[PATCH net-next v9 01/13] mm: page_frag: add a test module for page

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH net-next v9 01/13] mm: page_frag: add a test module for page_frag
       [not found] <20240625135216.47007-1-linyunsheng@huawei.com>
@ 2024-06-25 13:52 ` Yunsheng Lin
  2024-06-25 13:52 ` [PATCH net-next v9 02/13] mm: move the page fragment allocator from page_alloc into its own file Yunsheng Lin
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 43+ messages in thread
From: Yunsheng Lin @ 2024-06-25 13:52 UTC (permalink / raw)
  To: davem, kuba, pabeni
  Cc: netdev, linux-kernel, Yunsheng Lin, Alexander Duyck,
	Andrew Morton, linux-mm

Basing on the lib/objpool.c, change it to something like a
ptrpool, so that we can utilize that to test the correctness
and performance of the page_frag.

The testing is done by ensuring that the fragment allocated
from a frag_frag_cache instance is pushed into a ptrpool
instance in a kthread binded to a specified cpu, and a kthread
binded to a specified cpu will pop the fragment from the
ptrpool and free the fragment.

We may refactor out the common part between objpool and ptrpool
if this ptrpool thing turns out to be helpful for other place.

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 mm/Kconfig.debug    |   8 +
 mm/Makefile         |   1 +
 mm/page_frag_test.c | 389 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 398 insertions(+)
 create mode 100644 mm/page_frag_test.c

diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index afc72fde0f03..1ebcd45f47d4 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -142,6 +142,14 @@ config DEBUG_PAGE_REF
 	  kernel code.  However the runtime performance overhead is virtually
 	  nil until the tracepoints are actually enabled.
 
+config DEBUG_PAGE_FRAG_TEST
+	tristate "Test module for page_frag"
+	default n
+	depends on m && DEBUG_KERNEL
+	help
+	  This builds the "page_frag_test" module that is used to test the
+	  correctness and performance of page_frag's implementation.
+
 config DEBUG_RODATA_TEST
     bool "Testcase for the marking rodata read-only"
     depends on STRICT_KERNEL_RWX
diff --git a/mm/Makefile b/mm/Makefile
index 8fb85acda1b1..29d9f7618a33 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -106,6 +106,7 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
+obj-$(CONFIG_DEBUG_PAGE_FRAG_TEST) += page_frag_test.o
 obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
 obj-$(CONFIG_PAGE_OWNER) += page_owner.o
 obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
new file mode 100644
index 000000000000..5ee3f33b756d
--- /dev/null
+++ b/mm/page_frag_test.c
@@ -0,0 +1,389 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Test module for page_frag cache
+ *
+ * Copyright: linyunsheng@huawei.com
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/atomic.h>
+#include <linux/irqflags.h>
+#include <linux/cpumask.h>
+#include <linux/log2.h>
+#include <linux/completion.h>
+#include <linux/kthread.h>
+
+#define OBJPOOL_NR_OBJECT_MAX	BIT(24)
+
+struct objpool_slot {
+	u32 head;
+	u32 tail;
+	u32 last;
+	u32 mask;
+	void *entries[];
+} __packed;
+
+struct objpool_head {
+	int nr_cpus;
+	int capacity;
+	struct objpool_slot **cpu_slots;
+};
+
+/* initialize percpu objpool_slot */
+static void objpool_init_percpu_slot(struct objpool_head *pool,
+				     struct objpool_slot *slot)
+{
+	/* initialize elements of percpu objpool_slot */
+	slot->mask = pool->capacity - 1;
+}
+
+/* allocate and initialize percpu slots */
+static int objpool_init_percpu_slots(struct objpool_head *pool,
+				     int nr_objs, gfp_t gfp)
+{
+	int i;
+
+	for (i = 0; i < pool->nr_cpus; i++) {
+		struct objpool_slot *slot;
+		int size;
+
+		/* skip the cpu node which could never be present */
+		if (!cpu_possible(i))
+			continue;
+
+		size = struct_size(slot, entries, pool->capacity);
+
+		/*
+		 * here we allocate percpu-slot & objs together in a single
+		 * allocation to make it more compact, taking advantage of
+		 * warm caches and TLB hits. in default vmalloc is used to
+		 * reduce the pressure of kernel slab system. as we know,
+		 * minimal size of vmalloc is one page since vmalloc would
+		 * always align the requested size to page size
+		 */
+		if (gfp & GFP_ATOMIC)
+			slot = kmalloc_node(size, gfp, cpu_to_node(i));
+		else
+			slot = __vmalloc_node(size, sizeof(void *), gfp,
+					      cpu_to_node(i),
+					      __builtin_return_address(0));
+		if (!slot)
+			return -ENOMEM;
+
+		memset(slot, 0, size);
+		pool->cpu_slots[i] = slot;
+
+		objpool_init_percpu_slot(pool, slot);
+	}
+
+	return 0;
+}
+
+/* cleanup all percpu slots of the object pool */
+static void objpool_fini_percpu_slots(struct objpool_head *pool)
+{
+	int i;
+
+	if (!pool->cpu_slots)
+		return;
+
+	for (i = 0; i < pool->nr_cpus; i++)
+		kvfree(pool->cpu_slots[i]);
+	kfree(pool->cpu_slots);
+}
+
+/* initialize object pool and pre-allocate objects */
+static int objpool_init(struct objpool_head *pool, int nr_objs, gfp_t gfp)
+{
+	int rc, capacity, slot_size;
+
+	/* check input parameters */
+	if (nr_objs <= 0 || nr_objs > OBJPOOL_NR_OBJECT_MAX)
+		return -EINVAL;
+
+	/* calculate capacity of percpu objpool_slot */
+	capacity = roundup_pow_of_two(nr_objs);
+	if (!capacity)
+		return -EINVAL;
+
+	gfp = gfp & ~__GFP_ZERO;
+
+	/* initialize objpool pool */
+	memset(pool, 0, sizeof(struct objpool_head));
+	pool->nr_cpus = nr_cpu_ids;
+	pool->capacity = capacity;
+	slot_size = pool->nr_cpus * sizeof(struct objpool_slot *);
+	pool->cpu_slots = kzalloc(slot_size, gfp);
+	if (!pool->cpu_slots)
+		return -ENOMEM;
+
+	/* initialize per-cpu slots */
+	rc = objpool_init_percpu_slots(pool, nr_objs, gfp);
+	if (rc)
+		objpool_fini_percpu_slots(pool);
+
+	return rc;
+}
+
+/* adding object to slot, abort if the slot was already full */
+static int objpool_try_add_slot(void *obj, struct objpool_head *pool, int cpu)
+{
+	struct objpool_slot *slot = pool->cpu_slots[cpu];
+	u32 head, tail;
+
+	/* loading tail and head as a local snapshot, tail first */
+	tail = READ_ONCE(slot->tail);
+
+	do {
+		head = READ_ONCE(slot->head);
+		/* fault caught: something must be wrong */
+		if (unlikely(tail - head >= pool->capacity))
+			return -ENOSPC;
+	} while (!try_cmpxchg_acquire(&slot->tail, &tail, tail + 1));
+
+	/* now the tail position is reserved for the given obj */
+	WRITE_ONCE(slot->entries[tail & slot->mask], obj);
+	/* update sequence to make this obj available for pop() */
+	smp_store_release(&slot->last, tail + 1);
+
+	return 0;
+}
+
+/* reclaim an object to object pool */
+static int objpool_push(void *obj, struct objpool_head *pool)
+{
+	unsigned long flags;
+	int rc;
+
+	/* disable local irq to avoid preemption & interruption */
+	raw_local_irq_save(flags);
+	rc = objpool_try_add_slot(obj, pool, raw_smp_processor_id());
+	raw_local_irq_restore(flags);
+
+	return rc;
+}
+
+/* try to retrieve object from slot */
+static void *objpool_try_get_slot(struct objpool_head *pool, int cpu)
+{
+	struct objpool_slot *slot = pool->cpu_slots[cpu];
+	/* load head snapshot, other cpus may change it */
+	u32 head = smp_load_acquire(&slot->head);
+
+	while (head != READ_ONCE(slot->last)) {
+		void *obj;
+
+		/*
+		 * data visibility of 'last' and 'head' could be out of
+		 * order since memory updating of 'last' and 'head' are
+		 * performed in push() and pop() independently
+		 *
+		 * before any retrieving attempts, pop() must guarantee
+		 * 'last' is behind 'head', that is to say, there must
+		 * be available objects in slot, which could be ensured
+		 * by condition 'last != head && last - head <= nr_objs'
+		 * that is equivalent to 'last - head - 1 < nr_objs' as
+		 * 'last' and 'head' are both unsigned int32
+		 */
+		if (READ_ONCE(slot->last) - head - 1 >= pool->capacity) {
+			head = READ_ONCE(slot->head);
+			continue;
+		}
+
+		/* obj must be retrieved before moving forward head */
+		obj = READ_ONCE(slot->entries[head & slot->mask]);
+
+		/* move head forward to mark it's consumption */
+		if (try_cmpxchg_release(&slot->head, &head, head + 1))
+			return obj;
+	}
+
+	return NULL;
+}
+
+/* allocate an object from object pool */
+static void *objpool_pop(struct objpool_head *pool)
+{
+	void *obj = NULL;
+	unsigned long flags;
+	int i, cpu;
+
+	/* disable local irq to avoid preemption & interruption */
+	raw_local_irq_save(flags);
+
+	cpu = raw_smp_processor_id();
+	for (i = 0; i < num_possible_cpus(); i++) {
+		obj = objpool_try_get_slot(pool, cpu);
+		if (obj)
+			break;
+		cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1);
+	}
+	raw_local_irq_restore(flags);
+
+	return obj;
+}
+
+/* release whole objpool forcely */
+static void objpool_free(struct objpool_head *pool)
+{
+	if (!pool->cpu_slots)
+		return;
+
+	/* release percpu slots */
+	objpool_fini_percpu_slots(pool);
+}
+
+static struct objpool_head ptr_pool;
+static int nr_objs = 512;
+static atomic_t nthreads;
+static struct completion wait;
+static struct page_frag_cache test_frag;
+
+static int nr_test = 5120000;
+module_param(nr_test, int, 0);
+MODULE_PARM_DESC(nr_test, "number of iterations to test");
+
+static bool test_align;
+module_param(test_align, bool, 0);
+MODULE_PARM_DESC(test_align, "use align API for testing");
+
+static int test_alloc_len = 2048;
+module_param(test_alloc_len, int, 0);
+MODULE_PARM_DESC(test_alloc_len, "alloc len for testing");
+
+static int test_push_cpu;
+module_param(test_push_cpu, int, 0);
+MODULE_PARM_DESC(test_push_cpu, "test cpu for pushing fragment");
+
+static int test_pop_cpu;
+module_param(test_pop_cpu, int, 0);
+MODULE_PARM_DESC(test_pop_cpu, "test cpu for popping fragment");
+
+static int page_frag_pop_thread(void *arg)
+{
+	struct objpool_head *pool = arg;
+	int nr = nr_test;
+
+	pr_info("page_frag pop test thread begins on cpu %d\n",
+		smp_processor_id());
+
+	while (nr > 0) {
+		void *obj = objpool_pop(pool);
+
+		if (obj) {
+			nr--;
+			page_frag_free(obj);
+		} else {
+			cond_resched();
+		}
+	}
+
+	if (atomic_dec_and_test(&nthreads))
+		complete(&wait);
+
+	pr_info("page_frag pop test thread exits on cpu %d\n",
+		smp_processor_id());
+
+	return 0;
+}
+
+static int page_frag_push_thread(void *arg)
+{
+	struct objpool_head *pool = arg;
+	int nr = nr_test;
+
+	pr_info("page_frag push test thread begins on cpu %d\n",
+		smp_processor_id());
+
+	while (nr > 0) {
+		void *va;
+		int ret;
+
+		if (test_align)
+			va = page_frag_alloc_align(&test_frag, test_alloc_len,
+						   GFP_KERNEL, SMP_CACHE_BYTES);
+		else
+			va = page_frag_alloc(&test_frag, test_alloc_len, GFP_KERNEL);
+
+		if (!va)
+			continue;
+
+		ret = objpool_push(va, pool);
+		if (ret) {
+			page_frag_free(va);
+			cond_resched();
+		} else {
+			nr--;
+		}
+	}
+
+	pr_info("page_frag push test thread exits on cpu %d\n",
+		smp_processor_id());
+
+	if (atomic_dec_and_test(&nthreads))
+		complete(&wait);
+
+	return 0;
+}
+
+static int __init page_frag_test_init(void)
+{
+	struct task_struct *tsk_push, *tsk_pop;
+	ktime_t start;
+	u64 duration;
+	int ret;
+
+	test_frag.va = NULL;
+	atomic_set(&nthreads, 2);
+	init_completion(&wait);
+
+	if (test_alloc_len > PAGE_SIZE || test_alloc_len <= 0)
+		return -EINVAL;
+
+	ret = objpool_init(&ptr_pool, nr_objs, GFP_KERNEL);
+	if (ret)
+		return ret;
+
+	tsk_push = kthread_create_on_cpu(page_frag_push_thread, &ptr_pool,
+					 test_push_cpu, "page_frag_push");
+	if (IS_ERR(tsk_push))
+		return PTR_ERR(tsk_push);
+
+	tsk_pop = kthread_create_on_cpu(page_frag_pop_thread, &ptr_pool,
+					test_pop_cpu, "page_frag_pop");
+	if (IS_ERR(tsk_pop)) {
+		kthread_stop(tsk_push);
+		return PTR_ERR(tsk_pop);
+	}
+
+	start = ktime_get();
+	wake_up_process(tsk_push);
+	wake_up_process(tsk_pop);
+
+	pr_info("waiting for test to complete\n");
+	wait_for_completion(&wait);
+
+	duration = (u64)ktime_us_delta(ktime_get(), start);
+	pr_info("%d of iterations for %s testing took: %lluus\n", nr_test,
+		test_align ? "aligned" : "non-aligned", duration);
+
+	objpool_free(&ptr_pool);
+	page_frag_cache_drain(&test_frag);
+
+	return -EAGAIN;
+}
+
+static void __exit page_frag_test_exit(void)
+{
+}
+
+module_init(page_frag_test_init);
+module_exit(page_frag_test_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yunsheng Lin <linyunsheng@huawei.com>");
+MODULE_DESCRIPTION("Test module for page_frag");
-- 
2.33.0



^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH net-next v9 02/13] mm: move the page fragment allocator from page_alloc into its own file
       [not found] <20240625135216.47007-1-linyunsheng@huawei.com>
  2024-06-25 13:52 ` [PATCH net-next v9 01/13] mm: page_frag: add a test module for page_frag Yunsheng Lin
@ 2024-06-25 13:52 ` Yunsheng Lin
  2024-07-01 23:10   ` Alexander H Duyck
  2024-06-25 13:52 ` [PATCH net-next v9 03/13] mm: page_frag: use initial zero offset for page_frag_alloc_align() Yunsheng Lin
                   ` (7 subsequent siblings)
  9 siblings, 1 reply; 43+ messages in thread
From: Yunsheng Lin @ 2024-06-25 13:52 UTC (permalink / raw)
  To: davem, kuba, pabeni
  Cc: netdev, linux-kernel, Yunsheng Lin, David Howells,
	Alexander Duyck, Andrew Morton, linux-mm

Inspired by [1], move the page fragment allocator from page_alloc
into its own c file and header file, as we are about to make more
change for it to replace another page_frag implementation in
sock.c

1. https://lore.kernel.org/all/20230411160902.4134381-3-dhowells@redhat.com/

CC: David Howells <dhowells@redhat.com>
CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 include/linux/gfp.h             |  22 -----
 include/linux/mm_types.h        |  18 ----
 include/linux/page_frag_cache.h |  47 +++++++++++
 include/linux/skbuff.h          |   1 +
 mm/Makefile                     |   1 +
 mm/page_alloc.c                 | 136 ------------------------------
 mm/page_frag_cache.c            | 144 ++++++++++++++++++++++++++++++++
 mm/page_frag_test.c             |   1 +
 8 files changed, 194 insertions(+), 176 deletions(-)
 create mode 100644 include/linux/page_frag_cache.h
 create mode 100644 mm/page_frag_cache.c

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 7f9691d375f0..3d8f9dc6c6ee 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -363,28 +363,6 @@ __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mas
 extern void __free_pages(struct page *page, unsigned int order);
 extern void free_pages(unsigned long addr, unsigned int order);
 
-struct page_frag_cache;
-void page_frag_cache_drain(struct page_frag_cache *nc);
-extern void __page_frag_cache_drain(struct page *page, unsigned int count);
-void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
-			      gfp_t gfp_mask, unsigned int align_mask);
-
-static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
-					  unsigned int fragsz, gfp_t gfp_mask,
-					  unsigned int align)
-{
-	WARN_ON_ONCE(!is_power_of_2(align));
-	return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align);
-}
-
-static inline void *page_frag_alloc(struct page_frag_cache *nc,
-			     unsigned int fragsz, gfp_t gfp_mask)
-{
-	return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
-}
-
-extern void page_frag_free(void *addr);
-
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr), 0)
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index af3a0256fa93..7a4e695a7a1e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -505,9 +505,6 @@ static_assert(sizeof(struct ptdesc) <= sizeof(struct page));
  */
 #define STRUCT_PAGE_MAX_SHIFT	(order_base_2(sizeof(struct page)))
 
-#define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
-#define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
-
 /*
  * page_private can be used on tail pages.  However, PagePrivate is only
  * checked by the VM on the head page.  So page_private on the tail pages
@@ -526,21 +523,6 @@ static inline void *folio_get_private(struct folio *folio)
 	return folio->private;
 }
 
-struct page_frag_cache {
-	void * va;
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-	__u16 offset;
-	__u16 size;
-#else
-	__u32 offset;
-#endif
-	/* we maintain a pagecount bias, so that we dont dirty cache line
-	 * containing page->_refcount every time we allocate a fragment.
-	 */
-	unsigned int		pagecnt_bias;
-	bool pfmemalloc;
-};
-
 typedef unsigned long vm_flags_t;
 
 /*
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
new file mode 100644
index 000000000000..3a44bfc99750
--- /dev/null
+++ b/include/linux/page_frag_cache.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_PAGE_FRAG_CACHE_H
+#define _LINUX_PAGE_FRAG_CACHE_H
+
+#include <linux/gfp_types.h>
+
+#define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
+#define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
+
+struct page_frag_cache {
+	void *va;
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	__u16 offset;
+	__u16 size;
+#else
+	__u32 offset;
+#endif
+	/* we maintain a pagecount bias, so that we dont dirty cache line
+	 * containing page->_refcount every time we allocate a fragment.
+	 */
+	unsigned int		pagecnt_bias;
+	bool pfmemalloc;
+};
+
+void page_frag_cache_drain(struct page_frag_cache *nc);
+void __page_frag_cache_drain(struct page *page, unsigned int count);
+void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
+			      gfp_t gfp_mask, unsigned int align_mask);
+
+static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
+					  unsigned int fragsz, gfp_t gfp_mask,
+					  unsigned int align)
+{
+	WARN_ON_ONCE(!is_power_of_2(align));
+	return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align);
+}
+
+static inline void *page_frag_alloc(struct page_frag_cache *nc,
+				    unsigned int fragsz, gfp_t gfp_mask)
+{
+	return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
+}
+
+void page_frag_free(void *addr);
+
+#endif
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f4cda3fbdb75..eb8ae8292c48 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -31,6 +31,7 @@
 #include <linux/in6.h>
 #include <linux/if_packet.h>
 #include <linux/llist.h>
+#include <linux/page_frag_cache.h>
 #include <net/flow.h>
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 #include <linux/netfilter/nf_conntrack_common.h>
diff --git a/mm/Makefile b/mm/Makefile
index 29d9f7618a33..3080257a0a75 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -64,6 +64,7 @@ page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o
 memory-hotplug-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 
 obj-y += page-alloc.o
+obj-y += page_frag_cache.o
 obj-y += init-mm.o
 obj-y += memblock.o
 obj-y += $(memory-hotplug-y)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7300aa9f14b0..f9d4cfc9aeb7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4781,142 +4781,6 @@ void free_pages(unsigned long addr, unsigned int order)
 
 EXPORT_SYMBOL(free_pages);
 
-/*
- * Page Fragment:
- *  An arbitrary-length arbitrary-offset area of memory which resides
- *  within a 0 or higher order page.  Multiple fragments within that page
- *  are individually refcounted, in the page's reference counter.
- *
- * The page_frag functions below provide a simple allocation framework for
- * page fragments.  This is used by the network stack and network device
- * drivers to provide a backing region of memory for use as either an
- * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
- */
-static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
-					     gfp_t gfp_mask)
-{
-	struct page *page = NULL;
-	gfp_t gfp = gfp_mask;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
-		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
-	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
-				PAGE_FRAG_CACHE_MAX_ORDER);
-	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
-#endif
-	if (unlikely(!page))
-		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
-
-	nc->va = page ? page_address(page) : NULL;
-
-	return page;
-}
-
-void page_frag_cache_drain(struct page_frag_cache *nc)
-{
-	if (!nc->va)
-		return;
-
-	__page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
-	nc->va = NULL;
-}
-EXPORT_SYMBOL(page_frag_cache_drain);
-
-void __page_frag_cache_drain(struct page *page, unsigned int count)
-{
-	VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
-
-	if (page_ref_sub_and_test(page, count))
-		free_unref_page(page, compound_order(page));
-}
-EXPORT_SYMBOL(__page_frag_cache_drain);
-
-void *__page_frag_alloc_align(struct page_frag_cache *nc,
-			      unsigned int fragsz, gfp_t gfp_mask,
-			      unsigned int align_mask)
-{
-	unsigned int size = PAGE_SIZE;
-	struct page *page;
-	int offset;
-
-	if (unlikely(!nc->va)) {
-refill:
-		page = __page_frag_cache_refill(nc, gfp_mask);
-		if (!page)
-			return NULL;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
-		/* Even if we own the page, we do not use atomic_set().
-		 * This would break get_page_unless_zero() users.
-		 */
-		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
-
-		/* reset page count bias and offset to start of new frag */
-		nc->pfmemalloc = page_is_pfmemalloc(page);
-		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		nc->offset = size;
-	}
-
-	offset = nc->offset - fragsz;
-	if (unlikely(offset < 0)) {
-		page = virt_to_page(nc->va);
-
-		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
-			goto refill;
-
-		if (unlikely(nc->pfmemalloc)) {
-			free_unref_page(page, compound_order(page));
-			goto refill;
-		}
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
-		/* OK, page count is 0, we can safely set it */
-		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
-
-		/* reset page count bias and offset to start of new frag */
-		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		offset = size - fragsz;
-		if (unlikely(offset < 0)) {
-			/*
-			 * The caller is trying to allocate a fragment
-			 * with fragsz > PAGE_SIZE but the cache isn't big
-			 * enough to satisfy the request, this may
-			 * happen in low memory conditions.
-			 * We don't release the cache page because
-			 * it could make memory pressure worse
-			 * so we simply return NULL here.
-			 */
-			return NULL;
-		}
-	}
-
-	nc->pagecnt_bias--;
-	offset &= align_mask;
-	nc->offset = offset;
-
-	return nc->va + offset;
-}
-EXPORT_SYMBOL(__page_frag_alloc_align);
-
-/*
- * Frees a page fragment allocated out of either a compound or order 0 page.
- */
-void page_frag_free(void *addr)
-{
-	struct page *page = virt_to_head_page(addr);
-
-	if (unlikely(put_page_testzero(page)))
-		free_unref_page(page, compound_order(page));
-}
-EXPORT_SYMBOL(page_frag_free);
-
 static void *make_alloc_exact(unsigned long addr, unsigned int order,
 		size_t size)
 {
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
new file mode 100644
index 000000000000..88f567ef0e29
--- /dev/null
+++ b/mm/page_frag_cache.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Page fragment allocator
+ *
+ * Page Fragment:
+ *  An arbitrary-length arbitrary-offset area of memory which resides within a
+ *  0 or higher order page.  Multiple fragments within that page are
+ *  individually refcounted, in the page's reference counter.
+ *
+ * The page_frag functions provide a simple allocation framework for page
+ * fragments.  This is used by the network stack and network device drivers to
+ * provide a backing region of memory for use as either an sk_buff->head, or to
+ * be used in the "frags" portion of skb_shared_info.
+ */
+
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/page_frag_cache.h>
+#include "internal.h"
+
+static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
+					     gfp_t gfp_mask)
+{
+	struct page *page = NULL;
+	gfp_t gfp = gfp_mask;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
+		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
+	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
+				PAGE_FRAG_CACHE_MAX_ORDER);
+	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
+#endif
+	if (unlikely(!page))
+		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+
+	nc->va = page ? page_address(page) : NULL;
+
+	return page;
+}
+
+void page_frag_cache_drain(struct page_frag_cache *nc)
+{
+	if (!nc->va)
+		return;
+
+	__page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
+	nc->va = NULL;
+}
+EXPORT_SYMBOL(page_frag_cache_drain);
+
+void __page_frag_cache_drain(struct page *page, unsigned int count)
+{
+	VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
+
+	if (page_ref_sub_and_test(page, count))
+		free_unref_page(page, compound_order(page));
+}
+EXPORT_SYMBOL(__page_frag_cache_drain);
+
+void *__page_frag_alloc_align(struct page_frag_cache *nc,
+			      unsigned int fragsz, gfp_t gfp_mask,
+			      unsigned int align_mask)
+{
+	unsigned int size = PAGE_SIZE;
+	struct page *page;
+	int offset;
+
+	if (unlikely(!nc->va)) {
+refill:
+		page = __page_frag_cache_refill(nc, gfp_mask);
+		if (!page)
+			return NULL;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+		/* if size can vary use size else just use PAGE_SIZE */
+		size = nc->size;
+#endif
+		/* Even if we own the page, we do not use atomic_set().
+		 * This would break get_page_unless_zero() users.
+		 */
+		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
+
+		/* reset page count bias and offset to start of new frag */
+		nc->pfmemalloc = page_is_pfmemalloc(page);
+		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
+		nc->offset = size;
+	}
+
+	offset = nc->offset - fragsz;
+	if (unlikely(offset < 0)) {
+		page = virt_to_page(nc->va);
+
+		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
+			goto refill;
+
+		if (unlikely(nc->pfmemalloc)) {
+			free_unref_page(page, compound_order(page));
+			goto refill;
+		}
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+		/* if size can vary use size else just use PAGE_SIZE */
+		size = nc->size;
+#endif
+		/* OK, page count is 0, we can safely set it */
+		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
+
+		/* reset page count bias and offset to start of new frag */
+		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
+		offset = size - fragsz;
+		if (unlikely(offset < 0)) {
+			/*
+			 * The caller is trying to allocate a fragment
+			 * with fragsz > PAGE_SIZE but the cache isn't big
+			 * enough to satisfy the request, this may
+			 * happen in low memory conditions.
+			 * We don't release the cache page because
+			 * it could make memory pressure worse
+			 * so we simply return NULL here.
+			 */
+			return NULL;
+		}
+	}
+
+	nc->pagecnt_bias--;
+	offset &= align_mask;
+	nc->offset = offset;
+
+	return nc->va + offset;
+}
+EXPORT_SYMBOL(__page_frag_alloc_align);
+
+/*
+ * Frees a page fragment allocated out of either a compound or order 0 page.
+ */
+void page_frag_free(void *addr)
+{
+	struct page *page = virt_to_head_page(addr);
+
+	if (unlikely(put_page_testzero(page)))
+		free_unref_page(page, compound_order(page));
+}
+EXPORT_SYMBOL(page_frag_free);
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
index 5ee3f33b756d..07748ee0a21f 100644
--- a/mm/page_frag_test.c
+++ b/mm/page_frag_test.c
@@ -16,6 +16,7 @@
 #include <linux/log2.h>
 #include <linux/completion.h>
 #include <linux/kthread.h>
+#include <linux/page_frag_cache.h>
 
 #define OBJPOOL_NR_OBJECT_MAX	BIT(24)
 
-- 
2.33.0



^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH net-next v9 03/13] mm: page_frag: use initial zero offset for page_frag_alloc_align()
       [not found] <20240625135216.47007-1-linyunsheng@huawei.com>
  2024-06-25 13:52 ` [PATCH net-next v9 01/13] mm: page_frag: add a test module for page_frag Yunsheng Lin
  2024-06-25 13:52 ` [PATCH net-next v9 02/13] mm: move the page fragment allocator from page_alloc into its own file Yunsheng Lin
@ 2024-06-25 13:52 ` Yunsheng Lin
  2024-07-01 23:27   ` Alexander H Duyck
  2024-06-25 13:52 ` [PATCH net-next v9 04/13] mm: page_frag: add '_va' suffix to page_frag API Yunsheng Lin
                   ` (6 subsequent siblings)
  9 siblings, 1 reply; 43+ messages in thread
From: Yunsheng Lin @ 2024-06-25 13:52 UTC (permalink / raw)
  To: davem, kuba, pabeni
  Cc: netdev, linux-kernel, Yunsheng Lin, Alexander Duyck,
	Andrew Morton, linux-mm

We are above to use page_frag_alloc_*() API to not just
allocate memory for skb->data, but also use them to do
the memory allocation for skb frag too. Currently the
implementation of page_frag in mm subsystem is running
the offset as a countdown rather than count-up value,
there may have several advantages to that as mentioned
in [1], but it may have some disadvantages, for example,
it may disable skb frag coaleasing and more correct cache
prefetching

We have a trade-off to make in order to have a unified
implementation and API for page_frag, so use a initial zero
offset in this patch, and the following patch will try to
make some optimization to aovid the disadvantages as much
as possible.

As offsets is added due to alignment requirement before
actually checking if the cache is enough, which might make
it exploitable if caller passes a align value bigger than
32K mistakenly. As we are allowing order 3 page allocation
to fail easily under low memory condition, align value bigger
than PAGE_SIZE is not really allowed, so add a 'align >
PAGE_SIZE' checking in page_frag_alloc_va_align() to catch
that.

1. https://lore.kernel.org/all/f4abe71b3439b39d17a6fb2d410180f367cadf5c.camel@gmail.com/

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 include/linux/page_frag_cache.h |  2 +-
 include/linux/skbuff.h          |  4 ++--
 mm/page_frag_cache.c            | 26 +++++++++++---------------
 3 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index 3a44bfc99750..b9411f0db25a 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -32,7 +32,7 @@ static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
 					  unsigned int fragsz, gfp_t gfp_mask,
 					  unsigned int align)
 {
-	WARN_ON_ONCE(!is_power_of_2(align));
+	WARN_ON_ONCE(!is_power_of_2(align) || align > PAGE_SIZE);
 	return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align);
 }
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index eb8ae8292c48..d1fea23ec386 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3320,7 +3320,7 @@ static inline void *netdev_alloc_frag(unsigned int fragsz)
 static inline void *netdev_alloc_frag_align(unsigned int fragsz,
 					    unsigned int align)
 {
-	WARN_ON_ONCE(!is_power_of_2(align));
+	WARN_ON_ONCE(!is_power_of_2(align) || align > PAGE_SIZE);
 	return __netdev_alloc_frag_align(fragsz, -align);
 }
 
@@ -3391,7 +3391,7 @@ static inline void *napi_alloc_frag(unsigned int fragsz)
 static inline void *napi_alloc_frag_align(unsigned int fragsz,
 					  unsigned int align)
 {
-	WARN_ON_ONCE(!is_power_of_2(align));
+	WARN_ON_ONCE(!is_power_of_2(align) || align > PAGE_SIZE);
 	return __napi_alloc_frag_align(fragsz, -align);
 }
 
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index 88f567ef0e29..da244851b8a4 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -72,10 +72,6 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
 		if (!page)
 			return NULL;
 
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
 		/* Even if we own the page, we do not use atomic_set().
 		 * This would break get_page_unless_zero() users.
 		 */
@@ -84,11 +80,16 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
 		/* reset page count bias and offset to start of new frag */
 		nc->pfmemalloc = page_is_pfmemalloc(page);
 		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		nc->offset = size;
+		nc->offset = 0;
 	}
 
-	offset = nc->offset - fragsz;
-	if (unlikely(offset < 0)) {
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	/* if size can vary use size else just use PAGE_SIZE */
+	size = nc->size;
+#endif
+
+	offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask);
+	if (unlikely(offset + fragsz > size)) {
 		page = virt_to_page(nc->va);
 
 		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
@@ -99,17 +100,13 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
 			goto refill;
 		}
 
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
 		/* OK, page count is 0, we can safely set it */
 		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
 
 		/* reset page count bias and offset to start of new frag */
 		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		offset = size - fragsz;
-		if (unlikely(offset < 0)) {
+		offset = 0;
+		if (unlikely(fragsz > PAGE_SIZE)) {
 			/*
 			 * The caller is trying to allocate a fragment
 			 * with fragsz > PAGE_SIZE but the cache isn't big
@@ -124,8 +121,7 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
 	}
 
 	nc->pagecnt_bias--;
-	offset &= align_mask;
-	nc->offset = offset;
+	nc->offset = offset + fragsz;
 
 	return nc->va + offset;
 }
-- 
2.33.0



^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH net-next v9 04/13] mm: page_frag: add '_va' suffix to page_frag API
       [not found] <20240625135216.47007-1-linyunsheng@huawei.com>
                   ` (2 preceding siblings ...)
  2024-06-25 13:52 ` [PATCH net-next v9 03/13] mm: page_frag: use initial zero offset for page_frag_alloc_align() Yunsheng Lin
@ 2024-06-25 13:52 ` Yunsheng Lin
  2024-06-25 13:52 ` [PATCH net-next v9 05/13] mm: page_frag: avoid caller accessing 'page_frag_cache' directly Yunsheng Lin
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 43+ messages in thread
From: Yunsheng Lin @ 2024-06-25 13:52 UTC (permalink / raw)
  To: davem, kuba, pabeni
  Cc: netdev, linux-kernel, Yunsheng Lin, Alexander Duyck,
	Jeroen de Borst, Praveen Kaligineedi, Shailend Chand,
	Eric Dumazet, Jesse Brandeburg, Tony Nguyen, Sunil Goutham,
	Geetha sowjanya, Subbaraya Sundeep, hariprasad, Felix Fietkau,
	Sean Wang, Mark Lee, Lorenzo Bianconi, Matthias Brugger,
	AngeloGioacchino Del Regno, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, Chaitanya Kulkarni,
	Michael S. Tsirkin, Jason Wang, Eugenio Pérez, Andrew Morton,
	Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
	John Fastabend, Andrii Nakryiko, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, KP Singh,
	Stanislav Fomichev, Hao Luo, Jiri Olsa, David Howells,
	Marc Dionne, Chuck Lever, Jeff Layton, Neil Brown,
	Olga Kornievskaia, Dai Ngo, Tom Talpey, Trond Myklebust,
	Anna Schumaker, intel-wired-lan, linux-arm-kernel, linux-mediatek,
	linux-nvme, kvm, virtualization, linux-mm, bpf, linux-afs,
	linux-nfs

Currently the page_frag API is returning 'virtual address'
or 'va' when allocing and expecting 'virtual address' or
'va' as input when freeing.

As we are about to support new use cases that the caller
need to deal with 'struct page' or need to deal with both
'va' and 'struct page'. In order to differentiate the API
handling between 'va' and 'struct page', add '_va' suffix
to the corresponding API mirroring the page_pool_alloc_va()
API of the page_pool. So that callers expecting to deal with
va, page or both va and page may call page_frag_alloc_va*,
page_frag_alloc_pg*, or page_frag_alloc* API accordingly.

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 drivers/net/ethernet/google/gve/gve_rx.c      |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c     |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx.h     |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
 .../marvell/octeontx2/nic/otx2_common.c       |  2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c    |  4 ++--
 drivers/nvme/host/tcp.c                       |  8 +++----
 drivers/nvme/target/tcp.c                     | 22 +++++++++----------
 drivers/vhost/net.c                           |  6 ++---
 include/linux/page_frag_cache.h               | 21 +++++++++---------
 include/linux/skbuff.h                        |  2 +-
 kernel/bpf/cpumap.c                           |  2 +-
 mm/page_frag_cache.c                          | 12 +++++-----
 mm/page_frag_test.c                           | 13 ++++++-----
 net/core/skbuff.c                             | 14 ++++++------
 net/core/xdp.c                                |  2 +-
 net/rxrpc/txbuf.c                             | 15 +++++++------
 net/sunrpc/svcsock.c                          |  6 ++---
 19 files changed, 74 insertions(+), 69 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c
index acb73d4d0de6..b6c10100e462 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -729,7 +729,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct gve_rx_ring *rx,
 
 	total_len = headroom + SKB_DATA_ALIGN(len) +
 		SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-	frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
+	frame = page_frag_alloc_va(&rx->page_cache, total_len, GFP_ATOMIC);
 	if (!frame) {
 		u64_stats_update_begin(&rx->statss);
 		rx->xdp_alloc_fails++;
@@ -742,7 +742,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct gve_rx_ring *rx,
 
 	err = xdp_do_redirect(dev, &new, xdp_prog);
 	if (err)
-		page_frag_free(frame);
+		page_frag_free_va(frame);
 
 	return err;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8bb743f78fcb..399b317c509d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct ice_tx_buf *tx_buf)
 		dev_kfree_skb_any(tx_buf->skb);
 		break;
 	case ICE_TX_BUF_XDP_TX:
-		page_frag_free(tx_buf->raw_buf);
+		page_frag_free_va(tx_buf->raw_buf);
 		break;
 	case ICE_TX_BUF_XDP_XMIT:
 		xdp_return_frame(tx_buf->xdpf);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h
index feba314a3fe4..6379f57d8228 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
  * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
  * @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA
  * @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats
- * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats
+ * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free_va(), stats
  * @ICE_TX_BUF_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame(), stats
  * @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index 2719f0e20933..a1a41a14df0d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -250,7 +250,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf *tx_buf,
 
 	switch (tx_buf->type) {
 	case ICE_TX_BUF_XDP_TX:
-		page_frag_free(tx_buf->raw_buf);
+		page_frag_free_va(tx_buf->raw_buf);
 		break;
 	case ICE_TX_BUF_XDP_XMIT:
 		xdp_return_frame_bulk(tx_buf->xdpf, bq);
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index b938dc06045d..fcd1b149a45d 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -303,7 +303,7 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector,
 
 		/* free the skb */
 		if (ring_is_xdp(tx_ring))
-			page_frag_free(tx_buffer->data);
+			page_frag_free_va(tx_buffer->data);
 		else
 			napi_consume_skb(tx_buffer->skb, napi_budget);
 
@@ -2413,7 +2413,7 @@ static void ixgbevf_clean_tx_ring(struct ixgbevf_ring *tx_ring)
 
 		/* Free all the Tx ring sk_buffs */
 		if (ring_is_xdp(tx_ring))
-			page_frag_free(tx_buffer->data);
+			page_frag_free_va(tx_buffer->data);
 		else
 			dev_kfree_skb_any(tx_buffer->skb);
 
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
index a85ac039d779..8eb5820b8a70 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
@@ -553,7 +553,7 @@ static int __otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool,
 	*dma = dma_map_single_attrs(pfvf->dev, buf, pool->rbsize,
 				    DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
 	if (unlikely(dma_mapping_error(pfvf->dev, *dma))) {
-		page_frag_free(buf);
+		page_frag_free_va(buf);
 		return -ENOMEM;
 	}
 
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_wo.c b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
index 7063c78bd35f..c4228719f8a4 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_wo.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
@@ -142,8 +142,8 @@ mtk_wed_wo_queue_refill(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q,
 		dma_addr_t addr;
 		void *buf;
 
-		buf = page_frag_alloc(&q->cache, q->buf_size,
-				      GFP_ATOMIC | GFP_DMA32);
+		buf = page_frag_alloc_va(&q->cache, q->buf_size,
+					 GFP_ATOMIC | GFP_DMA32);
 		if (!buf)
 			break;
 
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 8b5e4327fe83..4b7a897897fc 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -506,7 +506,7 @@ static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
 {
 	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 
-	page_frag_free(req->pdu);
+	page_frag_free_va(req->pdu);
 }
 
 static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
@@ -520,7 +520,7 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
 	struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
 	u8 hdgst = nvme_tcp_hdgst_len(queue);
 
-	req->pdu = page_frag_alloc(&queue->pf_cache,
+	req->pdu = page_frag_alloc_va(&queue->pf_cache,
 		sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
 		GFP_KERNEL | __GFP_ZERO);
 	if (!req->pdu)
@@ -1337,7 +1337,7 @@ static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
 {
 	struct nvme_tcp_request *async = &ctrl->async_req;
 
-	page_frag_free(async->pdu);
+	page_frag_free_va(async->pdu);
 }
 
 static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
@@ -1346,7 +1346,7 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
 	struct nvme_tcp_request *async = &ctrl->async_req;
 	u8 hdgst = nvme_tcp_hdgst_len(queue);
 
-	async->pdu = page_frag_alloc(&queue->pf_cache,
+	async->pdu = page_frag_alloc_va(&queue->pf_cache,
 		sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
 		GFP_KERNEL | __GFP_ZERO);
 	if (!async->pdu)
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 380f22ee3ebb..bea3aa79ef43 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1463,24 +1463,24 @@ static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
 	c->queue = queue;
 	c->req.port = queue->port->nport;
 
-	c->cmd_pdu = page_frag_alloc(&queue->pf_cache,
+	c->cmd_pdu = page_frag_alloc_va(&queue->pf_cache,
 			sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
 	if (!c->cmd_pdu)
 		return -ENOMEM;
 	c->req.cmd = &c->cmd_pdu->cmd;
 
-	c->rsp_pdu = page_frag_alloc(&queue->pf_cache,
+	c->rsp_pdu = page_frag_alloc_va(&queue->pf_cache,
 			sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
 	if (!c->rsp_pdu)
 		goto out_free_cmd;
 	c->req.cqe = &c->rsp_pdu->cqe;
 
-	c->data_pdu = page_frag_alloc(&queue->pf_cache,
+	c->data_pdu = page_frag_alloc_va(&queue->pf_cache,
 			sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
 	if (!c->data_pdu)
 		goto out_free_rsp;
 
-	c->r2t_pdu = page_frag_alloc(&queue->pf_cache,
+	c->r2t_pdu = page_frag_alloc_va(&queue->pf_cache,
 			sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
 	if (!c->r2t_pdu)
 		goto out_free_data;
@@ -1495,20 +1495,20 @@ static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
 
 	return 0;
 out_free_data:
-	page_frag_free(c->data_pdu);
+	page_frag_free_va(c->data_pdu);
 out_free_rsp:
-	page_frag_free(c->rsp_pdu);
+	page_frag_free_va(c->rsp_pdu);
 out_free_cmd:
-	page_frag_free(c->cmd_pdu);
+	page_frag_free_va(c->cmd_pdu);
 	return -ENOMEM;
 }
 
 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c)
 {
-	page_frag_free(c->r2t_pdu);
-	page_frag_free(c->data_pdu);
-	page_frag_free(c->rsp_pdu);
-	page_frag_free(c->cmd_pdu);
+	page_frag_free_va(c->r2t_pdu);
+	page_frag_free_va(c->data_pdu);
+	page_frag_free_va(c->rsp_pdu);
+	page_frag_free_va(c->cmd_pdu);
 }
 
 static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f16279351db5..6691fac01e0d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -686,8 +686,8 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
 		return -ENOSPC;
 
 	buflen += SKB_DATA_ALIGN(len + pad);
-	buf = page_frag_alloc_align(&net->pf_cache, buflen, GFP_KERNEL,
-				    SMP_CACHE_BYTES);
+	buf = page_frag_alloc_va_align(&net->pf_cache, buflen, GFP_KERNEL,
+				       SMP_CACHE_BYTES);
 	if (unlikely(!buf))
 		return -ENOMEM;
 
@@ -734,7 +734,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
 	return 0;
 
 err:
-	page_frag_free(buf);
+	page_frag_free_va(buf);
 	return ret;
 }
 
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index b9411f0db25a..c6fde197a6eb 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -25,23 +25,24 @@ struct page_frag_cache {
 
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
-void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
-			      gfp_t gfp_mask, unsigned int align_mask);
+void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
+				 unsigned int fragsz, gfp_t gfp_mask,
+				 unsigned int align_mask);
 
-static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
-					  unsigned int fragsz, gfp_t gfp_mask,
-					  unsigned int align)
+static inline void *page_frag_alloc_va_align(struct page_frag_cache *nc,
+					     unsigned int fragsz,
+					     gfp_t gfp_mask, unsigned int align)
 {
 	WARN_ON_ONCE(!is_power_of_2(align) || align > PAGE_SIZE);
-	return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align);
+	return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, -align);
 }
 
-static inline void *page_frag_alloc(struct page_frag_cache *nc,
-				    unsigned int fragsz, gfp_t gfp_mask)
+static inline void *page_frag_alloc_va(struct page_frag_cache *nc,
+				       unsigned int fragsz, gfp_t gfp_mask)
 {
-	return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
+	return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, ~0u);
 }
 
-void page_frag_free(void *addr);
+void page_frag_free_va(void *addr);
 
 #endif
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d1fea23ec386..4cfcbbcf8666 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3378,7 +3378,7 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
 
 static inline void skb_free_frag(void *addr)
 {
-	page_frag_free(addr);
+	page_frag_free_va(addr);
 }
 
 void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 068e994ed781..fa3c70db5aa1 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -323,7 +323,7 @@ static int cpu_map_kthread_run(void *data)
 
 			/* Bring struct page memory area to curr CPU. Read by
 			 * build_skb_around via page_is_pfmemalloc(), and when
-			 * freed written by page_frag_free call.
+			 * freed written by page_frag_free_va call.
 			 */
 			prefetchw(page);
 		}
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index da244851b8a4..dd640af5607a 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -58,9 +58,9 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
 }
 EXPORT_SYMBOL(__page_frag_cache_drain);
 
-void *__page_frag_alloc_align(struct page_frag_cache *nc,
-			      unsigned int fragsz, gfp_t gfp_mask,
-			      unsigned int align_mask)
+void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
+				 unsigned int fragsz, gfp_t gfp_mask,
+				 unsigned int align_mask)
 {
 	unsigned int size = PAGE_SIZE;
 	struct page *page;
@@ -125,16 +125,16 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
 
 	return nc->va + offset;
 }
-EXPORT_SYMBOL(__page_frag_alloc_align);
+EXPORT_SYMBOL(__page_frag_alloc_va_align);
 
 /*
  * Frees a page fragment allocated out of either a compound or order 0 page.
  */
-void page_frag_free(void *addr)
+void page_frag_free_va(void *addr)
 {
 	struct page *page = virt_to_head_page(addr);
 
 	if (unlikely(put_page_testzero(page)))
 		free_unref_page(page, compound_order(page));
 }
-EXPORT_SYMBOL(page_frag_free);
+EXPORT_SYMBOL(page_frag_free_va);
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
index 07748ee0a21f..a0bd0ca5f343 100644
--- a/mm/page_frag_test.c
+++ b/mm/page_frag_test.c
@@ -277,7 +277,7 @@ static int page_frag_pop_thread(void *arg)
 
 		if (obj) {
 			nr--;
-			page_frag_free(obj);
+			page_frag_free_va(obj);
 		} else {
 			cond_resched();
 		}
@@ -305,17 +305,20 @@ static int page_frag_push_thread(void *arg)
 		int ret;
 
 		if (test_align)
-			va = page_frag_alloc_align(&test_frag, test_alloc_len,
-						   GFP_KERNEL, SMP_CACHE_BYTES);
+			va = page_frag_alloc_va_align(&test_frag,
+						      test_alloc_len,
+						      GFP_KERNEL,
+						      SMP_CACHE_BYTES);
 		else
-			va = page_frag_alloc(&test_frag, test_alloc_len, GFP_KERNEL);
+			va = page_frag_alloc_va(&test_frag, test_alloc_len,
+						GFP_KERNEL);
 
 		if (!va)
 			continue;
 
 		ret = objpool_push(va, pool);
 		if (ret) {
-			page_frag_free(va);
+			page_frag_free_va(va);
 			cond_resched();
 		} else {
 			nr--;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index eb9a7e65b5c8..6a84cc929505 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -314,8 +314,8 @@ void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
 	fragsz = SKB_DATA_ALIGN(fragsz);
 
 	local_lock_nested_bh(&napi_alloc_cache.bh_lock);
-	data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
-				       align_mask);
+	data = __page_frag_alloc_va_align(&nc->page, fragsz, GFP_ATOMIC,
+					  align_mask);
 	local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
 	return data;
 
@@ -330,8 +330,8 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
 		struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
 
 		fragsz = SKB_DATA_ALIGN(fragsz);
-		data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC,
-					       align_mask);
+		data = __page_frag_alloc_va_align(nc, fragsz, GFP_ATOMIC,
+						  align_mask);
 	} else {
 		local_bh_disable();
 		data = __napi_alloc_frag_align(fragsz, align_mask);
@@ -748,14 +748,14 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 
 	if (in_hardirq() || irqs_disabled()) {
 		nc = this_cpu_ptr(&netdev_alloc_cache);
-		data = page_frag_alloc(nc, len, gfp_mask);
+		data = page_frag_alloc_va(nc, len, gfp_mask);
 		pfmemalloc = nc->pfmemalloc;
 	} else {
 		local_bh_disable();
 		local_lock_nested_bh(&napi_alloc_cache.bh_lock);
 
 		nc = this_cpu_ptr(&napi_alloc_cache.page);
-		data = page_frag_alloc(nc, len, gfp_mask);
+		data = page_frag_alloc_va(nc, len, gfp_mask);
 		pfmemalloc = nc->pfmemalloc;
 
 		local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
@@ -845,7 +845,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
 	} else {
 		len = SKB_HEAD_ALIGN(len);
 
-		data = page_frag_alloc(&nc->page, len, gfp_mask);
+		data = page_frag_alloc_va(&nc->page, len, gfp_mask);
 		pfmemalloc = nc->page.pfmemalloc;
 	}
 	local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 41693154e426..245a2d011aeb 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -391,7 +391,7 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
 		page_pool_put_full_page(page->pp, page, napi_direct);
 		break;
 	case MEM_TYPE_PAGE_SHARED:
-		page_frag_free(data);
+		page_frag_free_va(data);
 		break;
 	case MEM_TYPE_PAGE_ORDER0:
 		page = virt_to_page(data); /* Assumes order0 page*/
diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c
index c3913d8a50d3..dccb0353ee84 100644
--- a/net/rxrpc/txbuf.c
+++ b/net/rxrpc/txbuf.c
@@ -33,8 +33,8 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_
 
 	data_align = umax(data_align, L1_CACHE_BYTES);
 	mutex_lock(&call->conn->tx_data_alloc_lock);
-	buf = page_frag_alloc_align(&call->conn->tx_data_alloc, total, gfp,
-				    data_align);
+	buf = page_frag_alloc_va_align(&call->conn->tx_data_alloc, total, gfp,
+				       data_align);
 	mutex_unlock(&call->conn->tx_data_alloc_lock);
 	if (!buf) {
 		kfree(txb);
@@ -96,17 +96,18 @@ struct rxrpc_txbuf *rxrpc_alloc_ack_txbuf(struct rxrpc_call *call, size_t sack_s
 	if (!txb)
 		return NULL;
 
-	buf = page_frag_alloc(&call->local->tx_alloc,
-			      sizeof(*whdr) + sizeof(*ack) + 1 + 3 + sizeof(*trailer), gfp);
+	buf = page_frag_alloc_va(&call->local->tx_alloc,
+				 sizeof(*whdr) + sizeof(*ack) + 1 + 3 + sizeof(*trailer), gfp);
 	if (!buf) {
 		kfree(txb);
 		return NULL;
 	}
 
 	if (sack_size) {
-		buf2 = page_frag_alloc(&call->local->tx_alloc, sack_size, gfp);
+		buf2 = page_frag_alloc_va(&call->local->tx_alloc, sack_size,
+					  gfp);
 		if (!buf2) {
-			page_frag_free(buf);
+			page_frag_free_va(buf);
 			kfree(txb);
 			return NULL;
 		}
@@ -180,7 +181,7 @@ static void rxrpc_free_txbuf(struct rxrpc_txbuf *txb)
 			  rxrpc_txbuf_free);
 	for (i = 0; i < txb->nr_kvec; i++)
 		if (txb->kvec[i].iov_base)
-			page_frag_free(txb->kvec[i].iov_base);
+			page_frag_free_va(txb->kvec[i].iov_base);
 	kfree(txb);
 	atomic_dec(&rxrpc_nr_txbuf);
 }
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 6b3f01beb294..42d20412c1c3 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1222,8 +1222,8 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp,
 	/* The stream record marker is copied into a temporary page
 	 * fragment buffer so that it can be included in rq_bvec.
 	 */
-	buf = page_frag_alloc(&svsk->sk_frag_cache, sizeof(marker),
-			      GFP_KERNEL);
+	buf = page_frag_alloc_va(&svsk->sk_frag_cache, sizeof(marker),
+				 GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 	memcpy(buf, &marker, sizeof(marker));
@@ -1235,7 +1235,7 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp,
 	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec,
 		      1 + count, sizeof(marker) + rqstp->rq_res.len);
 	ret = sock_sendmsg(svsk->sk_sock, &msg);
-	page_frag_free(buf);
+	page_frag_free_va(buf);
 	if (ret < 0)
 		return ret;
 	*sentp += ret;
-- 
2.33.0



^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH net-next v9 05/13] mm: page_frag: avoid caller accessing 'page_frag_cache' directly
       [not found] <20240625135216.47007-1-linyunsheng@huawei.com>
                   ` (3 preceding siblings ...)
  2024-06-25 13:52 ` [PATCH net-next v9 04/13] mm: page_frag: add '_va' suffix to page_frag API Yunsheng Lin
@ 2024-06-25 13:52 ` Yunsheng Lin
  2024-06-25 13:52 ` [PATCH net-next v9 06/13] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc' Yunsheng Lin
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 43+ messages in thread
From: Yunsheng Lin @ 2024-06-25 13:52 UTC (permalink / raw)
  To: davem, kuba, pabeni
  Cc: netdev, linux-kernel, Yunsheng Lin, Alexander Duyck,
	Michael S. Tsirkin, Jason Wang, Eugenio Pérez, Andrew Morton,
	Eric Dumazet, David Howells, Marc Dionne, Chuck Lever,
	Jeff Layton, Neil Brown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, kvm, virtualization, linux-mm,
	linux-afs, linux-nfs

Use appropriate frag_page API instead of caller accessing
'page_frag_cache' directly.

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 drivers/vhost/net.c             |  2 +-
 include/linux/page_frag_cache.h | 10 ++++++++++
 mm/page_frag_test.c             |  2 +-
 net/core/skbuff.c               |  6 +++---
 net/rxrpc/conn_object.c         |  4 +---
 net/rxrpc/local_object.c        |  4 +---
 net/sunrpc/svcsock.c            |  6 ++----
 7 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 6691fac01e0d..b2737dc0dc50 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1325,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 			vqs[VHOST_NET_VQ_RX]);
 
 	f->private_data = n;
-	n->pf_cache.va = NULL;
+	page_frag_cache_init(&n->pf_cache);
 
 	return 0;
 }
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index c6fde197a6eb..6ac3a25089d1 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -23,6 +23,16 @@ struct page_frag_cache {
 	bool pfmemalloc;
 };
 
+static inline void page_frag_cache_init(struct page_frag_cache *nc)
+{
+	nc->va = NULL;
+}
+
+static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
+{
+	return !!nc->pfmemalloc;
+}
+
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
 void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
index a0bd0ca5f343..cdffebc20a10 100644
--- a/mm/page_frag_test.c
+++ b/mm/page_frag_test.c
@@ -341,7 +341,7 @@ static int __init page_frag_test_init(void)
 	u64 duration;
 	int ret;
 
-	test_frag.va = NULL;
+	page_frag_cache_init(&test_frag);
 	atomic_set(&nthreads, 2);
 	init_completion(&wait);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6a84cc929505..59d42d642067 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -749,14 +749,14 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 	if (in_hardirq() || irqs_disabled()) {
 		nc = this_cpu_ptr(&netdev_alloc_cache);
 		data = page_frag_alloc_va(nc, len, gfp_mask);
-		pfmemalloc = nc->pfmemalloc;
+		pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
 	} else {
 		local_bh_disable();
 		local_lock_nested_bh(&napi_alloc_cache.bh_lock);
 
 		nc = this_cpu_ptr(&napi_alloc_cache.page);
 		data = page_frag_alloc_va(nc, len, gfp_mask);
-		pfmemalloc = nc->pfmemalloc;
+		pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
 
 		local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
 		local_bh_enable();
@@ -846,7 +846,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
 		len = SKB_HEAD_ALIGN(len);
 
 		data = page_frag_alloc_va(&nc->page, len, gfp_mask);
-		pfmemalloc = nc->page.pfmemalloc;
+		pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
 	}
 	local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
 
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 1539d315afe7..694c4df7a1a3 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -337,9 +337,7 @@ static void rxrpc_clean_up_connection(struct work_struct *work)
 	 */
 	rxrpc_purge_queue(&conn->rx_queue);
 
-	if (conn->tx_data_alloc.va)
-		__page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va),
-					conn->tx_data_alloc.pagecnt_bias);
+	page_frag_cache_drain(&conn->tx_data_alloc);
 	call_rcu(&conn->rcu, rxrpc_rcu_free_connection);
 }
 
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 504453c688d7..a8cffe47cf01 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -452,9 +452,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local)
 #endif
 	rxrpc_purge_queue(&local->rx_queue);
 	rxrpc_purge_client_connections(local);
-	if (local->tx_alloc.va)
-		__page_frag_cache_drain(virt_to_page(local->tx_alloc.va),
-					local->tx_alloc.pagecnt_bias);
+	page_frag_cache_drain(&local->tx_alloc);
 }
 
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 42d20412c1c3..4b1e87187614 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1609,7 +1609,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
 static void svc_sock_free(struct svc_xprt *xprt)
 {
 	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-	struct page_frag_cache *pfc = &svsk->sk_frag_cache;
 	struct socket *sock = svsk->sk_sock;
 
 	trace_svcsock_free(svsk, sock);
@@ -1619,8 +1618,7 @@ static void svc_sock_free(struct svc_xprt *xprt)
 		sockfd_put(sock);
 	else
 		sock_release(sock);
-	if (pfc->va)
-		__page_frag_cache_drain(virt_to_head_page(pfc->va),
-					pfc->pagecnt_bias);
+
+	page_frag_cache_drain(&svsk->sk_frag_cache);
 	kfree(svsk);
 }
-- 
2.33.0



^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH net-next v9 06/13] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc'
       [not found] <20240625135216.47007-1-linyunsheng@huawei.com>
                   ` (4 preceding siblings ...)
  2024-06-25 13:52 ` [PATCH net-next v9 05/13] mm: page_frag: avoid caller accessing 'page_frag_cache' directly Yunsheng Lin
@ 2024-06-25 13:52 ` Yunsheng Lin
  2024-07-02  0:08   ` Alexander H Duyck
  2024-06-25 13:52 ` [PATCH net-next v9 07/13] mm: page_frag: some minor refactoring before adding new API Yunsheng Lin
                   ` (3 subsequent siblings)
  9 siblings, 1 reply; 43+ messages in thread
From: Yunsheng Lin @ 2024-06-25 13:52 UTC (permalink / raw)
  To: davem, kuba, pabeni
  Cc: netdev, linux-kernel, Yunsheng Lin, Alexander Duyck,
	Andrew Morton, linux-mm

Currently there is one 'struct page_frag' for every 'struct
sock' and 'struct task_struct', we are about to replace the
'struct page_frag' with 'struct page_frag_cache' for them.
Before begin the replacing, we need to ensure the size of
'struct page_frag_cache' is not bigger than the size of
'struct page_frag', as there may be tens of thousands of
'struct sock' and 'struct task_struct' instances in the
system.

By or'ing the page order & pfmemalloc with lower bits of
'va' instead of using 'u16' or 'u32' for page size and 'u8'
for pfmemalloc, we are able to avoid 3 or 5 bytes space waste.
And page address & pfmemalloc & order is unchanged for the
same page in the same 'page_frag_cache' instance, it makes
sense to fit them together.

Also, it is better to replace 'offset' with 'remaining', which
is the remaining size for the cache in a 'page_frag_cache'
instance, we are able to do a single 'fragsz > remaining'
checking for the case of cache not being enough, which should be
the fast path if we ensure size is zoro when 'va' == NULL by
memset'ing 'struct page_frag_cache' in page_frag_cache_init()
and page_frag_cache_drain().

After this patch, the size of 'struct page_frag_cache' should be
the same as the size of 'struct page_frag'.

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 include/linux/page_frag_cache.h | 76 +++++++++++++++++++++++-----
 mm/page_frag_cache.c            | 90 ++++++++++++++++++++-------------
 2 files changed, 118 insertions(+), 48 deletions(-)

diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index 6ac3a25089d1..b33904d4494f 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -8,29 +8,81 @@
 #define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
 #define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
 
-struct page_frag_cache {
-	void *va;
+/*
+ * struct encoded_va - a nonexistent type marking this pointer
+ *
+ * An 'encoded_va' pointer is a pointer to a aligned virtual address, which is
+ * at least aligned to PAGE_SIZE, that means there are at least 12 lower bits
+ * space available for other purposes.
+ *
+ * Currently we use the lower 8 bits and bit 9 for the order and PFMEMALLOC
+ * flag of the page this 'va' is corresponding to.
+ *
+ * Use the supplied helper functions to endcode/decode the pointer and bits.
+ */
+struct encoded_va;
+
+#define PAGE_FRAG_CACHE_ORDER_MASK		GENMASK(7, 0)
+#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT		BIT(8)
+#define PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT	8
+
+static inline struct encoded_va *encode_aligned_va(void *va,
+						   unsigned int order,
+						   bool pfmemalloc)
+{
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-	__u16 offset;
-	__u16 size;
+	return (struct encoded_va *)((unsigned long)va | order |
+			pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT);
 #else
-	__u32 offset;
+	return (struct encoded_va *)((unsigned long)va |
+			pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT);
+#endif
+}
+
+static inline unsigned long encoded_page_order(struct encoded_va *encoded_va)
+{
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	return PAGE_FRAG_CACHE_ORDER_MASK & (unsigned long)encoded_va;
+#else
+	return 0;
+#endif
+}
+
+static inline bool encoded_page_pfmemalloc(struct encoded_va *encoded_va)
+{
+	return PAGE_FRAG_CACHE_PFMEMALLOC_BIT & (unsigned long)encoded_va;
+}
+
+static inline void *encoded_page_address(struct encoded_va *encoded_va)
+{
+	return (void *)((unsigned long)encoded_va & PAGE_MASK);
+}
+
+struct page_frag_cache {
+	struct encoded_va *encoded_va;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
+	u16 pagecnt_bias;
+	u16 remaining;
+#else
+	u32 pagecnt_bias;
+	u32 remaining;
 #endif
-	/* we maintain a pagecount bias, so that we dont dirty cache line
-	 * containing page->_refcount every time we allocate a fragment.
-	 */
-	unsigned int		pagecnt_bias;
-	bool pfmemalloc;
 };
 
 static inline void page_frag_cache_init(struct page_frag_cache *nc)
 {
-	nc->va = NULL;
+	memset(nc, 0, sizeof(*nc));
 }
 
 static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
 {
-	return !!nc->pfmemalloc;
+	return encoded_page_pfmemalloc(nc->encoded_va);
+}
+
+static inline unsigned int page_frag_cache_page_size(struct encoded_va *encoded_va)
+{
+	return PAGE_SIZE << encoded_page_order(encoded_va);
 }
 
 void page_frag_cache_drain(struct page_frag_cache *nc);
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index dd640af5607a..a3316dd50eff 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -18,34 +18,61 @@
 #include <linux/page_frag_cache.h>
 #include "internal.h"
 
+static void *page_frag_cache_current_va(struct page_frag_cache *nc)
+{
+	struct encoded_va *encoded_va = nc->encoded_va;
+
+	return (void *)(((unsigned long)encoded_va & PAGE_MASK) |
+		(page_frag_cache_page_size(encoded_va) - nc->remaining));
+}
+
 static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 					     gfp_t gfp_mask)
 {
 	struct page *page = NULL;
 	gfp_t gfp = gfp_mask;
+	unsigned int order;
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
 	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
 		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
 	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
 				PAGE_FRAG_CACHE_MAX_ORDER);
-	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
 #endif
-	if (unlikely(!page))
+	if (unlikely(!page)) {
 		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+		if (unlikely(!page)) {
+			memset(nc, 0, sizeof(*nc));
+			return NULL;
+		}
+
+		order = 0;
+		nc->remaining = PAGE_SIZE;
+	} else {
+		order = PAGE_FRAG_CACHE_MAX_ORDER;
+		nc->remaining = PAGE_FRAG_CACHE_MAX_SIZE;
+	}
 
-	nc->va = page ? page_address(page) : NULL;
+	/* Even if we own the page, we do not use atomic_set().
+	 * This would break get_page_unless_zero() users.
+	 */
+	page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
 
+	/* reset page count bias of new frag */
+	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
+	nc->encoded_va = encode_aligned_va(page_address(page), order,
+					   page_is_pfmemalloc(page));
 	return page;
 }
 
 void page_frag_cache_drain(struct page_frag_cache *nc)
 {
-	if (!nc->va)
+	if (!nc->encoded_va)
 		return;
 
-	__page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
-	nc->va = NULL;
+	__page_frag_cache_drain(virt_to_head_page(nc->encoded_va),
+				nc->pagecnt_bias);
+	memset(nc, 0, sizeof(*nc));
 }
 EXPORT_SYMBOL(page_frag_cache_drain);
 
@@ -62,51 +89,41 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 				 unsigned int fragsz, gfp_t gfp_mask,
 				 unsigned int align_mask)
 {
-	unsigned int size = PAGE_SIZE;
+	struct encoded_va *encoded_va = nc->encoded_va;
 	struct page *page;
-	int offset;
+	int remaining;
+	void *va;
 
-	if (unlikely(!nc->va)) {
+	if (unlikely(!encoded_va)) {
 refill:
-		page = __page_frag_cache_refill(nc, gfp_mask);
-		if (!page)
+		if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
 			return NULL;
 
-		/* Even if we own the page, we do not use atomic_set().
-		 * This would break get_page_unless_zero() users.
-		 */
-		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
-
-		/* reset page count bias and offset to start of new frag */
-		nc->pfmemalloc = page_is_pfmemalloc(page);
-		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		nc->offset = 0;
+		encoded_va = nc->encoded_va;
 	}
 
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-	/* if size can vary use size else just use PAGE_SIZE */
-	size = nc->size;
-#endif
-
-	offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask);
-	if (unlikely(offset + fragsz > size)) {
-		page = virt_to_page(nc->va);
-
+	remaining = nc->remaining & align_mask;
+	remaining -= fragsz;
+	if (unlikely(remaining < 0)) {
+		page = virt_to_page(encoded_va);
 		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
 			goto refill;
 
-		if (unlikely(nc->pfmemalloc)) {
-			free_unref_page(page, compound_order(page));
+		if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
+			VM_BUG_ON(compound_order(page) !=
+				  encoded_page_order(encoded_va));
+			free_unref_page(page, encoded_page_order(encoded_va));
 			goto refill;
 		}
 
 		/* OK, page count is 0, we can safely set it */
 		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
 
-		/* reset page count bias and offset to start of new frag */
+		/* reset page count bias and remaining of new frag */
 		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		offset = 0;
-		if (unlikely(fragsz > PAGE_SIZE)) {
+		nc->remaining = remaining = page_frag_cache_page_size(encoded_va);
+		remaining -= fragsz;
+		if (unlikely(remaining < 0)) {
 			/*
 			 * The caller is trying to allocate a fragment
 			 * with fragsz > PAGE_SIZE but the cache isn't big
@@ -120,10 +137,11 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 		}
 	}
 
+	va = page_frag_cache_current_va(nc);
 	nc->pagecnt_bias--;
-	nc->offset = offset + fragsz;
+	nc->remaining = remaining;
 
-	return nc->va + offset;
+	return va;
 }
 EXPORT_SYMBOL(__page_frag_alloc_va_align);
 
-- 
2.33.0



^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH net-next v9 07/13] mm: page_frag: some minor refactoring before adding new API
       [not found] <20240625135216.47007-1-linyunsheng@huawei.com>
                   ` (5 preceding siblings ...)
  2024-06-25 13:52 ` [PATCH net-next v9 06/13] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc' Yunsheng Lin
@ 2024-06-25 13:52 ` Yunsheng Lin
  2024-07-02 15:30   ` Alexander H Duyck
  2024-06-25 13:52 ` [PATCH net-next v9 08/13] mm: page_frag: use __alloc_pages() to replace alloc_pages_node() Yunsheng Lin
                   ` (2 subsequent siblings)
  9 siblings, 1 reply; 43+ messages in thread
From: Yunsheng Lin @ 2024-06-25 13:52 UTC (permalink / raw)
  To: davem, kuba, pabeni
  Cc: netdev, linux-kernel, Yunsheng Lin, Alexander Duyck,
	Andrew Morton, linux-mm

Refactor common codes from __page_frag_alloc_va_align()
to __page_frag_cache_refill(), so that the new API can
make use of them.

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 mm/page_frag_cache.c | 61 ++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 30 deletions(-)

diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index a3316dd50eff..4fd421d4f22c 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -29,10 +29,36 @@ static void *page_frag_cache_current_va(struct page_frag_cache *nc)
 static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 					     gfp_t gfp_mask)
 {
-	struct page *page = NULL;
+	struct encoded_va *encoded_va = nc->encoded_va;
 	gfp_t gfp = gfp_mask;
 	unsigned int order;
+	struct page *page;
+
+	if (unlikely(!encoded_va))
+		goto alloc;
+
+	page = virt_to_page(encoded_va);
+	if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
+		goto alloc;
+
+	if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
+		VM_BUG_ON(compound_order(page) !=
+			  encoded_page_order(encoded_va));
+		free_unref_page(page, encoded_page_order(encoded_va));
+		goto alloc;
+	}
+
+	/* OK, page count is 0, we can safely set it */
+	set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
+
+	/* reset page count bias and remaining of new frag */
+	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
+	nc->remaining = page_frag_cache_page_size(encoded_va);
+
+	return page;
 
+alloc:
+	page = NULL;
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
 	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
 		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
@@ -89,40 +115,15 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 				 unsigned int fragsz, gfp_t gfp_mask,
 				 unsigned int align_mask)
 {
-	struct encoded_va *encoded_va = nc->encoded_va;
-	struct page *page;
-	int remaining;
+	int remaining = nc->remaining & align_mask;
 	void *va;
 
-	if (unlikely(!encoded_va)) {
-refill:
-		if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
-			return NULL;
-
-		encoded_va = nc->encoded_va;
-	}
-
-	remaining = nc->remaining & align_mask;
 	remaining -= fragsz;
 	if (unlikely(remaining < 0)) {
-		page = virt_to_page(encoded_va);
-		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
-			goto refill;
-
-		if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
-			VM_BUG_ON(compound_order(page) !=
-				  encoded_page_order(encoded_va));
-			free_unref_page(page, encoded_page_order(encoded_va));
-			goto refill;
-		}
-
-		/* OK, page count is 0, we can safely set it */
-		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
+		if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
+			return NULL;
 
-		/* reset page count bias and remaining of new frag */
-		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		nc->remaining = remaining = page_frag_cache_page_size(encoded_va);
-		remaining -= fragsz;
+		remaining = nc->remaining - fragsz;
 		if (unlikely(remaining < 0)) {
 			/*
 			 * The caller is trying to allocate a fragment
-- 
2.33.0



^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH net-next v9 08/13] mm: page_frag: use __alloc_pages() to replace alloc_pages_node()
       [not found] <20240625135216.47007-1-linyunsheng@huawei.com>
                   ` (6 preceding siblings ...)
  2024-06-25 13:52 ` [PATCH net-next v9 07/13] mm: page_frag: some minor refactoring before adding new API Yunsheng Lin
@ 2024-06-25 13:52 ` Yunsheng Lin
  2024-06-25 13:52 ` [PATCH net-next v9 10/13] mm: page_frag: introduce prepare/probe/commit API Yunsheng Lin
  2024-06-25 13:52 ` [PATCH net-next v9 12/13] mm: page_frag: update documentation for page_frag Yunsheng Lin
  9 siblings, 0 replies; 43+ messages in thread
From: Yunsheng Lin @ 2024-06-25 13:52 UTC (permalink / raw)
  To: davem, kuba, pabeni
  Cc: netdev, linux-kernel, Yunsheng Lin, Alexander Duyck,
	Andrew Morton, linux-mm

There are more new APIs calling __page_frag_cache_refill() in
this patchset, which may cause compiler not being able to inline
__page_frag_cache_refill() into __page_frag_alloc_va_align().

Not being able to do the inlining seems to casue some notiable
performance degradation in arm64 system with 64K PAGE_SIZE after
adding new API calling __page_frag_cache_refill().

It seems there is about 24Bytes binary size increase for
__page_frag_cache_refill() and __page_frag_cache_refill() in
arm64 system with 64K PAGE_SIZE. By doing the gdb disassembling,
It seems we can have more than 100Bytes decrease for the binary
size by using __alloc_pages() to replace alloc_pages_node(), as
there seems to be some unnecessary checking for nid being
NUMA_NO_NODE, especially when page_frag is still part of the mm
system.

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 mm/page_frag_cache.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index 4fd421d4f22c..58facd2b59f7 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -62,11 +62,11 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
 	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
 		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
-	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
-				PAGE_FRAG_CACHE_MAX_ORDER);
+	page = __alloc_pages(gfp_mask, PAGE_FRAG_CACHE_MAX_ORDER,
+			     numa_mem_id(), NULL);
 #endif
 	if (unlikely(!page)) {
-		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+		page = __alloc_pages(gfp, 0, numa_mem_id(), NULL);
 		if (unlikely(!page)) {
 			memset(nc, 0, sizeof(*nc));
 			return NULL;
-- 
2.33.0

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH net-next v9 10/13] mm: page_frag: introduce prepare/probe/commit API
       [not found] <20240625135216.47007-1-linyunsheng@huawei.com>
                   ` (7 preceding siblings ...)
  2024-06-25 13:52 ` [PATCH net-next v9 08/13] mm: page_frag: use __alloc_pages() to replace alloc_pages_node() Yunsheng Lin
@ 2024-06-25 13:52 ` Yunsheng Lin
  2024-06-28 22:35   ` Alexander H Duyck
  2024-06-25 13:52 ` [PATCH net-next v9 12/13] mm: page_frag: update documentation for page_frag Yunsheng Lin
  9 siblings, 1 reply; 43+ messages in thread
From: Yunsheng Lin @ 2024-06-25 13:52 UTC (permalink / raw)
  To: davem, kuba, pabeni
  Cc: netdev, linux-kernel, Yunsheng Lin, Alexander Duyck,
	Andrew Morton, linux-mm

There are many use cases that need minimum memory in order
for forward progress, but more performant if more memory is
available or need to probe the cache info to use any memory
available for frag caoleasing reason.

Currently skb_page_frag_refill() API is used to solve the
above use cases, but caller needs to know about the internal
detail and access the data field of 'struct page_frag' to
meet the requirement of the above use cases and its
implementation is similar to the one in mm subsystem.

To unify those two page_frag implementations, introduce a
prepare API to ensure minimum memory is satisfied and return
how much the actual memory is available to the caller and a
probe API to report the current available memory to caller
without doing cache refilling. The caller needs to either call
the commit API to report how much memory it actually uses, or
not do so if deciding to not use any memory.

As next patch is about to replace 'struct page_frag' with
'struct page_frag_cache' in linux/sched.h, which is included
by the asm-offsets.s, using the virt_to_page() in the inline
helper of page_frag_cache.h cause a "'vmemmap' undeclared"
compiling error for asm-offsets.s, use a macro for probe API
to avoid that compiling error.

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 include/linux/page_frag_cache.h |  82 +++++++++++++++++++++++
 mm/page_frag_cache.c            | 114 ++++++++++++++++++++++++++++++++
 2 files changed, 196 insertions(+)

diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index b33904d4494f..e95d44a36ec9 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -4,6 +4,7 @@
 #define _LINUX_PAGE_FRAG_CACHE_H
 
 #include <linux/gfp_types.h>
+#include <linux/mmdebug.h>
 
 #define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
 #define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
@@ -87,6 +88,9 @@ static inline unsigned int page_frag_cache_page_size(struct encoded_va *encoded_
 
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
+struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
+				unsigned int *offset, unsigned int fragsz,
+				gfp_t gfp);
 void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 				 unsigned int fragsz, gfp_t gfp_mask,
 				 unsigned int align_mask);
@@ -99,12 +103,90 @@ static inline void *page_frag_alloc_va_align(struct page_frag_cache *nc,
 	return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, -align);
 }
 
+static inline unsigned int page_frag_cache_page_offset(const struct page_frag_cache *nc)
+{
+	return page_frag_cache_page_size(nc->encoded_va) - nc->remaining;
+}
+
 static inline void *page_frag_alloc_va(struct page_frag_cache *nc,
 				       unsigned int fragsz, gfp_t gfp_mask)
 {
 	return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, ~0u);
 }
 
+void *page_frag_alloc_va_prepare(struct page_frag_cache *nc, unsigned int *fragsz,
+				 gfp_t gfp);
+
+static inline void *page_frag_alloc_va_prepare_align(struct page_frag_cache *nc,
+						     unsigned int *fragsz,
+						     gfp_t gfp,
+						     unsigned int align)
+{
+	WARN_ON_ONCE(!is_power_of_2(align) || align > PAGE_SIZE);
+	nc->remaining = nc->remaining & -align;
+	return page_frag_alloc_va_prepare(nc, fragsz, gfp);
+}
+
+struct page *page_frag_alloc_pg_prepare(struct page_frag_cache *nc,
+					unsigned int *offset,
+					unsigned int *fragsz, gfp_t gfp);
+
+struct page *page_frag_alloc_prepare(struct page_frag_cache *nc,
+				     unsigned int *offset,
+				     unsigned int *fragsz,
+				     void **va, gfp_t gfp);
+
+static inline struct encoded_va *__page_frag_alloc_probe(struct page_frag_cache *nc,
+							 unsigned int *offset,
+							 unsigned int *fragsz,
+							 void **va)
+{
+	struct encoded_va *encoded_va;
+
+	*fragsz = nc->remaining;
+	encoded_va = nc->encoded_va;
+	*offset = page_frag_cache_page_size(encoded_va) - *fragsz;
+	*va = encoded_page_address(encoded_va) + *offset;
+
+	return encoded_va;
+}
+
+#define page_frag_alloc_probe(nc, offset, fragsz, va)			\
+({									\
+	struct page *__page = NULL;					\
+									\
+	VM_BUG_ON(!*(fragsz));						\
+	if (likely((nc)->remaining >= *(fragsz)))			\
+		__page = virt_to_page(__page_frag_alloc_probe(nc,	\
+							      offset,	\
+							      fragsz,	\
+							      va));	\
+									\
+	__page;								\
+})
+
+static inline void page_frag_alloc_commit(struct page_frag_cache *nc,
+					  unsigned int fragsz)
+{
+	VM_BUG_ON(fragsz > nc->remaining || !nc->pagecnt_bias);
+	nc->pagecnt_bias--;
+	nc->remaining -= fragsz;
+}
+
+static inline void page_frag_alloc_commit_noref(struct page_frag_cache *nc,
+						unsigned int fragsz)
+{
+	VM_BUG_ON(fragsz > nc->remaining);
+	nc->remaining -= fragsz;
+}
+
+static inline void page_frag_alloc_abort(struct page_frag_cache *nc,
+					 unsigned int fragsz)
+{
+	nc->pagecnt_bias++;
+	nc->remaining += fragsz;
+}
+
 void page_frag_free_va(void *addr);
 
 #endif
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index 58facd2b59f7..a6eb0ab2e7f9 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -91,6 +91,120 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 	return page;
 }
 
+void *page_frag_alloc_va_prepare(struct page_frag_cache *nc,
+				 unsigned int *fragsz, gfp_t gfp)
+{
+	struct encoded_va *encoded_va;
+	unsigned int remaining;
+
+	remaining = nc->remaining;
+	if (unlikely(*fragsz > remaining)) {
+		if (unlikely(!__page_frag_cache_refill(nc, gfp) ||
+			     *fragsz > PAGE_SIZE))
+			return NULL;
+
+		remaining = nc->remaining;
+	}
+
+	encoded_va = nc->encoded_va;
+	*fragsz = remaining;
+	return encoded_page_address(encoded_va) +
+			page_frag_cache_page_size(encoded_va) - remaining;
+}
+EXPORT_SYMBOL(page_frag_alloc_va_prepare);
+
+struct page *page_frag_alloc_pg_prepare(struct page_frag_cache *nc,
+					unsigned int *offset,
+					unsigned int *fragsz, gfp_t gfp)
+{
+	struct encoded_va *encoded_va;
+	unsigned int remaining;
+	struct page *page;
+
+	remaining = nc->remaining;
+	if (unlikely(*fragsz > remaining)) {
+		if (unlikely(*fragsz > PAGE_SIZE)) {
+			*fragsz = 0;
+			return NULL;
+		}
+
+		page = __page_frag_cache_refill(nc, gfp);
+		remaining = nc->remaining;
+		encoded_va = nc->encoded_va;
+	} else {
+		encoded_va = nc->encoded_va;
+		page = virt_to_page(encoded_va);
+	}
+
+	*offset = page_frag_cache_page_size(encoded_va) - remaining;
+	*fragsz = remaining;
+
+	return page;
+}
+EXPORT_SYMBOL(page_frag_alloc_pg_prepare);
+
+struct page *page_frag_alloc_prepare(struct page_frag_cache *nc,
+				     unsigned int *offset,
+				     unsigned int *fragsz,
+				     void **va, gfp_t gfp)
+{
+	struct encoded_va *encoded_va;
+	unsigned int remaining;
+	struct page *page;
+
+	remaining = nc->remaining;
+	if (unlikely(*fragsz > remaining)) {
+		if (unlikely(*fragsz > PAGE_SIZE)) {
+			*fragsz = 0;
+			return NULL;
+		}
+
+		page = __page_frag_cache_refill(nc, gfp);
+		remaining = nc->remaining;
+		encoded_va = nc->encoded_va;
+	} else {
+		encoded_va = nc->encoded_va;
+		page = virt_to_page(encoded_va);
+	}
+
+	*offset = page_frag_cache_page_size(encoded_va) - remaining;
+	*fragsz = remaining;
+	*va = encoded_page_address(encoded_va) + *offset;
+
+	return page;
+}
+EXPORT_SYMBOL(page_frag_alloc_prepare);
+
+struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
+				unsigned int *offset, unsigned int fragsz,
+				gfp_t gfp)
+{
+	struct page *page;
+
+	if (unlikely(fragsz > nc->remaining)) {
+		if (unlikely(fragsz > PAGE_SIZE))
+			return NULL;
+
+		page = __page_frag_cache_refill(nc, gfp);
+		if (unlikely(!page))
+			return NULL;
+
+		*offset = 0;
+	} else {
+		struct encoded_va *encoded_va = nc->encoded_va;
+
+		page = virt_to_page(encoded_va);
+		*offset = page_frag_cache_page_size(encoded_va) -
+					nc->remaining;
+	}
+
+	nc->remaining -= fragsz;
+	nc->pagecnt_bias--;
+
+	return page;
+}
+EXPORT_SYMBOL(page_frag_alloc_pg);
+
 void page_frag_cache_drain(struct page_frag_cache *nc)
 {
 	if (!nc->encoded_va)
-- 
2.33.0



^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH net-next v9 12/13] mm: page_frag: update documentation for page_frag
       [not found] <20240625135216.47007-1-linyunsheng@huawei.com>
                   ` (8 preceding siblings ...)
  2024-06-25 13:52 ` [PATCH net-next v9 10/13] mm: page_frag: introduce prepare/probe/commit API Yunsheng Lin
@ 2024-06-25 13:52 ` Yunsheng Lin
  9 siblings, 0 replies; 43+ messages in thread
From: Yunsheng Lin @ 2024-06-25 13:52 UTC (permalink / raw)
  To: davem, kuba, pabeni
  Cc: netdev, linux-kernel, Yunsheng Lin, Alexander Duyck,
	Jonathan Corbet, Andrew Morton, linux-mm, linux-doc

Update documentation about design, implementation and API usages
for page_frag.

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 Documentation/mm/page_frags.rst | 163 +++++++++++++++++++++++++++++++-
 include/linux/page_frag_cache.h | 107 +++++++++++++++++++++
 mm/page_frag_cache.c            |  77 ++++++++++++++-
 3 files changed, 344 insertions(+), 3 deletions(-)

diff --git a/Documentation/mm/page_frags.rst b/Documentation/mm/page_frags.rst
index 503ca6cdb804..6a4ac2616098 100644
--- a/Documentation/mm/page_frags.rst
+++ b/Documentation/mm/page_frags.rst
@@ -1,3 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
 ==============
 Page fragments
 ==============
@@ -40,4 +42,163 @@ page via a single call.  The advantage to doing this is that it allows for
 cleaning up the multiple references that were added to a page in order to
 avoid calling get_page per allocation.
 
-Alexander Duyck, Nov 29, 2016.
+
+Architecture overview
+=====================
+
+.. code-block:: none
+
+                      +----------------------+
+                      | page_frag API caller |
+                      +----------------------+
+                                  |
+                                  |
+                                  v
+    +---------------------------------------------------------------+
+    |                   request page fragment                       |
+    +---------------------------------------------------------------+
+             |                                 |                  |
+             |                                 |                  |
+             |                          Cache not enough          |
+             |                                 |                  |
+             |                                 v                  |
+        Cache empty                   +-----------------+         |
+             |                        | drain old cache |         |
+             |                        +-----------------+         |
+             |                                 |                  |
+             v_________________________________v                  |
+                              |                                   |
+                              |                                   |
+             _________________v_______________                    |
+            |                                 |            Cache is enough
+            |                                 |                   |
+ PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE         |                   |
+            |                                 |                   |
+            |               PAGE_SIZE >= PAGE_FRAG_CACHE_MAX_SIZE |
+            v                                 |                   |
+    +----------------------------------+      |                   |
+    | refill cache with order > 0 page |      |                   |
+    +----------------------------------+      |                   |
+      |                    |                  |                   |
+      |                    |                  |                   |
+      |              Refill failed            |                   |
+      |                    |                  |                   |
+      |                    v                  v                   |
+      |      +------------------------------------+               |
+      |      |   refill cache with order 0 page   |               |
+      |      +----------------------------------=-+               |
+      |                       |                                   |
+ Refill succeed               |                                   |
+      |                 Refill succeed                            |
+      |                       |                                   |
+      v                       v                                   v
+    +---------------------------------------------------------------+
+    |             allocate fragment from cache                      |
+    +---------------------------------------------------------------+
+
+API interface
+=============
+As the design and implementation of page_frag API implies, the allocation side
+does not allow concurrent calling. Instead it is assumed that the caller must
+ensure there is not concurrent alloc calling to the same page_frag_cache
+instance by using its own lock or rely on some lockless guarantee like NAPI
+softirq.
+
+Depending on different aligning requirement, the page_frag API caller may call
+page_frag_alloc*_align*() to ensure the returned virtual address or offset of
+the page is aligned according to the 'align/alignment' parameter. Note the size
+of the allocated fragment is not aligned, the caller needs to provide an aligned
+fragsz if there is an alignment requirement for the size of the fragment.
+
+Depending on different use cases, callers expecting to deal with va, page or
+both va and page for them may call page_frag_alloc_va*, page_frag_alloc_pg*,
+or page_frag_alloc* API accordingly.
+
+There is also a use case that needs minimum memory in order for forward progress,
+but more performant if more memory is available. Using page_frag_alloc_prepare()
+and page_frag_alloc_commit() related API, the caller requests the minimum memory
+it needs and the prepare API will return the maximum size of the fragment
+returned. The caller needs to either call the commit API to report how much
+memory it actually uses, or not do so if deciding to not use any memory.
+
+.. kernel-doc:: include/linux/page_frag_cache.h
+   :identifiers: page_frag_cache_init page_frag_cache_is_pfmemalloc
+                 page_frag_cache_page_offset page_frag_alloc_va
+                 page_frag_alloc_va_align page_frag_alloc_va_prepare_align
+                 page_frag_alloc_probe page_frag_alloc_commit
+                 page_frag_alloc_commit_noref page_frag_alloc_abort
+
+.. kernel-doc:: mm/page_frag_cache.c
+   :identifiers: __page_frag_alloc_va_align page_frag_alloc_pg
+                 page_frag_alloc_va_prepare page_frag_alloc_pg_prepare
+                 page_frag_alloc_prepare page_frag_cache_drain
+                 page_frag_free_va
+
+Coding examples
+===============
+
+Init & Drain API
+----------------
+
+.. code-block:: c
+
+   page_frag_cache_init(pfrag);
+   ...
+   page_frag_cache_drain(pfrag);
+
+
+Alloc & Free API
+----------------
+
+.. code-block:: c
+
+    void *va;
+
+    va = page_frag_alloc_va_align(pfrag, size, gfp, align);
+    if (!va)
+        goto do_error;
+
+    err = do_something(va, size);
+    if (err) {
+        page_frag_free_va(va);
+        goto do_error;
+    }
+
+Prepare & Commit API
+--------------------
+
+.. code-block:: c
+
+    unsigned int offset, size;
+    bool merge = true;
+    struct page *page;
+    void *va;
+
+    size = 32U;
+    page = page_frag_alloc_prepare(pfrag, &offset, &size, &va);
+    if (!page)
+        goto wait_for_space;
+
+    copy = min_t(unsigned int, copy, size);
+    if (!skb_can_coalesce(skb, i, page, offset)) {
+        if (i >= max_skb_frags)
+            goto new_segment;
+
+        merge = false;
+    }
+
+    copy = mem_schedule(copy);
+    if (!copy)
+        goto wait_for_space;
+
+    err = copy_from_iter_full_nocache(va, copy, iter);
+    if (err)
+        goto do_error;
+
+    if (merge) {
+        skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+        page_frag_alloc_commit_noref(pfrag, offset, copy);
+    } else {
+        skb_fill_page_desc(skb, i, page, offset, copy);
+        page_frag_alloc_commit(pfrag, offset, copy);
+    }
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index e95d44a36ec9..234b9b1c6f63 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -71,11 +71,28 @@ struct page_frag_cache {
 #endif
 };
 
+/**
+ * page_frag_cache_init() - Init page_frag cache.
+ * @nc: page_frag cache from which to init
+ *
+ * Inline helper to init the page_frag cache.
+ */
 static inline void page_frag_cache_init(struct page_frag_cache *nc)
 {
 	memset(nc, 0, sizeof(*nc));
 }
 
+/**
+ * page_frag_cache_is_pfmemalloc() - Check for pfmemalloc.
+ * @nc: page_frag cache from which to check
+ *
+ * Used to check if the current page in page_frag cache is pfmemalloc'ed.
+ * It has the same calling context expection as the alloc API.
+ *
+ * Return:
+ * true if the current page in page_frag cache is pfmemalloc'ed, otherwise
+ * return false.
+ */
 static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
 {
 	return encoded_page_pfmemalloc(nc->encoded_va);
@@ -95,6 +112,19 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 				 unsigned int fragsz, gfp_t gfp_mask,
 				 unsigned int align_mask);
 
+/**
+ * page_frag_alloc_va_align() - Alloc a page fragment with aligning requirement.
+ * @nc: page_frag cache from which to allocate
+ * @fragsz: the requested fragment size
+ * @gfp_mask: the allocation gfp to use when cache needs to be refilled
+ * @align: the requested aligning requirement for virtual address of fragment
+ *
+ * WARN_ON_ONCE() checking for @align before allocing a page fragment from
+ * page_frag cache with aligning requirement.
+ *
+ * Return:
+ * virtual address of the page fragment, otherwise return NULL.
+ */
 static inline void *page_frag_alloc_va_align(struct page_frag_cache *nc,
 					     unsigned int fragsz,
 					     gfp_t gfp_mask, unsigned int align)
@@ -103,11 +133,32 @@ static inline void *page_frag_alloc_va_align(struct page_frag_cache *nc,
 	return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, -align);
 }
 
+/**
+ * page_frag_cache_page_offset() - Return the current page fragment's offset.
+ * @nc: page_frag cache from which to check
+ *
+ * The API is only used in net/sched/em_meta.c for historical reason, do not use
+ * it for new caller unless there is a strong reason.
+ *
+ * Return:
+ * the offset of the current page fragment in the page_frag cache.
+ */
 static inline unsigned int page_frag_cache_page_offset(const struct page_frag_cache *nc)
 {
 	return page_frag_cache_page_size(nc->encoded_va) - nc->remaining;
 }
 
+/**
+ * page_frag_alloc_va() - Alloc a page fragment.
+ * @nc: page_frag cache from which to allocate
+ * @fragsz: the requested fragment size
+ * @gfp_mask: the allocation gfp to use when cache need to be refilled
+ *
+ * Get a page fragment from page_frag cache.
+ *
+ * Return:
+ * virtual address of the page fragment, otherwise return NULL.
+ */
 static inline void *page_frag_alloc_va(struct page_frag_cache *nc,
 				       unsigned int fragsz, gfp_t gfp_mask)
 {
@@ -117,6 +168,21 @@ static inline void *page_frag_alloc_va(struct page_frag_cache *nc,
 void *page_frag_alloc_va_prepare(struct page_frag_cache *nc, unsigned int *fragsz,
 				 gfp_t gfp);
 
+/**
+ * page_frag_alloc_va_prepare_align() - Prepare allocing a page fragment with
+ * aligning requirement.
+ * @nc: page_frag cache from which to prepare
+ * @fragsz: in as the requested size, out as the available size
+ * @gfp: the allocation gfp to use when cache need to be refilled
+ * @align: the requested aligning requirement
+ *
+ * WARN_ON_ONCE() checking for @align before preparing an aligned page fragment
+ * with minimum size of @fragsz, @fragsz is also used to report the maximum size
+ * of the page fragment the caller can use.
+ *
+ * Return:
+ * virtual address of the page fragment, otherwise return NULL.
+ */
 static inline void *page_frag_alloc_va_prepare_align(struct page_frag_cache *nc,
 						     unsigned int *fragsz,
 						     gfp_t gfp,
@@ -151,6 +217,21 @@ static inline struct encoded_va *__page_frag_alloc_probe(struct page_frag_cache
 	return encoded_va;
 }
 
+/**
+ * page_frag_alloc_probe - Probe the available page fragment.
+ * @nc: page_frag cache from which to probe
+ * @offset: out as the offset of the page fragment
+ * @fragsz: in as the requested size, out as the available size
+ * @va: out as the virtual address of the returned page fragment
+ *
+ * Probe the current available memory to caller without doing cache refilling.
+ * If no space is available in the page_frag cache, return NULL.
+ * If the requested space is available, up to @fragsz bytes may be added to the
+ * fragment using commit API.
+ *
+ * Return:
+ * the page fragment, otherwise return NULL.
+ */
 #define page_frag_alloc_probe(nc, offset, fragsz, va)			\
 ({									\
 	struct page *__page = NULL;					\
@@ -165,6 +246,14 @@ static inline struct encoded_va *__page_frag_alloc_probe(struct page_frag_cache
 	__page;								\
 })
 
+/**
+ * page_frag_alloc_commit - Commit allocing a page fragment.
+ * @nc: page_frag cache from which to commit
+ * @fragsz: size of the page fragment has been used
+ *
+ * Commit the actual used size for the allocation that was either prepared or
+ * probed.
+ */
 static inline void page_frag_alloc_commit(struct page_frag_cache *nc,
 					  unsigned int fragsz)
 {
@@ -173,6 +262,16 @@ static inline void page_frag_alloc_commit(struct page_frag_cache *nc,
 	nc->remaining -= fragsz;
 }
 
+/**
+ * page_frag_alloc_commit_noref - Commit allocing a page fragment without taking
+ * page refcount.
+ * @nc: page_frag cache from which to commit
+ * @fragsz: size of the page fragment has been used
+ *
+ * Commit the alloc preparing or probing by passing the actual used size, but
+ * not taking refcount. Mostly used for fragmemt coalescing case when the
+ * current fragment can share the same refcount with previous fragment.
+ */
 static inline void page_frag_alloc_commit_noref(struct page_frag_cache *nc,
 						unsigned int fragsz)
 {
@@ -180,6 +279,14 @@ static inline void page_frag_alloc_commit_noref(struct page_frag_cache *nc,
 	nc->remaining -= fragsz;
 }
 
+/**
+ * page_frag_alloc_abort - Abort the page fragment allocation.
+ * @nc: page_frag cache to which the page fragment is aborted back
+ * @fragsz: size of the page fragment to be aborted
+ *
+ * It is expected to be called from the same context as the alloc API.
+ * Mostly used for error handling cases where the fragment is no longer needed.
+ */
 static inline void page_frag_alloc_abort(struct page_frag_cache *nc,
 					 unsigned int fragsz)
 {
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index a6eb0ab2e7f9..b42864ee6f5d 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -91,6 +91,18 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 	return page;
 }
 
+/**
+ * page_frag_alloc_va_prepare() - Prepare allocing a page fragment.
+ * @nc: page_frag cache from which to prepare
+ * @fragsz: in as the requested size, out as the available size
+ * @gfp: the allocation gfp to use when cache needs to be refilled
+ *
+ * Prepare a page fragment with minimum size of @fragsz, @fragsz is also used
+ * to report the maximum size of the page fragment the caller can use.
+ *
+ * Return:
+ * virtual address of the page fragment, otherwise return NULL.
+ */
 void *page_frag_alloc_va_prepare(struct page_frag_cache *nc,
 				 unsigned int *fragsz, gfp_t gfp)
 {
@@ -113,6 +125,19 @@ void *page_frag_alloc_va_prepare(struct page_frag_cache *nc,
 }
 EXPORT_SYMBOL(page_frag_alloc_va_prepare);
 
+/**
+ * page_frag_alloc_pg_prepare - Prepare allocing a page fragment.
+ * @nc: page_frag cache from which to prepare
+ * @offset: out as the offset of the page fragment
+ * @fragsz: in as the requested size, out as the available size
+ * @gfp: the allocation gfp to use when cache needs to be refilled
+ *
+ * Prepare a page fragment with minimum size of @fragsz, @fragsz is also used
+ * to report the maximum size of the page fragment the caller can use.
+ *
+ * Return:
+ * the page fragment, otherwise return NULL.
+ */
 struct page *page_frag_alloc_pg_prepare(struct page_frag_cache *nc,
 					unsigned int *offset,
 					unsigned int *fragsz, gfp_t gfp)
@@ -143,6 +168,21 @@ struct page *page_frag_alloc_pg_prepare(struct page_frag_cache *nc,
 }
 EXPORT_SYMBOL(page_frag_alloc_pg_prepare);
 
+/**
+ * page_frag_alloc_prepare - Prepare allocing a page fragment.
+ * @nc: page_frag cache from which to prepare
+ * @offset: out as the offset of the page fragment
+ * @fragsz: in as the requested size, out as the available size
+ * @va: out as the virtual address of the returned page fragment
+ * @gfp: the allocation gfp to use when cache needs to be refilled
+ *
+ * Prepare a page fragment with minimum size of @fragsz, @fragsz is also used
+ * to report the maximum size of the page fragment. Return both 'struct page'
+ * and virtual address of the fragment to the caller.
+ *
+ * Return:
+ * the page fragment, otherwise return NULL.
+ */
 struct page *page_frag_alloc_prepare(struct page_frag_cache *nc,
 				     unsigned int *offset,
 				     unsigned int *fragsz,
@@ -175,6 +215,18 @@ struct page *page_frag_alloc_prepare(struct page_frag_cache *nc,
 }
 EXPORT_SYMBOL(page_frag_alloc_prepare);
 
+/**
+ * page_frag_alloc_pg - Alloce a page fragment.
+ * @nc: page_frag cache from which to alloce
+ * @offset: out as the offset of the page fragment
+ * @fragsz: the requested fragment size
+ * @gfp: the allocation gfp to use when cache needs to be refilled
+ *
+ * Get a page fragment from page_frag cache.
+ *
+ * Return:
+ * the page fragment, otherwise return NULL.
+ */
 struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
 				unsigned int *offset, unsigned int fragsz,
 				gfp_t gfp)
@@ -205,6 +257,10 @@ struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
 }
 EXPORT_SYMBOL(page_frag_alloc_pg);
 
+/**
+ * page_frag_cache_drain - Drain the current page from page_frag cache.
+ * @nc: page_frag cache from which to drain
+ */
 void page_frag_cache_drain(struct page_frag_cache *nc)
 {
 	if (!nc->encoded_va)
@@ -225,6 +281,19 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
 }
 EXPORT_SYMBOL(__page_frag_cache_drain);
 
+/**
+ * __page_frag_alloc_va_align() - Alloc a page fragment with aligning
+ * requirement.
+ * @nc: page_frag cache from which to allocate
+ * @fragsz: the requested fragment size
+ * @gfp_mask: the allocation gfp to use when cache need to be refilled
+ * @align_mask: the requested aligning requirement for the 'va'
+ *
+ * Get a page fragment from page_frag cache with aligning requirement.
+ *
+ * Return:
+ * Return va of the page fragment, otherwise return NULL.
+ */
 void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 				 unsigned int fragsz, gfp_t gfp_mask,
 				 unsigned int align_mask)
@@ -260,8 +329,12 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 }
 EXPORT_SYMBOL(__page_frag_alloc_va_align);
 
-/*
- * Frees a page fragment allocated out of either a compound or order 0 page.
+/**
+ * page_frag_free_va - Free a page fragment.
+ * @addr: va of page fragment to be freed
+ *
+ * Free a page fragment allocated out of either a compound or order 0 page by
+ * virtual address.
  */
 void page_frag_free_va(void *addr)
 {
-- 
2.33.0



^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 10/13] mm: page_frag: introduce prepare/probe/commit API
  2024-06-25 13:52 ` [PATCH net-next v9 10/13] mm: page_frag: introduce prepare/probe/commit API Yunsheng Lin
@ 2024-06-28 22:35   ` Alexander H Duyck
  2024-06-29 11:15     ` Yunsheng Lin
  0 siblings, 1 reply; 43+ messages in thread
From: Alexander H Duyck @ 2024-06-28 22:35 UTC (permalink / raw)
  To: Yunsheng Lin, davem, kuba, pabeni
  Cc: netdev, linux-kernel, Andrew Morton, linux-mm

On Tue, 2024-06-25 at 21:52 +0800, Yunsheng Lin wrote:
> There are many use cases that need minimum memory in order
> for forward progress, but more performant if more memory is
> available or need to probe the cache info to use any memory
> available for frag caoleasing reason.
> 
> Currently skb_page_frag_refill() API is used to solve the
> above use cases, but caller needs to know about the internal
> detail and access the data field of 'struct page_frag' to
> meet the requirement of the above use cases and its
> implementation is similar to the one in mm subsystem.
> 
> To unify those two page_frag implementations, introduce a
> prepare API to ensure minimum memory is satisfied and return
> how much the actual memory is available to the caller and a
> probe API to report the current available memory to caller
> without doing cache refilling. The caller needs to either call
> the commit API to report how much memory it actually uses, or
> not do so if deciding to not use any memory.
> 
> As next patch is about to replace 'struct page_frag' with
> 'struct page_frag_cache' in linux/sched.h, which is included
> by the asm-offsets.s, using the virt_to_page() in the inline
> helper of page_frag_cache.h cause a "'vmemmap' undeclared"
> compiling error for asm-offsets.s, use a macro for probe API
> to avoid that compiling error.
> 
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
>  include/linux/page_frag_cache.h |  82 +++++++++++++++++++++++
>  mm/page_frag_cache.c            | 114 ++++++++++++++++++++++++++++++++
>  2 files changed, 196 insertions(+)
> 
> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
> index b33904d4494f..e95d44a36ec9 100644
> --- a/include/linux/page_frag_cache.h
> +++ b/include/linux/page_frag_cache.h
> @@ -4,6 +4,7 @@
>  #define _LINUX_PAGE_FRAG_CACHE_H
>  
>  #include <linux/gfp_types.h>
> +#include <linux/mmdebug.h>
>  
>  #define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
>  #define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
> @@ -87,6 +88,9 @@ static inline unsigned int page_frag_cache_page_size(struct encoded_va *encoded_
>  
>  void page_frag_cache_drain(struct page_frag_cache *nc);
>  void __page_frag_cache_drain(struct page *page, unsigned int count);
> +struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
> +				unsigned int *offset, unsigned int fragsz,
> +				gfp_t gfp);
>  void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>  				 unsigned int fragsz, gfp_t gfp_mask,
>  				 unsigned int align_mask);
> @@ -99,12 +103,90 @@ static inline void *page_frag_alloc_va_align(struct page_frag_cache *nc,
>  	return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, -align);
>  }
>  
> +static inline unsigned int page_frag_cache_page_offset(const struct page_frag_cache *nc)
> +{
> +	return page_frag_cache_page_size(nc->encoded_va) - nc->remaining;
> +}
> +
>  static inline void *page_frag_alloc_va(struct page_frag_cache *nc,
>  				       unsigned int fragsz, gfp_t gfp_mask)
>  {
>  	return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, ~0u);
>  }
>  
> +void *page_frag_alloc_va_prepare(struct page_frag_cache *nc, unsigned int *fragsz,
> +				 gfp_t gfp);
> +
> +static inline void *page_frag_alloc_va_prepare_align(struct page_frag_cache *nc,
> +						     unsigned int *fragsz,
> +						     gfp_t gfp,
> +						     unsigned int align)
> +{
> +	WARN_ON_ONCE(!is_power_of_2(align) || align > PAGE_SIZE);
> +	nc->remaining = nc->remaining & -align;
> +	return page_frag_alloc_va_prepare(nc, fragsz, gfp);
> +}
> +
> +struct page *page_frag_alloc_pg_prepare(struct page_frag_cache *nc,
> +					unsigned int *offset,
> +					unsigned int *fragsz, gfp_t gfp);
> +
> +struct page *page_frag_alloc_prepare(struct page_frag_cache *nc,
> +				     unsigned int *offset,
> +				     unsigned int *fragsz,
> +				     void **va, gfp_t gfp);
> +
> +static inline struct encoded_va *__page_frag_alloc_probe(struct page_frag_cache *nc,
> +							 unsigned int *offset,
> +							 unsigned int *fragsz,
> +							 void **va)
> +{
> +	struct encoded_va *encoded_va;
> +
> +	*fragsz = nc->remaining;
> +	encoded_va = nc->encoded_va;
> +	*offset = page_frag_cache_page_size(encoded_va) - *fragsz;
> +	*va = encoded_page_address(encoded_va) + *offset;
> +
> +	return encoded_va;
> +}
> +
> +#define page_frag_alloc_probe(nc, offset, fragsz, va)			\
> +({									\
> +	struct page *__page = NULL;					\
> +									\
> +	VM_BUG_ON(!*(fragsz));						\
> +	if (likely((nc)->remaining >= *(fragsz)))			\
> +		__page = virt_to_page(__page_frag_alloc_probe(nc,	\
> +							      offset,	\
> +							      fragsz,	\
> +							      va));	\
> +									\
> +	__page;								\
> +})
> +

Why is this a macro instead of just being an inline? Are you trying to
avoid having to include a header due to the virt_to_page?



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 10/13] mm: page_frag: introduce prepare/probe/commit API
  2024-06-28 22:35   ` Alexander H Duyck
@ 2024-06-29 11:15     ` Yunsheng Lin
  2024-06-29 17:37       ` Alexander Duyck
  0 siblings, 1 reply; 43+ messages in thread
From: Yunsheng Lin @ 2024-06-29 11:15 UTC (permalink / raw)
  To: Alexander H Duyck, davem, kuba, pabeni
  Cc: netdev, linux-kernel, Andrew Morton, linux-mm

On 2024/6/29 6:35, Alexander H Duyck wrote:
> On Tue, 2024-06-25 at 21:52 +0800, Yunsheng Lin wrote:
>> There are many use cases that need minimum memory in order
>> for forward progress, but more performant if more memory is
>> available or need to probe the cache info to use any memory
>> available for frag caoleasing reason.
>>
>> Currently skb_page_frag_refill() API is used to solve the
>> above use cases, but caller needs to know about the internal
>> detail and access the data field of 'struct page_frag' to
>> meet the requirement of the above use cases and its
>> implementation is similar to the one in mm subsystem.
>>
>> To unify those two page_frag implementations, introduce a
>> prepare API to ensure minimum memory is satisfied and return
>> how much the actual memory is available to the caller and a
>> probe API to report the current available memory to caller
>> without doing cache refilling. The caller needs to either call
>> the commit API to report how much memory it actually uses, or
>> not do so if deciding to not use any memory.
>>
>> As next patch is about to replace 'struct page_frag' with
>> 'struct page_frag_cache' in linux/sched.h, which is included
>> by the asm-offsets.s, using the virt_to_page() in the inline
>> helper of page_frag_cache.h cause a "'vmemmap' undeclared"
>> compiling error for asm-offsets.s, use a macro for probe API
>> to avoid that compiling error.
>>
>> CC: Alexander Duyck <alexander.duyck@gmail.com>
>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>> ---
>>  include/linux/page_frag_cache.h |  82 +++++++++++++++++++++++
>>  mm/page_frag_cache.c            | 114 ++++++++++++++++++++++++++++++++
>>  2 files changed, 196 insertions(+)
>>
>> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
>> index b33904d4494f..e95d44a36ec9 100644
>> --- a/include/linux/page_frag_cache.h
>> +++ b/include/linux/page_frag_cache.h
>> @@ -4,6 +4,7 @@
>>  #define _LINUX_PAGE_FRAG_CACHE_H
>>  
>>  #include <linux/gfp_types.h>
>> +#include <linux/mmdebug.h>
>>  
>>  #define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
>>  #define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
>> @@ -87,6 +88,9 @@ static inline unsigned int page_frag_cache_page_size(struct encoded_va *encoded_
>>  
>>  void page_frag_cache_drain(struct page_frag_cache *nc);
>>  void __page_frag_cache_drain(struct page *page, unsigned int count);
>> +struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
>> +				unsigned int *offset, unsigned int fragsz,
>> +				gfp_t gfp);
>>  void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>>  				 unsigned int fragsz, gfp_t gfp_mask,
>>  				 unsigned int align_mask);
>> @@ -99,12 +103,90 @@ static inline void *page_frag_alloc_va_align(struct page_frag_cache *nc,
>>  	return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, -align);
>>  }
>>  
>> +static inline unsigned int page_frag_cache_page_offset(const struct page_frag_cache *nc)
>> +{
>> +	return page_frag_cache_page_size(nc->encoded_va) - nc->remaining;
>> +}
>> +
>>  static inline void *page_frag_alloc_va(struct page_frag_cache *nc,
>>  				       unsigned int fragsz, gfp_t gfp_mask)
>>  {
>>  	return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, ~0u);
>>  }
>>  
>> +void *page_frag_alloc_va_prepare(struct page_frag_cache *nc, unsigned int *fragsz,
>> +				 gfp_t gfp);
>> +
>> +static inline void *page_frag_alloc_va_prepare_align(struct page_frag_cache *nc,
>> +						     unsigned int *fragsz,
>> +						     gfp_t gfp,
>> +						     unsigned int align)
>> +{
>> +	WARN_ON_ONCE(!is_power_of_2(align) || align > PAGE_SIZE);
>> +	nc->remaining = nc->remaining & -align;
>> +	return page_frag_alloc_va_prepare(nc, fragsz, gfp);
>> +}
>> +
>> +struct page *page_frag_alloc_pg_prepare(struct page_frag_cache *nc,
>> +					unsigned int *offset,
>> +					unsigned int *fragsz, gfp_t gfp);
>> +
>> +struct page *page_frag_alloc_prepare(struct page_frag_cache *nc,
>> +				     unsigned int *offset,
>> +				     unsigned int *fragsz,
>> +				     void **va, gfp_t gfp);
>> +
>> +static inline struct encoded_va *__page_frag_alloc_probe(struct page_frag_cache *nc,
>> +							 unsigned int *offset,
>> +							 unsigned int *fragsz,
>> +							 void **va)
>> +{
>> +	struct encoded_va *encoded_va;
>> +
>> +	*fragsz = nc->remaining;
>> +	encoded_va = nc->encoded_va;
>> +	*offset = page_frag_cache_page_size(encoded_va) - *fragsz;
>> +	*va = encoded_page_address(encoded_va) + *offset;
>> +
>> +	return encoded_va;
>> +}
>> +
>> +#define page_frag_alloc_probe(nc, offset, fragsz, va)			\
>> +({									\
>> +	struct page *__page = NULL;					\
>> +									\
>> +	VM_BUG_ON(!*(fragsz));						\
>> +	if (likely((nc)->remaining >= *(fragsz)))			\
>> +		__page = virt_to_page(__page_frag_alloc_probe(nc,	\
>> +							      offset,	\
>> +							      fragsz,	\
>> +							      va));	\
>> +									\
>> +	__page;								\
>> +})
>> +
> 
> Why is this a macro instead of just being an inline? Are you trying to
> avoid having to include a header due to the virt_to_page?

Yes, you are right.
I tried including different headers for virt_to_page(), and it did not
work for arch/x86/kernel/asm-offsets.s, which has included linux/sched.h,
and linux/sched.h need 'struct page_frag_cache' for 'struct task_struct'
after this patchset, including page_frag_cache.h for sched.h causes the
below compiler error:

  CC      arch/x86/kernel/asm-offsets.s
In file included from ./arch/x86/include/asm/page.h:89,
                 from ./arch/x86/include/asm/thread_info.h:12,
                 from ./include/linux/thread_info.h:60,
                 from ./include/linux/spinlock.h:60,
                 from ./include/linux/swait.h:7,
                 from ./include/linux/completion.h:12,
                 from ./include/linux/crypto.h:15,
                 from arch/x86/kernel/asm-offsets.c:9:
./include/linux/page_frag_cache.h: In function ‘page_frag_alloc_align’:
./include/asm-generic/memory_model.h:37:34: error: ‘vmemmap’ undeclared (first use in this function); did you mean ‘mem_map’?
   37 | #define __pfn_to_page(pfn)      (vmemmap + (pfn))
      |                                  ^~~~~~~
./include/asm-generic/memory_model.h:65:21: note: in expansion of macro ‘__pfn_to_page’
   65 | #define pfn_to_page __pfn_to_page
      |                     ^~~~~~~~~~~~~
./arch/x86/include/asm/page.h:68:33: note: in expansion of macro ‘pfn_to_page’
   68 | #define virt_to_page(kaddr)     pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
      |                                 ^~~~~~~~~~~
./include/linux/page_frag_cache.h:151:16: note: in expansion of macro ‘virt_to_page’
  151 |         return virt_to_page(va);
      |                ^~~~~~~~~~~~
./include/asm-generic/memory_model.h:37:34: note: each undeclared identifier is reported only once for each function it appears in
   37 | #define __pfn_to_page(pfn)      (vmemmap + (pfn))
      |                                  ^~~~~~~
./include/asm-generic/memory_model.h:65:21: note: in expansion of macro ‘__pfn_to_page’
   65 | #define pfn_to_page __pfn_to_page
      |                     ^~~~~~~~~~~~~
./arch/x86/include/asm/page.h:68:33: note: in expansion of macro ‘pfn_to_page’
   68 | #define virt_to_page(kaddr)     pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
      |                                 ^~~~~~~~~~~
./include/linux/page_frag_cache.h:151:16: note: in expansion of macro ‘virt_to_page’
  151 |         return virt_to_page(va);


Another possible way I can think of to aovid the above problem is to
split the page_frag_cache.h to something like page_frag_cache/types.h
and page_frag_cache/helpers.h as page_pool does, so that sched.h only
need to include page_frag_cache/types.h.
But I am not sure it is the correct way or it is worth the effort, what
do you think about this?

> 
> .
> 


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 10/13] mm: page_frag: introduce prepare/probe/commit API
  2024-06-29 11:15     ` Yunsheng Lin
@ 2024-06-29 17:37       ` Alexander Duyck
       [not found]         ` <0a80e362-1eb7-40b0-b1b9-07ec5a6506ea@gmail.com>
  0 siblings, 1 reply; 43+ messages in thread
From: Alexander Duyck @ 2024-06-29 17:37 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Andrew Morton,
	linux-mm

On Sat, Jun 29, 2024 at 4:15 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/6/29 6:35, Alexander H Duyck wrote:
> > On Tue, 2024-06-25 at 21:52 +0800, Yunsheng Lin wrote:
> >> There are many use cases that need minimum memory in order
> >> for forward progress, but more performant if more memory is
> >> available or need to probe the cache info to use any memory
> >> available for frag caoleasing reason.
> >>
> >> Currently skb_page_frag_refill() API is used to solve the
> >> above use cases, but caller needs to know about the internal
> >> detail and access the data field of 'struct page_frag' to
> >> meet the requirement of the above use cases and its
> >> implementation is similar to the one in mm subsystem.
> >>
> >> To unify those two page_frag implementations, introduce a
> >> prepare API to ensure minimum memory is satisfied and return
> >> how much the actual memory is available to the caller and a
> >> probe API to report the current available memory to caller
> >> without doing cache refilling. The caller needs to either call
> >> the commit API to report how much memory it actually uses, or
> >> not do so if deciding to not use any memory.
> >>
> >> As next patch is about to replace 'struct page_frag' with
> >> 'struct page_frag_cache' in linux/sched.h, which is included
> >> by the asm-offsets.s, using the virt_to_page() in the inline
> >> helper of page_frag_cache.h cause a "'vmemmap' undeclared"
> >> compiling error for asm-offsets.s, use a macro for probe API
> >> to avoid that compiling error.
> >>
> >> CC: Alexander Duyck <alexander.duyck@gmail.com>
> >> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> >> ---
> >>  include/linux/page_frag_cache.h |  82 +++++++++++++++++++++++
> >>  mm/page_frag_cache.c            | 114 ++++++++++++++++++++++++++++++++
> >>  2 files changed, 196 insertions(+)
> >>
> >> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
> >> index b33904d4494f..e95d44a36ec9 100644
> >> --- a/include/linux/page_frag_cache.h
> >> +++ b/include/linux/page_frag_cache.h
> >> @@ -4,6 +4,7 @@
> >>  #define _LINUX_PAGE_FRAG_CACHE_H
> >>
> >>  #include <linux/gfp_types.h>
> >> +#include <linux/mmdebug.h>
> >>
> >>  #define PAGE_FRAG_CACHE_MAX_SIZE    __ALIGN_MASK(32768, ~PAGE_MASK)
> >>  #define PAGE_FRAG_CACHE_MAX_ORDER   get_order(PAGE_FRAG_CACHE_MAX_SIZE)
> >> @@ -87,6 +88,9 @@ static inline unsigned int page_frag_cache_page_size(struct encoded_va *encoded_
> >>
> >>  void page_frag_cache_drain(struct page_frag_cache *nc);
> >>  void __page_frag_cache_drain(struct page *page, unsigned int count);
> >> +struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
> >> +                            unsigned int *offset, unsigned int fragsz,
> >> +                            gfp_t gfp);
> >>  void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
> >>                               unsigned int fragsz, gfp_t gfp_mask,
> >>                               unsigned int align_mask);
> >> @@ -99,12 +103,90 @@ static inline void *page_frag_alloc_va_align(struct page_frag_cache *nc,
> >>      return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, -align);
> >>  }
> >>
> >> +static inline unsigned int page_frag_cache_page_offset(const struct page_frag_cache *nc)
> >> +{
> >> +    return page_frag_cache_page_size(nc->encoded_va) - nc->remaining;
> >> +}
> >> +
> >>  static inline void *page_frag_alloc_va(struct page_frag_cache *nc,
> >>                                     unsigned int fragsz, gfp_t gfp_mask)
> >>  {
> >>      return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, ~0u);
> >>  }
> >>
> >> +void *page_frag_alloc_va_prepare(struct page_frag_cache *nc, unsigned int *fragsz,
> >> +                             gfp_t gfp);
> >> +
> >> +static inline void *page_frag_alloc_va_prepare_align(struct page_frag_cache *nc,
> >> +                                                 unsigned int *fragsz,
> >> +                                                 gfp_t gfp,
> >> +                                                 unsigned int align)
> >> +{
> >> +    WARN_ON_ONCE(!is_power_of_2(align) || align > PAGE_SIZE);
> >> +    nc->remaining = nc->remaining & -align;
> >> +    return page_frag_alloc_va_prepare(nc, fragsz, gfp);
> >> +}
> >> +
> >> +struct page *page_frag_alloc_pg_prepare(struct page_frag_cache *nc,
> >> +                                    unsigned int *offset,
> >> +                                    unsigned int *fragsz, gfp_t gfp);
> >> +
> >> +struct page *page_frag_alloc_prepare(struct page_frag_cache *nc,
> >> +                                 unsigned int *offset,
> >> +                                 unsigned int *fragsz,
> >> +                                 void **va, gfp_t gfp);
> >> +
> >> +static inline struct encoded_va *__page_frag_alloc_probe(struct page_frag_cache *nc,
> >> +                                                     unsigned int *offset,
> >> +                                                     unsigned int *fragsz,
> >> +                                                     void **va)
> >> +{
> >> +    struct encoded_va *encoded_va;
> >> +
> >> +    *fragsz = nc->remaining;
> >> +    encoded_va = nc->encoded_va;
> >> +    *offset = page_frag_cache_page_size(encoded_va) - *fragsz;
> >> +    *va = encoded_page_address(encoded_va) + *offset;
> >> +
> >> +    return encoded_va;
> >> +}
> >> +
> >> +#define page_frag_alloc_probe(nc, offset, fragsz, va)                       \
> >> +({                                                                  \
> >> +    struct page *__page = NULL;                                     \
> >> +                                                                    \
> >> +    VM_BUG_ON(!*(fragsz));                                          \
> >> +    if (likely((nc)->remaining >= *(fragsz)))                       \
> >> +            __page = virt_to_page(__page_frag_alloc_probe(nc,       \
> >> +                                                          offset,   \
> >> +                                                          fragsz,   \
> >> +                                                          va));     \
> >> +                                                                    \
> >> +    __page;                                                         \
> >> +})
> >> +
> >
> > Why is this a macro instead of just being an inline? Are you trying to
> > avoid having to include a header due to the virt_to_page?
>
> Yes, you are right.
> I tried including different headers for virt_to_page(), and it did not
> work for arch/x86/kernel/asm-offsets.s, which has included linux/sched.h,
> and linux/sched.h need 'struct page_frag_cache' for 'struct task_struct'
> after this patchset, including page_frag_cache.h for sched.h causes the
> below compiler error:
>
>   CC      arch/x86/kernel/asm-offsets.s
> In file included from ./arch/x86/include/asm/page.h:89,
>                  from ./arch/x86/include/asm/thread_info.h:12,
>                  from ./include/linux/thread_info.h:60,
>                  from ./include/linux/spinlock.h:60,
>                  from ./include/linux/swait.h:7,
>                  from ./include/linux/completion.h:12,
>                  from ./include/linux/crypto.h:15,
>                  from arch/x86/kernel/asm-offsets.c:9:
> ./include/linux/page_frag_cache.h: In function ‘page_frag_alloc_align’:
> ./include/asm-generic/memory_model.h:37:34: error: ‘vmemmap’ undeclared (first use in this function); did you mean ‘mem_map’?
>    37 | #define __pfn_to_page(pfn)      (vmemmap + (pfn))
>       |                                  ^~~~~~~
> ./include/asm-generic/memory_model.h:65:21: note: in expansion of macro ‘__pfn_to_page’
>    65 | #define pfn_to_page __pfn_to_page
>       |                     ^~~~~~~~~~~~~
> ./arch/x86/include/asm/page.h:68:33: note: in expansion of macro ‘pfn_to_page’
>    68 | #define virt_to_page(kaddr)     pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
>       |                                 ^~~~~~~~~~~
> ./include/linux/page_frag_cache.h:151:16: note: in expansion of macro ‘virt_to_page’
>   151 |         return virt_to_page(va);
>       |                ^~~~~~~~~~~~
> ./include/asm-generic/memory_model.h:37:34: note: each undeclared identifier is reported only once for each function it appears in
>    37 | #define __pfn_to_page(pfn)      (vmemmap + (pfn))
>       |                                  ^~~~~~~
> ./include/asm-generic/memory_model.h:65:21: note: in expansion of macro ‘__pfn_to_page’
>    65 | #define pfn_to_page __pfn_to_page
>       |                     ^~~~~~~~~~~~~
> ./arch/x86/include/asm/page.h:68:33: note: in expansion of macro ‘pfn_to_page’
>    68 | #define virt_to_page(kaddr)     pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
>       |                                 ^~~~~~~~~~~
> ./include/linux/page_frag_cache.h:151:16: note: in expansion of macro ‘virt_to_page’
>   151 |         return virt_to_page(va);
>
>

I am pretty sure you just need to add:
#include <asm/page.h>


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 10/13] mm: page_frag: introduce prepare/probe/commit API
       [not found]         ` <0a80e362-1eb7-40b0-b1b9-07ec5a6506ea@gmail.com>
@ 2024-06-30 14:35           ` Alexander Duyck
  2024-06-30 15:05             ` Yunsheng Lin
  0 siblings, 1 reply; 43+ messages in thread
From: Alexander Duyck @ 2024-06-30 14:35 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: Yunsheng Lin, davem, kuba, pabeni, netdev, linux-kernel,
	Andrew Morton, linux-mm

On Sun, Jun 30, 2024 at 7:05 AM Yunsheng Lin <yunshenglin0825@gmail.com> wrote:
>
> On 6/30/2024 1:37 AM, Alexander Duyck wrote:
> > On Sat, Jun 29, 2024 at 4:15 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> ...
>
> >>>
> >>> Why is this a macro instead of just being an inline? Are you trying to
> >>> avoid having to include a header due to the virt_to_page?
> >>
> >> Yes, you are right.

...

> > I am pretty sure you just need to add:
> > #include <asm/page.h>
>
> I am supposing you mean adding the above to page_frag_cache.h, right?
>
> It seems thing is more complicated for SPARSEMEM_VMEMMAP case, as it
> needs the declaration of 'vmemmap'(some arch defines it as a pointer
> variable while some arch defines it as a macro) and the definition of
> 'struct page' for '(vmemmap + (pfn))' operation.
>
> Adding below for 'vmemmap' and 'struct page' seems to have some compiler
> error caused by interdependence between linux/mm_types.h and asm/pgtable.h:
> #include <asm/pgtable.h>
> #include <linux/mm_types.h>
>

Maybe you should just include linux/mm.h as that should have all the
necessary includes to handle these cases. In any case though it
doesn't make any sense to have a define in one include that expects
the user to then figure out what other headers to include in order to
make the define work they should be included in the header itself to
avoid any sort of weird dependencies.


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 10/13] mm: page_frag: introduce prepare/probe/commit API
  2024-06-30 14:35           ` Alexander Duyck
@ 2024-06-30 15:05             ` Yunsheng Lin
  2024-07-03 12:40               ` Yunsheng Lin
  0 siblings, 1 reply; 43+ messages in thread
From: Yunsheng Lin @ 2024-06-30 15:05 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Yunsheng Lin, davem, kuba, pabeni, netdev, linux-kernel,
	Andrew Morton, linux-mm

On 6/30/2024 10:35 PM, Alexander Duyck wrote:
> On Sun, Jun 30, 2024 at 7:05 AM Yunsheng Lin <yunshenglin0825@gmail.com> wrote:
>>
>> On 6/30/2024 1:37 AM, Alexander Duyck wrote:
>>> On Sat, Jun 29, 2024 at 4:15 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> ...
>>
>>>>>
>>>>> Why is this a macro instead of just being an inline? Are you trying to
>>>>> avoid having to include a header due to the virt_to_page?
>>>>
>>>> Yes, you are right.
> 
> ...
> 
>>> I am pretty sure you just need to add:
>>> #include <asm/page.h>
>>
>> I am supposing you mean adding the above to page_frag_cache.h, right?
>>
>> It seems thing is more complicated for SPARSEMEM_VMEMMAP case, as it
>> needs the declaration of 'vmemmap'(some arch defines it as a pointer
>> variable while some arch defines it as a macro) and the definition of
>> 'struct page' for '(vmemmap + (pfn))' operation.
>>
>> Adding below for 'vmemmap' and 'struct page' seems to have some compiler
>> error caused by interdependence between linux/mm_types.h and asm/pgtable.h:
>> #include <asm/pgtable.h>
>> #include <linux/mm_types.h>
>>
> 
> Maybe you should just include linux/mm.h as that should have all the
> necessary includes to handle these cases. In any case though it

Including linux/mm.h seems to have similar compiler error, just the
interdependence is between linux/mm_types.h and linux/mm.h now.

As below, linux/mmap_lock.h obviously need the definition of
'struct mm_struct' from linux/mm_types.h, and linux/mm_types.h
has some a long dependency of linux/mm.h starting from
linux/uprobes.h if we add '#include <linux/mm.h>' in 
linux/page_frag_cache.h:

In file included from ./include/linux/mm.h:16,
                  from ./include/linux/page_frag_cache.h:6,
                  from ./include/linux/sched.h:49,
                  from ./include/linux/percpu.h:13,
                  from ./arch/x86/include/asm/msr.h:15,
                  from ./arch/x86/include/asm/tsc.h:10,
                  from ./arch/x86/include/asm/timex.h:6,
                  from ./include/linux/timex.h:67,
                  from ./include/linux/time32.h:13,
                  from ./include/linux/time.h:60,
                  from ./include/linux/jiffies.h:10,
                  from ./include/linux/ktime.h:25,
                  from ./include/linux/timer.h:6,
                  from ./include/linux/workqueue.h:9,
                  from ./include/linux/srcu.h:21,
                  from ./include/linux/notifier.h:16,
                  from ./arch/x86/include/asm/uprobes.h:13,
                  from ./include/linux/uprobes.h:49,
                  from ./include/linux/mm_types.h:16,
                  from ./include/linux/mmzone.h:22,
                  from ./include/linux/gfp.h:7,
                  from ./include/linux/slab.h:16,
                  from ./include/linux/crypto.h:17,
                  from arch/x86/kernel/asm-offsets.c:9:
./include/linux/mmap_lock.h: In function ‘mmap_assert_locked’:
./include/linux/mmap_lock.h:65:30: error: invalid use of undefined type 
‘const struct mm_struct’
    65 |         rwsem_assert_held(&mm->mmap_lock);
       |                              ^~

> doesn't make any sense to have a define in one include that expects
> the user to then figure out what other headers to include in order to
> make the define work they should be included in the header itself to
> avoid any sort of weird dependencies.

Perhaps there are some season why there are two headers for the mm 
subsystem, linux/mm_types.h and linux/mm.h?
And .h file is supposed to include the linux/mm_types.h while .c file
is supposed to include the linux/mm.h?
If the above is correct, it seems the above rule is broked by including 
linux/mm.h in linux/page_frag_cache.h.


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 02/13] mm: move the page fragment allocator from page_alloc into its own file
  2024-06-25 13:52 ` [PATCH net-next v9 02/13] mm: move the page fragment allocator from page_alloc into its own file Yunsheng Lin
@ 2024-07-01 23:10   ` Alexander H Duyck
  2024-07-02 12:27     ` Yunsheng Lin
  0 siblings, 1 reply; 43+ messages in thread
From: Alexander H Duyck @ 2024-07-01 23:10 UTC (permalink / raw)
  To: Yunsheng Lin, davem, kuba, pabeni
  Cc: netdev, linux-kernel, David Howells, Andrew Morton, linux-mm

On Tue, 2024-06-25 at 21:52 +0800, Yunsheng Lin wrote:
> Inspired by [1], move the page fragment allocator from page_alloc
> into its own c file and header file, as we are about to make more
> change for it to replace another page_frag implementation in
> sock.c
> 
> 1. https://lore.kernel.org/all/20230411160902.4134381-3-dhowells@redhat.com/
> 
> CC: David Howells <dhowells@redhat.com>
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>

So one thing that I think might have been overlooked in the previous
reviews is the fact that the headers weren't necessarily self
sufficient. You were introducing dependencies that had to be fulfilled
by other headers.

One thing you might try doing as part of your testing would be to add a
C file that just adds your header and calls your functions to verify
that there aren't any unincluded dependencies.

> ---
>  include/linux/gfp.h             |  22 -----
>  include/linux/mm_types.h        |  18 ----
>  include/linux/page_frag_cache.h |  47 +++++++++++
>  include/linux/skbuff.h          |   1 +
>  mm/Makefile                     |   1 +
>  mm/page_alloc.c                 | 136 ------------------------------
>  mm/page_frag_cache.c            | 144 ++++++++++++++++++++++++++++++++
>  mm/page_frag_test.c             |   1 +
>  8 files changed, 194 insertions(+), 176 deletions(-)
>  create mode 100644 include/linux/page_frag_cache.h
>  create mode 100644 mm/page_frag_cache.c
> 
...

> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
> new file mode 100644
> index 000000000000..3a44bfc99750
> --- /dev/null
> +++ b/include/linux/page_frag_cache.h
> @@ -0,0 +1,47 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef _LINUX_PAGE_FRAG_CACHE_H
> +#define _LINUX_PAGE_FRAG_CACHE_H
> +
> +#include <linux/gfp_types.h>
> +

The gfp_types.h only really gives you the values you pass to the
gfp_mask. Did you mean to include linux/types.h to get the gfp_t
typedef?

> +#define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)

You should probably include linux/align.h to pull in the __ALIGN_MASK.

> +#define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)

I am pretty sure get_order is from asm/page.h as well.

> +
> +struct page_frag_cache {
> +	void *va;
> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)

I am pretty sure PAGE_SIZE is included from asm/page.h

> +	__u16 offset;
> +	__u16 size;
> +#else
> +	__u32 offset;
> +#endif
> +	/* we maintain a pagecount bias, so that we dont dirty cache line
> +	 * containing page->_refcount every time we allocate a fragment.
> +	 */
> +	unsigned int		pagecnt_bias;
> +	bool pfmemalloc;
> +};
> +
> +void page_frag_cache_drain(struct page_frag_cache *nc);
> +void __page_frag_cache_drain(struct page *page, unsigned int count);
> +void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
> +			      gfp_t gfp_mask, unsigned int align_mask);
> +
> +static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
> +					  unsigned int fragsz, gfp_t gfp_mask,
> +					  unsigned int align)
> +{
> +	WARN_ON_ONCE(!is_power_of_2(align));

To get is_power_of_2 you should be including linux/log2.h.

> +	return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align);
> +}
> +
> +static inline void *page_frag_alloc(struct page_frag_cache *nc,
> +				    unsigned int fragsz, gfp_t gfp_mask)
> +{
> +	return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
> +}
> +
> +void page_frag_free(void *addr);
> +
> +#endif
> 

...

> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> new file mode 100644
> index 000000000000..88f567ef0e29
> --- /dev/null
> +++ b/mm/page_frag_cache.c
> @@ -0,0 +1,144 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Page fragment allocator
> + *
> + * Page Fragment:
> + *  An arbitrary-length arbitrary-offset area of memory which resides within a
> + *  0 or higher order page.  Multiple fragments within that page are
> + *  individually refcounted, in the page's reference counter.
> + *
> + * The page_frag functions provide a simple allocation framework for page
> + * fragments.  This is used by the network stack and network device drivers to
> + * provide a backing region of memory for use as either an sk_buff->head, or to
> + * be used in the "frags" portion of skb_shared_info.
> + */
> +
> +#include <linux/export.h>
> +#include <linux/init.h>
> +#include <linux/mm.h>
> +#include <linux/page_frag_cache.h>
> +#include "internal.h"

You could probably include gfp_types.h here since this is where you are
using the GFP_XXX values.

> +
> +static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
> +					     gfp_t gfp_mask)
> +{
> +	struct page *page = NULL;
> +	gfp_t gfp = gfp_mask;
> +
> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> +	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
> +		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
> +	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
> +				PAGE_FRAG_CACHE_MAX_ORDER);
> +	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
> +#endif
> +	if (unlikely(!page))
> +		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
> +
> +	nc->va = page ? page_address(page) : NULL;
> +
> +	return page;
> +}
> +
> +void page_frag_cache_drain(struct page_frag_cache *nc)
> +{
> +	if (!nc->va)
> +		return;
> +
> +	__page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
> +	nc->va = NULL;
> +}
> +EXPORT_SYMBOL(page_frag_cache_drain);
> +
> +void __page_frag_cache_drain(struct page *page, unsigned int count)
> +{
> +	VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
> +
> +	if (page_ref_sub_and_test(page, count))
> +		free_unref_page(page, compound_order(page));
> +}
> +EXPORT_SYMBOL(__page_frag_cache_drain);
> +
> +void *__page_frag_alloc_align(struct page_frag_cache *nc,
> +			      unsigned int fragsz, gfp_t gfp_mask,
> +			      unsigned int align_mask)
> +{
> +	unsigned int size = PAGE_SIZE;
> +	struct page *page;
> +	int offset;
> +
> +	if (unlikely(!nc->va)) {
> +refill:
> +		page = __page_frag_cache_refill(nc, gfp_mask);
> +		if (!page)
> +			return NULL;
> +
> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> +		/* if size can vary use size else just use PAGE_SIZE */
> +		size = nc->size;
> +#endif
> +		/* Even if we own the page, we do not use atomic_set().
> +		 * This would break get_page_unless_zero() users.
> +		 */
> +		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
> +
> +		/* reset page count bias and offset to start of new frag */
> +		nc->pfmemalloc = page_is_pfmemalloc(page);
> +		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> +		nc->offset = size;
> +	}
> +
> +	offset = nc->offset - fragsz;
> +	if (unlikely(offset < 0)) {
> +		page = virt_to_page(nc->va);
> +
> +		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> +			goto refill;
> +
> +		if (unlikely(nc->pfmemalloc)) {
> +			free_unref_page(page, compound_order(page));
> +			goto refill;
> +		}
> +
> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> +		/* if size can vary use size else just use PAGE_SIZE */
> +		size = nc->size;
> +#endif
> +		/* OK, page count is 0, we can safely set it */
> +		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
> +
> +		/* reset page count bias and offset to start of new frag */
> +		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> +		offset = size - fragsz;
> +		if (unlikely(offset < 0)) {
> +			/*
> +			 * The caller is trying to allocate a fragment
> +			 * with fragsz > PAGE_SIZE but the cache isn't big
> +			 * enough to satisfy the request, this may
> +			 * happen in low memory conditions.
> +			 * We don't release the cache page because
> +			 * it could make memory pressure worse
> +			 * so we simply return NULL here.
> +			 */
> +			return NULL;
> +		}
> +	}
> +
> +	nc->pagecnt_bias--;
> +	offset &= align_mask;
> +	nc->offset = offset;
> +
> +	return nc->va + offset;
> +}
> +EXPORT_SYMBOL(__page_frag_alloc_align);
> +
> +/*
> + * Frees a page fragment allocated out of either a compound or order 0 page.
> + */
> +void page_frag_free(void *addr)
> +{
> +	struct page *page = virt_to_head_page(addr);
> +
> +	if (unlikely(put_page_testzero(page)))
> +		free_unref_page(page, compound_order(page));
> +}
> +EXPORT_SYMBOL(page_frag_free);
> diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
> index 5ee3f33b756d..07748ee0a21f 100644
> --- a/mm/page_frag_test.c
> +++ b/mm/page_frag_test.c
> @@ -16,6 +16,7 @@
>  #include <linux/log2.h>
>  #include <linux/completion.h>
>  #include <linux/kthread.h>
> +#include <linux/page_frag_cache.h>
>  
>  #define OBJPOOL_NR_OBJECT_MAX	BIT(24)
>  



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 03/13] mm: page_frag: use initial zero offset for page_frag_alloc_align()
  2024-06-25 13:52 ` [PATCH net-next v9 03/13] mm: page_frag: use initial zero offset for page_frag_alloc_align() Yunsheng Lin
@ 2024-07-01 23:27   ` Alexander H Duyck
  2024-07-02 12:28     ` Yunsheng Lin
  0 siblings, 1 reply; 43+ messages in thread
From: Alexander H Duyck @ 2024-07-01 23:27 UTC (permalink / raw)
  To: Yunsheng Lin, davem, kuba, pabeni
  Cc: netdev, linux-kernel, Andrew Morton, linux-mm

On Tue, 2024-06-25 at 21:52 +0800, Yunsheng Lin wrote:
> We are above to use page_frag_alloc_*() API to not just
"about to use", not "above to use"

> allocate memory for skb->data, but also use them to do
> the memory allocation for skb frag too. Currently the
> implementation of page_frag in mm subsystem is running
> the offset as a countdown rather than count-up value,
> there may have several advantages to that as mentioned
> in [1], but it may have some disadvantages, for example,
> it may disable skb frag coaleasing and more correct cache
> prefetching
> 
> We have a trade-off to make in order to have a unified
> implementation and API for page_frag, so use a initial zero
> offset in this patch, and the following patch will try to
> make some optimization to aovid the disadvantages as much
> as possible.
> 
> As offsets is added due to alignment requirement before
> actually checking if the cache is enough, which might make
> it exploitable if caller passes a align value bigger than
> 32K mistakenly. As we are allowing order 3 page allocation
> to fail easily under low memory condition, align value bigger
> than PAGE_SIZE is not really allowed, so add a 'align >
> PAGE_SIZE' checking in page_frag_alloc_va_align() to catch
> that.
> 
> 1. https://lore.kernel.org/all/f4abe71b3439b39d17a6fb2d410180f367cadf5c.camel@gmail.com/
> 
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
>  include/linux/page_frag_cache.h |  2 +-
>  include/linux/skbuff.h          |  4 ++--
>  mm/page_frag_cache.c            | 26 +++++++++++---------------
>  3 files changed, 14 insertions(+), 18 deletions(-)
> 
> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
> index 3a44bfc99750..b9411f0db25a 100644
> --- a/include/linux/page_frag_cache.h
> +++ b/include/linux/page_frag_cache.h
> @@ -32,7 +32,7 @@ static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
>  					  unsigned int fragsz, gfp_t gfp_mask,
>  					  unsigned int align)
>  {
> -	WARN_ON_ONCE(!is_power_of_2(align));
> +	WARN_ON_ONCE(!is_power_of_2(align) || align > PAGE_SIZE);
>  	return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align);
>  }
>  
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index eb8ae8292c48..d1fea23ec386 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -3320,7 +3320,7 @@ static inline void *netdev_alloc_frag(unsigned int fragsz)
>  static inline void *netdev_alloc_frag_align(unsigned int fragsz,
>  					    unsigned int align)
>  {
> -	WARN_ON_ONCE(!is_power_of_2(align));
> +	WARN_ON_ONCE(!is_power_of_2(align) || align > PAGE_SIZE);
>  	return __netdev_alloc_frag_align(fragsz, -align);
>  }
>  
> @@ -3391,7 +3391,7 @@ static inline void *napi_alloc_frag(unsigned int fragsz)
>  static inline void *napi_alloc_frag_align(unsigned int fragsz,
>  					  unsigned int align)
>  {
> -	WARN_ON_ONCE(!is_power_of_2(align));
> +	WARN_ON_ONCE(!is_power_of_2(align) || align > PAGE_SIZE);
>  	return __napi_alloc_frag_align(fragsz, -align);
>  }
>  
> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> index 88f567ef0e29..da244851b8a4 100644
> --- a/mm/page_frag_cache.c
> +++ b/mm/page_frag_cache.c
> @@ -72,10 +72,6 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>  		if (!page)
>  			return NULL;
>  
> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> -		/* if size can vary use size else just use PAGE_SIZE */
> -		size = nc->size;
> -#endif
>  		/* Even if we own the page, we do not use atomic_set().
>  		 * This would break get_page_unless_zero() users.
>  		 */
> @@ -84,11 +80,16 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>  		/* reset page count bias and offset to start of new frag */
>  		nc->pfmemalloc = page_is_pfmemalloc(page);
>  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> -		nc->offset = size;
> +		nc->offset = 0;
>  	}
>  
> -	offset = nc->offset - fragsz;
> -	if (unlikely(offset < 0)) {
> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> +	/* if size can vary use size else just use PAGE_SIZE */
> +	size = nc->size;
> +#endif
> +
> +	offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask);
> +	if (unlikely(offset + fragsz > size)) {

The fragsz check below could be moved to here.

>  		page = virt_to_page(nc->va);
>  
>  		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> @@ -99,17 +100,13 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>  			goto refill;
>  		}
>  
> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> -		/* if size can vary use size else just use PAGE_SIZE */
> -		size = nc->size;
> -#endif
>  		/* OK, page count is 0, we can safely set it */
>  		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
>  
>  		/* reset page count bias and offset to start of new frag */
>  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> -		offset = size - fragsz;
> -		if (unlikely(offset < 0)) {
> +		offset = 0;
> +		if (unlikely(fragsz > PAGE_SIZE)) {

Since we aren't taking advantage of the flag that is left after the
subtraction we might just want to look at moving this piece up to just
after the offset + fragsz check. That should prevent us from trying to
refill if we have a request that is larger than a single page. In
addition we could probably just drop the 3 PAGE_SIZE checks above as
they would be redundant.

>  			/*
>  			 * The caller is trying to allocate a fragment
>  			 * with fragsz > PAGE_SIZE but the cache isn't big
> @@ -124,8 +121,7 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>  	}
>  
>  	nc->pagecnt_bias--;
> -	offset &= align_mask;
> -	nc->offset = offset;
> +	nc->offset = offset + fragsz;
>  
>  	return nc->va + offset;
>  }



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 06/13] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc'
  2024-06-25 13:52 ` [PATCH net-next v9 06/13] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc' Yunsheng Lin
@ 2024-07-02  0:08   ` Alexander H Duyck
  2024-07-02 12:35     ` Yunsheng Lin
  0 siblings, 1 reply; 43+ messages in thread
From: Alexander H Duyck @ 2024-07-02  0:08 UTC (permalink / raw)
  To: Yunsheng Lin, davem, kuba, pabeni
  Cc: netdev, linux-kernel, Andrew Morton, linux-mm

On Tue, 2024-06-25 at 21:52 +0800, Yunsheng Lin wrote:
> Currently there is one 'struct page_frag' for every 'struct
> sock' and 'struct task_struct', we are about to replace the
> 'struct page_frag' with 'struct page_frag_cache' for them.
> Before begin the replacing, we need to ensure the size of
> 'struct page_frag_cache' is not bigger than the size of
> 'struct page_frag', as there may be tens of thousands of
> 'struct sock' and 'struct task_struct' instances in the
> system.
> 
> By or'ing the page order & pfmemalloc with lower bits of
> 'va' instead of using 'u16' or 'u32' for page size and 'u8'
> for pfmemalloc, we are able to avoid 3 or 5 bytes space waste.
> And page address & pfmemalloc & order is unchanged for the
> same page in the same 'page_frag_cache' instance, it makes
> sense to fit them together.
> 
> Also, it is better to replace 'offset' with 'remaining', which
> is the remaining size for the cache in a 'page_frag_cache'
> instance, we are able to do a single 'fragsz > remaining'
> checking for the case of cache not being enough, which should be
> the fast path if we ensure size is zoro when 'va' == NULL by
> memset'ing 'struct page_frag_cache' in page_frag_cache_init()
> and page_frag_cache_drain().
> 
> After this patch, the size of 'struct page_frag_cache' should be
> the same as the size of 'struct page_frag'.
> 
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
>  include/linux/page_frag_cache.h | 76 +++++++++++++++++++++++-----
>  mm/page_frag_cache.c            | 90 ++++++++++++++++++++-------------
>  2 files changed, 118 insertions(+), 48 deletions(-)
> 
> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
> index 6ac3a25089d1..b33904d4494f 100644
> --- a/include/linux/page_frag_cache.h
> +++ b/include/linux/page_frag_cache.h
> @@ -8,29 +8,81 @@
>  #define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
>  #define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
>  
> -struct page_frag_cache {
> -	void *va;
> +/*
> + * struct encoded_va - a nonexistent type marking this pointer
> + *
> + * An 'encoded_va' pointer is a pointer to a aligned virtual address, which is
> + * at least aligned to PAGE_SIZE, that means there are at least 12 lower bits
> + * space available for other purposes.
> + *
> + * Currently we use the lower 8 bits and bit 9 for the order and PFMEMALLOC
> + * flag of the page this 'va' is corresponding to.
> + *
> + * Use the supplied helper functions to endcode/decode the pointer and bits.
> + */
> +struct encoded_va;
> +

Why did you create a struct for this? The way you use it below it is
just a pointer. No point in defining a struct that doesn't exist
anywhere.

> +#define PAGE_FRAG_CACHE_ORDER_MASK		GENMASK(7, 0)
> +#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT		BIT(8)
> +#define PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT	8
> +
> +static inline struct encoded_va *encode_aligned_va(void *va,
> +						   unsigned int order,
> +						   bool pfmemalloc)
> +{
>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> -	__u16 offset;
> -	__u16 size;
> +	return (struct encoded_va *)((unsigned long)va | order |
> +			pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT);
>  #else
> -	__u32 offset;
> +	return (struct encoded_va *)((unsigned long)va |
> +			pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT);
> +#endif
> +}
> +
> +static inline unsigned long encoded_page_order(struct encoded_va *encoded_va)
> +{
> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> +	return PAGE_FRAG_CACHE_ORDER_MASK & (unsigned long)encoded_va;
> +#else
> +	return 0;
> +#endif
> +}
> +
> +static inline bool encoded_page_pfmemalloc(struct encoded_va *encoded_va)
> +{
> +	return PAGE_FRAG_CACHE_PFMEMALLOC_BIT & (unsigned long)encoded_va;
> +}
> +

My advice is that if you just make encoded_va an unsigned long this
just becomes some FIELD_GET and bit operations.

> +static inline void *encoded_page_address(struct encoded_va *encoded_va)
> +{
> +	return (void *)((unsigned long)encoded_va & PAGE_MASK);
> +}
> +
> +struct page_frag_cache {
> +	struct encoded_va *encoded_va;

This should be an unsigned long, not a pointer since you are storing
data other than just a pointer in here. The pointer is just one of the
things you extract out of it.

> +
> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
> +	u16 pagecnt_bias;
> +	u16 remaining;
> +#else
> +	u32 pagecnt_bias;
> +	u32 remaining;
>  #endif
> -	/* we maintain a pagecount bias, so that we dont dirty cache line
> -	 * containing page->_refcount every time we allocate a fragment.
> -	 */
> -	unsigned int		pagecnt_bias;
> -	bool pfmemalloc;
>  };
>  
>  static inline void page_frag_cache_init(struct page_frag_cache *nc)
>  {
> -	nc->va = NULL;
> +	memset(nc, 0, sizeof(*nc));

Shouldn't need to memset 0 the whole thing. Just setting page and order
to 0 should be enough to indicate that there isn't anything there.

>  }
>  
>  static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
>  {
> -	return !!nc->pfmemalloc;
> +	return encoded_page_pfmemalloc(nc->encoded_va);
> +}
> +
> +static inline unsigned int page_frag_cache_page_size(struct encoded_va *encoded_va)
> +{
> +	return PAGE_SIZE << encoded_page_order(encoded_va);
>  }
>  
>  void page_frag_cache_drain(struct page_frag_cache *nc);
> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> index dd640af5607a..a3316dd50eff 100644
> --- a/mm/page_frag_cache.c
> +++ b/mm/page_frag_cache.c
> @@ -18,34 +18,61 @@
>  #include <linux/page_frag_cache.h>
>  #include "internal.h"
>  
> +static void *page_frag_cache_current_va(struct page_frag_cache *nc)
> +{
> +	struct encoded_va *encoded_va = nc->encoded_va;
> +
> +	return (void *)(((unsigned long)encoded_va & PAGE_MASK) |
> +		(page_frag_cache_page_size(encoded_va) - nc->remaining));
> +}
> +

Rather than an OR here I would rather see this just use addition.
Otherwise this logic becomes overly complicated.

>  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>  					     gfp_t gfp_mask)
>  {
>  	struct page *page = NULL;
>  	gfp_t gfp = gfp_mask;
> +	unsigned int order;
>  
>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>  	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
>  		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
>  	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
>  				PAGE_FRAG_CACHE_MAX_ORDER);
> -	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
>  #endif
> -	if (unlikely(!page))
> +	if (unlikely(!page)) {
>  		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
> +		if (unlikely(!page)) {
> +			memset(nc, 0, sizeof(*nc));
> +			return NULL;
> +		}
> +
> +		order = 0;
> +		nc->remaining = PAGE_SIZE;
> +	} else {
> +		order = PAGE_FRAG_CACHE_MAX_ORDER;
> +		nc->remaining = PAGE_FRAG_CACHE_MAX_SIZE;
> +	}
>  
> -	nc->va = page ? page_address(page) : NULL;
> +	/* Even if we own the page, we do not use atomic_set().
> +	 * This would break get_page_unless_zero() users.
> +	 */
> +	page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
>  
> +	/* reset page count bias of new frag */
> +	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;

I would rather keep the pagecnt_bias, page reference addition, and
resetting of remaining outside of this. The only fields we should be
setting are order, the virtual address, and pfmemalloc since those are
what is encoded in your unsigned long variable.

> +	nc->encoded_va = encode_aligned_va(page_address(page), order,
> +					   page_is_pfmemalloc(page));
>  	return page;
>  }
>  
>  void page_frag_cache_drain(struct page_frag_cache *nc)
>  {
> -	if (!nc->va)
> +	if (!nc->encoded_va)
>  		return;
>  
> -	__page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
> -	nc->va = NULL;
> +	__page_frag_cache_drain(virt_to_head_page(nc->encoded_va),
> +				nc->pagecnt_bias);
> +	memset(nc, 0, sizeof(*nc));

Again, no need for memset when "nv->encoded_va = 0" will do.

>  }
>  EXPORT_SYMBOL(page_frag_cache_drain);
>  
> @@ -62,51 +89,41 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>  				 unsigned int fragsz, gfp_t gfp_mask,
>  				 unsigned int align_mask)
>  {
> -	unsigned int size = PAGE_SIZE;
> +	struct encoded_va *encoded_va = nc->encoded_va;
>  	struct page *page;
> -	int offset;
> +	int remaining;
> +	void *va;
>  
> -	if (unlikely(!nc->va)) {
> +	if (unlikely(!encoded_va)) {
>  refill:
> -		page = __page_frag_cache_refill(nc, gfp_mask);
> -		if (!page)
> +		if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
>  			return NULL;
>  
> -		/* Even if we own the page, we do not use atomic_set().
> -		 * This would break get_page_unless_zero() users.
> -		 */
> -		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
> -
> -		/* reset page count bias and offset to start of new frag */
> -		nc->pfmemalloc = page_is_pfmemalloc(page);
> -		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> -		nc->offset = 0;
> +		encoded_va = nc->encoded_va;
>  	}
>  
> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> -	/* if size can vary use size else just use PAGE_SIZE */
> -	size = nc->size;
> -#endif
> -
> -	offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask);
> -	if (unlikely(offset + fragsz > size)) {
> -		page = virt_to_page(nc->va);
> -
> +	remaining = nc->remaining & align_mask;
> +	remaining -= fragsz;
> +	if (unlikely(remaining < 0)) {

Now this is just getting confusing. You essentially just added an
additional addition step and went back to the countdown approach I was
using before except for the fact that you are starting at 0 whereas I
was actually moving down through the page.

What I would suggest doing since "remaining" is a negative offset
anyway would be to look at just storing it as a signed negative number.
At least with that you can keep to your original approach and would
only have to change your check to be for "remaining + fragsz <= 0".
With that you can still do your math but it becomes an addition instead
of a subtraction.

> +		page = virt_to_page(encoded_va);
>  		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
>  			goto refill;
>  
> -		if (unlikely(nc->pfmemalloc)) {
> -			free_unref_page(page, compound_order(page));
> +		if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
> +			VM_BUG_ON(compound_order(page) !=
> +				  encoded_page_order(encoded_va));
> +			free_unref_page(page, encoded_page_order(encoded_va));
>  			goto refill;
>  		}
>  
>  		/* OK, page count is 0, we can safely set it */
>  		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
>  
> -		/* reset page count bias and offset to start of new frag */
> +		/* reset page count bias and remaining of new frag */
>  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> -		offset = 0;
> -		if (unlikely(fragsz > PAGE_SIZE)) {
> +		nc->remaining = remaining = page_frag_cache_page_size(encoded_va);
> +		remaining -= fragsz;
> +		if (unlikely(remaining < 0)) {
>  			/*
>  			 * The caller is trying to allocate a fragment
>  			 * with fragsz > PAGE_SIZE but the cache isn't big

I find it really amusing that you went to all the trouble of flipping
the logic just to flip it back to being a countdown setup. If you were
going to bother with all that then why not just make the remaining
negative instead? You could save yourself a ton of trouble that way and
all you would need to do is flip a few signs.

> @@ -120,10 +137,11 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>  		}
>  	}
>  
> +	va = page_frag_cache_current_va(nc);
>  	nc->pagecnt_bias--;
> -	nc->offset = offset + fragsz;
> +	nc->remaining = remaining;
>  
> -	return nc->va + offset;
> +	return va;
>  }
>  EXPORT_SYMBOL(__page_frag_alloc_va_align);
>  

Not sure I am huge fan of the way the order of operations has to get so
creative for this to work.  Not that I see a better way to do it, but
my concern is that this is going to add technical debt as I can easily
see somebody messing up the order of things at some point in the future
and generating a bad pointer.


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 02/13] mm: move the page fragment allocator from page_alloc into its own file
  2024-07-01 23:10   ` Alexander H Duyck
@ 2024-07-02 12:27     ` Yunsheng Lin
  0 siblings, 0 replies; 43+ messages in thread
From: Yunsheng Lin @ 2024-07-02 12:27 UTC (permalink / raw)
  To: Alexander H Duyck, davem, kuba, pabeni
  Cc: netdev, linux-kernel, David Howells, Andrew Morton, linux-mm

On 2024/7/2 7:10, Alexander H Duyck wrote:
> On Tue, 2024-06-25 at 21:52 +0800, Yunsheng Lin wrote:
>> Inspired by [1], move the page fragment allocator from page_alloc
>> into its own c file and header file, as we are about to make more
>> change for it to replace another page_frag implementation in
>> sock.c
>>
>> 1. https://lore.kernel.org/all/20230411160902.4134381-3-dhowells@redhat.com/
>>
>> CC: David Howells <dhowells@redhat.com>
>> CC: Alexander Duyck <alexander.duyck@gmail.com>
>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> 
> So one thing that I think might have been overlooked in the previous
> reviews is the fact that the headers weren't necessarily self
> sufficient. You were introducing dependencies that had to be fulfilled
> by other headers.
> 
> One thing you might try doing as part of your testing would be to add a
> C file that just adds your header and calls your functions to verify
> that there aren't any unincluded dependencies.

Sure, will do.

> 


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 03/13] mm: page_frag: use initial zero offset for page_frag_alloc_align()
  2024-07-01 23:27   ` Alexander H Duyck
@ 2024-07-02 12:28     ` Yunsheng Lin
  2024-07-02 16:00       ` Alexander Duyck
  0 siblings, 1 reply; 43+ messages in thread
From: Yunsheng Lin @ 2024-07-02 12:28 UTC (permalink / raw)
  To: Alexander H Duyck, davem, kuba, pabeni
  Cc: netdev, linux-kernel, Andrew Morton, linux-mm

On 2024/7/2 7:27, Alexander H Duyck wrote:
> On Tue, 2024-06-25 at 21:52 +0800, Yunsheng Lin wrote:
>> We are above to use page_frag_alloc_*() API to not just
> "about to use", not "above to use"

Ack.

> 
>> allocate memory for skb->data, but also use them to do
>> the memory allocation for skb frag too. Currently the
>> implementation of page_frag in mm subsystem is running
>> the offset as a countdown rather than count-up value,
>> there may have several advantages to that as mentioned
>> in [1], but it may have some disadvantages, for example,
>> it may disable skb frag coaleasing and more correct cache
>> prefetching
>>

...

>> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
>> index 88f567ef0e29..da244851b8a4 100644
>> --- a/mm/page_frag_cache.c
>> +++ b/mm/page_frag_cache.c
>> @@ -72,10 +72,6 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>  		if (!page)
>>  			return NULL;
>>  
>> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> -		/* if size can vary use size else just use PAGE_SIZE */
>> -		size = nc->size;
>> -#endif
>>  		/* Even if we own the page, we do not use atomic_set().
>>  		 * This would break get_page_unless_zero() users.
>>  		 */
>> @@ -84,11 +80,16 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>  		/* reset page count bias and offset to start of new frag */
>>  		nc->pfmemalloc = page_is_pfmemalloc(page);
>>  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>> -		nc->offset = size;
>> +		nc->offset = 0;
>>  	}
>>  
>> -	offset = nc->offset - fragsz;
>> -	if (unlikely(offset < 0)) {
>> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> +	/* if size can vary use size else just use PAGE_SIZE */
>> +	size = nc->size;
>> +#endif
>> +
>> +	offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask);
>> +	if (unlikely(offset + fragsz > size)) {
> 
> The fragsz check below could be moved to here.
> 
>>  		page = virt_to_page(nc->va);
>>  
>>  		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
>> @@ -99,17 +100,13 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>  			goto refill;
>>  		}
>>  
>> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> -		/* if size can vary use size else just use PAGE_SIZE */
>> -		size = nc->size;
>> -#endif
>>  		/* OK, page count is 0, we can safely set it */
>>  		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
>>  
>>  		/* reset page count bias and offset to start of new frag */
>>  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>> -		offset = size - fragsz;
>> -		if (unlikely(offset < 0)) {
>> +		offset = 0;
>> +		if (unlikely(fragsz > PAGE_SIZE)) {
> 
> Since we aren't taking advantage of the flag that is left after the
> subtraction we might just want to look at moving this piece up to just
> after the offset + fragsz check. That should prevent us from trying to
> refill if we have a request that is larger than a single page. In
> addition we could probably just drop the 3 PAGE_SIZE checks above as
> they would be redundant.

I am not sure I understand the 'drop the 3 PAGE_SIZE checks' part and
the 'redundant' part, where is the '3 PAGE_SIZE checks'? And why they
are redundant?

> 
>>  			/*
>>  			 * The caller is trying to allocate a fragment
>>  			 * with fragsz > PAGE_SIZE but the cache isn't big
>> @@ -124,8 +121,7 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>  	}
>>  
>>  	nc->pagecnt_bias--;
>> -	offset &= align_mask;
>> -	nc->offset = offset;
>> +	nc->offset = offset + fragsz;
>>  
>>  	return nc->va + offset;
>>  }
> 
> 
> .
> 


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 06/13] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc'
  2024-07-02  0:08   ` Alexander H Duyck
@ 2024-07-02 12:35     ` Yunsheng Lin
  2024-07-02 14:55       ` Alexander H Duyck
  0 siblings, 1 reply; 43+ messages in thread
From: Yunsheng Lin @ 2024-07-02 12:35 UTC (permalink / raw)
  To: Alexander H Duyck, davem, kuba, pabeni
  Cc: netdev, linux-kernel, Andrew Morton, linux-mm

On 2024/7/2 8:08, Alexander H Duyck wrote:
> On Tue, 2024-06-25 at 21:52 +0800, Yunsheng Lin wrote:
>> Currently there is one 'struct page_frag' for every 'struct
>> sock' and 'struct task_struct', we are about to replace the
>> 'struct page_frag' with 'struct page_frag_cache' for them.
>> Before begin the replacing, we need to ensure the size of
>> 'struct page_frag_cache' is not bigger than the size of
>> 'struct page_frag', as there may be tens of thousands of
>> 'struct sock' and 'struct task_struct' instances in the
>> system.
>>
>> By or'ing the page order & pfmemalloc with lower bits of
>> 'va' instead of using 'u16' or 'u32' for page size and 'u8'
>> for pfmemalloc, we are able to avoid 3 or 5 bytes space waste.
>> And page address & pfmemalloc & order is unchanged for the
>> same page in the same 'page_frag_cache' instance, it makes
>> sense to fit them together.
>>
>> Also, it is better to replace 'offset' with 'remaining', which
>> is the remaining size for the cache in a 'page_frag_cache'
>> instance, we are able to do a single 'fragsz > remaining'
>> checking for the case of cache not being enough, which should be
>> the fast path if we ensure size is zoro when 'va' == NULL by
>> memset'ing 'struct page_frag_cache' in page_frag_cache_init()
>> and page_frag_cache_drain().
>>
>> After this patch, the size of 'struct page_frag_cache' should be
>> the same as the size of 'struct page_frag'.
>>
>> CC: Alexander Duyck <alexander.duyck@gmail.com>
>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>> ---
>>  include/linux/page_frag_cache.h | 76 +++++++++++++++++++++++-----
>>  mm/page_frag_cache.c            | 90 ++++++++++++++++++++-------------
>>  2 files changed, 118 insertions(+), 48 deletions(-)
>>
>> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
>> index 6ac3a25089d1..b33904d4494f 100644
>> --- a/include/linux/page_frag_cache.h
>> +++ b/include/linux/page_frag_cache.h
>> @@ -8,29 +8,81 @@
>>  #define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
>>  #define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
>>  
>> -struct page_frag_cache {
>> -	void *va;
>> +/*
>> + * struct encoded_va - a nonexistent type marking this pointer
>> + *
>> + * An 'encoded_va' pointer is a pointer to a aligned virtual address, which is
>> + * at least aligned to PAGE_SIZE, that means there are at least 12 lower bits
>> + * space available for other purposes.
>> + *
>> + * Currently we use the lower 8 bits and bit 9 for the order and PFMEMALLOC
>> + * flag of the page this 'va' is corresponding to.
>> + *
>> + * Use the supplied helper functions to endcode/decode the pointer and bits.
>> + */
>> +struct encoded_va;
>> +
> 
> Why did you create a struct for this? The way you use it below it is
> just a pointer. No point in defining a struct that doesn't exist
> anywhere.

The encoded_va is mirroring the encoded_page below:
https://elixir.bootlin.com/linux/v6.10-rc6/source/include/linux/mm_types.h#L222
https://github.com/torvalds/linux/commit/70fb4fdff5826a48886152fd5c5db04eb6c59a40

"So this introduces a 'struct encoded_page' pointer that cannot be used for
anything else than to encode a real page pointer and a couple of extra
bits in the low bits.  That way the compiler can trivially track the state
of the pointer and you just explicitly encode and decode the extra bits."

It seems to be similar for encoded_va case too, I guess this is more of personal
preference for using a struct or unsigned long here, I have no strong preference
here and it can be changed if you really insist.

> 
>> +#define PAGE_FRAG_CACHE_ORDER_MASK		GENMASK(7, 0)
>> +#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT		BIT(8)
>> +#define PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT	8
>> +
>> +static inline struct encoded_va *encode_aligned_va(void *va,
>> +						   unsigned int order,
>> +						   bool pfmemalloc)
>> +{
>>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> -	__u16 offset;
>> -	__u16 size;
>> +	return (struct encoded_va *)((unsigned long)va | order |
>> +			pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT);
>>  #else
>> -	__u32 offset;
>> +	return (struct encoded_va *)((unsigned long)va |
>> +			pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT);
>> +#endif
>> +}
>> +
>> +static inline unsigned long encoded_page_order(struct encoded_va *encoded_va)
>> +{
>> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> +	return PAGE_FRAG_CACHE_ORDER_MASK & (unsigned long)encoded_va;
>> +#else
>> +	return 0;
>> +#endif
>> +}
>> +
>> +static inline bool encoded_page_pfmemalloc(struct encoded_va *encoded_va)
>> +{
>> +	return PAGE_FRAG_CACHE_PFMEMALLOC_BIT & (unsigned long)encoded_va;
>> +}
>> +
> 
> My advice is that if you just make encoded_va an unsigned long this
> just becomes some FIELD_GET and bit operations.

As above.

> 
>> +static inline void *encoded_page_address(struct encoded_va *encoded_va)
>> +{
>> +	return (void *)((unsigned long)encoded_va & PAGE_MASK);
>> +}
>> +
>> +struct page_frag_cache {
>> +	struct encoded_va *encoded_va;
> 
> This should be an unsigned long, not a pointer since you are storing
> data other than just a pointer in here. The pointer is just one of the
> things you extract out of it.
> 
>> +
>> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
>> +	u16 pagecnt_bias;
>> +	u16 remaining;
>> +#else
>> +	u32 pagecnt_bias;
>> +	u32 remaining;
>>  #endif
>> -	/* we maintain a pagecount bias, so that we dont dirty cache line
>> -	 * containing page->_refcount every time we allocate a fragment.
>> -	 */
>> -	unsigned int		pagecnt_bias;
>> -	bool pfmemalloc;
>>  };
>>  
>>  static inline void page_frag_cache_init(struct page_frag_cache *nc)
>>  {
>> -	nc->va = NULL;
>> +	memset(nc, 0, sizeof(*nc));
> 
> Shouldn't need to memset 0 the whole thing. Just setting page and order
> to 0 should be enough to indicate that there isn't anything there.

As mentioned in the commit log:
'Also, it is better to replace 'offset' with 'remaining', which
is the remaining size for the cache in a 'page_frag_cache'
instance, we are able to do a single 'fragsz > remaining'
checking for the case of cache not being enough, which should be
the fast path if we ensure 'remaining' is zero when 'va' == NULL by
memset'ing 'struct page_frag_cache' in page_frag_cache_init()
and page_frag_cache_drain().'

Yes, we are only really depending on nc->remaining being zero
when 'va' == NULL untill next patch refactors more codes in
__page_frag_alloc_va_align() to __page_frag_cache_refill().
Perhap I should do the memset() thing in next patch.

> 
>>  }
>>  
>>  static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
>>  {
>> -	return !!nc->pfmemalloc;
>> +	return encoded_page_pfmemalloc(nc->encoded_va);
>> +}
>> +
>> +static inline unsigned int page_frag_cache_page_size(struct encoded_va *encoded_va)
>> +{
>> +	return PAGE_SIZE << encoded_page_order(encoded_va);
>>  }
>>  
>>  void page_frag_cache_drain(struct page_frag_cache *nc);
>> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
>> index dd640af5607a..a3316dd50eff 100644
>> --- a/mm/page_frag_cache.c
>> +++ b/mm/page_frag_cache.c
>> @@ -18,34 +18,61 @@
>>  #include <linux/page_frag_cache.h>
>>  #include "internal.h"
>>  
>> +static void *page_frag_cache_current_va(struct page_frag_cache *nc)
>> +{
>> +	struct encoded_va *encoded_va = nc->encoded_va;
>> +
>> +	return (void *)(((unsigned long)encoded_va & PAGE_MASK) |
>> +		(page_frag_cache_page_size(encoded_va) - nc->remaining));
>> +}
>> +
> 
> Rather than an OR here I would rather see this just use addition.
> Otherwise this logic becomes overly complicated.

Sure.

> 
>>  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>  					     gfp_t gfp_mask)
>>  {
>>  	struct page *page = NULL;
>>  	gfp_t gfp = gfp_mask;
>> +	unsigned int order;
>>  
>>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>>  	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
>>  		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
>>  	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
>>  				PAGE_FRAG_CACHE_MAX_ORDER);
>> -	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
>>  #endif
>> -	if (unlikely(!page))
>> +	if (unlikely(!page)) {
>>  		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
>> +		if (unlikely(!page)) {
>> +			memset(nc, 0, sizeof(*nc));
>> +			return NULL;
>> +		}
>> +
>> +		order = 0;
>> +		nc->remaining = PAGE_SIZE;
>> +	} else {
>> +		order = PAGE_FRAG_CACHE_MAX_ORDER;
>> +		nc->remaining = PAGE_FRAG_CACHE_MAX_SIZE;
>> +	}
>>  
>> -	nc->va = page ? page_address(page) : NULL;
>> +	/* Even if we own the page, we do not use atomic_set().
>> +	 * This would break get_page_unless_zero() users.
>> +	 */
>> +	page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
>>  
>> +	/* reset page count bias of new frag */
>> +	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> 
> I would rather keep the pagecnt_bias, page reference addition, and
> resetting of remaining outside of this. The only fields we should be
> setting are order, the virtual address, and pfmemalloc since those are
> what is encoded in your unsigned long variable.

Is there any reason why you want to keep them outside of this?

For resetting of remaining, it seems we need more check to decide the
value of remaining if it is kept outside of this.

Also, for the next patch, more common codes are refactored out of
 __page_frag_alloc_va_align() to __page_frag_cache_refill(), so that
the new API can make use of them, so I am not sure it really matter
that much.

> 
>> +	nc->encoded_va = encode_aligned_va(page_address(page), order,
>> +					   page_is_pfmemalloc(page));
>>  	return page;
>>  }
>>  
>>  void page_frag_cache_drain(struct page_frag_cache *nc)
>>  {
>> -	if (!nc->va)
>> +	if (!nc->encoded_va)
>>  		return;
>>  
>> -	__page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
>> -	nc->va = NULL;
>> +	__page_frag_cache_drain(virt_to_head_page(nc->encoded_va),
>> +				nc->pagecnt_bias);
>> +	memset(nc, 0, sizeof(*nc));
> 
> Again, no need for memset when "nv->encoded_va = 0" will do.
> 
>>  }
>>  EXPORT_SYMBOL(page_frag_cache_drain);
>>  
>> @@ -62,51 +89,41 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>>  				 unsigned int fragsz, gfp_t gfp_mask,
>>  				 unsigned int align_mask)
>>  {
>> -	unsigned int size = PAGE_SIZE;
>> +	struct encoded_va *encoded_va = nc->encoded_va;
>>  	struct page *page;
>> -	int offset;
>> +	int remaining;
>> +	void *va;
>>  
>> -	if (unlikely(!nc->va)) {
>> +	if (unlikely(!encoded_va)) {
>>  refill:
>> -		page = __page_frag_cache_refill(nc, gfp_mask);
>> -		if (!page)
>> +		if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
>>  			return NULL;
>>  
>> -		/* Even if we own the page, we do not use atomic_set().
>> -		 * This would break get_page_unless_zero() users.
>> -		 */
>> -		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
>> -
>> -		/* reset page count bias and offset to start of new frag */
>> -		nc->pfmemalloc = page_is_pfmemalloc(page);
>> -		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>> -		nc->offset = 0;
>> +		encoded_va = nc->encoded_va;
>>  	}
>>  
>> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> -	/* if size can vary use size else just use PAGE_SIZE */
>> -	size = nc->size;
>> -#endif
>> -
>> -	offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask);
>> -	if (unlikely(offset + fragsz > size)) {
>> -		page = virt_to_page(nc->va);
>> -
>> +	remaining = nc->remaining & align_mask;
>> +	remaining -= fragsz;
>> +	if (unlikely(remaining < 0)) {
> 
> Now this is just getting confusing. You essentially just added an
> additional addition step and went back to the countdown approach I was
> using before except for the fact that you are starting at 0 whereas I
> was actually moving down through the page.

Does the 'additional addition step' mean the additional step to calculate
the offset using the new 'remaining' field? I guess that is the disadvantage
by changing 'offset' to 'remaining', but it also some advantages too:

1. it is better to replace 'offset' with 'remaining', which
   is the remaining size for the cache in a 'page_frag_cache'
   instance, we are able to do a single 'fragsz > remaining'
   checking for the case of cache not being enough, which should be
   the fast path if we ensure size is zoro when 'va' == NULL by
   memset'ing 'struct page_frag_cache' in page_frag_cache_init()
   and page_frag_cache_drain().
2. It seems more convenient to implement the commit/probe API too
   when using 'remaining' instead of 'offset' as those API needs
   the remaining size of the page_frag_cache anyway.

So it is really a trade-off between using 'offset' and 'remaining',
it is like the similar argument about trade-off between allocating
fragment 'countdown' and 'countup' way.

About confusing part, as the nc->remaining does mean how much cache
is left in the 'nc', and nc->remaining does start from
PAGE_FRAG_CACHE_MAX_SIZE/PAGE_SIZE to zero naturally if that was what
you meant by 'countdown', but it is different from the 'offset countdown'
before this patchset as my understanding.

> 
> What I would suggest doing since "remaining" is a negative offset
> anyway would be to look at just storing it as a signed negative number.
> At least with that you can keep to your original approach and would
> only have to change your check to be for "remaining + fragsz <= 0".

Did you mean by using s16/s32 for 'remaining'? And set nc->remaining like
below?
nc->remaining = -PAGE_SIZE or
nc->remaining = -PAGE_FRAG_CACHE_MAX_SIZE

struct page_frag_cache {
        struct encoded_va *encoded_va;

#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
        u16 pagecnt_bias;
        s16 remaining;
#else
        u32 pagecnt_bias;
        s32 remaining;
#endif
};

If I understand above correctly, it seems we really need a better name
than 'remaining' to reflect that.

> With that you can still do your math but it becomes an addition instead
> of a subtraction.

And I am not really sure what is the gain here by using an addition
instead of a subtraction here.

> 
>> +		page = virt_to_page(encoded_va);
>>  		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
>>  			goto refill;
>>  
>> -		if (unlikely(nc->pfmemalloc)) {
>> -			free_unref_page(page, compound_order(page));
>> +		if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
>> +			VM_BUG_ON(compound_order(page) !=
>> +				  encoded_page_order(encoded_va));
>> +			free_unref_page(page, encoded_page_order(encoded_va));
>>  			goto refill;
>>  		}
>>  
>>  		/* OK, page count is 0, we can safely set it */
>>  		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
>>  
>> -		/* reset page count bias and offset to start of new frag */
>> +		/* reset page count bias and remaining of new frag */
>>  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>> -		offset = 0;
>> -		if (unlikely(fragsz > PAGE_SIZE)) {
>> +		nc->remaining = remaining = page_frag_cache_page_size(encoded_va);
>> +		remaining -= fragsz;
>> +		if (unlikely(remaining < 0)) {
>>  			/*
>>  			 * The caller is trying to allocate a fragment
>>  			 * with fragsz > PAGE_SIZE but the cache isn't big
> 
> I find it really amusing that you went to all the trouble of flipping
> the logic just to flip it back to being a countdown setup. If you were
> going to bother with all that then why not just make the remaining
> negative instead? You could save yourself a ton of trouble that way and
> all you would need to do is flip a few signs.

I am not sure I understand the 'a ton of trouble' part here, if 'flipping
a few signs' does save a ton of trouble here, I would like to avoid 'a
ton of trouble' here, but I am not really understand the gain here yet as
mentioned above.


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 06/13] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc'
  2024-07-02 12:35     ` Yunsheng Lin
@ 2024-07-02 14:55       ` Alexander H Duyck
  2024-07-03 12:33         ` Yunsheng Lin
  0 siblings, 1 reply; 43+ messages in thread
From: Alexander H Duyck @ 2024-07-02 14:55 UTC (permalink / raw)
  To: Yunsheng Lin, davem, kuba, pabeni
  Cc: netdev, linux-kernel, Andrew Morton, linux-mm

On Tue, 2024-07-02 at 20:35 +0800, Yunsheng Lin wrote:
> On 2024/7/2 8:08, Alexander H Duyck wrote:
> > On Tue, 2024-06-25 at 21:52 +0800, Yunsheng Lin wrote:
> > > Currently there is one 'struct page_frag' for every 'struct
> > > sock' and 'struct task_struct', we are about to replace the
> > > 'struct page_frag' with 'struct page_frag_cache' for them.
> > > Before begin the replacing, we need to ensure the size of
> > > 'struct page_frag_cache' is not bigger than the size of
> > > 'struct page_frag', as there may be tens of thousands of
> > > 'struct sock' and 'struct task_struct' instances in the
> > > system.
> > > 
> > > By or'ing the page order & pfmemalloc with lower bits of
> > > 'va' instead of using 'u16' or 'u32' for page size and 'u8'
> > > for pfmemalloc, we are able to avoid 3 or 5 bytes space waste.
> > > And page address & pfmemalloc & order is unchanged for the
> > > same page in the same 'page_frag_cache' instance, it makes
> > > sense to fit them together.
> > > 
> > > Also, it is better to replace 'offset' with 'remaining', which
> > > is the remaining size for the cache in a 'page_frag_cache'
> > > instance, we are able to do a single 'fragsz > remaining'
> > > checking for the case of cache not being enough, which should be
> > > the fast path if we ensure size is zoro when 'va' == NULL by
> > > memset'ing 'struct page_frag_cache' in page_frag_cache_init()
> > > and page_frag_cache_drain().
> > > 
> > > After this patch, the size of 'struct page_frag_cache' should be
> > > the same as the size of 'struct page_frag'.
> > > 
> > > CC: Alexander Duyck <alexander.duyck@gmail.com>
> > > Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> > > ---
> > >  include/linux/page_frag_cache.h | 76 +++++++++++++++++++++++-----
> > >  mm/page_frag_cache.c            | 90 ++++++++++++++++++++-------------
> > >  2 files changed, 118 insertions(+), 48 deletions(-)
> > > 
> > > diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
> > > index 6ac3a25089d1..b33904d4494f 100644
> > > --- a/include/linux/page_frag_cache.h
> > > +++ b/include/linux/page_frag_cache.h
> > > @@ -8,29 +8,81 @@
> > >  #define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
> > >  #define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
> > >  
> > > -struct page_frag_cache {
> > > -	void *va;
> > > +/*
> > > + * struct encoded_va - a nonexistent type marking this pointer
> > > + *
> > > + * An 'encoded_va' pointer is a pointer to a aligned virtual address, which is
> > > + * at least aligned to PAGE_SIZE, that means there are at least 12 lower bits
> > > + * space available for other purposes.
> > > + *
> > > + * Currently we use the lower 8 bits and bit 9 for the order and PFMEMALLOC
> > > + * flag of the page this 'va' is corresponding to.
> > > + *
> > > + * Use the supplied helper functions to endcode/decode the pointer and bits.
> > > + */
> > > +struct encoded_va;
> > > +
> > 
> > Why did you create a struct for this? The way you use it below it is
> > just a pointer. No point in defining a struct that doesn't exist
> > anywhere.
> 
> The encoded_va is mirroring the encoded_page below:
> https://elixir.bootlin.com/linux/v6.10-rc6/source/include/linux/mm_types.h#L222
> https://github.com/torvalds/linux/commit/70fb4fdff5826a48886152fd5c5db04eb6c59a40
> 
> "So this introduces a 'struct encoded_page' pointer that cannot be used for
> anything else than to encode a real page pointer and a couple of extra
> bits in the low bits.  That way the compiler can trivially track the state
> of the pointer and you just explicitly encode and decode the extra bits."
> 
> It seems to be similar for encoded_va case too, I guess this is more of personal
> preference for using a struct or unsigned long here, I have no strong preference
> here and it can be changed if you really insist.
> 

Really this seems like a bad copy with none of the guardrails. I still
think you would be better off just storing this as a long rather than a
pointer. It doesn't need to be a pointer and it creates a bunch of
unnecessary extra overhead.

In addition if you store it as a long as I mentioned it will make it
much easier to extract the size and pfmemalloc fields. Basically this
thing is more bitfield(2 items) than pointer(1 item) which is why I
think it would make more sense as an unsigned long. Then you only have
to cast to get the pointer in and the pointer out.

> > 
> > > +#define PAGE_FRAG_CACHE_ORDER_MASK		GENMASK(7, 0)
> > > +#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT		BIT(8)
> > > +#define PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT	8
> > > +
> > > +static inline struct encoded_va *encode_aligned_va(void *va,
> > > +						   unsigned int order,
> > > +						   bool pfmemalloc)
> > > +{
> > >  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> > > -	__u16 offset;
> > > -	__u16 size;
> > > +	return (struct encoded_va *)((unsigned long)va | order |
> > > +			pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT);
> > >  #else
> > > -	__u32 offset;
> > > +	return (struct encoded_va *)((unsigned long)va |
> > > +			pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT);
> > > +#endif
> > > +}
> > > +

This is missing any and all protection of the example you cited. If I
want to pass order as a 32b value I can and I can corrupt the virtual
address. Same thing with pfmemalloc. Lets only hope you don't hit an
architecture where a bool is a u8 in size as otherwise that shift is
going to wipe out the value, and if it is a u32 which is usually the
case lets hope they stick to the values of 0 and 1.

> > > +static inline unsigned long encoded_page_order(struct encoded_va *encoded_va)
> > > +{
> > > +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> > > +	return PAGE_FRAG_CACHE_ORDER_MASK & (unsigned long)encoded_va;
> > > +#else
> > > +	return 0;
> > > +#endif
> > > +}
> > > +
> > > +static inline bool encoded_page_pfmemalloc(struct encoded_va *encoded_va)
> > > +{
> > > +	return PAGE_FRAG_CACHE_PFMEMALLOC_BIT & (unsigned long)encoded_va;
> > > +}
> > > +
> > 
> > My advice is that if you just make encoded_va an unsigned long this
> > just becomes some FIELD_GET and bit operations.
> 
> As above.
> 

The code you mentioned had one extra block of bits that was in it and
had strict protections on what went into and out of those bits. You
don't have any of those protections.

I suggest you just use a long and don't bother storing this as a
pointer.

> > > +static inline void *encoded_page_address(struct encoded_va *encoded_va)
> > > +{
> > > +	return (void *)((unsigned long)encoded_va & PAGE_MASK);
> > > +}
> > > +
> > > +struct page_frag_cache {
> > > +	struct encoded_va *encoded_va;
> > 
> > This should be an unsigned long, not a pointer since you are storing
> > data other than just a pointer in here. The pointer is just one of the
> > things you extract out of it.
> > 
> > > +
> > > +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
> > > +	u16 pagecnt_bias;
> > > +	u16 remaining;
> > > +#else
> > > +	u32 pagecnt_bias;
> > > +	u32 remaining;
> > >  #endif
> > > -	/* we maintain a pagecount bias, so that we dont dirty cache line
> > > -	 * containing page->_refcount every time we allocate a fragment.
> > > -	 */
> > > -	unsigned int		pagecnt_bias;
> > > -	bool pfmemalloc;
> > >  };
> > >  
> > >  static inline void page_frag_cache_init(struct page_frag_cache *nc)
> > >  {
> > > -	nc->va = NULL;
> > > +	memset(nc, 0, sizeof(*nc));
> > 
> > Shouldn't need to memset 0 the whole thing. Just setting page and order
> > to 0 should be enough to indicate that there isn't anything there.
> 
> As mentioned in the commit log:
> 'Also, it is better to replace 'offset' with 'remaining', which
> is the remaining size for the cache in a 'page_frag_cache'
> instance, we are able to do a single 'fragsz > remaining'
> checking for the case of cache not being enough, which should be
> the fast path if we ensure 'remaining' is zero when 'va' == NULL by
> memset'ing 'struct page_frag_cache' in page_frag_cache_init()
> and page_frag_cache_drain().'
> 
> Yes, we are only really depending on nc->remaining being zero
> when 'va' == NULL untill next patch refactors more codes in
> __page_frag_alloc_va_align() to __page_frag_cache_refill().
> Perhap I should do the memset() thing in next patch.
> 

I get that. You reverted the code back to where it was in patch 3 but
are retaining the bottom up direction. If you were going to do that you
might as well just did the "remaining" change in patch 3 and saved me
having to review the same block of logic twice.

> 
> 
> > 
> > >  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
> > >  					     gfp_t gfp_mask)
> > >  {
> > >  	struct page *page = NULL;
> > >  	gfp_t gfp = gfp_mask;
> > > +	unsigned int order;
> > >  
> > >  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> > >  	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
> > >  		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
> > >  	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
> > >  				PAGE_FRAG_CACHE_MAX_ORDER);
> > > -	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
> > >  #endif
> > > -	if (unlikely(!page))
> > > +	if (unlikely(!page)) {
> > >  		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
> > > +		if (unlikely(!page)) {
> > > +			memset(nc, 0, sizeof(*nc));
> > > +			return NULL;
> > > +		}
> > > +
> > > +		order = 0;
> > > +		nc->remaining = PAGE_SIZE;
> > > +	} else {
> > > +		order = PAGE_FRAG_CACHE_MAX_ORDER;
> > > +		nc->remaining = PAGE_FRAG_CACHE_MAX_SIZE;
> > > +	}
> > >  
> > > -	nc->va = page ? page_address(page) : NULL;
> > > +	/* Even if we own the page, we do not use atomic_set().
> > > +	 * This would break get_page_unless_zero() users.
> > > +	 */
> > > +	page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
> > >  
> > > +	/* reset page count bias of new frag */
> > > +	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> > 
> > I would rather keep the pagecnt_bias, page reference addition, and
> > resetting of remaining outside of this. The only fields we should be
> > setting are order, the virtual address, and pfmemalloc since those are
> > what is encoded in your unsigned long variable.
> 
> Is there any reason why you want to keep them outside of this?

Because this is now essentially a function to allocate an encoded page.
The output should be your encoded virtual address with the size and
pfmemalloc flags built in.

> For resetting of remaining, it seems we need more check to decide the
> value of remaining if it is kept outside of this.
> 
> Also, for the next patch, more common codes are refactored out of
>  __page_frag_alloc_va_align() to __page_frag_cache_refill(), so that
> the new API can make use of them, so I am not sure it really matter
> that much.
> 
> > 
> > > +	nc->encoded_va = encode_aligned_va(page_address(page), order,
> > > +					   page_is_pfmemalloc(page));
> > >  	return page;
> > >  }
> > >  
> > >  void page_frag_cache_drain(struct page_frag_cache *nc)
> > >  {
> > > -	if (!nc->va)
> > > +	if (!nc->encoded_va)
> > >  		return;
> > >  
> > > -	__page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
> > > -	nc->va = NULL;
> > > +	__page_frag_cache_drain(virt_to_head_page(nc->encoded_va),
> > > +				nc->pagecnt_bias);
> > > +	memset(nc, 0, sizeof(*nc));
> > 
> > Again, no need for memset when "nv->encoded_va = 0" will do.
> > 
> > >  }
> > >  EXPORT_SYMBOL(page_frag_cache_drain);
> > >  
> > > @@ -62,51 +89,41 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
> > >  				 unsigned int fragsz, gfp_t gfp_mask,
> > >  				 unsigned int align_mask)
> > >  {
> > > -	unsigned int size = PAGE_SIZE;
> > > +	struct encoded_va *encoded_va = nc->encoded_va;
> > >  	struct page *page;
> > > -	int offset;
> > > +	int remaining;
> > > +	void *va;
> > >  
> > > -	if (unlikely(!nc->va)) {
> > > +	if (unlikely(!encoded_va)) {
> > >  refill:
> > > -		page = __page_frag_cache_refill(nc, gfp_mask);
> > > -		if (!page)
> > > +		if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
> > >  			return NULL;
> > >  
> > > -		/* Even if we own the page, we do not use atomic_set().
> > > -		 * This would break get_page_unless_zero() users.
> > > -		 */
> > > -		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
> > > -
> > > -		/* reset page count bias and offset to start of new frag */
> > > -		nc->pfmemalloc = page_is_pfmemalloc(page);
> > > -		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> > > -		nc->offset = 0;
> > > +		encoded_va = nc->encoded_va;
> > >  	}
> > >  
> > > -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> > > -	/* if size can vary use size else just use PAGE_SIZE */
> > > -	size = nc->size;
> > > -#endif
> > > -
> > > -	offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask);
> > > -	if (unlikely(offset + fragsz > size)) {
> > > -		page = virt_to_page(nc->va);
> > > -
> > > +	remaining = nc->remaining & align_mask;
> > > +	remaining -= fragsz;
> > > +	if (unlikely(remaining < 0)) {
> > 
> > Now this is just getting confusing. You essentially just added an
> > additional addition step and went back to the countdown approach I was
> > using before except for the fact that you are starting at 0 whereas I
> > was actually moving down through the page.
> 
> Does the 'additional addition step' mean the additional step to calculate
> the offset using the new 'remaining' field? I guess that is the disadvantage
> by changing 'offset' to 'remaining', but it also some advantages too:
> 
> 1. it is better to replace 'offset' with 'remaining', which
>    is the remaining size for the cache in a 'page_frag_cache'
>    instance, we are able to do a single 'fragsz > remaining'
>    checking for the case of cache not being enough, which should be
>    the fast path if we ensure size is zoro when 'va' == NULL by
>    memset'ing 'struct page_frag_cache' in page_frag_cache_init()
>    and page_frag_cache_drain().
> 2. It seems more convenient to implement the commit/probe API too
>    when using 'remaining' instead of 'offset' as those API needs
>    the remaining size of the page_frag_cache anyway.
> 
> So it is really a trade-off between using 'offset' and 'remaining',
> it is like the similar argument about trade-off between allocating
> fragment 'countdown' and 'countup' way.
> 
> About confusing part, as the nc->remaining does mean how much cache
> is left in the 'nc', and nc->remaining does start from
> PAGE_FRAG_CACHE_MAX_SIZE/PAGE_SIZE to zero naturally if that was what
> you meant by 'countdown', but it is different from the 'offset countdown'
> before this patchset as my understanding.
> 
> > 
> > What I would suggest doing since "remaining" is a negative offset
> > anyway would be to look at just storing it as a signed negative number.
> > At least with that you can keep to your original approach and would
> > only have to change your check to be for "remaining + fragsz <= 0".
> 
> Did you mean by using s16/s32 for 'remaining'? And set nc->remaining like
> below?
> nc->remaining = -PAGE_SIZE or
> nc->remaining = -PAGE_FRAG_CACHE_MAX_SIZE

Yes. Basically if we used it as a signed value then you could just work
your way up and do your aligned math still.

> struct page_frag_cache {
>         struct encoded_va *encoded_va;
> 
> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
>         u16 pagecnt_bias;
>         s16 remaining;
> #else
>         u32 pagecnt_bias;
>         s32 remaining;
> #endif
> };
> 
> If I understand above correctly, it seems we really need a better name
> than 'remaining' to reflect that.

It would effectively work like a countdown. Instead of T - X in this
case it is size - X.

> > With that you can still do your math but it becomes an addition instead
> > of a subtraction.
> 
> And I am not really sure what is the gain here by using an addition
> instead of a subtraction here.
> 

The "remaining" as it currently stands is doing the same thing so odds
are it isn't too big a deal. It is just that the original code was
already somewhat confusing and this is just making it that much more
complex.

> > > +		page = virt_to_page(encoded_va);
> > >  		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> > >  			goto refill;
> > >  
> > > -		if (unlikely(nc->pfmemalloc)) {
> > > -			free_unref_page(page, compound_order(page));
> > > +		if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
> > > +			VM_BUG_ON(compound_order(page) !=
> > > +				  encoded_page_order(encoded_va));
> > > +			free_unref_page(page, encoded_page_order(encoded_va));
> > >  			goto refill;
> > >  		}
> > >  
> > >  		/* OK, page count is 0, we can safely set it */
> > >  		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
> > >  
> > > -		/* reset page count bias and offset to start of new frag */
> > > +		/* reset page count bias and remaining of new frag */
> > >  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> > > -		offset = 0;
> > > -		if (unlikely(fragsz > PAGE_SIZE)) {
> > > +		nc->remaining = remaining = page_frag_cache_page_size(encoded_va);
> > > +		remaining -= fragsz;
> > > +		if (unlikely(remaining < 0)) {
> > >  			/*
> > >  			 * The caller is trying to allocate a fragment
> > >  			 * with fragsz > PAGE_SIZE but the cache isn't big
> > 
> > I find it really amusing that you went to all the trouble of flipping
> > the logic just to flip it back to being a countdown setup. If you were
> > going to bother with all that then why not just make the remaining
> > negative instead? You could save yourself a ton of trouble that way and
> > all you would need to do is flip a few signs.
> 
> I am not sure I understand the 'a ton of trouble' part here, if 'flipping
> a few signs' does save a ton of trouble here, I would like to avoid 'a
> ton of trouble' here, but I am not really understand the gain here yet as
> mentioned above.

It isn't about flipping the signs. It is more the fact that the logic
has now become even more complex then it originally was. With my work
backwards approach the advantage was that the offset could be updated
and then we just recorded everything and reported it up. Now we have to
keep a temporary "remaining" value, generate our virtual address and
store that to a temp variable, we can record the new "remaining" value,
and then we can report the virtual address. If we get the ordering on
the steps 2 and 3 in this it will cause issues as we will be pointing
to the wrong values. It is something I can see someone easily messing
up.



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 07/13] mm: page_frag: some minor refactoring before adding new API
  2024-06-25 13:52 ` [PATCH net-next v9 07/13] mm: page_frag: some minor refactoring before adding new API Yunsheng Lin
@ 2024-07-02 15:30   ` Alexander H Duyck
  2024-07-03 12:36     ` Yunsheng Lin
  0 siblings, 1 reply; 43+ messages in thread
From: Alexander H Duyck @ 2024-07-02 15:30 UTC (permalink / raw)
  To: Yunsheng Lin, davem, kuba, pabeni
  Cc: netdev, linux-kernel, Andrew Morton, linux-mm

On Tue, 2024-06-25 at 21:52 +0800, Yunsheng Lin wrote:
> Refactor common codes from __page_frag_alloc_va_align()
> to __page_frag_cache_refill(), so that the new API can
> make use of them.
> 
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>

I am generally not a fan of the concept behind this patch. I really
think we should keep the page_frag_cache_refill function to just
allocating the page, or in this case the encoded_va and populating only
that portion of the struct.

> ---
>  mm/page_frag_cache.c | 61 ++++++++++++++++++++++----------------------
>  1 file changed, 31 insertions(+), 30 deletions(-)
> 
> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> index a3316dd50eff..4fd421d4f22c 100644
> --- a/mm/page_frag_cache.c
> +++ b/mm/page_frag_cache.c
> @@ -29,10 +29,36 @@ static void *page_frag_cache_current_va(struct page_frag_cache *nc)
>  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>  					     gfp_t gfp_mask)
>  {
> -	struct page *page = NULL;
> +	struct encoded_va *encoded_va = nc->encoded_va;
>  	gfp_t gfp = gfp_mask;
>  	unsigned int order;
> +	struct page *page;
> +
> +	if (unlikely(!encoded_va))
> +		goto alloc;
> +
> +	page = virt_to_page(encoded_va);
> +	if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> +		goto alloc;
> +
> +	if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
> +		VM_BUG_ON(compound_order(page) !=
> +			  encoded_page_order(encoded_va));
> +		free_unref_page(page, encoded_page_order(encoded_va));
> +		goto alloc;
> +	}
> +
> +	/* OK, page count is 0, we can safely set it */
> +	set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);

Why not just make this block of code a function onto itself? You put an
if statement at the top that essentially is just merging two functions
into one. Perhaps this logic could be __page_frag_cache_recharge which
would return an error if the page is busy or the wrong type. Then
acting on that you could switch to the refill attempt.

Also thinking about it more the set_page_count in this function and
page_ref_add in the other can probably be merged into the recharge and
refill functions since they are acting directly on the encoded page and
not interacting with the other parts of the page_frag_cache.

> +
> +	/* reset page count bias and remaining of new frag */
> +	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> +	nc->remaining = page_frag_cache_page_size(encoded_va);

These two parts are more or less agnostic to the setup and could be
applied to refill or recharge. Also one thought occurs to me. You were
encoding "order" into the encoded VA. Why use that when your choices
are either PAGE_FRAG_CACHE_MAX_SIZE or PAGE_SIZE. It should be a single
bit and doesn't need to be a fully byte to store that. That would allow
you to reduce this down to just 2 bits, one for pfmemalloc and one for
max order vs order 0.

> +
> +	return page;
>  
> +alloc:
> +	page = NULL;
>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>  	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
>  		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
> @@ -89,40 +115,15 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>  				 unsigned int fragsz, gfp_t gfp_mask,
>  				 unsigned int align_mask)
>  {
> -	struct encoded_va *encoded_va = nc->encoded_va;
> -	struct page *page;
> -	int remaining;
> +	int remaining = nc->remaining & align_mask;
>  	void *va;
>  
> -	if (unlikely(!encoded_va)) {
> -refill:
> -		if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
> -			return NULL;
> -
> -		encoded_va = nc->encoded_va;
> -	}
> -
> -	remaining = nc->remaining & align_mask;
>  	remaining -= fragsz;
>  	if (unlikely(remaining < 0)) {

I see, so this is why you were using the memset calls everywhere.

> -		page = virt_to_page(encoded_va);
> -		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> -			goto refill;
> -
> -		if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
> -			VM_BUG_ON(compound_order(page) !=
> -				  encoded_page_order(encoded_va));
> -			free_unref_page(page, encoded_page_order(encoded_va));
> -			goto refill;
> -		}
> -
> -		/* OK, page count is 0, we can safely set it */
> -		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
> +		if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
> +			return NULL;
>  
> -		/* reset page count bias and remaining of new frag */
> -		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> -		nc->remaining = remaining = page_frag_cache_page_size(encoded_va);
> -		remaining -= fragsz;
> +		remaining = nc->remaining - fragsz;
>  		if (unlikely(remaining < 0)) {
>  			/*
>  			 * The caller is trying to allocate a fragment




^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 03/13] mm: page_frag: use initial zero offset for page_frag_alloc_align()
  2024-07-02 12:28     ` Yunsheng Lin
@ 2024-07-02 16:00       ` Alexander Duyck
  2024-07-03 11:25         ` Yunsheng Lin
  0 siblings, 1 reply; 43+ messages in thread
From: Alexander Duyck @ 2024-07-02 16:00 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Andrew Morton,
	linux-mm

On Tue, Jul 2, 2024 at 5:28 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/7/2 7:27, Alexander H Duyck wrote:
> > On Tue, 2024-06-25 at 21:52 +0800, Yunsheng Lin wrote:
> >> We are above to use page_frag_alloc_*() API to not just
> > "about to use", not "above to use"
>
> Ack.
>
> >
> >> allocate memory for skb->data, but also use them to do
> >> the memory allocation for skb frag too. Currently the
> >> implementation of page_frag in mm subsystem is running
> >> the offset as a countdown rather than count-up value,
> >> there may have several advantages to that as mentioned
> >> in [1], but it may have some disadvantages, for example,
> >> it may disable skb frag coaleasing and more correct cache
> >> prefetching
> >>
>
> ...
>
> >> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> >> index 88f567ef0e29..da244851b8a4 100644
> >> --- a/mm/page_frag_cache.c
> >> +++ b/mm/page_frag_cache.c
> >> @@ -72,10 +72,6 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
> >>              if (!page)
> >>                      return NULL;
> >>
> >> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> >> -            /* if size can vary use size else just use PAGE_SIZE */
> >> -            size = nc->size;
> >> -#endif
> >>              /* Even if we own the page, we do not use atomic_set().
> >>               * This would break get_page_unless_zero() users.
> >>               */
> >> @@ -84,11 +80,16 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
> >>              /* reset page count bias and offset to start of new frag */
> >>              nc->pfmemalloc = page_is_pfmemalloc(page);
> >>              nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> >> -            nc->offset = size;
> >> +            nc->offset = 0;
> >>      }
> >>
> >> -    offset = nc->offset - fragsz;
> >> -    if (unlikely(offset < 0)) {
> >> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> >> +    /* if size can vary use size else just use PAGE_SIZE */
> >> +    size = nc->size;
> >> +#endif
> >> +
> >> +    offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask);
> >> +    if (unlikely(offset + fragsz > size)) {
> >
> > The fragsz check below could be moved to here.
> >
> >>              page = virt_to_page(nc->va);
> >>
> >>              if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> >> @@ -99,17 +100,13 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
> >>                      goto refill;
> >>              }
> >>
> >> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> >> -            /* if size can vary use size else just use PAGE_SIZE */
> >> -            size = nc->size;
> >> -#endif
> >>              /* OK, page count is 0, we can safely set it */
> >>              set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
> >>
> >>              /* reset page count bias and offset to start of new frag */
> >>              nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> >> -            offset = size - fragsz;
> >> -            if (unlikely(offset < 0)) {
> >> +            offset = 0;
> >> +            if (unlikely(fragsz > PAGE_SIZE)) {
> >
> > Since we aren't taking advantage of the flag that is left after the
> > subtraction we might just want to look at moving this piece up to just
> > after the offset + fragsz check. That should prevent us from trying to
> > refill if we have a request that is larger than a single page. In
> > addition we could probably just drop the 3 PAGE_SIZE checks above as
> > they would be redundant.
>
> I am not sure I understand the 'drop the 3 PAGE_SIZE checks' part and
> the 'redundant' part, where is the '3 PAGE_SIZE checks'? And why they
> are redundant?

I was referring to the addition of the checks for align > PAGE_SIZE in
the alloc functions at the start of this diff. I guess I had dropped
them from the first half of it with the "...". Also looking back
through the patch you misspelled "avoid" as "aovid".

The issue is there is a ton of pulling things forward that don't
necessarily make sense into these diffs. Now that I have finished
looking through the set I have a better idea of why those are there
and they might make sense. It is just difficult to review since code
is being added for things that aren't applicable to the patch being
reviewed.


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 03/13] mm: page_frag: use initial zero offset for page_frag_alloc_align()
  2024-07-02 16:00       ` Alexander Duyck
@ 2024-07-03 11:25         ` Yunsheng Lin
  0 siblings, 0 replies; 43+ messages in thread
From: Yunsheng Lin @ 2024-07-03 11:25 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Andrew Morton,
	linux-mm

On 2024/7/3 0:00, Alexander Duyck wrote:

...

>>>> +
>>>> +    offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask);
>>>> +    if (unlikely(offset + fragsz > size)) {
>>>
>>> The fragsz check below could be moved to here.
>>>
>>>>              page = virt_to_page(nc->va);
>>>>
>>>>              if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
>>>> @@ -99,17 +100,13 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>>>                      goto refill;
>>>>              }
>>>>
>>>> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>>>> -            /* if size can vary use size else just use PAGE_SIZE */
>>>> -            size = nc->size;
>>>> -#endif
>>>>              /* OK, page count is 0, we can safely set it */
>>>>              set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
>>>>
>>>>              /* reset page count bias and offset to start of new frag */
>>>>              nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>>>> -            offset = size - fragsz;
>>>> -            if (unlikely(offset < 0)) {
>>>> +            offset = 0;
>>>> +            if (unlikely(fragsz > PAGE_SIZE)) {
>>>
>>> Since we aren't taking advantage of the flag that is left after the
>>> subtraction we might just want to look at moving this piece up to just
>>> after the offset + fragsz check. That should prevent us from trying to
>>> refill if we have a request that is larger than a single page. In
>>> addition we could probably just drop the 3 PAGE_SIZE checks above as
>>> they would be redundant.
>>
>> I am not sure I understand the 'drop the 3 PAGE_SIZE checks' part and
>> the 'redundant' part, where is the '3 PAGE_SIZE checks'? And why they
>> are redundant?
> 
> I was referring to the addition of the checks for align > PAGE_SIZE in
> the alloc functions at the start of this diff. I guess I had dropped
> them from the first half of it with the "...". Also looking back
> through the patch you misspelled "avoid" as "aovid".
> 
> The issue is there is a ton of pulling things forward that don't
> necessarily make sense into these diffs. Now that I have finished
> looking through the set I have a better idea of why those are there
> and they might make sense. It is just difficult to review since code
> is being added for things that aren't applicable to the patch being
> reviewed.

As you mentioned in other thread, perhaps the 'remaining' changing does
need to be incorporated into this patch.

> .
> 


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 06/13] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc'
  2024-07-02 14:55       ` Alexander H Duyck
@ 2024-07-03 12:33         ` Yunsheng Lin
  2024-07-10 15:28           ` Alexander H Duyck
  0 siblings, 1 reply; 43+ messages in thread
From: Yunsheng Lin @ 2024-07-03 12:33 UTC (permalink / raw)
  To: Alexander H Duyck, davem, kuba, pabeni
  Cc: netdev, linux-kernel, Andrew Morton, linux-mm

On 2024/7/2 22:55, Alexander H Duyck wrote:

...

> 
>>>
>>>> +#define PAGE_FRAG_CACHE_ORDER_MASK		GENMASK(7, 0)
>>>> +#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT		BIT(8)
>>>> +#define PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT	8
>>>> +
>>>> +static inline struct encoded_va *encode_aligned_va(void *va,
>>>> +						   unsigned int order,
>>>> +						   bool pfmemalloc)
>>>> +{
>>>>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>>>> -	__u16 offset;
>>>> -	__u16 size;
>>>> +	return (struct encoded_va *)((unsigned long)va | order |
>>>> +			pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT);
>>>>  #else
>>>> -	__u32 offset;
>>>> +	return (struct encoded_va *)((unsigned long)va |
>>>> +			pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT);
>>>> +#endif
>>>> +}
>>>> +
> 
> This is missing any and all protection of the example you cited. If I
> want to pass order as a 32b value I can and I can corrupt the virtual
> address. Same thing with pfmemalloc. Lets only hope you don't hit an
> architecture where a bool is a u8 in size as otherwise that shift is
> going to wipe out the value, and if it is a u32 which is usually the
> case lets hope they stick to the values of 0 and 1.

I explicitly checked that the protection is not really needed due to
performance consideration:
1. For the 'pfmemalloc' part, it does always stick to the values of 0
   and 1 as below:
https://elixir.bootlin.com/linux/v6.10-rc6/source/Documentation/process/coding-style.rst#L1053

2. For the 'order' part, its range can only be within 0~3.

> 
>>>> +static inline unsigned long encoded_page_order(struct encoded_va *encoded_va)
>>>> +{
>>>> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>>>> +	return PAGE_FRAG_CACHE_ORDER_MASK & (unsigned long)encoded_va;
>>>> +#else
>>>> +	return 0;
>>>> +#endif
>>>> +}
>>>> +
>>>> +static inline bool encoded_page_pfmemalloc(struct encoded_va *encoded_va)
>>>> +{
>>>> +	return PAGE_FRAG_CACHE_PFMEMALLOC_BIT & (unsigned long)encoded_va;
>>>> +}
>>>> +
>>>
>>> My advice is that if you just make encoded_va an unsigned long this
>>> just becomes some FIELD_GET and bit operations.
>>
>> As above.
>>
> 
> The code you mentioned had one extra block of bits that was in it and
> had strict protections on what went into and out of those bits. You
> don't have any of those protections.

As above, the protection masking/checking is explicitly avoided due
to performance consideration and reasons as above for encoded_va.

But I guess it doesn't hurt to have a VM_BUG_ON() checking to catch
possible future mistake.

> 
> I suggest you just use a long and don't bother storing this as a
> pointer.
> 

...

>>>> -
>>>> +	remaining = nc->remaining & align_mask;
>>>> +	remaining -= fragsz;
>>>> +	if (unlikely(remaining < 0)) {
>>>
>>> Now this is just getting confusing. You essentially just added an
>>> additional addition step and went back to the countdown approach I was
>>> using before except for the fact that you are starting at 0 whereas I
>>> was actually moving down through the page.
>>
>> Does the 'additional addition step' mean the additional step to calculate
>> the offset using the new 'remaining' field? I guess that is the disadvantage
>> by changing 'offset' to 'remaining', but it also some advantages too:
>>
>> 1. it is better to replace 'offset' with 'remaining', which
>>    is the remaining size for the cache in a 'page_frag_cache'
>>    instance, we are able to do a single 'fragsz > remaining'
>>    checking for the case of cache not being enough, which should be
>>    the fast path if we ensure size is zoro when 'va' == NULL by
>>    memset'ing 'struct page_frag_cache' in page_frag_cache_init()
>>    and page_frag_cache_drain().
>> 2. It seems more convenient to implement the commit/probe API too
>>    when using 'remaining' instead of 'offset' as those API needs
>>    the remaining size of the page_frag_cache anyway.
>>
>> So it is really a trade-off between using 'offset' and 'remaining',
>> it is like the similar argument about trade-off between allocating
>> fragment 'countdown' and 'countup' way.
>>
>> About confusing part, as the nc->remaining does mean how much cache
>> is left in the 'nc', and nc->remaining does start from
>> PAGE_FRAG_CACHE_MAX_SIZE/PAGE_SIZE to zero naturally if that was what
>> you meant by 'countdown', but it is different from the 'offset countdown'
>> before this patchset as my understanding.
>>
>>>
>>> What I would suggest doing since "remaining" is a negative offset
>>> anyway would be to look at just storing it as a signed negative number.
>>> At least with that you can keep to your original approach and would
>>> only have to change your check to be for "remaining + fragsz <= 0".
>>
>> Did you mean by using s16/s32 for 'remaining'? And set nc->remaining like
>> below?
>> nc->remaining = -PAGE_SIZE or
>> nc->remaining = -PAGE_FRAG_CACHE_MAX_SIZE
> 
> Yes. Basically if we used it as a signed value then you could just work
> your way up and do your aligned math still.

For the aligned math, I am not sure how can 'align_mask' be appiled to
a signed value yet. It seems that after masking nc->remaining leans
towards -PAGE_SIZE/-PAGE_FRAG_CACHE_MAX_SIZE instead of zero for
a unsigned value, for example:

nc->remaining = -4094;
nc->remaining &= -64;

It seems we got -4096 for above, is that what we are expecting?

> 
>> struct page_frag_cache {
>>         struct encoded_va *encoded_va;
>>
>> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
>>         u16 pagecnt_bias;
>>         s16 remaining;
>> #else
>>         u32 pagecnt_bias;
>>         s32 remaining;
>> #endif
>> };
>>
>> If I understand above correctly, it seems we really need a better name
>> than 'remaining' to reflect that.
> 
> It would effectively work like a countdown. Instead of T - X in this
> case it is size - X.
> 
>>> With that you can still do your math but it becomes an addition instead
>>> of a subtraction.
>>
>> And I am not really sure what is the gain here by using an addition
>> instead of a subtraction here.
>>
> 
> The "remaining" as it currently stands is doing the same thing so odds
> are it isn't too big a deal. It is just that the original code was
> already somewhat confusing and this is just making it that much more
> complex.
> 
>>>> +		page = virt_to_page(encoded_va);
>>>>  		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
>>>>  			goto refill;
>>>>  
>>>> -		if (unlikely(nc->pfmemalloc)) {
>>>> -			free_unref_page(page, compound_order(page));
>>>> +		if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
>>>> +			VM_BUG_ON(compound_order(page) !=
>>>> +				  encoded_page_order(encoded_va));
>>>> +			free_unref_page(page, encoded_page_order(encoded_va));
>>>>  			goto refill;
>>>>  		}
>>>>  
>>>>  		/* OK, page count is 0, we can safely set it */
>>>>  		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
>>>>  
>>>> -		/* reset page count bias and offset to start of new frag */
>>>> +		/* reset page count bias and remaining of new frag */
>>>>  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>>>> -		offset = 0;
>>>> -		if (unlikely(fragsz > PAGE_SIZE)) {
>>>> +		nc->remaining = remaining = page_frag_cache_page_size(encoded_va);
>>>> +		remaining -= fragsz;
>>>> +		if (unlikely(remaining < 0)) {
>>>>  			/*
>>>>  			 * The caller is trying to allocate a fragment
>>>>  			 * with fragsz > PAGE_SIZE but the cache isn't big
>>>
>>> I find it really amusing that you went to all the trouble of flipping
>>> the logic just to flip it back to being a countdown setup. If you were
>>> going to bother with all that then why not just make the remaining
>>> negative instead? You could save yourself a ton of trouble that way and
>>> all you would need to do is flip a few signs.
>>
>> I am not sure I understand the 'a ton of trouble' part here, if 'flipping
>> a few signs' does save a ton of trouble here, I would like to avoid 'a
>> ton of trouble' here, but I am not really understand the gain here yet as
>> mentioned above.
> 
> It isn't about flipping the signs. It is more the fact that the logic
> has now become even more complex then it originally was. With my work
> backwards approach the advantage was that the offset could be updated
> and then we just recorded everything and reported it up. Now we have to

I am supposing the above is referring to 'offset countdown' way
before this patchset, right?

> keep a temporary "remaining" value, generate our virtual address and
> store that to a temp variable, we can record the new "remaining" value,
> and then we can report the virtual address. If we get the ordering on

Yes, I noticed it when coding too, that is why current virtual address is
generated in page_frag_cache_current_va() basing on nc->remaining instead
of the local variable 'remaining' before assigning the local variable
'remaining' to nc->remaining. But I am not sure I understand how using a
signed negative number for 'remaining' will change the above steps. If
not, I still fail to see the gain of using a signed negative number for
'nc->remaining'.

> the steps 2 and 3 in this it will cause issues as we will be pointing
> to the wrong values. It is something I can see someone easily messing
> up.

Yes, agreed. It would be good to be more specific about how to avoid
the above problem using a signed negative number for 'remaining' as
I am not sure how it can be done yet.

> 


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 07/13] mm: page_frag: some minor refactoring before adding new API
  2024-07-02 15:30   ` Alexander H Duyck
@ 2024-07-03 12:36     ` Yunsheng Lin
  0 siblings, 0 replies; 43+ messages in thread
From: Yunsheng Lin @ 2024-07-03 12:36 UTC (permalink / raw)
  To: Alexander H Duyck, davem, kuba, pabeni
  Cc: netdev, linux-kernel, Andrew Morton, linux-mm

On 2024/7/2 23:30, Alexander H Duyck wrote:
> On Tue, 2024-06-25 at 21:52 +0800, Yunsheng Lin wrote:
>> Refactor common codes from __page_frag_alloc_va_align()
>> to __page_frag_cache_refill(), so that the new API can
>> make use of them.
>>
>> CC: Alexander Duyck <alexander.duyck@gmail.com>
>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> 
> I am generally not a fan of the concept behind this patch. I really
> think we should keep the page_frag_cache_refill function to just
> allocating the page, or in this case the encoded_va and populating only
> that portion of the struct.

As my understanding, the above mainly depends on how you look at it,
at least it seems odd to me that part of a struct is populated in
one function and other part of the same struct is populated in other
function.

> 
>> ---
>>  mm/page_frag_cache.c | 61 ++++++++++++++++++++++----------------------
>>  1 file changed, 31 insertions(+), 30 deletions(-)
>>
>> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
>> index a3316dd50eff..4fd421d4f22c 100644
>> --- a/mm/page_frag_cache.c
>> +++ b/mm/page_frag_cache.c
>> @@ -29,10 +29,36 @@ static void *page_frag_cache_current_va(struct page_frag_cache *nc)
>>  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>  					     gfp_t gfp_mask)
>>  {
>> -	struct page *page = NULL;
>> +	struct encoded_va *encoded_va = nc->encoded_va;
>>  	gfp_t gfp = gfp_mask;
>>  	unsigned int order;
>> +	struct page *page;
>> +
>> +	if (unlikely(!encoded_va))
>> +		goto alloc;
>> +
>> +	page = virt_to_page(encoded_va);
>> +	if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
>> +		goto alloc;
>> +
>> +	if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
>> +		VM_BUG_ON(compound_order(page) !=
>> +			  encoded_page_order(encoded_va));
>> +		free_unref_page(page, encoded_page_order(encoded_va));
>> +		goto alloc;
>> +	}
>> +
>> +	/* OK, page count is 0, we can safely set it */
>> +	set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
> 
> Why not just make this block of code a function onto itself? You put an
> if statement at the top that essentially is just merging two functions
> into one. Perhaps this logic could be __page_frag_cache_recharge which
> would return an error if the page is busy or the wrong type. Then
> acting on that you could switch to the refill attempt.
> 
> Also thinking about it more the set_page_count in this function and
> page_ref_add in the other can probably be merged into the recharge and
> refill functions since they are acting directly on the encoded page and
> not interacting with the other parts of the page_frag_cache.

So we are agreed that the below is merged into __page_frag_cache_recharge()?
set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;

The below is merged into __page_frag_cache_refill()?
page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;

> 
>> +
>> +	/* reset page count bias and remaining of new frag */
>> +	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>> +	nc->remaining = page_frag_cache_page_size(encoded_va);
> 
> These two parts are more or less agnostic to the setup and could be
> applied to refill or recharge. Also one thought occurs to me. You were
> encoding "order" into the encoded VA. Why use that when your choices
> are either PAGE_FRAG_CACHE_MAX_SIZE or PAGE_SIZE. It should be a single
> bit and doesn't need to be a fully byte to store that. That would allow
> you to reduce this down to just 2 bits, one for pfmemalloc and one for
> max order vs order 0.

I thought about the above and implemented it actually, but it turned out
that it was not as good as encoding "order" into the encoded VA, at least
for the generated asm code size, it didn't seem better. Using one bit,
we need a checking to decide it is PAGE_FRAG_CACHE_MAX_SIZE or PAGE_SIZE.
And we can use that to replace compound_order(page) when calling
free_unref_page() too.

> 
>> +
>> +	return page;


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 10/13] mm: page_frag: introduce prepare/probe/commit API
  2024-06-30 15:05             ` Yunsheng Lin
@ 2024-07-03 12:40               ` Yunsheng Lin
  2024-07-07 17:12                 ` Alexander Duyck
  0 siblings, 1 reply; 43+ messages in thread
From: Yunsheng Lin @ 2024-07-03 12:40 UTC (permalink / raw)
  To: Yunsheng Lin, Alexander Duyck
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Andrew Morton,
	linux-mm

On 2024/6/30 23:05, Yunsheng Lin wrote:
> On 6/30/2024 10:35 PM, Alexander Duyck wrote:
>> On Sun, Jun 30, 2024 at 7:05 AM Yunsheng Lin <yunshenglin0825@gmail.com> wrote:
>>>
>>> On 6/30/2024 1:37 AM, Alexander Duyck wrote:
>>>> On Sat, Jun 29, 2024 at 4:15 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>>
>>> ...
>>>
>>>>>>
>>>>>> Why is this a macro instead of just being an inline? Are you trying to
>>>>>> avoid having to include a header due to the virt_to_page?
>>>>>
>>>>> Yes, you are right.
>>
>> ...
>>
>>>> I am pretty sure you just need to add:
>>>> #include <asm/page.h>
>>>
>>> I am supposing you mean adding the above to page_frag_cache.h, right?
>>>
>>> It seems thing is more complicated for SPARSEMEM_VMEMMAP case, as it
>>> needs the declaration of 'vmemmap'(some arch defines it as a pointer
>>> variable while some arch defines it as a macro) and the definition of
>>> 'struct page' for '(vmemmap + (pfn))' operation.
>>>
>>> Adding below for 'vmemmap' and 'struct page' seems to have some compiler
>>> error caused by interdependence between linux/mm_types.h and asm/pgtable.h:
>>> #include <asm/pgtable.h>
>>> #include <linux/mm_types.h>
>>>
>>
>> Maybe you should just include linux/mm.h as that should have all the
>> necessary includes to handle these cases. In any case though it
> 
> Including linux/mm.h seems to have similar compiler error, just the
> interdependence is between linux/mm_types.h and linux/mm.h now.

How about splitting page_frag_cache.h into page_frag_types.h and
page_frag_cache.h mirroring the above linux/mm_types.h and linux/mm.h
to fix the compiler error?

> 
> As below, linux/mmap_lock.h obviously need the definition of
> 'struct mm_struct' from linux/mm_types.h, and linux/mm_types.h
> has some a long dependency of linux/mm.h starting from
> linux/uprobes.h if we add '#include <linux/mm.h>' in linux/page_frag_cache.h:
> 
> In file included from ./include/linux/mm.h:16,
>                  from ./include/linux/page_frag_cache.h:6,
>                  from ./include/linux/sched.h:49,
>                  from ./include/linux/percpu.h:13,
>                  from ./arch/x86/include/asm/msr.h:15,
>                  from ./arch/x86/include/asm/tsc.h:10,
>                  from ./arch/x86/include/asm/timex.h:6,
>                  from ./include/linux/timex.h:67,
>                  from ./include/linux/time32.h:13,
>                  from ./include/linux/time.h:60,
>                  from ./include/linux/jiffies.h:10,
>                  from ./include/linux/ktime.h:25,
>                  from ./include/linux/timer.h:6,
>                  from ./include/linux/workqueue.h:9,
>                  from ./include/linux/srcu.h:21,
>                  from ./include/linux/notifier.h:16,
>                  from ./arch/x86/include/asm/uprobes.h:13,
>                  from ./include/linux/uprobes.h:49,
>                  from ./include/linux/mm_types.h:16,
>                  from ./include/linux/mmzone.h:22,
>                  from ./include/linux/gfp.h:7,
>                  from ./include/linux/slab.h:16,
>                  from ./include/linux/crypto.h:17,
>                  from arch/x86/kernel/asm-offsets.c:9:
> ./include/linux/mmap_lock.h: In function ‘mmap_assert_locked’:
> ./include/linux/mmap_lock.h:65:30: error: invalid use of undefined type ‘const struct mm_struct’
>    65 |         rwsem_assert_held(&mm->mmap_lock);
>       |                              ^~
> 
>> doesn't make any sense to have a define in one include that expects
>> the user to then figure out what other headers to include in order to
>> make the define work they should be included in the header itself to
>> avoid any sort of weird dependencies.
> 
> Perhaps there are some season why there are two headers for the mm subsystem, linux/mm_types.h and linux/mm.h?
> And .h file is supposed to include the linux/mm_types.h while .c file
> is supposed to include the linux/mm.h?
> If the above is correct, it seems the above rule is broked by including linux/mm.h in linux/page_frag_cache.h.
> .
> 


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 10/13] mm: page_frag: introduce prepare/probe/commit API
  2024-07-03 12:40               ` Yunsheng Lin
@ 2024-07-07 17:12                 ` Alexander Duyck
  2024-07-08 10:58                   ` Yunsheng Lin
  0 siblings, 1 reply; 43+ messages in thread
From: Alexander Duyck @ 2024-07-07 17:12 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: Yunsheng Lin, davem, kuba, pabeni, netdev, linux-kernel,
	Andrew Morton, linux-mm

On Wed, Jul 3, 2024 at 5:40 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/6/30 23:05, Yunsheng Lin wrote:
> > On 6/30/2024 10:35 PM, Alexander Duyck wrote:
> >> On Sun, Jun 30, 2024 at 7:05 AM Yunsheng Lin <yunshenglin0825@gmail.com> wrote:
> >>>
> >>> On 6/30/2024 1:37 AM, Alexander Duyck wrote:
> >>>> On Sat, Jun 29, 2024 at 4:15 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
> >>>
> >>> ...
> >>>
> >>>>>>
> >>>>>> Why is this a macro instead of just being an inline? Are you trying to
> >>>>>> avoid having to include a header due to the virt_to_page?
> >>>>>
> >>>>> Yes, you are right.
> >>
> >> ...
> >>
> >>>> I am pretty sure you just need to add:
> >>>> #include <asm/page.h>
> >>>
> >>> I am supposing you mean adding the above to page_frag_cache.h, right?
> >>>
> >>> It seems thing is more complicated for SPARSEMEM_VMEMMAP case, as it
> >>> needs the declaration of 'vmemmap'(some arch defines it as a pointer
> >>> variable while some arch defines it as a macro) and the definition of
> >>> 'struct page' for '(vmemmap + (pfn))' operation.
> >>>
> >>> Adding below for 'vmemmap' and 'struct page' seems to have some compiler
> >>> error caused by interdependence between linux/mm_types.h and asm/pgtable.h:
> >>> #include <asm/pgtable.h>
> >>> #include <linux/mm_types.h>
> >>>
> >>
> >> Maybe you should just include linux/mm.h as that should have all the
> >> necessary includes to handle these cases. In any case though it
> >
> > Including linux/mm.h seems to have similar compiler error, just the
> > interdependence is between linux/mm_types.h and linux/mm.h now.
>
> How about splitting page_frag_cache.h into page_frag_types.h and
> page_frag_cache.h mirroring the above linux/mm_types.h and linux/mm.h
> to fix the compiler error?
>
> >
> > As below, linux/mmap_lock.h obviously need the definition of
> > 'struct mm_struct' from linux/mm_types.h, and linux/mm_types.h
> > has some a long dependency of linux/mm.h starting from
> > linux/uprobes.h if we add '#include <linux/mm.h>' in linux/page_frag_cache.h:
> >
> > In file included from ./include/linux/mm.h:16,
> >                  from ./include/linux/page_frag_cache.h:6,
> >                  from ./include/linux/sched.h:49,
> >                  from ./include/linux/percpu.h:13,
> >                  from ./arch/x86/include/asm/msr.h:15,
> >                  from ./arch/x86/include/asm/tsc.h:10,
> >                  from ./arch/x86/include/asm/timex.h:6,
> >                  from ./include/linux/timex.h:67,
> >                  from ./include/linux/time32.h:13,
> >                  from ./include/linux/time.h:60,
> >                  from ./include/linux/jiffies.h:10,
> >                  from ./include/linux/ktime.h:25,
> >                  from ./include/linux/timer.h:6,
> >                  from ./include/linux/workqueue.h:9,
> >                  from ./include/linux/srcu.h:21,
> >                  from ./include/linux/notifier.h:16,
> >                  from ./arch/x86/include/asm/uprobes.h:13,
> >                  from ./include/linux/uprobes.h:49,
> >                  from ./include/linux/mm_types.h:16,
> >                  from ./include/linux/mmzone.h:22,
> >                  from ./include/linux/gfp.h:7,
> >                  from ./include/linux/slab.h:16,
> >                  from ./include/linux/crypto.h:17,
> >                  from arch/x86/kernel/asm-offsets.c:9:
> > ./include/linux/mmap_lock.h: In function ‘mmap_assert_locked’:
> > ./include/linux/mmap_lock.h:65:30: error: invalid use of undefined type ‘const struct mm_struct’
> >    65 |         rwsem_assert_held(&mm->mmap_lock);
> >       |                              ^~
> >
> >> doesn't make any sense to have a define in one include that expects
> >> the user to then figure out what other headers to include in order to
> >> make the define work they should be included in the header itself to
> >> avoid any sort of weird dependencies.
> >
> > Perhaps there are some season why there are two headers for the mm subsystem, linux/mm_types.h and linux/mm.h?
> > And .h file is supposed to include the linux/mm_types.h while .c file
> > is supposed to include the linux/mm.h?
> > If the above is correct, it seems the above rule is broked by including linux/mm.h in linux/page_frag_cache.h.
> > .

The issue is the dependency mess that has been created with patch 11
in the set. Again you are conflating patches which makes this really
hard to debug or discuss as I make suggestions on one patch and you
claim it breaks things that are really due to issues in another patch.
So the issue is you included this header into include/linux/sched.h
which is included in linux/mm_types.h. So what happens then is that
you have to include page_frag_cache.h *before* you can include the
bits from mm_types.h

What might make more sense to solve this is to look at just moving the
page_frag_cache into mm_types_task.h and then having it replace the
page_frag struct there since mm_types.h will pull that in anyway. That
way sched.h can avoid having to pull in page_frag_cache.h.


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 10/13] mm: page_frag: introduce prepare/probe/commit API
  2024-07-07 17:12                 ` Alexander Duyck
@ 2024-07-08 10:58                   ` Yunsheng Lin
  2024-07-08 14:30                     ` Alexander Duyck
  0 siblings, 1 reply; 43+ messages in thread
From: Yunsheng Lin @ 2024-07-08 10:58 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Yunsheng Lin, davem, kuba, pabeni, netdev, linux-kernel,
	Andrew Morton, linux-mm

On 2024/7/8 1:12, Alexander Duyck wrote:

...

> The issue is the dependency mess that has been created with patch 11
> in the set. Again you are conflating patches which makes this really
> hard to debug or discuss as I make suggestions on one patch and you
> claim it breaks things that are really due to issues in another patch.
> So the issue is you included this header into include/linux/sched.h
> which is included in linux/mm_types.h. So what happens then is that
> you have to include page_frag_cache.h *before* you can include the
> bits from mm_types.h
> 
> What might make more sense to solve this is to look at just moving the
> page_frag_cache into mm_types_task.h and then having it replace the
> page_frag struct there since mm_types.h will pull that in anyway. That
> way sched.h can avoid having to pull in page_frag_cache.h.

It seems the above didn't work either, as asm-offsets.c does depend on
mm_types_task.h too.

In file included from ./include/linux/mm.h:16,
                 from ./include/linux/page_frag_cache.h:10,
                 from ./include/linux/mm_types_task.h:11,
                 from ./include/linux/mm_types.h:5,
                 from ./include/linux/mmzone.h:22,
                 from ./include/linux/gfp.h:7,
                 from ./include/linux/slab.h:16,
                 from ./include/linux/resource_ext.h:11,
                 from ./include/linux/acpi.h:13,
                 from ./include/acpi/apei.h:9,
                 from ./include/acpi/ghes.h:5,
                 from ./include/linux/arm_sdei.h:8,
                 from arch/arm64/kernel/asm-offsets.c:10:
./include/linux/mmap_lock.h: In function ‘mmap_assert_locked’:
./include/linux/mmap_lock.h:65:23: error: invalid use of undefined type ‘const struct mm_struct’
   65 |  rwsem_assert_held(&mm->mmap_lock);


> .
> 


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 10/13] mm: page_frag: introduce prepare/probe/commit API
  2024-07-08 10:58                   ` Yunsheng Lin
@ 2024-07-08 14:30                     ` Alexander Duyck
  2024-07-09  6:57                       ` Yunsheng Lin
  0 siblings, 1 reply; 43+ messages in thread
From: Alexander Duyck @ 2024-07-08 14:30 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: Yunsheng Lin, davem, kuba, pabeni, netdev, linux-kernel,
	Andrew Morton, linux-mm

On Mon, Jul 8, 2024 at 3:58 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/7/8 1:12, Alexander Duyck wrote:
>
> ...
>
> > The issue is the dependency mess that has been created with patch 11
> > in the set. Again you are conflating patches which makes this really
> > hard to debug or discuss as I make suggestions on one patch and you
> > claim it breaks things that are really due to issues in another patch.
> > So the issue is you included this header into include/linux/sched.h
> > which is included in linux/mm_types.h. So what happens then is that
> > you have to include page_frag_cache.h *before* you can include the
> > bits from mm_types.h
> >
> > What might make more sense to solve this is to look at just moving the
> > page_frag_cache into mm_types_task.h and then having it replace the
> > page_frag struct there since mm_types.h will pull that in anyway. That
> > way sched.h can avoid having to pull in page_frag_cache.h.
>
> It seems the above didn't work either, as asm-offsets.c does depend on
> mm_types_task.h too.
>
> In file included from ./include/linux/mm.h:16,
>                  from ./include/linux/page_frag_cache.h:10,
>                  from ./include/linux/mm_types_task.h:11,
>                  from ./include/linux/mm_types.h:5,
>                  from ./include/linux/mmzone.h:22,
>                  from ./include/linux/gfp.h:7,
>                  from ./include/linux/slab.h:16,
>                  from ./include/linux/resource_ext.h:11,
>                  from ./include/linux/acpi.h:13,
>                  from ./include/acpi/apei.h:9,
>                  from ./include/acpi/ghes.h:5,
>                  from ./include/linux/arm_sdei.h:8,
>                  from arch/arm64/kernel/asm-offsets.c:10:
> ./include/linux/mmap_lock.h: In function ‘mmap_assert_locked’:
> ./include/linux/mmap_lock.h:65:23: error: invalid use of undefined type ‘const struct mm_struct’
>    65 |  rwsem_assert_held(&mm->mmap_lock);

Do not include page_frag_cache.h in mm_types_task.h. Just move the
struct page_frag_cache there to replace struct page_frag.


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 10/13] mm: page_frag: introduce prepare/probe/commit API
  2024-07-08 14:30                     ` Alexander Duyck
@ 2024-07-09  6:57                       ` Yunsheng Lin
  2024-07-09 13:40                         ` Alexander Duyck
  0 siblings, 1 reply; 43+ messages in thread
From: Yunsheng Lin @ 2024-07-09  6:57 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Yunsheng Lin, davem, kuba, pabeni, netdev, linux-kernel,
	Andrew Morton, linux-mm

On 2024/7/8 22:30, Alexander Duyck wrote:
> On Mon, Jul 8, 2024 at 3:58 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> On 2024/7/8 1:12, Alexander Duyck wrote:
>>
>> ...
>>
>>> The issue is the dependency mess that has been created with patch 11
>>> in the set. Again you are conflating patches which makes this really
>>> hard to debug or discuss as I make suggestions on one patch and you
>>> claim it breaks things that are really due to issues in another patch.
>>> So the issue is you included this header into include/linux/sched.h
>>> which is included in linux/mm_types.h. So what happens then is that
>>> you have to include page_frag_cache.h *before* you can include the
>>> bits from mm_types.h
>>>
>>> What might make more sense to solve this is to look at just moving the
>>> page_frag_cache into mm_types_task.h and then having it replace the
>>> page_frag struct there since mm_types.h will pull that in anyway. That
>>> way sched.h can avoid having to pull in page_frag_cache.h.
>>
>> It seems the above didn't work either, as asm-offsets.c does depend on
>> mm_types_task.h too.
>>
>> In file included from ./include/linux/mm.h:16,
>>                  from ./include/linux/page_frag_cache.h:10,
>>                  from ./include/linux/mm_types_task.h:11,
>>                  from ./include/linux/mm_types.h:5,
>>                  from ./include/linux/mmzone.h:22,
>>                  from ./include/linux/gfp.h:7,
>>                  from ./include/linux/slab.h:16,
>>                  from ./include/linux/resource_ext.h:11,
>>                  from ./include/linux/acpi.h:13,
>>                  from ./include/acpi/apei.h:9,
>>                  from ./include/acpi/ghes.h:5,
>>                  from ./include/linux/arm_sdei.h:8,
>>                  from arch/arm64/kernel/asm-offsets.c:10:
>> ./include/linux/mmap_lock.h: In function ‘mmap_assert_locked’:
>> ./include/linux/mmap_lock.h:65:23: error: invalid use of undefined type ‘const struct mm_struct’
>>    65 |  rwsem_assert_held(&mm->mmap_lock);
> 
> Do not include page_frag_cache.h in mm_types_task.h. Just move the
> struct page_frag_cache there to replace struct page_frag.

The above does seem a clever idea, but doesn't doing above also seem to
defeat some purpose of patch 1? Anyway, it seems workable for trying
to avoid adding a new header for a single struct.

About the 'replace' part, as mentioned in [1], the 'struct page_frag'
is still needed as this patchset is large enough that replacing is only
done for sk_page_frag(), there are still other places using page_frag
that can be replaced by page_frag_cache in the following patchset.

1. https://lore.kernel.org/all/b200a609-2f30-ec37-39b6-f37ed8092f41@huawei.com/

> .
> 


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 10/13] mm: page_frag: introduce prepare/probe/commit API
  2024-07-09  6:57                       ` Yunsheng Lin
@ 2024-07-09 13:40                         ` Alexander Duyck
  0 siblings, 0 replies; 43+ messages in thread
From: Alexander Duyck @ 2024-07-09 13:40 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: Yunsheng Lin, davem, kuba, pabeni, netdev, linux-kernel,
	Andrew Morton, linux-mm

On Mon, Jul 8, 2024 at 11:58 PM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/7/8 22:30, Alexander Duyck wrote:
> > On Mon, Jul 8, 2024 at 3:58 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
> >>
> >> On 2024/7/8 1:12, Alexander Duyck wrote:
> >>
> >> ...
> >>
> >>> The issue is the dependency mess that has been created with patch 11
> >>> in the set. Again you are conflating patches which makes this really
> >>> hard to debug or discuss as I make suggestions on one patch and you
> >>> claim it breaks things that are really due to issues in another patch.
> >>> So the issue is you included this header into include/linux/sched.h
> >>> which is included in linux/mm_types.h. So what happens then is that
> >>> you have to include page_frag_cache.h *before* you can include the
> >>> bits from mm_types.h
> >>>
> >>> What might make more sense to solve this is to look at just moving the
> >>> page_frag_cache into mm_types_task.h and then having it replace the
> >>> page_frag struct there since mm_types.h will pull that in anyway. That
> >>> way sched.h can avoid having to pull in page_frag_cache.h.
> >>
> >> It seems the above didn't work either, as asm-offsets.c does depend on
> >> mm_types_task.h too.
> >>
> >> In file included from ./include/linux/mm.h:16,
> >>                  from ./include/linux/page_frag_cache.h:10,
> >>                  from ./include/linux/mm_types_task.h:11,
> >>                  from ./include/linux/mm_types.h:5,
> >>                  from ./include/linux/mmzone.h:22,
> >>                  from ./include/linux/gfp.h:7,
> >>                  from ./include/linux/slab.h:16,
> >>                  from ./include/linux/resource_ext.h:11,
> >>                  from ./include/linux/acpi.h:13,
> >>                  from ./include/acpi/apei.h:9,
> >>                  from ./include/acpi/ghes.h:5,
> >>                  from ./include/linux/arm_sdei.h:8,
> >>                  from arch/arm64/kernel/asm-offsets.c:10:
> >> ./include/linux/mmap_lock.h: In function ‘mmap_assert_locked’:
> >> ./include/linux/mmap_lock.h:65:23: error: invalid use of undefined type ‘const struct mm_struct’
> >>    65 |  rwsem_assert_held(&mm->mmap_lock);
> >
> > Do not include page_frag_cache.h in mm_types_task.h. Just move the
> > struct page_frag_cache there to replace struct page_frag.
>
> The above does seem a clever idea, but doesn't doing above also seem to
> defeat some purpose of patch 1? Anyway, it seems workable for trying
> to avoid adding a new header for a single struct.
>
> About the 'replace' part, as mentioned in [1], the 'struct page_frag'
> is still needed as this patchset is large enough that replacing is only
> done for sk_page_frag(), there are still other places using page_frag
> that can be replaced by page_frag_cache in the following patchset.
>
> 1. https://lore.kernel.org/all/b200a609-2f30-ec37-39b6-f37ed8092f41@huawei.com/

The point is you need to avoid pulling mm.h into sched.h. To do that
you have to pull the data structure out and place it in a different
header file. So maybe instead of creating yet another header file you
can just place the structure in mm_types_task.h and once you have
dealt with all of the other users you can finally drop the page_frag
structure.


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 06/13] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc'
  2024-07-03 12:33         ` Yunsheng Lin
@ 2024-07-10 15:28           ` Alexander H Duyck
  2024-07-11  8:16             ` Yunsheng Lin
  0 siblings, 1 reply; 43+ messages in thread
From: Alexander H Duyck @ 2024-07-10 15:28 UTC (permalink / raw)
  To: Yunsheng Lin, davem, kuba, pabeni
  Cc: netdev, linux-kernel, Andrew Morton, linux-mm

On Wed, 2024-07-03 at 20:33 +0800, Yunsheng Lin wrote:
> On 2024/7/2 22:55, Alexander H Duyck wrote:
> 
> ...
> 
> > 
> > > > 
> > > > > +#define PAGE_FRAG_CACHE_ORDER_MASK		GENMASK(7, 0)
> > > > > +#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT		BIT(8)
> > > > > +#define PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT	8
> > > > > +
> > > > > +static inline struct encoded_va *encode_aligned_va(void *va,
> > > > > +						   unsigned int order,
> > > > > +						   bool pfmemalloc)
> > > > > +{
> > > > >  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> > > > > -	__u16 offset;
> > > > > -	__u16 size;
> > > > > +	return (struct encoded_va *)((unsigned long)va | order |
> > > > > +			pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT);
> > > > >  #else
> > > > > -	__u32 offset;
> > > > > +	return (struct encoded_va *)((unsigned long)va |
> > > > > +			pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT);
> > > > > +#endif
> > > > > +}
> > > > > +
> > 
> > This is missing any and all protection of the example you cited. If I
> > want to pass order as a 32b value I can and I can corrupt the virtual
> > address. Same thing with pfmemalloc. Lets only hope you don't hit an
> > architecture where a bool is a u8 in size as otherwise that shift is
> > going to wipe out the value, and if it is a u32 which is usually the
> > case lets hope they stick to the values of 0 and 1.
> 
> I explicitly checked that the protection is not really needed due to
> performance consideration:
> 1. For the 'pfmemalloc' part, it does always stick to the values of 0
>    and 1 as below:
> https://elixir.bootlin.com/linux/v6.10-rc6/source/Documentation/process/coding-style.rst#L1053
> 
> 2. For the 'order' part, its range can only be within 0~3.
> 
> > 
> > > > > +static inline unsigned long encoded_page_order(struct encoded_va *encoded_va)
> > > > > +{
> > > > > +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> > > > > +	return PAGE_FRAG_CACHE_ORDER_MASK & (unsigned long)encoded_va;
> > > > > +#else
> > > > > +	return 0;
> > > > > +#endif
> > > > > +}
> > > > > +
> > > > > +static inline bool encoded_page_pfmemalloc(struct encoded_va *encoded_va)
> > > > > +{
> > > > > +	return PAGE_FRAG_CACHE_PFMEMALLOC_BIT & (unsigned long)encoded_va;
> > > > > +}
> > > > > +
> > > > 
> > > > My advice is that if you just make encoded_va an unsigned long this
> > > > just becomes some FIELD_GET and bit operations.
> > > 
> > > As above.
> > > 
> > 
> > The code you mentioned had one extra block of bits that was in it and
> > had strict protections on what went into and out of those bits. You
> > don't have any of those protections.
> 
> As above, the protection masking/checking is explicitly avoided due
> to performance consideration and reasons as above for encoded_va.
> 
> But I guess it doesn't hurt to have a VM_BUG_ON() checking to catch
> possible future mistake.
> 
> > 
> > I suggest you just use a long and don't bother storing this as a
> > pointer.
> > 
> 
> ...
> 
> > > > > -
> > > > > +	remaining = nc->remaining & align_mask;
> > > > > +	remaining -= fragsz;
> > > > > +	if (unlikely(remaining < 0)) {
> > > > 
> > > > Now this is just getting confusing. You essentially just added an
> > > > additional addition step and went back to the countdown approach I was
> > > > using before except for the fact that you are starting at 0 whereas I
> > > > was actually moving down through the page.
> > > 
> > > Does the 'additional addition step' mean the additional step to calculate
> > > the offset using the new 'remaining' field? I guess that is the disadvantage
> > > by changing 'offset' to 'remaining', but it also some advantages too:
> > > 
> > > 1. it is better to replace 'offset' with 'remaining', which
> > >    is the remaining size for the cache in a 'page_frag_cache'
> > >    instance, we are able to do a single 'fragsz > remaining'
> > >    checking for the case of cache not being enough, which should be
> > >    the fast path if we ensure size is zoro when 'va' == NULL by
> > >    memset'ing 'struct page_frag_cache' in page_frag_cache_init()
> > >    and page_frag_cache_drain().
> > > 2. It seems more convenient to implement the commit/probe API too
> > >    when using 'remaining' instead of 'offset' as those API needs
> > >    the remaining size of the page_frag_cache anyway.
> > > 
> > > So it is really a trade-off between using 'offset' and 'remaining',
> > > it is like the similar argument about trade-off between allocating
> > > fragment 'countdown' and 'countup' way.
> > > 
> > > About confusing part, as the nc->remaining does mean how much cache
> > > is left in the 'nc', and nc->remaining does start from
> > > PAGE_FRAG_CACHE_MAX_SIZE/PAGE_SIZE to zero naturally if that was what
> > > you meant by 'countdown', but it is different from the 'offset countdown'
> > > before this patchset as my understanding.
> > > 
> > > > 
> > > > What I would suggest doing since "remaining" is a negative offset
> > > > anyway would be to look at just storing it as a signed negative number.
> > > > At least with that you can keep to your original approach and would
> > > > only have to change your check to be for "remaining + fragsz <= 0".
> > > 
> > > Did you mean by using s16/s32 for 'remaining'? And set nc->remaining like
> > > below?
> > > nc->remaining = -PAGE_SIZE or
> > > nc->remaining = -PAGE_FRAG_CACHE_MAX_SIZE
> > 
> > Yes. Basically if we used it as a signed value then you could just work
> > your way up and do your aligned math still.
> 
> For the aligned math, I am not sure how can 'align_mask' be appiled to
> a signed value yet. It seems that after masking nc->remaining leans
> towards -PAGE_SIZE/-PAGE_FRAG_CACHE_MAX_SIZE instead of zero for
> a unsigned value, for example:
> 
> nc->remaining = -4094;
> nc->remaining &= -64;
> 
> It seems we got -4096 for above, is that what we are expecting?

No, you have to do an addition and then the mask like you were before
using __ALIGN_KERNEL.

As it stands I realized your alignment approach in this patch is
broken. You should be aligning the remaining at the start of this
function and then storing it before you call
page_frag_cache_current_va. Instead you do it after so the start of
your block may not be aligned to the requested mask if you have
multiple callers sharing this function or if you are passing an
unaligned size in the request.

> > 
> > > struct page_frag_cache {
> > >         struct encoded_va *encoded_va;
> > > 
> > > #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
> > >         u16 pagecnt_bias;
> > >         s16 remaining;
> > > #else
> > >         u32 pagecnt_bias;
> > >         s32 remaining;
> > > #endif
> > > };
> > > 
> > > If I understand above correctly, it seems we really need a better name
> > > than 'remaining' to reflect that.
> > 
> > It would effectively work like a countdown. Instead of T - X in this
> > case it is size - X.
> > 
> > > > With that you can still do your math but it becomes an addition instead
> > > > of a subtraction.
> > > 
> > > And I am not really sure what is the gain here by using an addition
> > > instead of a subtraction here.
> > > 
> > 
> > The "remaining" as it currently stands is doing the same thing so odds
> > are it isn't too big a deal. It is just that the original code was
> > already somewhat confusing and this is just making it that much more
> > complex.
> > 
> > > > > +		page = virt_to_page(encoded_va);
> > > > >  		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> > > > >  			goto refill;
> > > > >  
> > > > > -		if (unlikely(nc->pfmemalloc)) {
> > > > > -			free_unref_page(page, compound_order(page));
> > > > > +		if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
> > > > > +			VM_BUG_ON(compound_order(page) !=
> > > > > +				  encoded_page_order(encoded_va));
> > > > > +			free_unref_page(page, encoded_page_order(encoded_va));
> > > > >  			goto refill;
> > > > >  		}
> > > > >  
> > > > >  		/* OK, page count is 0, we can safely set it */
> > > > >  		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
> > > > >  
> > > > > -		/* reset page count bias and offset to start of new frag */
> > > > > +		/* reset page count bias and remaining of new frag */
> > > > >  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> > > > > -		offset = 0;
> > > > > -		if (unlikely(fragsz > PAGE_SIZE)) {
> > > > > +		nc->remaining = remaining = page_frag_cache_page_size(encoded_va);
> > > > > +		remaining -= fragsz;
> > > > > +		if (unlikely(remaining < 0)) {
> > > > >  			/*
> > > > >  			 * The caller is trying to allocate a fragment
> > > > >  			 * with fragsz > PAGE_SIZE but the cache isn't big
> > > > 
> > > > I find it really amusing that you went to all the trouble of flipping
> > > > the logic just to flip it back to being a countdown setup. If you were
> > > > going to bother with all that then why not just make the remaining
> > > > negative instead? You could save yourself a ton of trouble that way and
> > > > all you would need to do is flip a few signs.
> > > 
> > > I am not sure I understand the 'a ton of trouble' part here, if 'flipping
> > > a few signs' does save a ton of trouble here, I would like to avoid 'a
> > > ton of trouble' here, but I am not really understand the gain here yet as
> > > mentioned above.
> > 
> > It isn't about flipping the signs. It is more the fact that the logic
> > has now become even more complex then it originally was. With my work
> > backwards approach the advantage was that the offset could be updated
> > and then we just recorded everything and reported it up. Now we have to
> 
> I am supposing the above is referring to 'offset countdown' way
> before this patchset, right?
> 
> > keep a temporary "remaining" value, generate our virtual address and
> > store that to a temp variable, we can record the new "remaining" value,
> > and then we can report the virtual address. If we get the ordering on
> 
> Yes, I noticed it when coding too, that is why current virtual address is
> generated in page_frag_cache_current_va() basing on nc->remaining instead
> of the local variable 'remaining' before assigning the local variable
> 'remaining' to nc->remaining. But I am not sure I understand how using a
> signed negative number for 'remaining' will change the above steps. If
> not, I still fail to see the gain of using a signed negative number for
> 'nc->remaining'.
> 
> > the steps 2 and 3 in this it will cause issues as we will be pointing
> > to the wrong values. It is something I can see someone easily messing
> > up.
> 
> Yes, agreed. It would be good to be more specific about how to avoid
> the above problem using a signed negative number for 'remaining' as
> I am not sure how it can be done yet.
> 

My advice would be to go back to patch 3 and figure out how to do this
re-ordering without changing the alignment behaviour. The old code
essentially aligned both the offset and fragsz by combining the two and
then doing the alignment. Since you are doing a count up setup you will
need to align the remaining, then add fragsz, and then I guess you
could store remaining and then subtract fragsz from your final virtual
address to get back to where the starting offset is actually located.

Basically your "remaining" value isn't a safe number to use for an
offset since it isn't aligned to your starting value at any point.

As far as the negative value, it is more about making it easier to keep
track of what is actually going on. Basically we can use regular
pointer math and as such I suspect the compiler is having to do extra
instructions to flip your value negative before it can combine the
values via something like the LEA (load effective address) assembler
call.


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 06/13] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc'
  2024-07-10 15:28           ` Alexander H Duyck
@ 2024-07-11  8:16             ` Yunsheng Lin
  2024-07-11 16:49               ` Alexander Duyck
  0 siblings, 1 reply; 43+ messages in thread
From: Yunsheng Lin @ 2024-07-11  8:16 UTC (permalink / raw)
  To: Alexander H Duyck, davem, kuba, pabeni
  Cc: netdev, linux-kernel, Andrew Morton, linux-mm

On 2024/7/10 23:28, Alexander H Duyck wrote:

...

>>>>
>>>>>
>>>>> What I would suggest doing since "remaining" is a negative offset
>>>>> anyway would be to look at just storing it as a signed negative number.
>>>>> At least with that you can keep to your original approach and would
>>>>> only have to change your check to be for "remaining + fragsz <= 0".
>>>>
>>>> Did you mean by using s16/s32 for 'remaining'? And set nc->remaining like
>>>> below?
>>>> nc->remaining = -PAGE_SIZE or
>>>> nc->remaining = -PAGE_FRAG_CACHE_MAX_SIZE
>>>
>>> Yes. Basically if we used it as a signed value then you could just work
>>> your way up and do your aligned math still.
>>
>> For the aligned math, I am not sure how can 'align_mask' be appiled to
>> a signed value yet. It seems that after masking nc->remaining leans
>> towards -PAGE_SIZE/-PAGE_FRAG_CACHE_MAX_SIZE instead of zero for
>> a unsigned value, for example:
>>
>> nc->remaining = -4094;
>> nc->remaining &= -64;
>>
>> It seems we got -4096 for above, is that what we are expecting?
> 
> No, you have to do an addition and then the mask like you were before
> using __ALIGN_KERNEL.
> 
> As it stands I realized your alignment approach in this patch is
> broken. You should be aligning the remaining at the start of this
> function and then storing it before you call
> page_frag_cache_current_va. Instead you do it after so the start of
> your block may not be aligned to the requested mask if you have
> multiple callers sharing this function or if you are passing an
> unaligned size in the request.

Yes, you seems right about it, the intention is to do the below as patch
3 does in v10:
aligned_remaining = nc->remaining & align_mask;
remaining = aligned_remaining - fragsz;
nc->remaining = remaining;
return encoded_page_address(nc->encoded_va) + (size - aligned_remaining);

> 
>>>
>>>> struct page_frag_cache {
>>>>         struct encoded_va *encoded_va;
>>>>
>>>> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
>>>>         u16 pagecnt_bias;
>>>>         s16 remaining;
>>>> #else
>>>>         u32 pagecnt_bias;
>>>>         s32 remaining;
>>>> #endif
>>>> };
>>>>
>>>> If I understand above correctly, it seems we really need a better name
>>>> than 'remaining' to reflect that.
>>>
>>> It would effectively work like a countdown. Instead of T - X in this
>>> case it is size - X.
>>>
>>>>> With that you can still do your math but it becomes an addition instead
>>>>> of a subtraction.
>>>>
>>>> And I am not really sure what is the gain here by using an addition
>>>> instead of a subtraction here.
>>>>
>>>
>>> The "remaining" as it currently stands is doing the same thing so odds
>>> are it isn't too big a deal. It is just that the original code was
>>> already somewhat confusing and this is just making it that much more
>>> complex.
>>>
>>>>>> +		page = virt_to_page(encoded_va);
>>>>>>  		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
>>>>>>  			goto refill;
>>>>>>  
>>>>>> -		if (unlikely(nc->pfmemalloc)) {
>>>>>> -			free_unref_page(page, compound_order(page));
>>>>>> +		if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
>>>>>> +			VM_BUG_ON(compound_order(page) !=
>>>>>> +				  encoded_page_order(encoded_va));
>>>>>> +			free_unref_page(page, encoded_page_order(encoded_va));
>>>>>>  			goto refill;
>>>>>>  		}
>>>>>>  
>>>>>>  		/* OK, page count is 0, we can safely set it */
>>>>>>  		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
>>>>>>  
>>>>>> -		/* reset page count bias and offset to start of new frag */
>>>>>> +		/* reset page count bias and remaining of new frag */
>>>>>>  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>>>>>> -		offset = 0;
>>>>>> -		if (unlikely(fragsz > PAGE_SIZE)) {
>>>>>> +		nc->remaining = remaining = page_frag_cache_page_size(encoded_va);
>>>>>> +		remaining -= fragsz;
>>>>>> +		if (unlikely(remaining < 0)) {
>>>>>>  			/*
>>>>>>  			 * The caller is trying to allocate a fragment
>>>>>>  			 * with fragsz > PAGE_SIZE but the cache isn't big
>>>>>
>>>>> I find it really amusing that you went to all the trouble of flipping
>>>>> the logic just to flip it back to being a countdown setup. If you were
>>>>> going to bother with all that then why not just make the remaining
>>>>> negative instead? You could save yourself a ton of trouble that way and
>>>>> all you would need to do is flip a few signs.
>>>>
>>>> I am not sure I understand the 'a ton of trouble' part here, if 'flipping
>>>> a few signs' does save a ton of trouble here, I would like to avoid 'a
>>>> ton of trouble' here, but I am not really understand the gain here yet as
>>>> mentioned above.
>>>
>>> It isn't about flipping the signs. It is more the fact that the logic
>>> has now become even more complex then it originally was. With my work
>>> backwards approach the advantage was that the offset could be updated
>>> and then we just recorded everything and reported it up. Now we have to
>>
>> I am supposing the above is referring to 'offset countdown' way
>> before this patchset, right?
>>
>>> keep a temporary "remaining" value, generate our virtual address and
>>> store that to a temp variable, we can record the new "remaining" value,
>>> and then we can report the virtual address. If we get the ordering on
>>
>> Yes, I noticed it when coding too, that is why current virtual address is
>> generated in page_frag_cache_current_va() basing on nc->remaining instead
>> of the local variable 'remaining' before assigning the local variable
>> 'remaining' to nc->remaining. But I am not sure I understand how using a
>> signed negative number for 'remaining' will change the above steps. If
>> not, I still fail to see the gain of using a signed negative number for
>> 'nc->remaining'.
>>
>>> the steps 2 and 3 in this it will cause issues as we will be pointing
>>> to the wrong values. It is something I can see someone easily messing
>>> up.
>>
>> Yes, agreed. It would be good to be more specific about how to avoid
>> the above problem using a signed negative number for 'remaining' as
>> I am not sure how it can be done yet.
>>
> 
> My advice would be to go back to patch 3 and figure out how to do this
> re-ordering without changing the alignment behaviour. The old code
> essentially aligned both the offset and fragsz by combining the two and
> then doing the alignment. Since you are doing a count up setup you will

I am not sure I understand 'aligned both the offset and fragsz ' part, as
'fragsz' being aligned or not seems to depend on last caller' align_mask,
for the same page_frag_cache instance, suppose offset = 32768 initially for
the old code:
Step 1: __page_frag_alloc_align() is called with fragsz=7 and align_mask=~0u
       the offset after this is 32761, the true fragsz is 7 too.

Step 2: __page_frag_alloc_align() is called with fragsz=7 and align_mask=-16
        the offset after this is 32752, the true fragsz is 9, which does
        not seems to be aligned.

The above is why I added the below paragraph in the doc to make the semantic
more explicit:
"Depending on different aligning requirement, the page_frag API caller may call
page_frag_alloc*_align*() to ensure the returned virtual address or offset of
the page is aligned according to the 'align/alignment' parameter. Note the size
of the allocated fragment is not aligned, the caller needs to provide an aligned
fragsz if there is an alignment requirement for the size of the fragment."

And existing callers of page_frag aligned API does seems to follow the above
rule last time I checked.

Or did I miss something obvious here?

> need to align the remaining, then add fragsz, and then I guess you
> could store remaining and then subtract fragsz from your final virtual
> address to get back to where the starting offset is actually located.

remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask);
remaining += fragsz;
nc->remaining = remaining;
return encoded_page_address(nc->encoded_va) + (size + remaining) - fragsz;

If yes, I am not sure what is the point of doing the above yet, it
just seem to make thing more complicated and harder to understand.

> 
> Basically your "remaining" value isn't a safe number to use for an
> offset since it isn't aligned to your starting value at any point.

Does using 'aligned_remaining' local variable to make it more obvious
seem reasonable to you?

> 
> As far as the negative value, it is more about making it easier to keep
> track of what is actually going on. Basically we can use regular
> pointer math and as such I suspect the compiler is having to do extra
> instructions to flip your value negative before it can combine the
> values via something like the LEA (load effective address) assembler
> call.

I am not an asm expert here, I am not sure I understand the optimization
trick here.


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 06/13] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc'
  2024-07-11  8:16             ` Yunsheng Lin
@ 2024-07-11 16:49               ` Alexander Duyck
  2024-07-12  8:42                 ` Yunsheng Lin
  0 siblings, 1 reply; 43+ messages in thread
From: Alexander Duyck @ 2024-07-11 16:49 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Andrew Morton,
	linux-mm

On Thu, Jul 11, 2024 at 1:16 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/7/10 23:28, Alexander H Duyck wrote:

...

> >>
> >> Yes, agreed. It would be good to be more specific about how to avoid
> >> the above problem using a signed negative number for 'remaining' as
> >> I am not sure how it can be done yet.
> >>
> >
> > My advice would be to go back to patch 3 and figure out how to do this
> > re-ordering without changing the alignment behaviour. The old code
> > essentially aligned both the offset and fragsz by combining the two and
> > then doing the alignment. Since you are doing a count up setup you will
>
> I am not sure I understand 'aligned both the offset and fragsz ' part, as
> 'fragsz' being aligned or not seems to depend on last caller' align_mask,
> for the same page_frag_cache instance, suppose offset = 32768 initially for
> the old code:
> Step 1: __page_frag_alloc_align() is called with fragsz=7 and align_mask=~0u
>        the offset after this is 32761, the true fragsz is 7 too.
>
> Step 2: __page_frag_alloc_align() is called with fragsz=7 and align_mask=-16
>         the offset after this is 32752, the true fragsz is 9, which does
>         not seems to be aligned.

I was referring to my original code before this patchset. I was doing
the subtraction of the fragsz first, and then aligning which would end
up padding the end of the frame as it was adding to the total size by
pulling the offset down *after* I had already subtracted fragsz. The
result was that I was always adding additional room depending on the
setting of the fragsz and how it related to the alignment. After
changing the code to realign on the start of the next frag the fragsz
is at the mercy of the next caller's alignment. In the event that the
caller didn't bother to align the fragsz by the align mask before hand
they can end up with a scenario that might result in false sharing.

> The above is why I added the below paragraph in the doc to make the semantic
> more explicit:
> "Depending on different aligning requirement, the page_frag API caller may call
> page_frag_alloc*_align*() to ensure the returned virtual address or offset of
> the page is aligned according to the 'align/alignment' parameter. Note the size
> of the allocated fragment is not aligned, the caller needs to provide an aligned
> fragsz if there is an alignment requirement for the size of the fragment."
>
> And existing callers of page_frag aligned API does seems to follow the above
> rule last time I checked.
>
> Or did I miss something obvious here?

No you didn't miss anything. It is just that there is now more
potential for error than there was before.

> > need to align the remaining, then add fragsz, and then I guess you
> > could store remaining and then subtract fragsz from your final virtual
> > address to get back to where the starting offset is actually located.
>
> remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask);
> remaining += fragsz;
> nc->remaining = remaining;
> return encoded_page_address(nc->encoded_va) + (size + remaining) - fragsz;
>
> If yes, I am not sure what is the point of doing the above yet, it
> just seem to make thing more complicated and harder to understand.

That isn't right. I am not sure why you are adding size + remaining or
what those are supposed to represent.

The issue was that the "remaining" ends up being an unaligned value as
you were starting by aligning it and then adding fragsz. So by
subtracting fragsz you can get back to the aliglined start. What this
patch was doing before was adding the raw unaligned nc->remaining at
the end of the function.

> >
> > Basically your "remaining" value isn't a safe number to use for an
> > offset since it isn't aligned to your starting value at any point.
>
> Does using 'aligned_remaining' local variable to make it more obvious
> seem reasonable to you?

No, as the value you are storing above isn't guaranteed to be aligned.
If you stored it before adding fragsz then it would be aligned.

> >
> > As far as the negative value, it is more about making it easier to keep
> > track of what is actually going on. Basically we can use regular
> > pointer math and as such I suspect the compiler is having to do extra
> > instructions to flip your value negative before it can combine the
> > values via something like the LEA (load effective address) assembler
> > call.
>
> I am not an asm expert here, I am not sure I understand the optimization
> trick here.

The LEA instruction takes a base address adds 1/2/4/8 times a multiple
and then a fixed offset all in one function and provides an address as
an output. The general idea is that you could look at converting
things such that you are putting together the page address +
remaining*1 + PAGE_SIZE. Basically what I was getting at is that
addition works, but it doesn't do negative values for the multiple.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 06/13] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc'
  2024-07-11 16:49               ` Alexander Duyck
@ 2024-07-12  8:42                 ` Yunsheng Lin
  2024-07-12 16:55                   ` Alexander Duyck
  0 siblings, 1 reply; 43+ messages in thread
From: Yunsheng Lin @ 2024-07-12  8:42 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Andrew Morton,
	linux-mm

On 2024/7/12 0:49, Alexander Duyck wrote:
> On Thu, Jul 11, 2024 at 1:16 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> On 2024/7/10 23:28, Alexander H Duyck wrote:
> 
> ...
> 
>>>>
>>>> Yes, agreed. It would be good to be more specific about how to avoid
>>>> the above problem using a signed negative number for 'remaining' as
>>>> I am not sure how it can be done yet.
>>>>
>>>
>>> My advice would be to go back to patch 3 and figure out how to do this
>>> re-ordering without changing the alignment behaviour. The old code
>>> essentially aligned both the offset and fragsz by combining the two and
>>> then doing the alignment. Since you are doing a count up setup you will
>>
>> I am not sure I understand 'aligned both the offset and fragsz ' part, as
>> 'fragsz' being aligned or not seems to depend on last caller' align_mask,
>> for the same page_frag_cache instance, suppose offset = 32768 initially for
>> the old code:
>> Step 1: __page_frag_alloc_align() is called with fragsz=7 and align_mask=~0u
>>        the offset after this is 32761, the true fragsz is 7 too.
>>
>> Step 2: __page_frag_alloc_align() is called with fragsz=7 and align_mask=-16
>>         the offset after this is 32752, the true fragsz is 9, which does
>>         not seems to be aligned.
> 
> I was referring to my original code before this patchset. I was doing
> the subtraction of the fragsz first, and then aligning which would end
> up padding the end of the frame as it was adding to the total size by
> pulling the offset down *after* I had already subtracted fragsz. The
> result was that I was always adding additional room depending on the
> setting of the fragsz and how it related to the alignment. After
> changing the code to realign on the start of the next frag the fragsz
> is at the mercy of the next caller's alignment. In the event that the
> caller didn't bother to align the fragsz by the align mask before hand
> they can end up with a scenario that might result in false sharing.

So it is about ensuring the additional room due to alignment requirement
being placed at the end of a fragment, in order to avoid false sharing
between the last fragment and the current fragment as much as possible,
right?

I am generally agreed with above if we can also ensure skb coaleasing by
doing offset count-up instead of offset countdown.

If there is conflict between them, I am assuming that enabling skb frag
coaleasing is prefered over avoiding false sharing, right?

> 
>> The above is why I added the below paragraph in the doc to make the semantic
>> more explicit:
>> "Depending on different aligning requirement, the page_frag API caller may call
>> page_frag_alloc*_align*() to ensure the returned virtual address or offset of
>> the page is aligned according to the 'align/alignment' parameter. Note the size
>> of the allocated fragment is not aligned, the caller needs to provide an aligned
>> fragsz if there is an alignment requirement for the size of the fragment."
>>
>> And existing callers of page_frag aligned API does seems to follow the above
>> rule last time I checked.
>>
>> Or did I miss something obvious here?
> 
> No you didn't miss anything. It is just that there is now more
> potential for error than there was before.

I guess the 'error' is referred to the 'false sharing' mentioned above,
right? If it is indeed an error, are we not supposed to fix it instead
of allowing such implicit implication? Allowing implicit implication
seems to make the 'error' harder to reproduce and debug.

> 
>>> need to align the remaining, then add fragsz, and then I guess you
>>> could store remaining and then subtract fragsz from your final virtual
>>> address to get back to where the starting offset is actually located.
>>
>> remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask);
>> remaining += fragsz;
>> nc->remaining = remaining;
>> return encoded_page_address(nc->encoded_va) + (size + remaining) - fragsz;
>>
>> If yes, I am not sure what is the point of doing the above yet, it
>> just seem to make thing more complicated and harder to understand.
> 
> That isn't right. I am not sure why you are adding size + remaining or
> what those are supposed to represent.

As the above assumes 'remaining' is a negative value as you suggested,
(size + remaining) is supposed to represent the offset of next fragment
to ensure we have count-up offset for enabling skb frag coaleasing, and
'- fragsz' is used to get the offset of current fragment.

> 
> The issue was that the "remaining" ends up being an unaligned value as
> you were starting by aligning it and then adding fragsz. So by
> subtracting fragsz you can get back to the aliglined start. What this
> patch was doing before was adding the raw unaligned nc->remaining at
> the end of the function.
> 
>>>
>>> Basically your "remaining" value isn't a safe number to use for an
>>> offset since it isn't aligned to your starting value at any point.
>>
>> Does using 'aligned_remaining' local variable to make it more obvious
>> seem reasonable to you?
> 
> No, as the value you are storing above isn't guaranteed to be aligned.
> If you stored it before adding fragsz then it would be aligned.

I have a feeling that what you are proposing may be conflict with enabling
skb frag coaleasing, as doing offset count-up seems to need some room to
be reserved at the begin of a allocated fragment due to alignment requirement,
and that may mean we need to do both fragsz and offset aligning.

Perhaps the 'remaining' changing in this patch does seems to make things
harder to discuss. Anyway, it would be more helpful if there is some pseudo
code to show the steps of how the above can be done in your mind.

> 
>>>
>>> As far as the negative value, it is more about making it easier to keep
>>> track of what is actually going on. Basically we can use regular
>>> pointer math and as such I suspect the compiler is having to do extra
>>> instructions to flip your value negative before it can combine the
>>> values via something like the LEA (load effective address) assembler
>>> call.
>>
>> I am not an asm expert here, I am not sure I understand the optimization
>> trick here.
> 
> The LEA instruction takes a base address adds 1/2/4/8 times a multiple
> and then a fixed offset all in one function and provides an address as
> an output. The general idea is that you could look at converting
> things such that you are putting together the page address +
> remaining*1 + PAGE_SIZE. Basically what I was getting at is that
> addition works, but it doesn't do negative values for the multiple.


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 06/13] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc'
  2024-07-12  8:42                 ` Yunsheng Lin
@ 2024-07-12 16:55                   ` Alexander Duyck
  2024-07-13  5:20                     ` Yunsheng Lin
  0 siblings, 1 reply; 43+ messages in thread
From: Alexander Duyck @ 2024-07-12 16:55 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Andrew Morton,
	linux-mm

On Fri, Jul 12, 2024 at 1:42 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/7/12 0:49, Alexander Duyck wrote:
> > On Thu, Jul 11, 2024 at 1:16 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
> >>
> >> On 2024/7/10 23:28, Alexander H Duyck wrote:
> >
> > ...
> >
> >>>>
> >>>> Yes, agreed. It would be good to be more specific about how to avoid
> >>>> the above problem using a signed negative number for 'remaining' as
> >>>> I am not sure how it can be done yet.
> >>>>
> >>>
> >>> My advice would be to go back to patch 3 and figure out how to do this
> >>> re-ordering without changing the alignment behaviour. The old code
> >>> essentially aligned both the offset and fragsz by combining the two and
> >>> then doing the alignment. Since you are doing a count up setup you will
> >>
> >> I am not sure I understand 'aligned both the offset and fragsz ' part, as
> >> 'fragsz' being aligned or not seems to depend on last caller' align_mask,
> >> for the same page_frag_cache instance, suppose offset = 32768 initially for
> >> the old code:
> >> Step 1: __page_frag_alloc_align() is called with fragsz=7 and align_mask=~0u
> >>        the offset after this is 32761, the true fragsz is 7 too.
> >>
> >> Step 2: __page_frag_alloc_align() is called with fragsz=7 and align_mask=-16
> >>         the offset after this is 32752, the true fragsz is 9, which does
> >>         not seems to be aligned.
> >
> > I was referring to my original code before this patchset. I was doing
> > the subtraction of the fragsz first, and then aligning which would end
> > up padding the end of the frame as it was adding to the total size by
> > pulling the offset down *after* I had already subtracted fragsz. The
> > result was that I was always adding additional room depending on the
> > setting of the fragsz and how it related to the alignment. After
> > changing the code to realign on the start of the next frag the fragsz
> > is at the mercy of the next caller's alignment. In the event that the
> > caller didn't bother to align the fragsz by the align mask before hand
> > they can end up with a scenario that might result in false sharing.
>
> So it is about ensuring the additional room due to alignment requirement
> being placed at the end of a fragment, in order to avoid false sharing
> between the last fragment and the current fragment as much as possible,
> right?
>
> I am generally agreed with above if we can also ensure skb coaleasing by
> doing offset count-up instead of offset countdown.
>
> If there is conflict between them, I am assuming that enabling skb frag
> coaleasing is prefered over avoiding false sharing, right?

The question I would have is where do we have opportunities for this
to result in coalescing? If we are using this to allocate skb->data
then there isn't such a chance as the tailroom gets in the way.

If this is for a device allocating for an Rx buffer we won't get that
chance as we have to preallocate some fixed size not knowing the
buffer that is coming, and it needs to usually be DMA aligned in order
to avoid causing partial cacheline reads/writes. The only way these
would combine well is if you are doing aligned fixed blocks and are
the only consumer of the page frag cache. It is essentially just
optimizing for jumbo frames in that case.

If this is for some software interface why wouldn't it request the
coalesced size and do one allocation rather than trying to figure out
how to perform a bunch of smaller allocations and then trying to merge
them together after the fact.

> >
> >> The above is why I added the below paragraph in the doc to make the semantic
> >> more explicit:
> >> "Depending on different aligning requirement, the page_frag API caller may call
> >> page_frag_alloc*_align*() to ensure the returned virtual address or offset of
> >> the page is aligned according to the 'align/alignment' parameter. Note the size
> >> of the allocated fragment is not aligned, the caller needs to provide an aligned
> >> fragsz if there is an alignment requirement for the size of the fragment."
> >>
> >> And existing callers of page_frag aligned API does seems to follow the above
> >> rule last time I checked.
> >>
> >> Or did I miss something obvious here?
> >
> > No you didn't miss anything. It is just that there is now more
> > potential for error than there was before.
>
> I guess the 'error' is referred to the 'false sharing' mentioned above,
> right? If it is indeed an error, are we not supposed to fix it instead
> of allowing such implicit implication? Allowing implicit implication
> seems to make the 'error' harder to reproduce and debug.

The concern with the code as it stands is that if I am not mistaken
remaining isn't actually aligned. You aligned it, then added fragsz.
That becomes the start of the next frame so if fragsz isn't aligned
the next requester will be getting an unaligned buffer, or one that is
only aligned to the previous caller's alignment.

> >
> >>> need to align the remaining, then add fragsz, and then I guess you
> >>> could store remaining and then subtract fragsz from your final virtual
> >>> address to get back to where the starting offset is actually located.
> >>
> >> remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask);
> >> remaining += fragsz;
> >> nc->remaining = remaining;
> >> return encoded_page_address(nc->encoded_va) + (size + remaining) - fragsz;
> >>
> >> If yes, I am not sure what is the point of doing the above yet, it
> >> just seem to make thing more complicated and harder to understand.
> >
> > That isn't right. I am not sure why you are adding size + remaining or
> > what those are supposed to represent.
>
> As the above assumes 'remaining' is a negative value as you suggested,
> (size + remaining) is supposed to represent the offset of next fragment
> to ensure we have count-up offset for enabling skb frag coaleasing, and
> '- fragsz' is used to get the offset of current fragment.
>
> >
> > The issue was that the "remaining" ends up being an unaligned value as
> > you were starting by aligning it and then adding fragsz. So by
> > subtracting fragsz you can get back to the aliglined start. What this
> > patch was doing before was adding the raw unaligned nc->remaining at
> > the end of the function.
> >
> >>>
> >>> Basically your "remaining" value isn't a safe number to use for an
> >>> offset since it isn't aligned to your starting value at any point.
> >>
> >> Does using 'aligned_remaining' local variable to make it more obvious
> >> seem reasonable to you?
> >
> > No, as the value you are storing above isn't guaranteed to be aligned.
> > If you stored it before adding fragsz then it would be aligned.
>
> I have a feeling that what you are proposing may be conflict with enabling
> skb frag coaleasing, as doing offset count-up seems to need some room to
> be reserved at the begin of a allocated fragment due to alignment requirement,
> and that may mean we need to do both fragsz and offset aligning.
>
> Perhaps the 'remaining' changing in this patch does seems to make things
> harder to discuss. Anyway, it would be more helpful if there is some pseudo
> code to show the steps of how the above can be done in your mind.

Basically what you would really need do for all this is:
  remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask);
  nc->remaining = remaining + fragsz;
  return encoded_page_address(nc->encoded_va) + size + remaining;

The issue is you cannot be using page_frag_cache_current_va as that
pointer will always be garbage as it isn't aligned to anything. It
isn't like the code that I had that was working backwards as the
offset cannot be used as soon as you compute it since you add fragsz
to it.

For any countup you have to align the current offset/remaining value,
then that value is used for computing the page address, and you store
the offset/remaining plus fragsz adjustment. At which point your
offset/remaining pointer value isn't good for the current buffer
anymore.


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 06/13] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc'
  2024-07-12 16:55                   ` Alexander Duyck
@ 2024-07-13  5:20                     ` Yunsheng Lin
  2024-07-13 16:55                       ` Alexander Duyck
  0 siblings, 1 reply; 43+ messages in thread
From: Yunsheng Lin @ 2024-07-13  5:20 UTC (permalink / raw)
  To: Alexander Duyck, Yunsheng Lin
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Andrew Morton,
	linux-mm

On 7/13/2024 12:55 AM, Alexander Duyck wrote:

...

>>
>> So it is about ensuring the additional room due to alignment requirement
>> being placed at the end of a fragment, in order to avoid false sharing
>> between the last fragment and the current fragment as much as possible,
>> right?
>>
>> I am generally agreed with above if we can also ensure skb coaleasing by
>> doing offset count-up instead of offset countdown.
>>
>> If there is conflict between them, I am assuming that enabling skb frag
>> coaleasing is prefered over avoiding false sharing, right?
> 
> The question I would have is where do we have opportunities for this
> to result in coalescing? If we are using this to allocate skb->data
> then there isn't such a chance as the tailroom gets in the way.
> 
> If this is for a device allocating for an Rx buffer we won't get that
> chance as we have to preallocate some fixed size not knowing the
> buffer that is coming, and it needs to usually be DMA aligned in order
> to avoid causing partial cacheline reads/writes. The only way these
> would combine well is if you are doing aligned fixed blocks and are
> the only consumer of the page frag cache. It is essentially just
> optimizing for jumbo frames in that case.

And hw-gro or sw-gro.

> 
> If this is for some software interface why wouldn't it request the
> coalesced size and do one allocation rather than trying to figure out
> how to perform a bunch of smaller allocations and then trying to merge
> them together after the fact.

I am not sure I understand what 'some software interface' is referring
to, I hope you are not suggesting the below optimizations utilizing of
skb_can_coalesce() checking is unnecessary:(

https://elixir.bootlin.com/linux/v6.10-rc7/C/ident/skb_can_coalesce

Most of the usecases do that for the reason below as mentioned in the
Documentation/mm/page_frags.rst as my understanding:
"There is also a use case that needs minimum memory in order for forward 
progress, but more performant if more memory is available."

> 
>>>
>>>> The above is why I added the below paragraph in the doc to make the semantic
>>>> more explicit:
>>>> "Depending on different aligning requirement, the page_frag API caller may call
>>>> page_frag_alloc*_align*() to ensure the returned virtual address or offset of
>>>> the page is aligned according to the 'align/alignment' parameter. Note the size
>>>> of the allocated fragment is not aligned, the caller needs to provide an aligned
>>>> fragsz if there is an alignment requirement for the size of the fragment."
>>>>
>>>> And existing callers of page_frag aligned API does seems to follow the above
>>>> rule last time I checked.
>>>>
>>>> Or did I miss something obvious here?
>>>
>>> No you didn't miss anything. It is just that there is now more
>>> potential for error than there was before.
>>
>> I guess the 'error' is referred to the 'false sharing' mentioned above,
>> right? If it is indeed an error, are we not supposed to fix it instead
>> of allowing such implicit implication? Allowing implicit implication
>> seems to make the 'error' harder to reproduce and debug.
> 
> The concern with the code as it stands is that if I am not mistaken
> remaining isn't actually aligned. You aligned it, then added fragsz.
> That becomes the start of the next frame so if fragsz isn't aligned
> the next requester will be getting an unaligned buffer, or one that is
> only aligned to the previous caller's alignment.

As mentioned below:
https://lore.kernel.org/all/3da33d4c-a70e-23a4-8e00-23fe96dd0c1a@huawei.com/

what alignment semantics are we providing here:
1. Ensure alignment for both offset and fragsz.
2. Ensure alignment for offset only.
3. Ensure alignment for fragsz only.

As my understanding, the original code before this patchset is only 
ensuring alignment for offset too. So there may be 'false sharing'
both before this patchset and after this patchset. It would be better
not to argue about which implementation having more/less potential
to avoid the 'false sharing', it is an undefined behavior, the argument
would be endless depending on usecase and personal preference.

As I said before, I would love to retain the old undefined behavior
when there is a reasonable way to support the new usecases.

> 
>>>
>>>>> need to align the remaining, then add fragsz, and then I guess you
>>>>> could store remaining and then subtract fragsz from your final virtual
>>>>> address to get back to where the starting offset is actually located.
>>>>
>>>> remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask);
>>>> remaining += fragsz;
>>>> nc->remaining = remaining;
>>>> return encoded_page_address(nc->encoded_va) + (size + remaining) - fragsz;
>>>>
>>>> If yes, I am not sure what is the point of doing the above yet, it
>>>> just seem to make thing more complicated and harder to understand.
>>>
>>> That isn't right. I am not sure why you are adding size + remaining or
>>> what those are supposed to represent.
>>
>> As the above assumes 'remaining' is a negative value as you suggested,
>> (size + remaining) is supposed to represent the offset of next fragment
>> to ensure we have count-up offset for enabling skb frag coaleasing, and
>> '- fragsz' is used to get the offset of current fragment.
>>
>>>
>>> The issue was that the "remaining" ends up being an unaligned value as
>>> you were starting by aligning it and then adding fragsz. So by
>>> subtracting fragsz you can get back to the aliglined start. What this
>>> patch was doing before was adding the raw unaligned nc->remaining at
>>> the end of the function.
>>>
>>>>>
>>>>> Basically your "remaining" value isn't a safe number to use for an
>>>>> offset since it isn't aligned to your starting value at any point.
>>>>
>>>> Does using 'aligned_remaining' local variable to make it more obvious
>>>> seem reasonable to you?
>>>
>>> No, as the value you are storing above isn't guaranteed to be aligned.
>>> If you stored it before adding fragsz then it would be aligned.
>>
>> I have a feeling that what you are proposing may be conflict with enabling
>> skb frag coaleasing, as doing offset count-up seems to need some room to
>> be reserved at the begin of a allocated fragment due to alignment requirement,
>> and that may mean we need to do both fragsz and offset aligning.
>>
>> Perhaps the 'remaining' changing in this patch does seems to make things
>> harder to discuss. Anyway, it would be more helpful if there is some pseudo
>> code to show the steps of how the above can be done in your mind.
> 
> Basically what you would really need do for all this is:
>    remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask);
>    nc->remaining = remaining + fragsz;
>    return encoded_page_address(nc->encoded_va) + size + remaining;

I am assuming 'size + remaining' is supposed to represent the offset of 
current allocted fragment?
Are you sure the above is what you wanted?

suppose remaining = -32768 & size = 32768 initially:
Step 1: __page_frag_alloc_align() is called with fragsz=7 and
         align_mask=~0u, the remaining after this is -32761, the true
         fragsz is 7, the offset is 0

Step 2: __page_frag_alloc_align() is called with fragsz=7 and
         align_mask=-16, the offset after this is -32745, the true
         fragsz is 7, the offset = 16

The semantics of the above implementation seems to be the same as
the semantics of implementation of patch 3 in v10：
https://lore.kernel.org/all/20240709132741.47751-4-linyunsheng@huawei.com/
     aligned_remaining = nc->remaining & align_mask;
     remaining = aligned_remaining - fragsz;
     nc->remaining = remaining;
     return encoded_page_address(encoded_va) + size - aligned_remaining;

The main difference seems to be about using a negative value for 
nc->remaining or not, if yes, I am not sure what is other gain of
using a negative value for nc->remaining besides the LEA instruction
opt trick.

As using a unsigned value and a 'aligned_remaining' local variable
does seems to make thing easier to understand and avoid adding three 
checkings as mentioned below.

> 
> The issue is you cannot be using page_frag_cache_current_va as that
> pointer will always be garbage as it isn't aligned to anything. It
> isn't like the code that I had that was working backwards as the
> offset cannot be used as soon as you compute it since you add fragsz
> to it.

The above was a mistake in v9, the intend is to do:
nc->remaining &= align_mask;

That was why there were three 'align > PAGE_SIZE' checking in v10 to
avoid doing 'nc->remaining &= align_mask' prematurely if caller passes
a large align_mask value.

So 'aligned_remaining' local variable in v10 is used to avoid doing 
'nc->remaining &= align_mask', thus three 'align > PAGE_SIZE' checking
is not needed too.

> 
> For any countup you have to align the current offset/remaining value,
> then that value is used for computing the page address, and you store
> the offset/remaining plus fragsz adjustment. At which point your
> offset/remaining pointer value isn't good for the current buffer
> anymore.
> 


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 06/13] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc'
  2024-07-13  5:20                     ` Yunsheng Lin
@ 2024-07-13 16:55                       ` Alexander Duyck
       [not found]                         ` <12ff13d9-1f3d-4c1b-a972-2efb6f247e31@gmail.com>
  0 siblings, 1 reply; 43+ messages in thread
From: Alexander Duyck @ 2024-07-13 16:55 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: Yunsheng Lin, davem, kuba, pabeni, netdev, linux-kernel,
	Andrew Morton, linux-mm

On Fri, Jul 12, 2024 at 10:20 PM Yunsheng Lin <yunshenglin0825@gmail.com> wrote:
>
> On 7/13/2024 12:55 AM, Alexander Duyck wrote:
>
> ...
>
> >>
> >> So it is about ensuring the additional room due to alignment requirement
> >> being placed at the end of a fragment, in order to avoid false sharing
> >> between the last fragment and the current fragment as much as possible,
> >> right?
> >>
> >> I am generally agreed with above if we can also ensure skb coaleasing by
> >> doing offset count-up instead of offset countdown.
> >>
> >> If there is conflict between them, I am assuming that enabling skb frag
> >> coaleasing is prefered over avoiding false sharing, right?
> >
> > The question I would have is where do we have opportunities for this
> > to result in coalescing? If we are using this to allocate skb->data
> > then there isn't such a chance as the tailroom gets in the way.
> >
> > If this is for a device allocating for an Rx buffer we won't get that
> > chance as we have to preallocate some fixed size not knowing the
> > buffer that is coming, and it needs to usually be DMA aligned in order
> > to avoid causing partial cacheline reads/writes. The only way these
> > would combine well is if you are doing aligned fixed blocks and are
> > the only consumer of the page frag cache. It is essentially just
> > optimizing for jumbo frames in that case.
>
> And hw-gro or sw-gro.

No and no. The problem is for hw-gro in many cases the device will be
DMA aligning the start of writes for each new frame that comes in. As
such it is possible you won't be able to make use of this unless the
device is either queueing up the entire packet before writing it to
memory, or taking a double penalty for partial cache line writes which
will negatively impact performance. For sw-gro it won't happen since
as I mentioned the device is doing DMA aligned writes usually w/ fixed
size buffers that will be partially empty. As such coalescing will
likely not be possible in either of those scenarios.

This is why I was so comfortable using the reverse ordering for the
allocations. Trying to aggregate frames in this way will be very
difficult and the likelihood of anyone ever needing to do it is
incredibly small.

> >
> > If this is for some software interface why wouldn't it request the
> > coalesced size and do one allocation rather than trying to figure out
> > how to perform a bunch of smaller allocations and then trying to merge
> > them together after the fact.
>
> I am not sure I understand what 'some software interface' is referring
> to, I hope you are not suggesting the below optimizations utilizing of
> skb_can_coalesce() checking is unnecessary:(
>
> https://elixir.bootlin.com/linux/v6.10-rc7/C/ident/skb_can_coalesce
>
> Most of the usecases do that for the reason below as mentioned in the
> Documentation/mm/page_frags.rst as my understanding:
> "There is also a use case that needs minimum memory in order for forward
> progress, but more performant if more memory is available."

No what I am talking about is that it will be expensive to use and
have very little benefit to coalesce frames coming from a NIC as I
called out above. Most NICs won't use page frags anyway they will be
using page pool which is a slightly different beast anyway.

> >
> >>>
> >>>> The above is why I added the below paragraph in the doc to make the semantic
> >>>> more explicit:
> >>>> "Depending on different aligning requirement, the page_frag API caller may call
> >>>> page_frag_alloc*_align*() to ensure the returned virtual address or offset of
> >>>> the page is aligned according to the 'align/alignment' parameter. Note the size
> >>>> of the allocated fragment is not aligned, the caller needs to provide an aligned
> >>>> fragsz if there is an alignment requirement for the size of the fragment."
> >>>>
> >>>> And existing callers of page_frag aligned API does seems to follow the above
> >>>> rule last time I checked.
> >>>>
> >>>> Or did I miss something obvious here?
> >>>
> >>> No you didn't miss anything. It is just that there is now more
> >>> potential for error than there was before.
> >>
> >> I guess the 'error' is referred to the 'false sharing' mentioned above,
> >> right? If it is indeed an error, are we not supposed to fix it instead
> >> of allowing such implicit implication? Allowing implicit implication
> >> seems to make the 'error' harder to reproduce and debug.
> >
> > The concern with the code as it stands is that if I am not mistaken
> > remaining isn't actually aligned. You aligned it, then added fragsz.
> > That becomes the start of the next frame so if fragsz isn't aligned
> > the next requester will be getting an unaligned buffer, or one that is
> > only aligned to the previous caller's alignment.
>
> As mentioned below:
> https://lore.kernel.org/all/3da33d4c-a70e-23a4-8e00-23fe96dd0c1a@huawei.com/
>
> what alignment semantics are we providing here:
> 1. Ensure alignment for both offset and fragsz.
> 2. Ensure alignment for offset only.
> 3. Ensure alignment for fragsz only.
>
> As my understanding, the original code before this patchset is only
> ensuring alignment for offset too. So there may be 'false sharing'
> both before this patchset and after this patchset. It would be better
> not to argue about which implementation having more/less potential
> to avoid the 'false sharing', it is an undefined behavior, the argument
> would be endless depending on usecase and personal preference.
>
> As I said before, I would love to retain the old undefined behavior
> when there is a reasonable way to support the new usecases.

My main concern is that you were aligning "remaining", then adding
fragsz, and storing that. That was then used as the offset for the
next buffer. That isn't aligned since fragsz and the previous offset
aren't guaranteed to align with the new allocation.

So at a minimum the existing code cannot be using nc->remaining when
generating the pointer to the page. Instead it has to be using the
aligned version of that value that you generated before adding fragsz
and verifying there is space.

> >
> >>>
> >>>>> need to align the remaining, then add fragsz, and then I guess you
> >>>>> could store remaining and then subtract fragsz from your final virtual
> >>>>> address to get back to where the starting offset is actually located.
> >>>>
> >>>> remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask);
> >>>> remaining += fragsz;
> >>>> nc->remaining = remaining;
> >>>> return encoded_page_address(nc->encoded_va) + (size + remaining) - fragsz;
> >>>>
> >>>> If yes, I am not sure what is the point of doing the above yet, it
> >>>> just seem to make thing more complicated and harder to understand.
> >>>
> >>> That isn't right. I am not sure why you are adding size + remaining or
> >>> what those are supposed to represent.
> >>
> >> As the above assumes 'remaining' is a negative value as you suggested,
> >> (size + remaining) is supposed to represent the offset of next fragment
> >> to ensure we have count-up offset for enabling skb frag coaleasing, and
> >> '- fragsz' is used to get the offset of current fragment.
> >>
> >>>
> >>> The issue was that the "remaining" ends up being an unaligned value as
> >>> you were starting by aligning it and then adding fragsz. So by
> >>> subtracting fragsz you can get back to the aliglined start. What this
> >>> patch was doing before was adding the raw unaligned nc->remaining at
> >>> the end of the function.
> >>>
> >>>>>
> >>>>> Basically your "remaining" value isn't a safe number to use for an
> >>>>> offset since it isn't aligned to your starting value at any point.
> >>>>
> >>>> Does using 'aligned_remaining' local variable to make it more obvious
> >>>> seem reasonable to you?
> >>>
> >>> No, as the value you are storing above isn't guaranteed to be aligned.
> >>> If you stored it before adding fragsz then it would be aligned.
> >>
> >> I have a feeling that what you are proposing may be conflict with enabling
> >> skb frag coaleasing, as doing offset count-up seems to need some room to
> >> be reserved at the begin of a allocated fragment due to alignment requirement,
> >> and that may mean we need to do both fragsz and offset aligning.
> >>
> >> Perhaps the 'remaining' changing in this patch does seems to make things
> >> harder to discuss. Anyway, it would be more helpful if there is some pseudo
> >> code to show the steps of how the above can be done in your mind.
> >
> > Basically what you would really need do for all this is:
> >    remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask);
> >    nc->remaining = remaining + fragsz;
> >    return encoded_page_address(nc->encoded_va) + size + remaining;
>

I might have mixed my explanation up a bit. This is assuming remaining
is a negative value as I mentioned before.

Basically the issue is your current code is using nc->remaining to
generate the current address and that is bad as it isn't aligned to
anything as fragsz was added to it and no alignment check had been
done on that value.


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 06/13] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc'
       [not found]                         ` <12ff13d9-1f3d-4c1b-a972-2efb6f247e31@gmail.com>
@ 2024-07-15 17:55                           ` Alexander Duyck
  2024-07-16 12:58                             ` Yunsheng Lin
  0 siblings, 1 reply; 43+ messages in thread
From: Alexander Duyck @ 2024-07-15 17:55 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: Yunsheng Lin, davem, kuba, pabeni, netdev, linux-kernel,
	Andrew Morton, linux-mm

On Sat, Jul 13, 2024 at 9:52 PM Yunsheng Lin <yunshenglin0825@gmail.com> wrote:
>
> On 7/14/2024 12:55 AM, Alexander Duyck wrote:
>
> ...
>
> >>>>
> >>>> Perhaps the 'remaining' changing in this patch does seems to make things
> >>>> harder to discuss. Anyway, it would be more helpful if there is some pseudo
> >>>> code to show the steps of how the above can be done in your mind.
> >>>
> >>> Basically what you would really need do for all this is:
> >>>     remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask);
> >>>     nc->remaining = remaining + fragsz;
> >>>     return encoded_page_address(nc->encoded_va) + size + remaining;
> >>
> >
> > I might have mixed my explanation up a bit. This is assuming remaining
> > is a negative value as I mentioned before.
>
> Let's be more specific about the options here, what you meant is below,
> right? Let's say it is option 1 as below:
> struct page_frag_cache {
>          /* encoded_va consists of the virtual address, pfmemalloc bit
> and order
>           * of a page.
>           */
>          unsigned long encoded_va;
>
> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
>          __s16 remaining;
>          __u16 pagecnt_bias;
> #else
>          __s32 remaining;
>          __u32 pagecnt_bias;
> #endif
> };
>
> void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>                                   unsigned int fragsz, gfp_t gfp_mask,
>                                   unsigned int align_mask)
> {
>          unsigned int size = page_frag_cache_page_size(nc->encoded_va);
>          int remaining;
>
>          remaining = __ALIGN_KERNEL_MASK(nc->remaining, ~align_mask);
>          if (unlikely(remaining + (int)fragsz > 0)) {
>                  if (!__page_frag_cache_refill(nc, gfp_mask))
>                          return NULL;
>
>                  size = page_frag_cache_page_size(nc->encoded_va);
>
>                  remaining = -size;
>                  if (unlikely(remaining + (int)fragsz > 0))
>                          return NULL;
>          }
>
>          nc->pagecnt_bias--;
>          nc->remaining = remaining + fragsz;
>
>          return encoded_page_address(nc->encoded_va) + size + remaining;
> }
>
>
> And let's say what I am proposing in v10 is option 2 as below:
> struct page_frag_cache {
>          /* encoded_va consists of the virtual address, pfmemalloc bit
> and order
>           * of a page.
>           */
>          unsigned long encoded_va;
>
> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
>          __u16 remaining;
>          __u16 pagecnt_bias;
> #else
>          __u32 remaining;
>          __u32 pagecnt_bias;
> #endif
> };
>
> void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>                                   unsigned int fragsz, gfp_t gfp_mask,
>                                   unsigned int align_mask)
> {
>          unsigned int size = page_frag_cache_page_size(nc->encoded_va);
>          int aligned_remaining = nc->remaining & align_mask;
>          int remaining = aligned_remaining - fragsz;
>
>          if (unlikely(remaining < 0)) {
>                  if (!__page_frag_cache_refill(nc, gfp_mask))
>                          return NULL;
>
>                  size = page_frag_cache_page_size(nc->encoded_va);
>
>                  aligned_remaining = size;
>                  remaining = aligned_remaining - fragsz;
>                  if (unlikely(remaining < 0))
>                          return NULL;
>          }
>
>          nc->pagecnt_bias--;
>          nc->remaining = remaining;
>
>          return encoded_page_address(nc->encoded_va) + (size -
> aligned_remaining);
> }
>
> If the option 1 is not what you have in mind, it would be better to be
> more specific about what you have in mind.

Option 1 was more or less what I had in mind.

> If the option 1 is what you have in mind, it seems both option 1 and
> option 2 have the same semantics as my understanding, right? The
> question here seems to be what is your perfer option and why?
>
> I implemented both of them, and the option 1 seems to have a
> bigger generated asm size as below:
> ./scripts/bloat-o-meter vmlinux_non_neg vmlinux
> add/remove: 0/0 grow/shrink: 1/0 up/down: 37/0 (37)
> Function                                     old     new   delta
> __page_frag_alloc_va_align                   414     451     +37

My big complaint is that it seems option 2 is harder for people to
understand and more likely to not be done correctly. In some cases if
the performance difference is negligible it is better to go with the
more maintainable solution.


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 06/13] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc'
  2024-07-15 17:55                           ` Alexander Duyck
@ 2024-07-16 12:58                             ` Yunsheng Lin
  2024-07-17 12:31                               ` Yunsheng Lin
  0 siblings, 1 reply; 43+ messages in thread
From: Yunsheng Lin @ 2024-07-16 12:58 UTC (permalink / raw)
  To: Alexander Duyck, Yunsheng Lin
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Andrew Morton,
	linux-mm

On 2024/7/16 1:55, Alexander Duyck wrote:
> On Sat, Jul 13, 2024 at 9:52 PM Yunsheng Lin <yunshenglin0825@gmail.com> wrote:

...

>>
>> If the option 1 is not what you have in mind, it would be better to be
>> more specific about what you have in mind.
> 
> Option 1 was more or less what I had in mind.
> 
>> If the option 1 is what you have in mind, it seems both option 1 and
>> option 2 have the same semantics as my understanding, right? The
>> question here seems to be what is your perfer option and why?
>>
>> I implemented both of them, and the option 1 seems to have a
>> bigger generated asm size as below:
>> ./scripts/bloat-o-meter vmlinux_non_neg vmlinux
>> add/remove: 0/0 grow/shrink: 1/0 up/down: 37/0 (37)
>> Function                                     old     new   delta
>> __page_frag_alloc_va_align                   414     451     +37
> 
> My big complaint is that it seems option 2 is harder for people to
> understand and more likely to not be done correctly. In some cases if
> the performance difference is negligible it is better to go with the
> more maintainable solution.

Option 1 assuming nc->remaining as a negative value does not seems to
make it a more maintainable solution than option 2. How about something
like below if using a negative value to enable some optimization like LEA
does not have a noticeable performance difference?

struct page_frag_cache {
        /* encoded_va consists of the virtual address, pfmemalloc bit and order
         * of a page.
         */
        unsigned long encoded_va;

#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
        __u16 remaining;
        __u16 pagecnt_bias;
#else
        __u32 remaining;
        __u32 pagecnt_bias;
#endif
};

void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
                                 unsigned int fragsz, gfp_t gfp_mask,
                                 unsigned int align_mask)
{
        unsigned int size = page_frag_cache_page_size(nc->encoded_va);
        unsigned int remaining;

        remaining = nc->remaining & align_mask;
        if (unlikely(remaining < fragsz)) {
                if (unlikely(fragsz > PAGE_SIZE)) {
                        /*
                         * The caller is trying to allocate a fragment
                         * with fragsz > PAGE_SIZE but the cache isn't big
                         * enough to satisfy the request, this may
                         * happen in low memory conditions.
                         * We don't release the cache page because
                         * it could make memory pressure worse
                         * so we simply return NULL here.
                         */
                        return NULL;
                }

                if (!__page_frag_cache_refill(nc, gfp_mask))
                        return NULL;

                size = page_frag_cache_page_size(nc->encoded_va);
                remaining = size;
        }

        nc->pagecnt_bias--;
        nc->remaining = remaining - fragsz;

        return encoded_page_address(nc->encoded_va) + (size - remaining);
}



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next v9 06/13] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc'
  2024-07-16 12:58                             ` Yunsheng Lin
@ 2024-07-17 12:31                               ` Yunsheng Lin
  0 siblings, 0 replies; 43+ messages in thread
From: Yunsheng Lin @ 2024-07-17 12:31 UTC (permalink / raw)
  To: Alexander Duyck, Yunsheng Lin
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Andrew Morton,
	linux-mm

On 2024/7/16 20:58, Yunsheng Lin wrote:

...

> 
> Option 1 assuming nc->remaining as a negative value does not seems to
> make it a more maintainable solution than option 2. How about something
> like below if using a negative value to enable some optimization like LEA
> does not have a noticeable performance difference?

Suppose the below as option 3, it seems the option 3 has better performance
than option 2, and option 2 has better performance than option 1 using the
ko introduced in patch 1.

Option 1:
 Performance counter stats for 'insmod ./page_frag_test.ko test_push_cpu=16 test_pop_cpu=17 test_alloc_len=12 nr_test=5120000'                                                                                   (500 runs):

         17.757768      task-clock (msec)         #    0.001 CPUs utilized            ( +-  0.17% )
                 5      context-switches          #    0.288 K/sec                    ( +-  0.28% )
                 0      cpu-migrations            #    0.007 K/sec                    ( +- 12.36% )
                82      page-faults               #    0.005 M/sec                    ( +-  0.06% )
          46128280      cycles                    #    2.598 GHz                      ( +-  0.17% )
          60938595      instructions              #    1.32  insn per cycle           ( +-  0.02% )
          14783794      branches                  #  832.525 M/sec                    ( +-  0.02% )
             20393      branch-misses             #    0.14% of all branches          ( +-  0.13% )

      24.556644680 seconds time elapsed                                          ( +-  0.07% )

Option 2:
Performance counter stats for 'insmod ./page_frag_test.ko test_push_cpu=16 test_pop_cpu=17 test_alloc_len=12 nr_test=5120000' (500 runs):

         18.443508      task-clock (msec)         #    0.001 CPUs utilized            ( +-  0.61% )
                 6      context-switches          #    0.342 K/sec                    ( +-  0.57% )
                 0      cpu-migrations            #    0.025 K/sec                    ( +-  4.89% )
                82      page-faults               #    0.004 M/sec                    ( +-  0.06% )
          47901207      cycles                    #    2.597 GHz                      ( +-  0.61% )
          60985019      instructions              #    1.27  insn per cycle           ( +-  0.05% )
          14787177      branches                  #  801.755 M/sec                    ( +-  0.05% )
             21099      branch-misses             #    0.14% of all branches          ( +-  0.14% )

      24.413183804 seconds time elapsed                                          ( +-  0.06% )

Option 3:
Performance counter stats for 'insmod ./page_frag_test.ko test_push_cpu=16 test_pop_cpu=17 test_alloc_len=12 nr_test=5120000' (500 runs):

         17.847031      task-clock (msec)         #    0.001 CPUs utilized            ( +-  0.23% )
                 5      context-switches          #    0.305 K/sec                    ( +-  0.55% )
                 0      cpu-migrations            #    0.017 K/sec                    ( +-  6.86% )
                82      page-faults               #    0.005 M/sec                    ( +-  0.06% )
          46355974      cycles                    #    2.597 GHz                      ( +-  0.23% )
          60848779      instructions              #    1.31  insn per cycle           ( +-  0.03% )
          14758941      branches                  #  826.969 M/sec                    ( +-  0.03% )
             20728      branch-misses             #    0.14% of all branches          ( +-  0.15% )

      24.376161069 seconds time elapsed                                          ( +-  0.06% )

> 
> struct page_frag_cache {
>         /* encoded_va consists of the virtual address, pfmemalloc bit and order
>          * of a page.
>          */
>         unsigned long encoded_va;
> 
> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
>         __u16 remaining;
>         __u16 pagecnt_bias;
> #else
>         __u32 remaining;
>         __u32 pagecnt_bias;
> #endif
> };
> 
> void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>                                  unsigned int fragsz, gfp_t gfp_mask,
>                                  unsigned int align_mask)
> {
>         unsigned int size = page_frag_cache_page_size(nc->encoded_va);
>         unsigned int remaining;
> 
>         remaining = nc->remaining & align_mask;
>         if (unlikely(remaining < fragsz)) {
>                 if (unlikely(fragsz > PAGE_SIZE)) {
>                         /*
>                          * The caller is trying to allocate a fragment
>                          * with fragsz > PAGE_SIZE but the cache isn't big
>                          * enough to satisfy the request, this may
>                          * happen in low memory conditions.
>                          * We don't release the cache page because
>                          * it could make memory pressure worse
>                          * so we simply return NULL here.
>                          */
>                         return NULL;
>                 }
> 
>                 if (!__page_frag_cache_refill(nc, gfp_mask))
>                         return NULL;
> 
>                 size = page_frag_cache_page_size(nc->encoded_va);
>                 remaining = size;
>         }
> 
>         nc->pagecnt_bias--;
>         nc->remaining = remaining - fragsz;
> 
>         return encoded_page_address(nc->encoded_va) + (size - remaining);
> }
> 
> 


^ permalink raw reply	[flat|nested] 43+ messages in thread

end of thread, other threads:[~2024-07-17 12:31 UTC | newest]

Thread overview: 43+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20240625135216.47007-1-linyunsheng@huawei.com>
2024-06-25 13:52 ` [PATCH net-next v9 01/13] mm: page_frag: add a test module for page_frag Yunsheng Lin
2024-06-25 13:52 ` [PATCH net-next v9 02/13] mm: move the page fragment allocator from page_alloc into its own file Yunsheng Lin
2024-07-01 23:10   ` Alexander H Duyck
2024-07-02 12:27     ` Yunsheng Lin
2024-06-25 13:52 ` [PATCH net-next v9 03/13] mm: page_frag: use initial zero offset for page_frag_alloc_align() Yunsheng Lin
2024-07-01 23:27   ` Alexander H Duyck
2024-07-02 12:28     ` Yunsheng Lin
2024-07-02 16:00       ` Alexander Duyck
2024-07-03 11:25         ` Yunsheng Lin
2024-06-25 13:52 ` [PATCH net-next v9 04/13] mm: page_frag: add '_va' suffix to page_frag API Yunsheng Lin
2024-06-25 13:52 ` [PATCH net-next v9 05/13] mm: page_frag: avoid caller accessing 'page_frag_cache' directly Yunsheng Lin
2024-06-25 13:52 ` [PATCH net-next v9 06/13] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc' Yunsheng Lin
2024-07-02  0:08   ` Alexander H Duyck
2024-07-02 12:35     ` Yunsheng Lin
2024-07-02 14:55       ` Alexander H Duyck
2024-07-03 12:33         ` Yunsheng Lin
2024-07-10 15:28           ` Alexander H Duyck
2024-07-11  8:16             ` Yunsheng Lin
2024-07-11 16:49               ` Alexander Duyck
2024-07-12  8:42                 ` Yunsheng Lin
2024-07-12 16:55                   ` Alexander Duyck
2024-07-13  5:20                     ` Yunsheng Lin
2024-07-13 16:55                       ` Alexander Duyck
     [not found]                         ` <12ff13d9-1f3d-4c1b-a972-2efb6f247e31@gmail.com>
2024-07-15 17:55                           ` Alexander Duyck
2024-07-16 12:58                             ` Yunsheng Lin
2024-07-17 12:31                               ` Yunsheng Lin
2024-06-25 13:52 ` [PATCH net-next v9 07/13] mm: page_frag: some minor refactoring before adding new API Yunsheng Lin
2024-07-02 15:30   ` Alexander H Duyck
2024-07-03 12:36     ` Yunsheng Lin
2024-06-25 13:52 ` [PATCH net-next v9 08/13] mm: page_frag: use __alloc_pages() to replace alloc_pages_node() Yunsheng Lin
2024-06-25 13:52 ` [PATCH net-next v9 10/13] mm: page_frag: introduce prepare/probe/commit API Yunsheng Lin
2024-06-28 22:35   ` Alexander H Duyck
2024-06-29 11:15     ` Yunsheng Lin
2024-06-29 17:37       ` Alexander Duyck
     [not found]         ` <0a80e362-1eb7-40b0-b1b9-07ec5a6506ea@gmail.com>
2024-06-30 14:35           ` Alexander Duyck
2024-06-30 15:05             ` Yunsheng Lin
2024-07-03 12:40               ` Yunsheng Lin
2024-07-07 17:12                 ` Alexander Duyck
2024-07-08 10:58                   ` Yunsheng Lin
2024-07-08 14:30                     ` Alexander Duyck
2024-07-09  6:57                       ` Yunsheng Lin
2024-07-09 13:40                         ` Alexander Duyck
2024-06-25 13:52 ` [PATCH net-next v9 12/13] mm: page_frag: update documentation for page_frag Yunsheng Lin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).