All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jakub Kicinski <kuba@kernel.org>
To: netdev@vger.kernel.org
Cc: almasrymina@google.com, hawk@kernel.org,
	ilias.apalodimas@linaro.org, edumazet@google.com,
	dsahern@gmail.com, michael.chan@broadcom.com, willemb@google.com,
	Jakub Kicinski <kuba@kernel.org>
Subject: [RFC 02/12] net: create a 1G-huge-page-backed allocator
Date: Fri,  7 Jul 2023 11:39:25 -0700	[thread overview]
Message-ID: <20230707183935.997267-3-kuba@kernel.org> (raw)
In-Reply-To: <20230707183935.997267-1-kuba@kernel.org>

Get 1G pages from CMA, driver will be able to sub-allocate from
those for individual queues. Each Rx queue (taking recycling into
account) needs 32MB - 128MB of memory. With 32 active queues even
with 2MB pages we'll end up using a lot of IOTLB entries.

There are some workarounds for 2MB pages like trying to sort
the buffers (i.e. making sure that the buffers used by the NIC
at any time belong to as few 2MB pages as possible). But 1G pages
seem so much simpler.

Grab 4 pages for now, the real thing will probably need some
Kconfigs and command line params. And a lot more uAPI in general.

Also IDK how to hook this properly into early init :(

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 arch/x86/kernel/setup.c |   6 +-
 include/net/dcalloc.h   |  10 ++
 net/core/dcalloc.c      | 225 ++++++++++++++++++++++++++++++++++++++++
 net/core/dcalloc.h      |   3 +
 4 files changed, 243 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index fd975a4a5200..cc6acd1fa67a 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -843,6 +843,8 @@ static void __init x86_report_nx(void)
 	}
 }
 
+int __init mep_cma_init(void);
+
 /*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * passed the efi memmap, systab, etc., so we should use these data structures
@@ -1223,8 +1225,10 @@ void __init setup_arch(char **cmdline_p)
 	initmem_init();
 	dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
 
-	if (boot_cpu_has(X86_FEATURE_GBPAGES))
+	if (boot_cpu_has(X86_FEATURE_GBPAGES)) {
 		hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT);
+		mep_cma_init();
+	}
 
 	/*
 	 * Reserve memory for crash kernel after SRAT is parsed so that it
diff --git a/include/net/dcalloc.h b/include/net/dcalloc.h
index a85c59d7f844..21c0fcaaa163 100644
--- a/include/net/dcalloc.h
+++ b/include/net/dcalloc.h
@@ -15,4 +15,14 @@ void *dma_cocoa_alloc(struct dma_cocoa *cocoa, unsigned long size,
 void dma_cocoa_free(struct dma_cocoa *cocoa, unsigned long size, void *addr,
 		    dma_addr_t dma);
 
+struct mem_provider;
+
+struct mem_provider *mep_create(struct device *dev);
+void mep_destroy(struct mem_provider *mep);
+
+struct page *mep_alloc(struct mem_provider *mep, unsigned int order,
+		       dma_addr_t *dma, gfp_t gfp);
+void mep_free(struct mem_provider *mep, struct page *page,
+	      unsigned int order, dma_addr_t dma);
+
 #endif
diff --git a/net/core/dcalloc.c b/net/core/dcalloc.c
index af9029018353..821b9dbfb655 100644
--- a/net/core/dcalloc.c
+++ b/net/core/dcalloc.c
@@ -388,3 +388,228 @@ void dma_cocoa_free(struct dma_cocoa *cocoa, unsigned long size, void *addr,
 	size = roundup_pow_of_two(size);
 	return dma_sal_free(&cocoa->sal, addr, size, dma);
 }
+
+/*****************************
+ ***   DMA MEP allocator   ***
+ *****************************/
+
+#include <linux/cma.h>
+
+static struct cma *mep_cma;
+static int mep_err;
+
+int __init mep_cma_init(void);
+int __init mep_cma_init(void)
+{
+	int order_per_bit;
+
+	order_per_bit = min(30 - PAGE_SHIFT, MAX_ORDER - 1);
+	order_per_bit = min(order_per_bit, HUGETLB_PAGE_ORDER);
+
+	mep_err = cma_declare_contiguous_nid(0,		/* base */
+					     SZ_4G,	/* size */
+					     0,		/* limit */
+					     SZ_1G,	/* alignment */
+					     order_per_bit,  /* order_per_bit */
+					     false,	/* fixed */
+					     "net_mep",	/* name */
+					     &mep_cma,	/* res_cma */
+					     NUMA_NO_NODE);  /* nid */
+	if (mep_err)
+		pr_warn("Net MEP init failed: %d\n", mep_err);
+	else
+		pr_info("Net MEP reserved 4G of memory\n");
+
+	return 0;
+}
+
+/** ----- MEP (slow / ctrl) allocator ----- */
+
+void mp_huge_split(struct page *page, unsigned int order)
+{
+	int i;
+
+	split_page(page, order);
+	/* The subsequent pages have a poisoned next, and since we only
+	 * OR in the PP_SIGNATURE this will mess up PP detection.
+	 */
+	for (i = 0; i < (1 << order); i++)
+		page[i].pp_magic &= 3UL;
+}
+
+struct mem_provider {
+	struct dma_slow_allocator sal;
+
+	struct work_struct work;
+};
+
+static int
+dma_mep_alloc_fall(struct dma_slow_allocator *sal, struct dma_slow_fall *fb,
+		   unsigned int size, gfp_t gfp)
+{
+	int order = get_order(size);
+
+	fb->addr = alloc_pages(gfp, order);
+	if (!fb->addr)
+		return -ENOMEM;
+
+	fb->dma = dma_map_page_attrs(sal->dev, fb->addr, 0, size,
+				     DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+	if (dma_mapping_error(sal->dev, fb->dma)) {
+		put_page(fb->addr);
+		return -ENOMEM;
+	}
+
+	mp_huge_split(fb->addr, order);
+	return 0;
+}
+
+static void
+dma_mep_free_fall(struct dma_slow_allocator *sal, struct dma_slow_fall *fb)
+{
+	int order = get_order(fb->size);
+	struct page *page;
+	int i;
+
+	page = fb->addr;
+	dma_unmap_page_attrs(sal->dev, fb->dma, fb->size,
+			     DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+	for (i = 0; i < (1 << order); i++)
+		put_page(page + i);
+}
+
+static void mep_release_work(struct work_struct *work)
+{
+	struct mem_provider *mep;
+
+	mep = container_of(work, struct mem_provider, work);
+
+	while (!list_empty(&mep->sal.huge)) {
+		struct dma_slow_buddy *bud;
+		struct dma_slow_huge *shu;
+
+		shu = list_first_entry(&mep->sal.huge, typeof(*shu), huge);
+
+		dma_unmap_page_attrs(mep->sal.dev, shu->dma, SZ_1G,
+				     DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+		cma_release(mep_cma, shu->addr, SZ_1G / PAGE_SIZE);
+
+		bud = list_first_entry_or_null(&shu->buddy_list,
+					       typeof(*bud), list);
+		if (WARN_ON(!bud || bud->size != SZ_1G))
+			continue;
+		kfree(bud);
+
+		list_del(&shu->huge);
+		kfree(shu);
+	}
+	put_device(mep->sal.dev);
+	kfree(mep);
+}
+
+static void dma_mep_release(struct dma_slow_allocator *sal)
+{
+	struct mem_provider *mep;
+
+	mep = container_of(sal, struct mem_provider, sal);
+
+	INIT_WORK(&mep->work, mep_release_work);
+	schedule_work(&mep->work);
+}
+
+struct dma_slow_allocator_ops dma_mep_ops = {
+	.ptr_shf	= PAGE_SHIFT - order_base_2(sizeof(struct page)),
+
+	.alloc_fall	= dma_mep_alloc_fall,
+	.free_fall	= dma_mep_free_fall,
+
+	.release	= dma_mep_release,
+};
+
+struct mem_provider *mep_create(struct device *dev)
+{
+	struct mem_provider *mep;
+	int i;
+
+	mep = kzalloc(sizeof(*mep), GFP_KERNEL);
+	if (!mep)
+		return NULL;
+
+	dma_sal_init(&mep->sal, &dma_mep_ops, dev);
+	get_device(mep->sal.dev);
+
+	if (mep_err)
+		goto done;
+
+	/* Hardcoded for now */
+	for (i = 0; i < 2; i++) {
+		const unsigned int order = 30 - PAGE_SHIFT; /* 1G */
+		struct dma_slow_huge *shu;
+		struct page *page;
+
+		shu = kzalloc(sizeof(*shu), GFP_KERNEL);
+		if (!shu)
+			break;
+
+		page = cma_alloc(mep_cma, SZ_1G / PAGE_SIZE, order, false);
+		if (!page) {
+			pr_err("mep: CMA alloc failed\n");
+			goto err_free_shu;
+		}
+
+		shu->dma = dma_map_page_attrs(mep->sal.dev, page, 0,
+					      PAGE_SIZE << order,
+					      DMA_BIDIRECTIONAL,
+					      DMA_ATTR_SKIP_CPU_SYNC);
+		if (dma_mapping_error(mep->sal.dev, shu->dma)) {
+			pr_err("mep: DMA map failed\n");
+			goto err_free_page;
+		}
+
+		if (dma_slow_huge_init(shu, page, SZ_1G, shu->dma,
+				       GFP_KERNEL)) {
+			pr_err("mep: shu init failed\n");
+			goto err_unmap;
+		}
+
+		mp_huge_split(page, 30 - PAGE_SHIFT);
+
+		list_add(&shu->huge, &mep->sal.huge);
+		continue;
+
+err_unmap:
+		dma_unmap_page_attrs(mep->sal.dev, shu->dma, SZ_1G,
+				     DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+err_free_page:
+		put_page(page);
+err_free_shu:
+		kfree(shu);
+		break;
+	}
+done:
+	if (list_empty(&mep->sal.huge))
+		pr_warn("mep: no huge pages acquired\n");
+
+	return mep;
+}
+EXPORT_SYMBOL_GPL(mep_create);
+
+void mep_destroy(struct mem_provider *mep)
+{
+	dma_slow_put(&mep->sal);
+}
+EXPORT_SYMBOL_GPL(mep_destroy);
+
+struct page *mep_alloc(struct mem_provider *mep, unsigned int order,
+		       dma_addr_t *dma, gfp_t gfp)
+{
+	return dma_sal_alloc(&mep->sal, PAGE_SIZE << order, dma, gfp);
+}
+EXPORT_SYMBOL_GPL(mep_alloc);
+
+void mep_free(struct mem_provider *mep, struct page *page,
+	      unsigned int order, dma_addr_t dma)
+{
+	dma_sal_free(&mep->sal, page, PAGE_SIZE << order, dma);
+}
+EXPORT_SYMBOL_GPL(mep_free);
diff --git a/net/core/dcalloc.h b/net/core/dcalloc.h
index c7e75ef0cb81..2664f933c8e1 100644
--- a/net/core/dcalloc.h
+++ b/net/core/dcalloc.h
@@ -90,4 +90,7 @@ static inline void dma_slow_put(struct dma_slow_allocator *sal)
 		sal->ops->release(sal);
 }
 
+/* misc */
+void mp_huge_split(struct page *page, unsigned int order);
+
 #endif
-- 
2.41.0


  parent reply	other threads:[~2023-07-07 18:39 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-07-07 18:39 [RFC 00/12] net: huge page backed page_pool Jakub Kicinski
2023-07-07 18:39 ` [RFC 01/12] net: hack together some page sharing Jakub Kicinski
2023-07-07 18:39 ` Jakub Kicinski [this message]
2023-07-07 18:39 ` [RFC 03/12] net: page_pool: hide page_pool_release_page() Jakub Kicinski
2023-07-07 18:39 ` [RFC 04/12] net: page_pool: merge page_pool_release_page() with page_pool_return_page() Jakub Kicinski
2023-07-10 16:07   ` Jesper Dangaard Brouer
2023-07-07 18:39 ` [RFC 05/12] net: page_pool: factor out releasing DMA from releasing the page Jakub Kicinski
2023-07-07 18:39 ` [RFC 06/12] net: page_pool: create hooks for custom page providers Jakub Kicinski
2023-07-07 19:50   ` Mina Almasry
2023-07-07 22:28     ` Jakub Kicinski
2023-07-07 18:39 ` [RFC 07/12] net: page_pool: add huge page backed memory providers Jakub Kicinski
2023-07-07 18:39 ` [RFC 08/12] eth: bnxt: let the page pool manage the DMA mapping Jakub Kicinski
2023-07-10 10:12   ` Jesper Dangaard Brouer
2023-07-26  6:56     ` Ilias Apalodimas
2023-07-07 18:39 ` [RFC 09/12] eth: bnxt: use the page pool for data pages Jakub Kicinski
2023-07-10  4:22   ` Michael Chan
2023-07-10 17:04     ` Jakub Kicinski
2023-07-07 18:39 ` [RFC 10/12] eth: bnxt: make sure we make for recycle skbs before freeing them Jakub Kicinski
2023-07-07 18:39 ` [RFC 11/12] eth: bnxt: wrap coherent allocations into helpers Jakub Kicinski
2023-07-07 18:39 ` [RFC 12/12] eth: bnxt: hack in the use of MEP Jakub Kicinski
2023-07-07 19:45 ` [RFC 00/12] net: huge page backed page_pool Mina Almasry
2023-07-07 22:45   ` Jakub Kicinski
2023-07-10 17:31     ` Mina Almasry
2023-07-11 15:49 ` Jesper Dangaard Brouer
2023-07-12  0:08   ` Jakub Kicinski
2023-07-12 11:47     ` Yunsheng Lin
2023-07-12 12:43       ` Jesper Dangaard Brouer
2023-07-12 17:01         ` Jakub Kicinski
2023-07-14 13:05           ` Yunsheng Lin
2023-07-12 14:00     ` Jesper Dangaard Brouer
2023-07-12 17:19       ` Jakub Kicinski
2023-07-13 10:07         ` Jesper Dangaard Brouer
2023-07-13 16:27           ` Jakub Kicinski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230707183935.997267-3-kuba@kernel.org \
    --to=kuba@kernel.org \
    --cc=almasrymina@google.com \
    --cc=dsahern@gmail.com \
    --cc=edumazet@google.com \
    --cc=hawk@kernel.org \
    --cc=ilias.apalodimas@linaro.org \
    --cc=michael.chan@broadcom.com \
    --cc=netdev@vger.kernel.org \
    --cc=willemb@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.