From: Jakub Kicinski <kuba@kernel.org>
To: netdev@vger.kernel.org
Cc: almasrymina@google.com, hawk@kernel.org,
ilias.apalodimas@linaro.org, edumazet@google.com,
dsahern@gmail.com, michael.chan@broadcom.com, willemb@google.com,
Jakub Kicinski <kuba@kernel.org>
Subject: [RFC 01/12] net: hack together some page sharing
Date: Fri, 7 Jul 2023 11:39:24 -0700 [thread overview]
Message-ID: <20230707183935.997267-2-kuba@kernel.org> (raw)
In-Reply-To: <20230707183935.997267-1-kuba@kernel.org>
Implement a simple buddy allocator with a fallback. It will be
used to split huge pages into smaller pools. And fallback to
alloc_pages() if huge pages are exhausted.
This code will be used exclusively on slow paths and is generally
"not great" but it doesn't seem to immediately crash which is
good enough for now?
This patch contains a basic "coherent allocator" which splits 2M
coherently mapped pages into smaller chunks. Certian drivers
appear to allocate a few MB in single coherent pages which is not
great for IOTLB pressure (simple iperf test on bnxt with Rx backed
by huge pages goes from 170k IOTLB misses to 60k when using this).
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
include/net/dcalloc.h | 18 ++
net/core/Makefile | 2 +-
net/core/dcalloc.c | 390 ++++++++++++++++++++++++++++++++++++++++++
net/core/dcalloc.h | 93 ++++++++++
4 files changed, 502 insertions(+), 1 deletion(-)
create mode 100644 include/net/dcalloc.h
create mode 100644 net/core/dcalloc.c
create mode 100644 net/core/dcalloc.h
diff --git a/include/net/dcalloc.h b/include/net/dcalloc.h
new file mode 100644
index 000000000000..a85c59d7f844
--- /dev/null
+++ b/include/net/dcalloc.h
@@ -0,0 +1,18 @@
+#ifndef __NET_DCALLOC_H
+#define __NET_DCALLOC_H
+
+#include <linux/types.h>
+
+struct device;
+
+struct dma_cocoa;
+
+struct dma_cocoa *dma_cocoa_create(struct device *dev, gfp_t gfp);
+void dma_cocoa_destroy(struct dma_cocoa *cocoa);
+
+void *dma_cocoa_alloc(struct dma_cocoa *cocoa, unsigned long size,
+ dma_addr_t *dma, gfp_t gfp);
+void dma_cocoa_free(struct dma_cocoa *cocoa, unsigned long size, void *addr,
+ dma_addr_t dma);
+
+#endif
diff --git a/net/core/Makefile b/net/core/Makefile
index 731db2eaa610..3a98ad5d2b49 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -13,7 +13,7 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \
neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \
fib_notifier.o xdp.o flow_offload.o gro.o \
- netdev-genl.o netdev-genl-gen.o gso.o
+ netdev-genl.o netdev-genl-gen.o gso.o dcalloc.o
obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o
diff --git a/net/core/dcalloc.c b/net/core/dcalloc.c
new file mode 100644
index 000000000000..af9029018353
--- /dev/null
+++ b/net/core/dcalloc.c
@@ -0,0 +1,390 @@
+#include "dcalloc.h"
+
+#include <linux/dma-mapping.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+
+static bool dma_sal_in_use(struct dma_slow_allocator *sal)
+{
+ return refcount_read(&sal->user_cnt);
+}
+
+int dma_slow_huge_init(struct dma_slow_huge *shu, void *addr,
+ unsigned int size, dma_addr_t dma, gfp_t gfp)
+{
+ struct dma_slow_buddy *bud;
+
+ bud = kzalloc(sizeof(*bud), gfp);
+ if (!bud)
+ return -ENOMEM;
+
+ shu->addr = addr;
+ shu->size = size;
+ shu->dma = dma;
+
+ INIT_LIST_HEAD(&shu->buddy_list);
+
+ bud->size = size;
+ bud->free = true;
+ list_add(&bud->list, &shu->buddy_list);
+
+ return 0;
+}
+
+static struct dma_slow_buddy *
+dma_slow_bud_split(struct dma_slow_buddy *bud, gfp_t gfp)
+{
+ struct dma_slow_buddy *right;
+
+ right = kzalloc(sizeof(*bud), gfp);
+ if (!right)
+ return NULL;
+
+ bud->size /= 2;
+
+ right->offset = bud->offset + bud->size;
+ right->size = bud->size;
+ right->free = true;
+
+ list_add(&right->list, &bud->list);
+
+ return bud;
+}
+
+static bool dma_slow_bud_coalesce(struct dma_slow_huge *shu)
+{
+ struct dma_slow_buddy *bud, *left = NULL, *right = NULL;
+
+ list_for_each_entry(bud, &shu->buddy_list, list) {
+ if (left && bud &&
+ left->free && bud->free &&
+ left->size == bud->size &&
+ (left->offset & bud->offset) == left->offset) {
+ right = bud;
+ break;
+ }
+ left = bud;
+ }
+
+ if (!right)
+ return false;
+
+ left->size *= 2;
+ list_del(&right->list);
+ kfree(right);
+ return true;
+}
+
+static void *
+__dma_sal_alloc_buddy(struct dma_slow_allocator *sal, struct dma_slow_huge *shu,
+ unsigned int size, dma_addr_t *dma, gfp_t gfp)
+{
+ struct dma_slow_buddy *small_fit = NULL;
+ struct dma_slow_buddy *bud;
+
+ if (shu->size < size)
+ return NULL;
+
+ list_for_each_entry(bud, &shu->buddy_list, list) {
+ if (!bud->free || bud->size < size)
+ continue;
+
+ if (!small_fit || small_fit->size > bud->size)
+ small_fit = bud;
+ if (bud->size == size)
+ break;
+ }
+ if (!small_fit)
+ return NULL;
+ bud = small_fit;
+
+ while (bud->size >= size * 2) {
+ bud = dma_slow_bud_split(bud, gfp);
+ if (!bud)
+ return NULL;
+ }
+
+ bud->free = false;
+ *dma = shu->dma + bud->offset;
+ return shu->addr + (bud->offset >> sal->ops->ptr_shf);
+}
+
+static void *
+dma_sal_alloc_buddy(struct dma_slow_allocator *sal, unsigned int size,
+ dma_addr_t *dma, gfp_t gfp)
+{
+ struct dma_slow_huge *shu;
+ void *addr;
+
+ list_for_each_entry(shu, &sal->huge, huge) {
+ addr = __dma_sal_alloc_buddy(sal, shu, size, dma, gfp);
+ if (addr)
+ return addr;
+ }
+
+ if (!sal->ops->alloc_huge)
+ return NULL;
+
+ shu = kzalloc(sizeof(*shu), gfp);
+ if (!shu)
+ return NULL;
+ if (sal->ops->alloc_huge(sal, shu, size, gfp)) {
+ kfree(shu);
+ return NULL;
+ }
+ list_add(&shu->huge, &sal->huge);
+
+ return __dma_sal_alloc_buddy(sal, shu, size, dma, gfp);
+}
+
+static bool
+__dma_sal_free_buddy(struct dma_slow_allocator *sal, struct dma_slow_huge *shu,
+ void *addr, unsigned int size, dma_addr_t dma)
+{
+ struct dma_slow_buddy *bud;
+ dma_addr_t exp_dma;
+ void *exp_addr;
+
+ list_for_each_entry(bud, &shu->buddy_list, list) {
+ exp_dma = shu->dma + bud->offset;
+ exp_addr = shu->addr + (bud->offset >> sal->ops->ptr_shf);
+
+ if (exp_addr != addr)
+ continue;
+
+ if (exp_dma != dma || bud->size != size)
+ pr_warn("mep param mismatch: %u %u, %lu %lu\n",
+ bud->size, size, (ulong)exp_dma, (ulong)dma);
+ if (bud->free)
+ pr_warn("double free: %d %lu\n", size, (ulong)dma);
+ bud->free = true;
+ return true;
+ }
+
+ return false;
+}
+
+static void
+dma_slow_maybe_free_huge(struct dma_slow_allocator *sal,
+ struct dma_slow_huge *shu)
+{
+ struct dma_slow_buddy *bud;
+
+ bud = list_first_entry(&shu->buddy_list, typeof(*bud), list);
+ if (!bud->free || bud->size != shu->size)
+ return;
+
+ if (!sal->ops->alloc_huge)
+ return;
+
+ kfree(bud);
+
+ sal->ops->free_huge(sal, shu);
+ list_del(&shu->huge);
+ kfree(shu);
+}
+
+static bool
+dma_sal_free_buddy(struct dma_slow_allocator *sal, void *addr,
+ unsigned int order, dma_addr_t dma)
+{
+ struct dma_slow_huge *shu;
+ bool freed = false;
+
+ list_for_each_entry(shu, &sal->huge, huge) {
+ freed = __dma_sal_free_buddy(sal, shu, addr, order, dma);
+ if (freed)
+ break;
+ }
+ if (freed) {
+ while (dma_slow_bud_coalesce(shu))
+ /* I know, it's not efficient.
+ * But all of SAL is on the config path.
+ */;
+ dma_slow_maybe_free_huge(sal, shu);
+ }
+ return freed;
+}
+
+static void *
+dma_sal_alloc_fb(struct dma_slow_allocator *sal, unsigned int size,
+ dma_addr_t *dma, gfp_t gfp)
+{
+ struct dma_slow_fall *fb;
+
+ fb = kzalloc(sizeof(*fb), gfp);
+ if (!fb)
+ return NULL;
+ fb->size = size;
+
+ if (sal->ops->alloc_fall(sal, fb, size, gfp)) {
+ kfree(fb);
+ return NULL;
+ }
+ list_add(&fb->fb, &sal->fallback);
+
+ *dma = fb->dma;
+ return fb->addr;
+}
+
+static bool dma_sal_free_fb(struct dma_slow_allocator *sal, void *addr,
+ unsigned int size, dma_addr_t dma)
+{
+ struct dma_slow_fall *fb, *pos;
+
+ fb = NULL;
+ list_for_each_entry(pos, &sal->fallback, fb)
+ if (pos->addr == addr) {
+ fb = pos;
+ break;
+ }
+
+ if (!fb) {
+ pr_warn("free: address %px not found\n", addr);
+ return false;
+ }
+
+ if (fb->size != size || fb->dma != dma)
+ pr_warn("free: param mismatch: %u %u, %lu %lu\n",
+ fb->size, size, (ulong)fb->dma, (ulong)dma);
+
+ list_del(&fb->fb);
+ sal->ops->free_fall(sal, fb);
+ kfree(fb);
+ return true;
+}
+
+void *dma_sal_alloc(struct dma_slow_allocator *sal, unsigned int size,
+ dma_addr_t *dma, gfp_t gfp)
+{
+ void *ret;
+
+ ret = dma_sal_alloc_buddy(sal, size, dma, gfp);
+ if (!ret)
+ ret = dma_sal_alloc_fb(sal, size, dma, gfp);
+ if (!ret)
+ return NULL;
+
+ dma_slow_get(sal);
+ return ret;
+}
+
+void dma_sal_free(struct dma_slow_allocator *sal, void *addr,
+ unsigned int size, dma_addr_t dma)
+{
+ if (!dma_sal_free_buddy(sal, addr, size, dma) &&
+ !dma_sal_free_fb(sal, addr, size, dma))
+ return;
+
+ dma_slow_put(sal);
+}
+
+void dma_sal_init(struct dma_slow_allocator *sal,
+ const struct dma_slow_allocator_ops *ops,
+ struct device *dev)
+{
+ sal->ops = ops;
+ sal->dev = dev;
+
+ INIT_LIST_HEAD(&sal->huge);
+ INIT_LIST_HEAD(&sal->fallback);
+
+ refcount_set(&sal->user_cnt, 1);
+}
+
+/*****************************
+ *** DMA COCOA allocator ***
+ *****************************/
+static int
+dma_cocoa_alloc_huge(struct dma_slow_allocator *sal, struct dma_slow_huge *shu,
+ unsigned int size, gfp_t gfp)
+{
+ if (size >= SZ_2M)
+ return -ENOMEM;
+
+ shu->addr = dma_alloc_coherent(sal->dev, SZ_2M, &shu->dma, gfp);
+ if (!shu->addr)
+ return -ENOMEM;
+
+ if (dma_slow_huge_init(shu, shu->addr, SZ_2M, shu->dma, gfp))
+ goto err_free_dma;
+
+ return 0;
+
+err_free_dma:
+ dma_free_coherent(sal->dev, SZ_2M, shu->addr, shu->dma);
+ return -ENOMEM;
+}
+
+static void
+dma_cocoa_free_huge(struct dma_slow_allocator *sal, struct dma_slow_huge *shu)
+{
+ dma_free_coherent(sal->dev, SZ_2M, shu->addr, shu->dma);
+}
+
+static int
+dma_cocoa_alloc_fall(struct dma_slow_allocator *sal, struct dma_slow_fall *fb,
+ unsigned int size, gfp_t gfp)
+{
+ fb->addr = dma_alloc_coherent(sal->dev, size, &fb->dma, gfp);
+ if (!fb->addr)
+ return -ENOMEM;
+ return 0;
+}
+
+static void
+dma_cocoa_free_fall(struct dma_slow_allocator *sal, struct dma_slow_fall *fb)
+{
+ dma_free_coherent(sal->dev, fb->size, fb->addr, fb->dma);
+}
+
+struct dma_slow_allocator_ops dma_cocoa_ops = {
+ .alloc_huge = dma_cocoa_alloc_huge,
+ .free_huge = dma_cocoa_free_huge,
+ .alloc_fall = dma_cocoa_alloc_fall,
+ .free_fall = dma_cocoa_free_fall,
+};
+
+struct dma_cocoa {
+ struct dma_slow_allocator sal;
+};
+
+struct dma_cocoa *dma_cocoa_create(struct device *dev, gfp_t gfp)
+{
+ struct dma_cocoa *cocoa;
+
+ cocoa = kzalloc(sizeof(*cocoa), gfp);
+ if (!cocoa)
+ return NULL;
+
+ dma_sal_init(&cocoa->sal, &dma_cocoa_ops, dev);
+
+ return cocoa;
+}
+
+void dma_cocoa_destroy(struct dma_cocoa *cocoa)
+{
+ dma_slow_put(&cocoa->sal);
+ WARN_ON(dma_sal_in_use(&cocoa->sal));
+ kfree(cocoa);
+}
+
+void *dma_cocoa_alloc(struct dma_cocoa *cocoa, unsigned long size,
+ dma_addr_t *dma, gfp_t gfp)
+{
+ void *addr;
+
+ size = roundup_pow_of_two(size);
+ addr = dma_sal_alloc(&cocoa->sal, size, dma, gfp);
+ if (!addr)
+ return NULL;
+ memset(addr, 0, size);
+ return addr;
+}
+
+void dma_cocoa_free(struct dma_cocoa *cocoa, unsigned long size, void *addr,
+ dma_addr_t dma)
+{
+ size = roundup_pow_of_two(size);
+ return dma_sal_free(&cocoa->sal, addr, size, dma);
+}
diff --git a/net/core/dcalloc.h b/net/core/dcalloc.h
new file mode 100644
index 000000000000..c7e75ef0cb81
--- /dev/null
+++ b/net/core/dcalloc.h
@@ -0,0 +1,93 @@
+#ifndef __DCALLOC_H
+#define __DCALLOC_H
+
+#include <linux/dma-mapping.h>
+#include <net/dcalloc.h>
+
+struct device;
+
+/* struct dma_slow_huge - AKA @shu, large block which will get chopped up */
+struct dma_slow_huge {
+ void *addr;
+ unsigned int size;
+ dma_addr_t dma;
+
+ struct list_head huge;
+ struct list_head buddy_list; /* struct dma_slow_buddy */
+};
+
+/* Single allocation piece */
+struct dma_slow_buddy {
+ unsigned int offset;
+ unsigned int size;
+
+ bool free;
+
+ struct list_head list;
+};
+
+/* struct dma_slow_fall - AKA @fb, fallback when huge can't be allocated */
+struct dma_slow_fall {
+ void *addr;
+ unsigned int size;
+ dma_addr_t dma;
+
+ struct list_head fb;
+};
+
+/* struct dma_slow_allocator - AKA @sal, per device allocator */
+struct dma_slow_allocator {
+ const struct dma_slow_allocator_ops *ops;
+ struct device *dev;
+
+ unsigned int ptr_shf;
+ refcount_t user_cnt;
+
+ struct list_head huge; /* struct dma_slow_huge */
+ struct list_head fallback; /* struct dma_slow_fall */
+};
+
+struct dma_slow_allocator_ops {
+ u8 ptr_shf;
+
+ int (*alloc_huge)(struct dma_slow_allocator *sal,
+ struct dma_slow_huge *shu,
+ unsigned int size, gfp_t gfp);
+ void (*free_huge)(struct dma_slow_allocator *sal,
+ struct dma_slow_huge *fb);
+ int (*alloc_fall)(struct dma_slow_allocator *sal,
+ struct dma_slow_fall *fb,
+ unsigned int size, gfp_t gfp);
+ void (*free_fall)(struct dma_slow_allocator *sal,
+ struct dma_slow_fall *fb);
+
+ void (*release)(struct dma_slow_allocator *sal);
+};
+
+int dma_slow_huge_init(struct dma_slow_huge *shu, void *addr,
+ unsigned int size, dma_addr_t dma, gfp_t gfp);
+
+void dma_sal_init(struct dma_slow_allocator *sal,
+ const struct dma_slow_allocator_ops *ops,
+ struct device *dev);
+
+void *dma_sal_alloc(struct dma_slow_allocator *sal, unsigned int size,
+ dma_addr_t *dma, gfp_t gfp);
+void dma_sal_free(struct dma_slow_allocator *sal, void *addr,
+ unsigned int size, dma_addr_t dma);
+
+static inline void dma_slow_get(struct dma_slow_allocator *sal)
+{
+ refcount_inc(&sal->user_cnt);
+}
+
+static inline void dma_slow_put(struct dma_slow_allocator *sal)
+{
+ if (!refcount_dec_and_test(&sal->user_cnt))
+ return;
+
+ if (sal->ops->release)
+ sal->ops->release(sal);
+}
+
+#endif
--
2.41.0
next prev parent reply other threads:[~2023-07-07 18:39 UTC|newest]
Thread overview: 33+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-07-07 18:39 [RFC 00/12] net: huge page backed page_pool Jakub Kicinski
2023-07-07 18:39 ` Jakub Kicinski [this message]
2023-07-07 18:39 ` [RFC 02/12] net: create a 1G-huge-page-backed allocator Jakub Kicinski
2023-07-07 18:39 ` [RFC 03/12] net: page_pool: hide page_pool_release_page() Jakub Kicinski
2023-07-07 18:39 ` [RFC 04/12] net: page_pool: merge page_pool_release_page() with page_pool_return_page() Jakub Kicinski
2023-07-10 16:07 ` Jesper Dangaard Brouer
2023-07-07 18:39 ` [RFC 05/12] net: page_pool: factor out releasing DMA from releasing the page Jakub Kicinski
2023-07-07 18:39 ` [RFC 06/12] net: page_pool: create hooks for custom page providers Jakub Kicinski
2023-07-07 19:50 ` Mina Almasry
2023-07-07 22:28 ` Jakub Kicinski
2023-07-07 18:39 ` [RFC 07/12] net: page_pool: add huge page backed memory providers Jakub Kicinski
2023-07-07 18:39 ` [RFC 08/12] eth: bnxt: let the page pool manage the DMA mapping Jakub Kicinski
2023-07-10 10:12 ` Jesper Dangaard Brouer
2023-07-26 6:56 ` Ilias Apalodimas
2023-07-07 18:39 ` [RFC 09/12] eth: bnxt: use the page pool for data pages Jakub Kicinski
2023-07-10 4:22 ` Michael Chan
2023-07-10 17:04 ` Jakub Kicinski
2023-07-07 18:39 ` [RFC 10/12] eth: bnxt: make sure we make for recycle skbs before freeing them Jakub Kicinski
2023-07-07 18:39 ` [RFC 11/12] eth: bnxt: wrap coherent allocations into helpers Jakub Kicinski
2023-07-07 18:39 ` [RFC 12/12] eth: bnxt: hack in the use of MEP Jakub Kicinski
2023-07-07 19:45 ` [RFC 00/12] net: huge page backed page_pool Mina Almasry
2023-07-07 22:45 ` Jakub Kicinski
2023-07-10 17:31 ` Mina Almasry
2023-07-11 15:49 ` Jesper Dangaard Brouer
2023-07-12 0:08 ` Jakub Kicinski
2023-07-12 11:47 ` Yunsheng Lin
2023-07-12 12:43 ` Jesper Dangaard Brouer
2023-07-12 17:01 ` Jakub Kicinski
2023-07-14 13:05 ` Yunsheng Lin
2023-07-12 14:00 ` Jesper Dangaard Brouer
2023-07-12 17:19 ` Jakub Kicinski
2023-07-13 10:07 ` Jesper Dangaard Brouer
2023-07-13 16:27 ` Jakub Kicinski
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230707183935.997267-2-kuba@kernel.org \
--to=kuba@kernel.org \
--cc=almasrymina@google.com \
--cc=dsahern@gmail.com \
--cc=edumazet@google.com \
--cc=hawk@kernel.org \
--cc=ilias.apalodimas@linaro.org \
--cc=michael.chan@broadcom.com \
--cc=netdev@vger.kernel.org \
--cc=willemb@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.