From: Tony Battersby <tonyb@cybernetics.com>
To: Christoph Hellwig <hch@lst.de>,
Marek Szyprowski <m.szyprowski@samsung.com>,
Matthew Wilcox <willy@infradead.org>,
Sathya Prakash <sathya.prakash@broadcom.com>,
Chaitra P B <chaitra.basappa@broadcom.com>,
Suganath Prabu Subramani <suganath-prabu.subramani@broadcom.com>,
iommu@lists.linux-foundation.org, linux-mm@kvack.org,
linux-scsi <linux-scsi@vger.kernel.org>,
MPT-FusionLinux.pdl@broadcom.com
Subject: [PATCH 2/3] dmapool: improve scalability of dma_pool_free
Date: Thu, 26 Jul 2018 14:54:56 -0400 [thread overview]
Message-ID: <1288e597-a67a-25b3-b7c6-db883ca67a25@cybernetics.com> (raw)
dma_pool_free() scales poorly when the pool contains many pages because
pool_find_page() does a linear scan of all allocated pages. Improve its
scalability by replacing the linear scan with a red-black tree lookup.
In big O notation, this improves the algorithm from O(n^2) to O(n * log n).
Signed-off-by: Tony Battersby <tonyb@cybernetics.com>
---
I moved some code from dma_pool_destroy() into pool_free_page() to avoid code
repetition.
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -15,11 +15,12 @@
* Many older drivers still have their own code to do this.
*
* The current design of this allocator is fairly simple. The pool is
- * represented by the 'struct dma_pool' which keeps a doubly-linked list of
- * allocated pages. Each page in the page_list is split into blocks of at
- * least 'size' bytes. Free blocks are tracked in an unsorted singly-linked
- * list of free blocks within the page. Used blocks aren't tracked, but we
- * keep a count of how many are currently allocated from each page.
+ * represented by the 'struct dma_pool' which keeps a red-black tree of all
+ * allocated pages, keyed by DMA address for fast lookup when freeing.
+ * Each page in the page_tree is split into blocks of at least 'size' bytes.
+ * Free blocks are tracked in an unsorted singly-linked list of free blocks
+ * within the page. Used blocks aren't tracked, but we keep a count of how
+ * many are currently allocated from each page.
*
* The avail_page_list keeps track of pages that have one or more free blocks
* available to (re)allocate. Pages are moved in and out of avail_page_list
@@ -41,13 +42,14 @@
#include <linux/string.h>
#include <linux/types.h>
#include <linux/wait.h>
+#include <linux/rbtree.h>
#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
#define DMAPOOL_DEBUG 1
#endif
struct dma_pool { /* the pool */
- struct list_head page_list;
+ struct rb_root page_tree;
struct list_head avail_page_list;
spinlock_t lock;
size_t size;
@@ -59,7 +61,7 @@ struct dma_pool { /* the pool */
};
struct dma_page { /* cacheable header for 'allocation' bytes */
- struct list_head page_list;
+ struct rb_node page_node;
struct list_head avail_page_link;
void *vaddr;
dma_addr_t dma;
@@ -78,6 +80,7 @@ show_pools(struct device *dev, struct de
char *next;
struct dma_page *page;
struct dma_pool *pool;
+ struct rb_node *node;
next = buf;
size = PAGE_SIZE;
@@ -92,7 +95,10 @@ show_pools(struct device *dev, struct de
unsigned blocks = 0;
spin_lock_irq(&pool->lock);
- list_for_each_entry(page, &pool->page_list, page_list) {
+ for (node = rb_first(&pool->page_tree);
+ node;
+ node = rb_next(node)) {
+ page = rb_entry(node, struct dma_page, page_node);
pages++;
blocks += page->in_use;
}
@@ -169,7 +175,7 @@ struct dma_pool *dma_pool_create(const c
retval->dev = dev;
- INIT_LIST_HEAD(&retval->page_list);
+ retval->page_tree = RB_ROOT;
INIT_LIST_HEAD(&retval->avail_page_list);
spin_lock_init(&retval->lock);
retval->size = size;
@@ -210,6 +216,65 @@ struct dma_pool *dma_pool_create(const c
}
EXPORT_SYMBOL(dma_pool_create);
+/*
+ * Find the dma_page that manages the given DMA address.
+ */
+static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
+{
+ struct rb_node *node = pool->page_tree.rb_node;
+
+ while (node) {
+ struct dma_page *page =
+ container_of(node, struct dma_page, page_node);
+
+ if (dma < page->dma)
+ node = node->rb_left;
+ else if ((dma - page->dma) >= pool->allocation)
+ node = node->rb_right;
+ else
+ return page;
+ }
+ return NULL;
+}
+
+/*
+ * Insert a dma_page into the page_tree.
+ */
+static int pool_insert_page(struct dma_pool *pool, struct dma_page *new_page)
+{
+ dma_addr_t dma = new_page->dma;
+ struct rb_node **node = &(pool->page_tree.rb_node), *parent = NULL;
+
+ while (*node) {
+ struct dma_page *this_page =
+ container_of(*node, struct dma_page, page_node);
+
+ parent = *node;
+ if (dma < this_page->dma)
+ node = &((*node)->rb_left);
+ else if (likely((dma - this_page->dma) >= pool->allocation))
+ node = &((*node)->rb_right);
+ else {
+ /*
+ * A page that overlaps the new DMA range is already
+ * present in the tree. This should not happen.
+ */
+ WARN(1,
+ "%s: %s: DMA address overlap: old 0x%llx new 0x%llx len %zu\n",
+ pool->dev ? dev_name(pool->dev) : "(nodev)",
+ pool->name, (u64) this_page->dma, (u64) dma,
+ pool->allocation);
+ return -1;
+ }
+ }
+
+ /* Add new node and rebalance tree. */
+ rb_link_node(&new_page->page_node, parent, node);
+ rb_insert_color(&new_page->page_node, &pool->page_tree);
+
+ return 0;
+}
+
static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page)
{
unsigned int offset = 0;
@@ -254,15 +319,36 @@ static inline bool is_page_busy(struct d
return page->in_use != 0;
}
-static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
+static void pool_free_page(struct dma_pool *pool,
+ struct dma_page *page,
+ bool destroying_pool)
{
- dma_addr_t dma = page->dma;
-
+ if (destroying_pool && is_page_busy(page)) {
+ if (pool->dev)
+ dev_err(pool->dev,
+ "dma_pool_destroy %s, %p busy\n",
+ pool->name, page->vaddr);
+ else
+ pr_err("dma_pool_destroy %s, %p busy\n",
+ pool->name, page->vaddr);
+ /* leak the still-in-use consistent memory */
+ } else {
#ifdef DMAPOOL_DEBUG
- memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
+ memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
#endif
- dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma);
- list_del(&page->page_list);
+ dma_free_coherent(pool->dev,
+ pool->allocation,
+ page->vaddr,
+ page->dma);
+ }
+
+ /*
+ * If the pool is being destroyed, it is not safe to modify the
+ * page_tree while iterating over it, and it is also unnecessary since
+ * the whole tree will be discarded anyway.
+ */
+ if (!destroying_pool)
+ rb_erase(&page->page_node, &pool->page_tree);
list_del(&page->avail_page_link);
kfree(page);
}
@@ -277,6 +363,7 @@ static void pool_free_page(struct dma_po
*/
void dma_pool_destroy(struct dma_pool *pool)
{
+ struct dma_page *page, *tmp_page;
bool empty = false;
if (unlikely(!pool))
@@ -292,24 +379,11 @@ void dma_pool_destroy(struct dma_pool *p
device_remove_file(pool->dev, &dev_attr_pools);
mutex_unlock(&pools_reg_lock);
- while (!list_empty(&pool->page_list)) {
- struct dma_page *page;
- page = list_entry(pool->page_list.next,
- struct dma_page, page_list);
- if (is_page_busy(page)) {
- if (pool->dev)
- dev_err(pool->dev,
- "dma_pool_destroy %s, %p busy\n",
- pool->name, page->vaddr);
- else
- pr_err("dma_pool_destroy %s, %p busy\n",
- pool->name, page->vaddr);
- /* leak the still-in-use consistent memory */
- list_del(&page->page_list);
- list_del(&page->avail_page_link);
- kfree(page);
- } else
- pool_free_page(pool, page);
+ rbtree_postorder_for_each_entry_safe(page,
+ tmp_page,
+ &pool->page_tree,
+ page_node) {
+ pool_free_page(pool, page, true);
}
kfree(pool);
@@ -353,7 +427,15 @@ void *dma_pool_alloc(struct dma_pool *po
spin_lock_irqsave(&pool->lock, flags);
- list_add(&page->page_list, &pool->page_list);
+ if (unlikely(pool_insert_page(pool, page))) {
+ /*
+ * This should not happen, so something must have gone horribly
+ * wrong. Instead of crashing, intentionally leak the memory
+ * and make for the exit.
+ */
+ spin_unlock_irqrestore(&pool->lock, flags);
+ return NULL;
+ }
list_add(&page->avail_page_link, &pool->avail_page_list);
ready:
page->in_use++;
@@ -400,19 +482,6 @@ void *dma_pool_alloc(struct dma_pool *po
}
EXPORT_SYMBOL(dma_pool_alloc);
-static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
-{
- struct dma_page *page;
-
- list_for_each_entry(page, &pool->page_list, page_list) {
- if (dma < page->dma)
- continue;
- if ((dma - page->dma) < pool->allocation)
- return page;
- }
- return NULL;
-}
-
/**
* dma_pool_free - put block back into dma pool
* @pool: the dma pool holding the block
@@ -484,7 +553,7 @@ void dma_pool_free(struct dma_pool *pool
page->offset = offset;
/*
* Resist a temptation to do
- * if (!is_page_busy(page)) pool_free_page(pool, page);
+ * if (!is_page_busy(page)) pool_free_page(pool, page, false);
* Better have a few empty pages hang around.
*/
spin_unlock_irqrestore(&pool->lock, flags);
next reply other threads:[~2018-07-26 18:54 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-07-26 18:54 Tony Battersby [this message]
2018-07-26 19:42 ` [PATCH 2/3] dmapool: improve scalability of dma_pool_free Matthew Wilcox
2018-07-26 20:06 ` Tony Battersby
2018-07-27 0:07 ` Matthew Wilcox
2018-07-27 13:23 ` Tony Battersby
2018-07-27 15:23 ` Matthew Wilcox
2018-07-27 19:38 ` Tony Battersby
2018-07-27 21:27 ` Tony Battersby
2018-07-27 21:35 ` Andy Shevchenko
2018-07-27 22:07 ` Tony Battersby
2018-07-30 14:05 ` Tony Battersby
2018-07-26 19:45 ` Andy Shevchenko
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1288e597-a67a-25b3-b7c6-db883ca67a25@cybernetics.com \
--to=tonyb@cybernetics.com \
--cc=MPT-FusionLinux.pdl@broadcom.com \
--cc=chaitra.basappa@broadcom.com \
--cc=hch@lst.de \
--cc=iommu@lists.linux-foundation.org \
--cc=linux-mm@kvack.org \
--cc=linux-scsi@vger.kernel.org \
--cc=m.szyprowski@samsung.com \
--cc=sathya.prakash@broadcom.com \
--cc=suganath-prabu.subramani@broadcom.com \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).