From: Chris Mason <chris.mason@fusionio.com>
To: Linux FS Devel <linux-fsdevel@vger.kernel.org>,
David Woodhouse <David.Woodhouse@intel.com>,
"dchinner@redhat.com" <dchinner@redhat.com>,
<bo.li.liu@oracle.com>
Subject: [PATCH RFC 2/2] skiplists for the IOMMU
Date: Thu, 2 May 2013 22:10:00 -0400 [thread overview]
Message-ID: <20130503021000.GB4271@shiny.masoncoding.com> (raw)
In-Reply-To: <20130503020211.3599.72404@localhost.localdomain>
This is only for experimental use, and it is only lightly tested.
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 0099667..32d2920 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1403,7 +1403,6 @@ static void iommu_detach_domain(struct dmar_domain *domain,
}
static struct iova_domain reserved_iova_list;
-static struct lock_class_key reserved_rbtree_key;
static int dmar_init_reserved_ranges(void)
{
@@ -1413,9 +1412,6 @@ static int dmar_init_reserved_ranges(void)
init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
- lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
- &reserved_rbtree_key);
-
/* IOAPIC ranges shouldn't be accessed by DMA */
iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
IOVA_PFN(IOAPIC_RANGE_END));
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 67da6cff..efd5518 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -18,182 +18,113 @@
*/
#include <linux/iova.h>
+#include <linux/random.h>
+#define IOVA_PENDING_INIT ((unsigned long) -1)
void
init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit)
{
- spin_lock_init(&iovad->iova_rbtree_lock);
- iovad->rbroot = RB_ROOT;
- iovad->cached32_node = NULL;
+ spin_lock_init(&iovad->del_skiplist_lock);
+ spin_lock_init(&iovad->skiplist_lock);
+ sl_init_list(&iovad->skiplist, GFP_ATOMIC);
iovad->dma_32bit_pfn = pfn_32bit;
}
-static struct rb_node *
-__get_cached_rbnode(struct iova_domain *iovad, unsigned long *limit_pfn)
+static unsigned long
+__get_cached_addr(struct iova_domain *iovad, unsigned long limit_pfn)
{
- if ((*limit_pfn != iovad->dma_32bit_pfn) ||
- (iovad->cached32_node == NULL))
- return rb_last(&iovad->rbroot);
- else {
- struct rb_node *prev_node = rb_prev(iovad->cached32_node);
- struct iova *curr_iova =
- container_of(iovad->cached32_node, struct iova, node);
- *limit_pfn = curr_iova->pfn_lo - 1;
- return prev_node;
- }
-}
-
-static void
-__cached_rbnode_insert_update(struct iova_domain *iovad,
- unsigned long limit_pfn, struct iova *new)
-{
- if (limit_pfn != iovad->dma_32bit_pfn)
- return;
- iovad->cached32_node = &new->node;
-}
+ unsigned long guess;
-static void
-__cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
-{
- struct iova *cached_iova;
- struct rb_node *curr;
+ prandom_bytes(&guess, sizeof(guess));
- if (!iovad->cached32_node)
- return;
- curr = iovad->cached32_node;
- cached_iova = container_of(curr, struct iova, node);
-
- if (free->pfn_lo >= cached_iova->pfn_lo) {
- struct rb_node *node = rb_next(&free->node);
- struct iova *iova = container_of(node, struct iova, node);
-
- /* only cache if it's below 32bit pfn */
- if (node && iova->pfn_lo < iovad->dma_32bit_pfn)
- iovad->cached32_node = node;
- else
- iovad->cached32_node = NULL;
+ /* the skiplist code is fastest when we spread out the
+ * key range as much as possible. Instead of caching the
+ * last freed or allocated number, return random guesses
+ * in hopes of fanning out our locking attempts.
+ */
+ if (limit_pfn == iovad->dma_32bit_pfn) {
+ guess = guess % iovad->dma_32bit_pfn;
+ guess = max_t(unsigned long, guess, IOVA_START_PFN);
+ return guess;
+ } else {
+ guess = max_t(unsigned long, guess, IOVA_START_PFN);
+ guess = min_t(unsigned long, guess, (~0UL) >> 1);
+ return guess;
}
}
-/* Computes the padding size required, to make the
- * the start address naturally aligned on its size
- */
-static int
-iova_get_pad_size(int size, unsigned int limit_pfn)
-{
- unsigned int pad_size = 0;
- unsigned int order = ilog2(size);
-
- if (order)
- pad_size = (limit_pfn + 1) % (1 << order);
-
- return pad_size;
-}
-
static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
unsigned long size, unsigned long limit_pfn,
struct iova *new, bool size_aligned)
{
- struct rb_node *prev, *curr = NULL;
unsigned long flags;
- unsigned long saved_pfn;
- unsigned int pad_size = 0;
-
- /* Walk the tree backwards */
- spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
- saved_pfn = limit_pfn;
- curr = __get_cached_rbnode(iovad, &limit_pfn);
- prev = curr;
- while (curr) {
- struct iova *curr_iova = container_of(curr, struct iova, node);
-
- if (limit_pfn < curr_iova->pfn_lo)
- goto move_left;
- else if (limit_pfn < curr_iova->pfn_hi)
- goto adjust_limit_pfn;
- else {
- if (size_aligned)
- pad_size = iova_get_pad_size(size, limit_pfn);
- if ((curr_iova->pfn_hi + size + pad_size) <= limit_pfn)
- break; /* found a free slot */
- }
-adjust_limit_pfn:
- limit_pfn = curr_iova->pfn_lo - 1;
-move_left:
- prev = curr;
- curr = rb_prev(curr);
- }
+ unsigned long align = 1;
+ unsigned long hint;
- if (!curr) {
- if (size_aligned)
- pad_size = iova_get_pad_size(size, limit_pfn);
- if ((IOVA_START_PFN + size + pad_size) > limit_pfn) {
- spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
- return -ENOMEM;
- }
- }
+ int ret;
- /* pfn_lo will point to size aligned address if size_aligned is set */
- new->pfn_lo = limit_pfn - (size + pad_size) + 1;
- new->pfn_hi = new->pfn_lo + size - 1;
-
- /* Insert the new_iova into domain rbtree by holding writer lock */
- /* Add new node and rebalance tree. */
- {
- struct rb_node **entry, *parent = NULL;
-
- /* If we have 'prev', it's a valid place to start the
- insertion. Otherwise, start from the root. */
- if (prev)
- entry = &prev;
- else
- entry = &iovad->rbroot.rb_node;
-
- /* Figure out where to put new node */
- while (*entry) {
- struct iova *this = container_of(*entry,
- struct iova, node);
- parent = *entry;
-
- if (new->pfn_lo < this->pfn_lo)
- entry = &((*entry)->rb_left);
- else if (new->pfn_lo > this->pfn_lo)
- entry = &((*entry)->rb_right);
- else
- BUG(); /* this should not happen */
- }
+ if (size_aligned)
+ align = size;
- /* Add new node and rebalance tree. */
- rb_link_node(&new->node, parent, entry);
- rb_insert_color(&new->node, &iovad->rbroot);
+ /*
+ * make sure that a lockless search into the tree
+ * understands we're still doing setup on this iova
+ */
+ new->pfn_lo = IOVA_PENDING_INIT;
+ new->pfn_hi = IOVA_PENDING_INIT;
+ smp_wmb();
+again:
+ local_irq_save(flags);
+ hint = __get_cached_addr(iovad, limit_pfn);
+ ret = skiplist_insert_hole(&iovad->skiplist,
+ hint,
+ limit_pfn, size, align,
+ &new->slot, GFP_ATOMIC);
+ /*
+ * insert hole returns -eagain when it found a good
+ * spot but someone raced in and stole it. If
+ * that happens pick a new hint and try again
+ */
+ if (ret == -EAGAIN) {
+ local_irq_restore(flags);
+ goto again;
}
- __cached_rbnode_insert_update(iovad, saved_pfn, new);
- spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+ /* we're fully inserted, set our lo/hi */
+ new->pfn_lo = new->slot.key;
+ new->pfn_hi = new->slot.key + new->slot.size - 1;
+ smp_wmb();
+ local_irq_restore(flags);
+ if (ret)
+ return -ENOMEM;
return 0;
}
-static void
-iova_insert_rbtree(struct rb_root *root, struct iova *iova)
+static int
+iova_insert_skiplist(struct sl_list *skiplist, struct iova *iova)
{
- struct rb_node **new = &(root->rb_node), *parent = NULL;
- /* Figure out where to put new node */
- while (*new) {
- struct iova *this = container_of(*new, struct iova, node);
- parent = *new;
-
- if (iova->pfn_lo < this->pfn_lo)
- new = &((*new)->rb_left);
- else if (iova->pfn_lo > this->pfn_lo)
- new = &((*new)->rb_right);
- else
- BUG(); /* this should not happen */
+ int ret;
+ int preload_token;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ preload_token = skiplist_preload(skiplist, GFP_ATOMIC);
+ if (preload_token < 0) {
+ ret = preload_token;
+ local_irq_restore(flags);
+ goto out;
}
- /* Add new node and rebalance tree. */
- rb_link_node(&iova->node, parent, new);
- rb_insert_color(&iova->node, root);
+
+ iova->slot.key = iova->pfn_lo;
+ iova->slot.size = iova->pfn_hi - iova->pfn_lo + 1;
+
+ ret = skiplist_insert(skiplist, &iova->slot, preload_token);
+ local_irq_restore(flags);
+ preempt_enable();
+out:
+ return ret;
}
/**
@@ -245,53 +176,24 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,
*/
struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
{
+ struct sl_slot *slot;
unsigned long flags;
- struct rb_node *node;
-
- /* Take the lock so that no other thread is manipulating the rbtree */
- spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
- node = iovad->rbroot.rb_node;
- while (node) {
- struct iova *iova = container_of(node, struct iova, node);
-
- /* If pfn falls within iova's range, return iova */
- if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
- spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
- /* We are not holding the lock while this iova
- * is referenced by the caller as the same thread
- * which called this function also calls __free_iova()
- * and it is by design that only one thread can possibly
- * reference a particular iova and hence no conflict.
- */
- return iova;
- }
+ struct iova *iova;
- if (pfn < iova->pfn_lo)
- node = node->rb_left;
- else if (pfn > iova->pfn_lo)
- node = node->rb_right;
+ local_irq_save(flags);
+ slot = skiplist_lookup(&iovad->skiplist, pfn, 1);
+ if (!slot) {
+ local_irq_restore(flags);
+ return NULL;
}
-
- spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
- return NULL;
-}
-
-/**
- * __free_iova - frees the given iova
- * @iovad: iova domain in question.
- * @iova: iova in question.
- * Frees the given iova belonging to the giving domain
- */
-void
-__free_iova(struct iova_domain *iovad, struct iova *iova)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
- __cached_rbnode_delete_update(iovad, iova);
- rb_erase(&iova->node, &iovad->rbroot);
- spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
- free_iova_mem(iova);
+ iova = sl_slot_entry(slot, struct iova, slot);
+ while (iova->pfn_lo == IOVA_PENDING_INIT ||
+ iova->pfn_hi == IOVA_PENDING_INIT) {
+ cpu_relax();
+ smp_rmb();
+ }
+ local_irq_restore(flags);
+ return iova;
}
/**
@@ -304,12 +206,29 @@ __free_iova(struct iova_domain *iovad, struct iova *iova)
void
free_iova(struct iova_domain *iovad, unsigned long pfn)
{
- struct iova *iova = find_iova(iovad, pfn);
- if (iova)
- __free_iova(iovad, iova);
+ struct iova *iova;
+ struct sl_slot *slot;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ slot = skiplist_delete(&iovad->skiplist, pfn, 1);
+ local_irq_restore(flags);
+
+ if (!slot)
+ return;
+ iova = sl_slot_entry(slot, struct iova, slot);
+ free_iova_mem(iova);
}
+void
+__free_iova(struct iova_domain *iovad, struct iova *iova)
+{
+ unsigned long pfn = iova->pfn_lo;
+ free_iova(iovad, pfn);
+}
+
+
/**
* put_iova_domain - destroys the iova doamin
* @iovad: - iova domain in question.
@@ -317,29 +236,37 @@ free_iova(struct iova_domain *iovad, unsigned long pfn)
*/
void put_iova_domain(struct iova_domain *iovad)
{
- struct rb_node *node;
+ struct sl_list *skiplist = &iovad->skiplist;
+ struct sl_node *p;
+ struct sl_leaf *leaf;
unsigned long flags;
+ struct iova *iova;
+ struct sl_slot *slot;
+ int i;
- spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
- node = rb_first(&iovad->rbroot);
- while (node) {
- struct iova *iova = container_of(node, struct iova, node);
- rb_erase(node, &iovad->rbroot);
- free_iova_mem(iova);
- node = rb_first(&iovad->rbroot);
+ /*
+ * the skiplist code needs some helpers for iteration. For now
+ * roll our own
+ */
+ local_irq_save(flags);
+ spin_lock(&iovad->skiplist_lock);
+ sl_lock_node(skiplist->head);
+ p = skiplist->head->ptrs[0].next;
+ while (p) {
+ leaf = sl_entry(p);
+ for (i = 0; i < leaf->nr; i++) {
+ slot = leaf->ptrs[i];
+ iova = sl_slot_entry(slot, struct iova, slot);
+ free_iova_mem(iova);
+ }
+ p = leaf->node.ptrs[0].next;
+ sl_free_leaf(leaf);
}
- spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-}
-
-static int
-__is_range_overlap(struct rb_node *node,
- unsigned long pfn_lo, unsigned long pfn_hi)
-{
- struct iova *iova = container_of(node, struct iova, node);
-
- if ((pfn_lo <= iova->pfn_hi) && (pfn_hi >= iova->pfn_lo))
- return 1;
- return 0;
+ /* FIXME call a helper here */
+ memset(skiplist->head->ptrs, 0, sl_node_size(SKIP_MAXLEVEL));
+ sl_unlock_node(skiplist->head);
+ spin_unlock(&iovad->skiplist_lock);
+ local_irq_restore(flags);
}
static struct iova *
@@ -347,6 +274,7 @@ __insert_new_range(struct iova_domain *iovad,
unsigned long pfn_lo, unsigned long pfn_hi)
{
struct iova *iova;
+ int ret;
iova = alloc_iova_mem();
if (!iova)
@@ -354,18 +282,16 @@ __insert_new_range(struct iova_domain *iovad,
iova->pfn_hi = pfn_hi;
iova->pfn_lo = pfn_lo;
- iova_insert_rbtree(&iovad->rbroot, iova);
- return iova;
-}
+ ret = iova_insert_skiplist(&iovad->skiplist, iova);
-static void
-__adjust_overlap_range(struct iova *iova,
- unsigned long *pfn_lo, unsigned long *pfn_hi)
-{
- if (*pfn_lo < iova->pfn_lo)
- iova->pfn_lo = *pfn_lo;
- if (*pfn_hi > iova->pfn_hi)
- *pfn_lo = iova->pfn_hi + 1;
+ if (ret == -ENOMEM) {
+ free_iova_mem(iova);
+ return NULL;
+ } else {
+ BUG_ON(ret);
+ }
+
+ return iova;
}
/**
@@ -380,32 +306,42 @@ struct iova *
reserve_iova(struct iova_domain *iovad,
unsigned long pfn_lo, unsigned long pfn_hi)
{
- struct rb_node *node;
+ struct sl_slot *slot;
unsigned long flags;
- struct iova *iova;
- unsigned int overlap = 0;
-
- spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
- for (node = rb_first(&iovad->rbroot); node; node = rb_next(node)) {
- if (__is_range_overlap(node, pfn_lo, pfn_hi)) {
- iova = container_of(node, struct iova, node);
- __adjust_overlap_range(iova, &pfn_lo, &pfn_hi);
- if ((pfn_lo >= iova->pfn_lo) &&
- (pfn_hi <= iova->pfn_hi))
- goto finish;
- overlap = 1;
-
- } else if (overlap)
- break;
- }
-
- /* We are here either because this is the first reserver node
- * or need to insert remaining non overlap addr range
+ struct iova *iova = NULL;
+ struct iova *found = NULL;
+ unsigned long size = pfn_hi - pfn_lo + 1;
+ unsigned long min_pfn = pfn_lo;
+ unsigned long max_pfn = pfn_hi;
+
+ /*
+ * this is not locking safe. It only happens while there are no
+ * concurrent IO requrests (I hope!)
*/
- iova = __insert_new_range(iovad, pfn_lo, pfn_hi);
-finish:
-
- spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+ local_irq_save(flags);
+ spin_lock(&iovad->skiplist_lock);
+ while(1) {
+ /*
+ * really ugly, just delete anything overlapping and
+ * reinsert the new full range
+ */
+ slot = skiplist_delete(&iovad->skiplist, pfn_lo, size);
+ if (!slot)
+ break;
+
+ found = sl_slot_entry(slot, struct iova, slot);
+ while (found->pfn_lo == IOVA_PENDING_INIT ||
+ found->pfn_hi == IOVA_PENDING_INIT) {
+ cpu_relax();
+ smp_rmb();
+ }
+ min_pfn = min(found->pfn_lo, min_pfn);
+ max_pfn = max(found->pfn_hi, max_pfn);
+ free_iova_mem(found);
+ }
+ iova = __insert_new_range(iovad, min_pfn, max_pfn);
+ spin_unlock(&iovad->skiplist_lock);
+ local_irq_restore(flags);
return iova;
}
@@ -419,17 +355,33 @@ finish:
void
copy_reserved_iova(struct iova_domain *from, struct iova_domain *to)
{
+ struct sl_node *p;
+ struct sl_leaf *leaf;
unsigned long flags;
- struct rb_node *node;
-
- spin_lock_irqsave(&from->iova_rbtree_lock, flags);
- for (node = rb_first(&from->rbroot); node; node = rb_next(node)) {
- struct iova *iova = container_of(node, struct iova, node);
- struct iova *new_iova;
- new_iova = reserve_iova(to, iova->pfn_lo, iova->pfn_hi);
- if (!new_iova)
- printk(KERN_ERR "Reserve iova range %lx@%lx failed\n",
- iova->pfn_lo, iova->pfn_lo);
+ struct iova *iova;
+ struct iova *new_iova;
+ struct sl_slot *slot;
+ int i;
+
+ /*
+ * this is not locking safe. It only happens while there are no
+ * concurrent IO requrests (I hope!)
+ */
+ local_irq_save(flags);
+ sl_lock_node(from->skiplist.head);
+ p = from->skiplist.head->ptrs[0].next;
+ while (p) {
+ leaf = sl_entry(p);
+ for (i = 0; i < leaf->nr; i++) {
+ slot = leaf->ptrs[i];
+ iova = sl_slot_entry(slot, struct iova, slot);
+ new_iova = reserve_iova(to, iova->pfn_lo, iova->pfn_hi);
+ if (!new_iova)
+ printk(KERN_ERR "Reserve iova range %lx@%lx failed\n",
+ iova->pfn_lo, iova->pfn_lo);
+ }
+ p = leaf->node.ptrs[0].next;
}
- spin_unlock_irqrestore(&from->iova_rbtree_lock, flags);
+ sl_unlock_node(from->skiplist.head);
+ local_irq_restore(flags);
}
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 76a0759..b514c18 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -13,7 +13,7 @@
#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/rbtree.h>
+#include <linux/skiplist.h>
#include <linux/dma-mapping.h>
/* IO virtual address start page frame number */
@@ -21,16 +21,16 @@
/* iova structure */
struct iova {
- struct rb_node node;
+ struct sl_slot slot;
unsigned long pfn_hi; /* IOMMU dish out addr hi */
unsigned long pfn_lo; /* IOMMU dish out addr lo */
};
/* holds all the iova translations for a domain */
struct iova_domain {
- spinlock_t iova_rbtree_lock; /* Lock to protect update of rbtree */
- struct rb_root rbroot; /* iova domain rbtree root */
- struct rb_node *cached32_node; /* Save last alloced node */
+ spinlock_t skiplist_lock;
+ spinlock_t del_skiplist_lock;
+ struct sl_list skiplist; /* iova domain skiplist */
unsigned long dma_32bit_pfn;
};
--
1.8.2
next prev parent reply other threads:[~2013-05-03 2:10 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-05-03 2:02 [PATCH RFC 0/2] skiplists for range indexes Chris Mason
2013-05-03 2:06 ` [PATCH RFC 1/2] core skiplist code Chris Mason
2013-05-03 2:10 ` Chris Mason [this message]
2013-05-03 9:19 ` [PATCH RFC 0/2] skiplists for range indexes Jan Kara
2013-05-03 10:45 ` Chris Mason
2013-05-04 3:25 ` Dave Chinner
2013-05-04 11:11 ` Chris Mason
2013-05-05 7:33 ` Dave Chinner
2013-05-05 14:38 ` Chris Mason
2013-05-05 22:44 ` Dave Chinner
2013-05-06 11:28 ` [BULK] " Chris Mason
2013-05-07 2:12 ` Dave Chinner
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20130503021000.GB4271@shiny.masoncoding.com \
--to=chris.mason@fusionio.com \
--cc=David.Woodhouse@intel.com \
--cc=bo.li.liu@oracle.com \
--cc=dchinner@redhat.com \
--cc=linux-fsdevel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).