From: Dave Chinner <david@fromorbit.com>
To: xfs@oss.sgi.com
Subject: [PATCH 17/18] xfs: convert buffer cache hash to rbtree
Date: Fri, 24 Sep 2010 22:31:15 +1000 [thread overview]
Message-ID: <1285331476-23015-18-git-send-email-david@fromorbit.com> (raw)
In-Reply-To: <1285331476-23015-1-git-send-email-david@fromorbit.com>
From: Dave Chinner <dchinner@redhat.com>
The buffer cache hash is showing typical hash scalability problems.
In large scale testing the number of cached items growing far larger
than the hash can efficiently handle. Hence we need to move to a
self-scaling cache indexing mechanism.
I have selected rbtrees for indexing becuse they can have O(log n)
search scalability, and insert and remove cost is not excessive,
even on large trees. Hence we should be able to cache large numbers
of buffers without incurring the excessive cache miss search
penalties that the hash is imposing on us.
To ensure we still have parallel access to the cache, we need
multiple trees. Rather than hashing the buffers by disk address to
select a tree, it seems more sensible to separate trees by typical
access patterns. Most operations use buffers from within a single AG
at a time, so rather than searching lots of different lists,
separate the buffer indexes out into per-AG rbtrees. This means that
searches during metadata operation have a much higher chance of
hitting cache resident nodes, and that updates of the tree are less
likely to disturb trees being accessed on other CPUs doing
independent operations.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
fs/xfs/linux-2.6/xfs_buf.c | 138 +++++++++++++++++++++----------------------
fs/xfs/linux-2.6/xfs_buf.h | 8 +--
fs/xfs/xfs_ag.h | 4 +
fs/xfs/xfs_mount.c | 2 +
4 files changed, 75 insertions(+), 77 deletions(-)
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 251bcdc..749d7d3 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -188,7 +188,7 @@ _xfs_buf_initialize(
atomic_set(&bp->b_hold, 1);
init_completion(&bp->b_iowait);
INIT_LIST_HEAD(&bp->b_list);
- INIT_LIST_HEAD(&bp->b_hash_list);
+ RB_CLEAR_NODE(&bp->b_rbnode);
init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
XB_SET_OWNER(bp);
bp->b_target = target;
@@ -262,8 +262,6 @@ xfs_buf_free(
{
trace_xfs_buf_free(bp, _RET_IP_);
- ASSERT(list_empty(&bp->b_hash_list));
-
if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
uint i;
@@ -422,8 +420,10 @@ _xfs_buf_find(
{
xfs_off_t range_base;
size_t range_length;
- xfs_bufhash_t *hash;
- xfs_buf_t *bp, *n;
+ struct xfs_perag *pag;
+ struct rb_node **rbp;
+ struct rb_node *parent;
+ xfs_buf_t *bp;
range_base = (ioff << BBSHIFT);
range_length = (isize << BBSHIFT);
@@ -432,14 +432,37 @@ _xfs_buf_find(
ASSERT(!(range_length < (1 << btp->bt_sshift)));
ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
- hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
-
- spin_lock(&hash->bh_lock);
-
- list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
- ASSERT(btp == bp->b_target);
- if (bp->b_file_offset == range_base &&
- bp->b_buffer_length == range_length) {
+ /* get tree root */
+ pag = xfs_perag_get(btp->bt_mount,
+ xfs_daddr_to_agno(btp->bt_mount, ioff));
+
+ /* walk tree */
+ spin_lock(&pag->pag_buf_lock);
+ rbp = &pag->pag_buf_tree.rb_node;
+ parent = NULL;
+ bp = NULL;
+ while (*rbp) {
+ parent = *rbp;
+ bp = rb_entry(parent, struct xfs_buf, b_rbnode);
+
+ if (range_base < bp->b_file_offset)
+ rbp = &(*rbp)->rb_left;
+ else if (range_base > bp->b_file_offset)
+ rbp = &(*rbp)->rb_right;
+ else {
+ /*
+ * found a block offset match. If the range doesn't
+ * match, the only way this is allowed is if the buffer
+ * in the cache is stale and the transaction that made
+ * it stale has not yet committed. i.e. we are
+ * reallocating a busy extent. Skip this buffer and
+ * continue searching to the right for an exact match.
+ */
+ if (bp->b_buffer_length != range_length) {
+ ASSERT(bp->b_flags & XBF_STALE);
+ rbp = &(*rbp)->rb_right;
+ continue;
+ }
atomic_inc(&bp->b_hold);
goto found;
}
@@ -449,17 +472,21 @@ _xfs_buf_find(
if (new_bp) {
_xfs_buf_initialize(new_bp, btp, range_base,
range_length, flags);
- new_bp->b_hash = hash;
- list_add(&new_bp->b_hash_list, &hash->bh_list);
+ rb_link_node(&new_bp->b_rbnode, parent, rbp);
+ rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
+ /* the buffer keeps the perag reference until it is freed */
+ new_bp->b_pag = pag;
+ spin_unlock(&pag->pag_buf_lock);
} else {
XFS_STATS_INC(xb_miss_locked);
+ spin_unlock(&pag->pag_buf_lock);
+ xfs_perag_put(pag);
}
-
- spin_unlock(&hash->bh_lock);
return new_bp;
found:
- spin_unlock(&hash->bh_lock);
+ spin_unlock(&pag->pag_buf_lock);
+ xfs_perag_put(pag);
/* Attempt to get the semaphore without sleeping,
* if this does not work then we need to drop the
@@ -809,27 +836,30 @@ void
xfs_buf_rele(
xfs_buf_t *bp)
{
- xfs_bufhash_t *hash = bp->b_hash;
+ struct xfs_perag *pag = bp->b_pag;
trace_xfs_buf_rele(bp, _RET_IP_);
- if (unlikely(!hash)) {
+ if (!pag) {
ASSERT(!bp->b_relse);
+ ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
if (atomic_dec_and_test(&bp->b_hold))
xfs_buf_free(bp);
return;
}
+ ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
ASSERT(atomic_read(&bp->b_hold) > 0);
- if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
+ if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
if (bp->b_relse) {
atomic_inc(&bp->b_hold);
- spin_unlock(&hash->bh_lock);
- (*(bp->b_relse)) (bp);
+ spin_unlock(&pag->pag_buf_lock);
+ bp->b_relse(bp);
} else {
ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
- list_del_init(&bp->b_hash_list);
- spin_unlock(&hash->bh_lock);
+ rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+ spin_unlock(&pag->pag_buf_lock);
+ xfs_perag_put(pag);
xfs_buf_free(bp);
}
}
@@ -1429,56 +1459,24 @@ xfs_buf_iomove(
*/
void
xfs_wait_buftarg(
- xfs_buftarg_t *btp)
+ struct xfs_buftarg *btp)
{
- xfs_bufhash_t *hash;
- uint i;
+ struct xfs_perag *pag;
+ uint i;
- for (i = 0; i < (1 << btp->bt_hashshift); i++) {
- hash = &btp->bt_hash[i];
- spin_lock(&hash->bh_lock);
- while (!list_empty(&hash->bh_list)) {
- spin_unlock(&hash->bh_lock);
+ for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
+ pag = xfs_perag_get(btp->bt_mount, i);
+ spin_lock(&pag->pag_buf_lock);
+ while (rb_first(&pag->pag_buf_tree)) {
+ spin_unlock(&pag->pag_buf_lock);
delay(100);
- spin_lock(&hash->bh_lock);
+ spin_lock(&pag->pag_buf_lock);
}
- spin_unlock(&hash->bh_lock);
- }
-}
-
-/*
- * Allocate buffer hash table for a given target.
- * For devices containing metadata (i.e. not the log/realtime devices)
- * we need to allocate a much larger hash table.
- */
-STATIC void
-xfs_alloc_bufhash(
- xfs_buftarg_t *btp,
- int external)
-{
- unsigned int i;
-
- if (external) {
- btp->bt_hash = NULL;
- return;
- }
- btp->bt_hashshift = 12; /* 4096 buckets */
- btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
- sizeof(xfs_bufhash_t));
- for (i = 0; i < (1 << btp->bt_hashshift); i++) {
- spin_lock_init(&btp->bt_hash[i].bh_lock);
- INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
+ spin_unlock(&pag->pag_buf_lock);
+ xfs_perag_put(pag);
}
}
-STATIC void
-xfs_free_bufhash(
- xfs_buftarg_t *btp)
-{
- kmem_free_large(btp->bt_hash);
- btp->bt_hash = NULL;
-}
-
/*
* buftarg list for delwrite queue processing
*/
@@ -1511,7 +1509,6 @@ xfs_free_buftarg(
xfs_flush_buftarg(btp, 1);
if (mp->m_flags & XFS_MOUNT_BARRIER)
xfs_blkdev_issue_flush(btp);
- xfs_free_bufhash(btp);
iput(btp->bt_mapping->host);
/* Unregister the buftarg first so that we don't get a
@@ -1651,7 +1648,6 @@ xfs_alloc_buftarg(
goto error;
if (xfs_alloc_delwrite_queue(btp, fsname))
goto error;
- xfs_alloc_bufhash(btp, external);
return btp;
error:
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 1f109ce..c79882e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -135,10 +135,6 @@ typedef struct xfs_buftarg {
unsigned int bt_sshift;
size_t bt_smask;
- /* per device buffer hash table */
- uint bt_hashshift;
- xfs_bufhash_t *bt_hash;
-
/* per device delwri queue */
struct task_struct *bt_task;
struct list_head bt_list;
@@ -172,8 +168,8 @@ typedef struct xfs_buf {
wait_queue_head_t b_waiters; /* unpin waiters */
struct list_head b_list;
xfs_buf_flags_t b_flags; /* status flags */
- struct list_head b_hash_list; /* hash table list */
- xfs_bufhash_t *b_hash; /* hash table list start */
+ struct rb_node b_rbnode; /* rbtree node */
+ struct xfs_perag *b_pag; /* contains rbtree root */
xfs_buftarg_t *b_target; /* buffer target (device) */
atomic_t b_hold; /* reference count */
xfs_daddr_t b_bn; /* block number for I/O */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index baeec83..63c7a1a 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -233,6 +233,10 @@ typedef struct xfs_perag {
struct mutex pag_ici_reclaim_lock; /* serialisation point */
unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
+ /* buffer cache index */
+ spinlock_t pag_buf_lock; /* lock for pag_buf_tree */
+ struct rb_root pag_buf_tree; /* ordered tree of active buffers */
+
/* for rcu-safe freeing */
struct rcu_head rcu_head;
#endif
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 59859c3..cfa2fb4 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -479,6 +479,8 @@ xfs_initialize_perag(
rwlock_init(&pag->pag_ici_lock);
mutex_init(&pag->pag_ici_reclaim_lock);
INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+ spin_lock_init(&pag->pag_buf_lock);
+ pag->pag_buf_tree = RB_ROOT;
if (radix_tree_preload(GFP_NOFS))
goto out_unwind;
--
1.7.1
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
next prev parent reply other threads:[~2010-09-24 12:30 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-09-24 12:30 [PATCH 0/18] xfs: metadata scalability V3 Dave Chinner
2010-09-24 12:30 ` [PATCH 01/18] xfs: force background CIL push under sustained load Dave Chinner
2010-09-24 12:31 ` [PATCH 02/18] xfs: reduce the number of CIL lock round trips during commit Dave Chinner
2010-09-24 12:31 ` [PATCH 03/18] xfs: remove debug assert for per-ag reference counting Dave Chinner
2010-09-24 12:31 ` [PATCH 04/18] xfs: lockless per-ag lookups Dave Chinner
2010-09-24 12:31 ` [PATCH 05/18] xfs: don't use vfs writeback for pure metadata modifications Dave Chinner
2010-09-25 23:42 ` Christoph Hellwig
2010-09-27 1:09 ` Dave Chinner
2010-09-24 12:31 ` [PATCH 06/18] xfs: rename xfs_buf_get_nodaddr to be more appropriate Dave Chinner
2010-09-24 12:31 ` [PATCH 07/18] xfs: introduced uncached buffer read primitve Dave Chinner
2010-09-24 12:31 ` [PATCH 08/18] xfs: store xfs_mount in the buftarg instead of in the xfs_buf Dave Chinner
2010-09-24 12:31 ` [PATCH 09/18] xfs: kill XBF_FS_MANAGED buffers Dave Chinner
2010-09-24 12:31 ` [PATCH 10/18] xfs: use unhashed buffers for size checks Dave Chinner
2010-09-24 12:31 ` [PATCH 11/18] xfs: remove buftarg hash for external devices Dave Chinner
2010-09-24 12:31 ` [PATCH 12/18] xfs: split inode AG walking into separate code for reclaim Dave Chinner
2010-09-24 12:31 ` [PATCH 13/18] xfs: split out inode walk inode grabbing Dave Chinner
2010-09-25 16:31 ` Christoph Hellwig
2010-09-24 12:31 ` [PATCH 14/18] xfs: implement batched inode lookups for AG walking Dave Chinner
2010-09-25 16:32 ` Christoph Hellwig
2010-09-24 12:31 ` [PATCH 15/18] xfs: batch inode reclaim lookup Dave Chinner
2010-09-24 12:31 ` [PATCH 16/18] xfs: serialise inode reclaim within an AG Dave Chinner
2010-09-25 23:49 ` Christoph Hellwig
2010-09-27 0:56 ` Dave Chinner
2010-09-24 12:31 ` Dave Chinner [this message]
2010-09-24 12:31 ` [PATCH 18/18] xfs: pack xfs_buf structure more tightly Dave Chinner
-- strict thread matches above, loose matches on Subject: below --
2010-09-27 1:47 [PATCH 0/18] xfs: metadata scalability V4 Dave Chinner
2010-09-27 1:47 ` [PATCH 17/18] xfs: convert buffer cache hash to rbtree Dave Chinner
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1285331476-23015-18-git-send-email-david@fromorbit.com \
--to=david@fromorbit.com \
--cc=xfs@oss.sgi.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox