* [PATCH 01/16] xfs: reduce the number of CIL lock round trips during commit
2010-09-22 6:44 [PATCH 0/16] xfs: metadata scalability V2 Dave Chinner
@ 2010-09-22 6:44 ` Dave Chinner
2010-09-22 16:51 ` Christoph Hellwig
2010-09-22 19:57 ` Alex Elder
2010-09-22 6:44 ` [PATCH 02/16] xfs: remove debug assert for per-ag reference counting Dave Chinner
` (15 subsequent siblings)
16 siblings, 2 replies; 39+ messages in thread
From: Dave Chinner @ 2010-09-22 6:44 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
When commiting a transaction, we do a lock CIL state lock round trip
on every single log vector we insert into the CIL. This is resulting
in the lock being as hot as the inode and dcache locks on 8-way
create workloads. Rework the insertion loops to bring the number
of lock round trips to one per transaction for log vectors, and one
more do the busy extents.
Also change the allocation of the log vector buffer not to zero it
as we copy over the entire allocated buffer anyway.
This patch also includes a structural cleanup to the CIL item
insertion provided by Christoph Hellwig.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
fs/xfs/xfs_log_cil.c | 232 +++++++++++++++++++++++++++-----------------------
1 files changed, 127 insertions(+), 105 deletions(-)
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index ed575fb..3c78a17 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -146,102 +146,6 @@ xlog_cil_init_post_recovery(
}
/*
- * Insert the log item into the CIL and calculate the difference in space
- * consumed by the item. Add the space to the checkpoint ticket and calculate
- * if the change requires additional log metadata. If it does, take that space
- * as well. Remove the amount of space we addded to the checkpoint ticket from
- * the current transaction ticket so that the accounting works out correctly.
- *
- * If this is the first time the item is being placed into the CIL in this
- * context, pin it so it can't be written to disk until the CIL is flushed to
- * the iclog and the iclog written to disk.
- */
-static void
-xlog_cil_insert(
- struct log *log,
- struct xlog_ticket *ticket,
- struct xfs_log_item *item,
- struct xfs_log_vec *lv)
-{
- struct xfs_cil *cil = log->l_cilp;
- struct xfs_log_vec *old = lv->lv_item->li_lv;
- struct xfs_cil_ctx *ctx = cil->xc_ctx;
- int len;
- int diff_iovecs;
- int iclog_space;
-
- if (old) {
- /* existing lv on log item, space used is a delta */
- ASSERT(!list_empty(&item->li_cil));
- ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
-
- len = lv->lv_buf_len - old->lv_buf_len;
- diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
- kmem_free(old->lv_buf);
- kmem_free(old);
- } else {
- /* new lv, must pin the log item */
- ASSERT(!lv->lv_item->li_lv);
- ASSERT(list_empty(&item->li_cil));
-
- len = lv->lv_buf_len;
- diff_iovecs = lv->lv_niovecs;
- IOP_PIN(lv->lv_item);
-
- }
- len += diff_iovecs * sizeof(xlog_op_header_t);
-
- /* attach new log vector to log item */
- lv->lv_item->li_lv = lv;
-
- spin_lock(&cil->xc_cil_lock);
- list_move_tail(&item->li_cil, &cil->xc_cil);
- ctx->nvecs += diff_iovecs;
-
- /*
- * If this is the first time the item is being committed to the CIL,
- * store the sequence number on the log item so we can tell
- * in future commits whether this is the first checkpoint the item is
- * being committed into.
- */
- if (!item->li_seq)
- item->li_seq = ctx->sequence;
-
- /*
- * Now transfer enough transaction reservation to the context ticket
- * for the checkpoint. The context ticket is special - the unit
- * reservation has to grow as well as the current reservation as we
- * steal from tickets so we can correctly determine the space used
- * during the transaction commit.
- */
- if (ctx->ticket->t_curr_res == 0) {
- /* first commit in checkpoint, steal the header reservation */
- ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
- ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
- ticket->t_curr_res -= ctx->ticket->t_unit_res;
- }
-
- /* do we need space for more log record headers? */
- iclog_space = log->l_iclog_size - log->l_iclog_hsize;
- if (len > 0 && (ctx->space_used / iclog_space !=
- (ctx->space_used + len) / iclog_space)) {
- int hdrs;
-
- hdrs = (len + iclog_space - 1) / iclog_space;
- /* need to take into account split region headers, too */
- hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
- ctx->ticket->t_unit_res += hdrs;
- ctx->ticket->t_curr_res += hdrs;
- ticket->t_curr_res -= hdrs;
- ASSERT(ticket->t_curr_res >= len);
- }
- ticket->t_curr_res -= len;
- ctx->space_used += len;
-
- spin_unlock(&cil->xc_cil_lock);
-}
-
-/*
* Format log item into a flat buffers
*
* For delayed logging, we need to hold a formatted buffer containing all the
@@ -286,7 +190,7 @@ xlog_cil_format_items(
len += lv->lv_iovecp[index].i_len;
lv->lv_buf_len = len;
- lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
+ lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
ptr = lv->lv_buf;
for (index = 0; index < lv->lv_niovecs; index++) {
@@ -300,21 +204,136 @@ xlog_cil_format_items(
}
}
+/*
+ * Prepare the log item for insertion into the CIL. Calculate the difference in
+ * log space and vectors it will consume, and if it is a new item pin it as
+ * well.
+ */
+STATIC void
+xfs_cil_prepare_item(
+ struct log *log,
+ struct xfs_log_vec *lv,
+ int *len,
+ int *diff_iovecs)
+{
+ struct xfs_log_vec *old = lv->lv_item->li_lv;
+
+ if (old) {
+ /* existing lv on log item, space used is a delta */
+ ASSERT(!list_empty(&lv->lv_item->li_cil));
+ ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
+
+ *len += lv->lv_buf_len - old->lv_buf_len;
+ *diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
+ kmem_free(old->lv_buf);
+ kmem_free(old);
+ } else {
+ /* new lv, must pin the log item */
+ ASSERT(!lv->lv_item->li_lv);
+ ASSERT(list_empty(&lv->lv_item->li_cil));
+
+ *len += lv->lv_buf_len;
+ *diff_iovecs += lv->lv_niovecs;
+ IOP_PIN(lv->lv_item);
+
+ }
+
+ /* attach new log vector to log item */
+ lv->lv_item->li_lv = lv;
+
+ /*
+ * If this is the first time the item is being committed to the
+ * CIL, store the sequence number on the log item so we can
+ * tell in future commits whether this is the first checkpoint
+ * the item is being committed into.
+ */
+ if (!lv->lv_item->li_seq)
+ lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
+}
+
+/*
+ * Insert the log items into the CIL and calculate the difference in space
+ * consumed by the item. Add the space to the checkpoint ticket and calculate
+ * if the change requires additional log metadata. If it does, take that space
+ * as well. Remove the amount of space we addded to the checkpoint ticket from
+ * the current transaction ticket so that the accounting works out correctly.
+ */
static void
xlog_cil_insert_items(
struct log *log,
struct xfs_log_vec *log_vector,
- struct xlog_ticket *ticket,
- xfs_lsn_t *start_lsn)
+ struct xlog_ticket *ticket)
{
- struct xfs_log_vec *lv;
-
- if (start_lsn)
- *start_lsn = log->l_cilp->xc_ctx->sequence;
+ struct xfs_cil *cil = log->l_cilp;
+ struct xfs_cil_ctx *ctx = cil->xc_ctx;
+ struct xfs_log_vec *lv;
+ int len = 0;
+ int diff_iovecs = 0;
+ int iclog_space;
ASSERT(log_vector);
+
+ /*
+ * Do all the accounting aggregation and switching of log vectors
+ * around in a separate loop to the insertion of items into the CIL.
+ * Then we can do a separate loop to update the CIL within a single
+ * lock/unlock pair. This reduces the number of round trips on the CIL
+ * lock from O(nr_logvectors) to O(1) and greatly reduces the overall
+ * hold time for the transaction commit.
+ *
+ * If this is the first time the item is being placed into the CIL in
+ * this context, pin it so it can't be written to disk until the CIL is
+ * flushed to the iclog and the iclog written to disk.
+ *
+ * We can do this safely because the context can't checkpoint until we
+ * are done so it doesn't matter exactly how we update the CIL.
+ */
+ for (lv = log_vector; lv; lv = lv->lv_next)
+ xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
+
+ /* account for space used by new iovec headers */
+ len += diff_iovecs * sizeof(xlog_op_header_t);
+
+ spin_lock(&cil->xc_cil_lock);
+
+ /* move the items to the tail of the CIL */
for (lv = log_vector; lv; lv = lv->lv_next)
- xlog_cil_insert(log, ticket, lv->lv_item, lv);
+ list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
+
+ ctx->nvecs += diff_iovecs;
+
+ /*
+ * Now transfer enough transaction reservation to the context ticket
+ * for the checkpoint. The context ticket is special - the unit
+ * reservation has to grow as well as the current reservation as we
+ * steal from tickets so we can correctly determine the space used
+ * during the transaction commit.
+ */
+ if (ctx->ticket->t_curr_res == 0) {
+ /* first commit in checkpoint, steal the header reservation */
+ ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
+ ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
+ ticket->t_curr_res -= ctx->ticket->t_unit_res;
+ }
+
+ /* do we need space for more log record headers? */
+ iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+ if (len > 0 && (ctx->space_used / iclog_space !=
+ (ctx->space_used + len) / iclog_space)) {
+ int hdrs;
+
+ hdrs = (len + iclog_space - 1) / iclog_space;
+ /* need to take into account split region headers, too */
+ hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
+ ctx->ticket->t_unit_res += hdrs;
+ ctx->ticket->t_curr_res += hdrs;
+ ticket->t_curr_res -= hdrs;
+ ASSERT(ticket->t_curr_res >= len);
+ }
+ ticket->t_curr_res -= len;
+ ctx->space_used += len;
+
+ spin_unlock(&cil->xc_cil_lock);
}
static void
@@ -632,7 +651,10 @@ xfs_log_commit_cil(
/* lock out background commit */
down_read(&log->l_cilp->xc_ctx_lock);
- xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn);
+ if (commit_lsn)
+ *commit_lsn = log->l_cilp->xc_ctx->sequence;
+
+ xlog_cil_insert_items(log, log_vector, tp->t_ticket);
/* check we didn't blow the reservation */
if (tp->t_ticket->t_curr_res < 0)
--
1.7.1
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 39+ messages in thread* [PATCH 02/16] xfs: remove debug assert for per-ag reference counting
2010-09-22 6:44 [PATCH 0/16] xfs: metadata scalability V2 Dave Chinner
2010-09-22 6:44 ` [PATCH 01/16] xfs: reduce the number of CIL lock round trips during commit Dave Chinner
@ 2010-09-22 6:44 ` Dave Chinner
2010-09-22 6:44 ` [PATCH 03/16] xfs: lockless per-ag lookups Dave Chinner
` (14 subsequent siblings)
16 siblings, 0 replies; 39+ messages in thread
From: Dave Chinner @ 2010-09-22 6:44 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
When we start taking references per cached buffer to the the perag
it is cached on, it will blow the current debug maximum reference
count assert out of the water. The assert has never caught a bug,
and we have tracing to track changes if there ever is a problem,
so just remove it.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
fs/xfs/xfs_mount.c | 2 --
1 files changed, 0 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index aeb9d72..00c7a87 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -210,8 +210,6 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
pag = radix_tree_lookup(&mp->m_perag_tree, agno);
if (pag) {
ASSERT(atomic_read(&pag->pag_ref) >= 0);
- /* catch leaks in the positive direction during testing */
- ASSERT(atomic_read(&pag->pag_ref) < 1000);
ref = atomic_inc_return(&pag->pag_ref);
}
spin_unlock(&mp->m_perag_lock);
--
1.7.1
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 39+ messages in thread* [PATCH 03/16] xfs: lockless per-ag lookups
2010-09-22 6:44 [PATCH 0/16] xfs: metadata scalability V2 Dave Chinner
2010-09-22 6:44 ` [PATCH 01/16] xfs: reduce the number of CIL lock round trips during commit Dave Chinner
2010-09-22 6:44 ` [PATCH 02/16] xfs: remove debug assert for per-ag reference counting Dave Chinner
@ 2010-09-22 6:44 ` Dave Chinner
2010-09-22 6:44 ` [PATCH 04/16] xfs: don't use vfs writeback for pure metadata modifications Dave Chinner
` (13 subsequent siblings)
16 siblings, 0 replies; 39+ messages in thread
From: Dave Chinner @ 2010-09-22 6:44 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
When we start taking a reference to the per-ag for every cached
buffer in the system, kernel lockstat profiling on an 8-way create
workload shows the mp->m_perag_lock has higher acquisition rates
than the inode lock and has significantly more contention. That is,
it becomes the highest contended lock in the system.
The perag lookup is trivial to convert to lock-less RCU lookups
because perag structures never go away. Hence the only thing we need
to protect against is tree structure changes during a grow. This can
be done simply by replacing the locking in xfs_perag_get() with RCU
read locking. This removes the mp->m_perag_lock completely from this
path.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
fs/xfs/linux-2.6/xfs_sync.c | 6 +++---
fs/xfs/xfs_ag.h | 3 +++
fs/xfs/xfs_mount.c | 25 +++++++++++++++++--------
3 files changed, 23 insertions(+), 11 deletions(-)
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index d59c4a6..ddeaff9 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -150,17 +150,17 @@ xfs_inode_ag_iter_next_pag(
int found;
int ref;
- spin_lock(&mp->m_perag_lock);
+ rcu_read_lock();
found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
(void **)&pag, *first, 1, tag);
if (found <= 0) {
- spin_unlock(&mp->m_perag_lock);
+ rcu_read_unlock();
return NULL;
}
*first = pag->pag_agno + 1;
/* open coded pag reference increment */
ref = atomic_inc_return(&pag->pag_ref);
- spin_unlock(&mp->m_perag_lock);
+ rcu_read_unlock();
trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
} else {
pag = xfs_perag_get(mp, *first);
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 4917d4e..51c42c2 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -230,6 +230,9 @@ typedef struct xfs_perag {
rwlock_t pag_ici_lock; /* incore inode lock */
struct radix_tree_root pag_ici_root; /* incore inode cache root */
int pag_ici_reclaimable; /* reclaimable inodes */
+
+ /* for rcu-safe freeing */
+ struct rcu_head rcu_head;
#endif
int pagb_count; /* pagb slots in use */
} xfs_perag_t;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 00c7a87..14fc6e9 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -199,6 +199,8 @@ xfs_uuid_unmount(
/*
* Reference counting access wrappers to the perag structures.
+ * Because we never free per-ag structures, the only thing we
+ * have to protect against changes is the tree structure itself.
*/
struct xfs_perag *
xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
@@ -206,13 +208,13 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
struct xfs_perag *pag;
int ref = 0;
- spin_lock(&mp->m_perag_lock);
+ rcu_read_lock();
pag = radix_tree_lookup(&mp->m_perag_tree, agno);
if (pag) {
ASSERT(atomic_read(&pag->pag_ref) >= 0);
ref = atomic_inc_return(&pag->pag_ref);
}
- spin_unlock(&mp->m_perag_lock);
+ rcu_read_unlock();
trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
return pag;
}
@@ -227,10 +229,18 @@ xfs_perag_put(struct xfs_perag *pag)
trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
}
+STATIC void
+__xfs_free_perag(
+ struct rcu_head *head)
+{
+ struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
+
+ ASSERT(atomic_read(&pag->pag_ref) == 0);
+ kmem_free(pag);
+}
+
/*
- * Free up the resources associated with a mount structure. Assume that
- * the structure was initially zeroed, so we can tell which fields got
- * initialized.
+ * Free up the per-ag resources associated with the mount structure.
*/
STATIC void
xfs_free_perag(
@@ -242,10 +252,9 @@ xfs_free_perag(
for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
spin_lock(&mp->m_perag_lock);
pag = radix_tree_delete(&mp->m_perag_tree, agno);
- ASSERT(pag);
- ASSERT(atomic_read(&pag->pag_ref) == 0);
spin_unlock(&mp->m_perag_lock);
- kmem_free(pag);
+ ASSERT(pag);
+ call_rcu(&pag->rcu_head, __xfs_free_perag);
}
}
--
1.7.1
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 39+ messages in thread* [PATCH 04/16] xfs: don't use vfs writeback for pure metadata modifications
2010-09-22 6:44 [PATCH 0/16] xfs: metadata scalability V2 Dave Chinner
` (2 preceding siblings ...)
2010-09-22 6:44 ` [PATCH 03/16] xfs: lockless per-ag lookups Dave Chinner
@ 2010-09-22 6:44 ` Dave Chinner
2010-09-22 17:24 ` Christoph Hellwig
2010-09-23 16:19 ` Alex Elder
2010-09-22 6:44 ` [PATCH 05/16] xfs: rename xfs_buf_get_nodaddr to be more appropriate Dave Chinner
` (12 subsequent siblings)
16 siblings, 2 replies; 39+ messages in thread
From: Dave Chinner @ 2010-09-22 6:44 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
Under heavy multi-way parallel create workloads, the VFS struggles
to write back all the inodes that have been changed in age order.
The bdi flusher thread becomes CPU bound, spending 85% of it's time
in the VFS code, mostly traversing the superblock dirty inode list
to separate dirty inodes old enough to flush.
We already keep an index of all metadata changes in age order - in
the AIL - and continued log pressure will do age ordered writeback
without any extra overhead at all. If there is no pressure on the
log, the xfssyncd will periodically write back metadata in ascending
disk address offset order so will be very efficient.
Hence we can stop marking VFS inodes dirty during transaction commit
or when changing timestamps during transactions. This will keep the
inodes in the superblock dirty list to those containing data or
unlogged metadata changes.
However, the timstamp changes are slightly more complex than this -
there are a couple of places that do unlogged updates of the
timestamps, and the VFS need to be informed of these. Hence add a
new function xfs_trans_inode_chgtime() for transactional changes,
and leave xfs_ichgtime() for the non-transactional changes.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
fs/xfs/linux-2.6/xfs_ioctl.c | 2 +-
fs/xfs/linux-2.6/xfs_iops.c | 55 ++++++++++++++++++++++++++++-----------
fs/xfs/quota/xfs_qm_syscalls.c | 2 +-
fs/xfs/xfs_attr.c | 36 +++++++++++++-------------
fs/xfs/xfs_inode.h | 1 +
fs/xfs/xfs_inode_item.c | 9 ------
fs/xfs/xfs_rename.c | 12 ++++++---
fs/xfs/xfs_utils.c | 4 +-
fs/xfs/xfs_vnodeops.c | 10 +++---
9 files changed, 75 insertions(+), 56 deletions(-)
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 03aa908..10206be 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -1088,8 +1088,8 @@ xfs_ioctl_setattr(
xfs_diflags_to_linux(ip);
}
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
XFS_STATS_INC(xs_ig_attrchg);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index b1fc2a6..37918f4 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -96,40 +96,63 @@ xfs_mark_inode_dirty(
/*
* Change the requested timestamp in the given inode.
- * We don't lock across timestamp updates, and we don't log them but
- * we do record the fact that there is dirty information in core.
*/
-void
-xfs_ichgtime(
- xfs_inode_t *ip,
- int flags)
+static int
+xfs_ichgtime_int(
+ struct xfs_inode *ip,
+ int flags)
{
- struct inode *inode = VFS_I(ip);
- timespec_t tv;
- int sync_it = 0;
+ struct inode *inode = VFS_I(ip);
+ timespec_t tv;
+ int dirty = 0;
tv = current_fs_time(inode->i_sb);
if ((flags & XFS_ICHGTIME_MOD) &&
!timespec_equal(&inode->i_mtime, &tv)) {
inode->i_mtime = tv;
- sync_it = 1;
+ dirty = 1;
}
if ((flags & XFS_ICHGTIME_CHG) &&
!timespec_equal(&inode->i_ctime, &tv)) {
inode->i_ctime = tv;
- sync_it = 1;
+ dirty = 1;
}
+ return dirty;
+}
- /*
- * Update complete - now make sure everyone knows that the inode
- * is dirty.
- */
- if (sync_it)
+/*
+ * Non-transactional inode timestamp update. Does not require locks to be held,
+ * and marks the inode dirty at the VFS level so that the change is not lost.
+ */
+void
+xfs_ichgtime(
+ struct xfs_inode *ip,
+ int flags)
+{
+ if (xfs_ichgtime_int(ip, flags))
xfs_mark_inode_dirty_sync(ip);
}
/*
+ * Transactional inode timestamp update. requires inod to be locked and joined
+ * to the transaction supplied. Relies on the transaction subsystem to track
+ * dirty state and update/writeback the inode accordingly.
+ */
+void
+xfs_trans_ichgtime(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int flags)
+{
+ ASSERT(tp);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(ip->i_transp == tp);
+
+ xfs_ichgtime_int(ip, flags);
+}
+
+/*
* Hook in SELinux. This is not quite correct yet, what we really need
* here (as we do for default ACLs) is a mechanism by which creation of
* these attrs can be journalled at inode creation time (along with the
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 45e5849..7a71336 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -276,7 +276,7 @@ xfs_qm_scall_trunc_qfile(
goto out_unlock;
}
- xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
out_unlock:
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index c256824..1effc19 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -355,16 +355,18 @@ xfs_attr_set_int(
if (mp->m_flags & XFS_MOUNT_WSYNC) {
xfs_trans_set_sync(args.trans);
}
- err2 = xfs_trans_commit(args.trans,
- XFS_TRANS_RELEASE_LOG_RES);
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
/*
* Hit the inode change time.
*/
if (!error && (flags & ATTR_KERNOTIME) == 0) {
- xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
+ xfs_trans_ichgtime(args.trans, dp,
+ XFS_ICHGTIME_CHG);
}
+ err2 = xfs_trans_commit(args.trans,
+ XFS_TRANS_RELEASE_LOG_RES);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
return(error == 0 ? err2 : error);
}
@@ -421,19 +423,18 @@ xfs_attr_set_int(
}
/*
+ * Hit the inode change time.
+ */
+ if ((flags & ATTR_KERNOTIME) == 0)
+ xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
+
+ /*
* Commit the last in the sequence of transactions.
*/
xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- /*
- * Hit the inode change time.
- */
- if (!error && (flags & ATTR_KERNOTIME) == 0) {
- xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
- }
-
return(error);
out:
@@ -568,19 +569,18 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
}
/*
+ * Hit the inode change time.
+ */
+ if ((flags & ATTR_KERNOTIME) == 0)
+ xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
+
+ /*
* Commit the last in the sequence of transactions.
*/
xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- /*
- * Hit the inode change time.
- */
- if (!error && (flags & ATTR_KERNOTIME) == 0) {
- xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
- }
-
return(error);
out:
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0898c54..37deff1 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -472,6 +472,7 @@ void xfs_iext_realloc(xfs_inode_t *, int, int);
void xfs_iunpin_wait(xfs_inode_t *);
int xfs_iflush(xfs_inode_t *, uint);
void xfs_ichgtime(xfs_inode_t *, int);
+void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
void xfs_lock_inodes(xfs_inode_t **, int, uint);
void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index fe00777..c7ac020 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -223,15 +223,6 @@ xfs_inode_item_format(
nvecs = 1;
/*
- * Make sure the linux inode is dirty. We do this before
- * clearing i_update_core as the VFS will call back into
- * XFS here and set i_update_core, so we need to dirty the
- * inode first so that the ordering of i_update_core and
- * unlogged modifications still works as described below.
- */
- xfs_mark_inode_dirty_sync(ip);
-
- /*
* Clear i_update_core if the timestamps (or any other
* non-transactional modification) need flushing/logging
* and we're about to log them with the rest of the core.
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 8fca957..9028733 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -211,7 +211,9 @@ xfs_rename(
goto error_return;
if (error)
goto abort_return;
- xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+ xfs_trans_ichgtime(tp, target_dp,
+ XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
if (new_parent && src_is_directory) {
error = xfs_bumplink(tp, target_dp);
@@ -249,7 +251,9 @@ xfs_rename(
&first_block, &free_list, spaceres);
if (error)
goto abort_return;
- xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+ xfs_trans_ichgtime(tp, target_dp,
+ XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
/*
* Decrement the link count on the target since the target
@@ -292,7 +296,7 @@ xfs_rename(
* inode isn't really being changed, but old unix file systems did
* it and some incremental backup programs won't work without it.
*/
- xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG);
+ xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
/*
* Adjust the link count on src_dp. This is necessary when
@@ -315,7 +319,7 @@ xfs_rename(
if (error)
goto abort_return;
- xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
if (new_parent)
xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index b7d5769..4c2ba6f 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -235,7 +235,7 @@ xfs_droplink(
{
int error;
- xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
ASSERT (ip->i_d.di_nlink > 0);
ip->i_d.di_nlink--;
@@ -299,7 +299,7 @@ xfs_bumplink(
{
if (ip->i_d.di_nlink >= XFS_MAXLINK)
return XFS_ERROR(EMLINK);
- xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
ASSERT(ip->i_d.di_nlink > 0);
ip->i_d.di_nlink++;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index dc6e4fb..7413a02 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1391,7 +1391,7 @@ xfs_create(
ASSERT(error != ENOSPC);
goto out_trans_abort;
}
- xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
if (is_dir) {
@@ -1742,7 +1742,7 @@ xfs_remove(
ASSERT(error != ENOENT);
goto out_bmap_cancel;
}
- xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
if (is_dir) {
/*
@@ -1895,7 +1895,7 @@ xfs_link(
&first_block, &free_list, resblks);
if (error)
goto abort_return;
- xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
error = xfs_bumplink(tp, sip);
@@ -2129,7 +2129,7 @@ xfs_symlink(
&first_block, &free_list, resblks);
if (error)
goto error1;
- xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
/*
@@ -2833,7 +2833,7 @@ xfs_change_file_space(
if (ip->i_d.di_mode & S_IXGRP)
ip->i_d.di_mode &= ~S_ISGID;
- xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
}
if (setprealloc)
ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
--
1.7.1
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 39+ messages in thread* Re: [PATCH 04/16] xfs: don't use vfs writeback for pure metadata modifications
2010-09-22 6:44 ` [PATCH 04/16] xfs: don't use vfs writeback for pure metadata modifications Dave Chinner
@ 2010-09-22 17:24 ` Christoph Hellwig
2010-09-23 0:36 ` Dave Chinner
2010-09-23 16:19 ` Alex Elder
1 sibling, 1 reply; 39+ messages in thread
From: Christoph Hellwig @ 2010-09-22 17:24 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
> However, the timstamp changes are slightly more complex than this -
> there are a couple of places that do unlogged updates of the
> timestamps, and the VFS need to be informed of these. Hence add a
> new function xfs_trans_inode_chgtime() for transactional changes,
> and leave xfs_ichgtime() for the non-transactional changes.
The only user of xfs_ichgtime after this patch is a special case in
truncate for the case of a zero-sized file that's also truncated to size
zero. I think we should just remove this special case and not require
xfs_ichgtime at all. I'll prepare patches to clean up xfs_setattr
and remove this non-transaction update and once this patch is rebased
ontop of that it can be simplied again.
That leaves the timestamp updates from the data I/O path special as
they still get updated via direct writes to inode->i_*time and
mark_inode_dirty. I guess we'll have to live with that for now.
> + * Transactional inode timestamp update. requires inod to be locked and joined
> + * to the transaction supplied. Relies on the transaction subsystem to track
> + * dirty state and update/writeback the inode accordingly.
s/inod/the inode/
Also I wonder if xfs_trans_ichgtime should be in xfs_trans_inode.c with
a prototype in xfs_trans.h, just like all the other xfs_trans*
functions.
> /*
> + * Hit the inode change time.
> + */
All these comments are utterly pointless. I'd suggest removing them
when touching the surrounding areas.
> +++ b/fs/xfs/xfs_inode_item.c
> @@ -223,15 +223,6 @@ xfs_inode_item_format(
> nvecs = 1;
>
> /*
> - * Make sure the linux inode is dirty. We do this before
> - * clearing i_update_core as the VFS will call back into
> - * XFS here and set i_update_core, so we need to dirty the
> - * inode first so that the ordering of i_update_core and
> - * unlogged modifications still works as described below.
> - */
> - xfs_mark_inode_dirty_sync(ip);
> -
With this gone the comment above xfs_fs_dirty_inode will need an update.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 04/16] xfs: don't use vfs writeback for pure metadata modifications
2010-09-22 17:24 ` Christoph Hellwig
@ 2010-09-23 0:36 ` Dave Chinner
0 siblings, 0 replies; 39+ messages in thread
From: Dave Chinner @ 2010-09-23 0:36 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: xfs
On Wed, Sep 22, 2010 at 01:24:01PM -0400, Christoph Hellwig wrote:
> > However, the timstamp changes are slightly more complex than this -
> > there are a couple of places that do unlogged updates of the
> > timestamps, and the VFS need to be informed of these. Hence add a
> > new function xfs_trans_inode_chgtime() for transactional changes,
> > and leave xfs_ichgtime() for the non-transactional changes.
>
> The only user of xfs_ichgtime after this patch is a special case in
> truncate for the case of a zero-sized file that's also truncated to size
> zero. I think we should just remove this special case and not require
> xfs_ichgtime at all. I'll prepare patches to clean up xfs_setattr
> and remove this non-transaction update and once this patch is rebased
> ontop of that it can be simplied again.
>
> That leaves the timestamp updates from the data I/O path special as
> they still get updated via direct writes to inode->i_*time and
> mark_inode_dirty. I guess we'll have to live with that for now.
>
>
> > + * Transactional inode timestamp update. requires inod to be locked and joined
> > + * to the transaction supplied. Relies on the transaction subsystem to track
> > + * dirty state and update/writeback the inode accordingly.
>
> s/inod/the inode/
>
> Also I wonder if xfs_trans_ichgtime should be in xfs_trans_inode.c with
> a prototype in xfs_trans.h, just like all the other xfs_trans*
> functions.
If we get rid of the special setattr case, then yes, it should be
moved to a transaction specific file.
>
> > /*
> > + * Hit the inode change time.
> > + */
>
> All these comments are utterly pointless. I'd suggest removing them
> when touching the surrounding areas.
Ok, will do.
>
> > +++ b/fs/xfs/xfs_inode_item.c
> > @@ -223,15 +223,6 @@ xfs_inode_item_format(
> > nvecs = 1;
> >
> > /*
> > - * Make sure the linux inode is dirty. We do this before
> > - * clearing i_update_core as the VFS will call back into
> > - * XFS here and set i_update_core, so we need to dirty the
> > - * inode first so that the ordering of i_update_core and
> > - * unlogged modifications still works as described below.
> > - */
> > - xfs_mark_inode_dirty_sync(ip);
> > -
>
> With this gone the comment above xfs_fs_dirty_inode will need an update.
OK.
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 04/16] xfs: don't use vfs writeback for pure metadata modifications
2010-09-22 6:44 ` [PATCH 04/16] xfs: don't use vfs writeback for pure metadata modifications Dave Chinner
2010-09-22 17:24 ` Christoph Hellwig
@ 2010-09-23 16:19 ` Alex Elder
1 sibling, 0 replies; 39+ messages in thread
From: Alex Elder @ 2010-09-23 16:19 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
On Wed, 2010-09-22 at 16:44 +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
>
> Under heavy multi-way parallel create workloads, the VFS struggles
> to write back all the inodes that have been changed in age order.
> The bdi flusher thread becomes CPU bound, spending 85% of it's time
> in the VFS code, mostly traversing the superblock dirty inode list
> to separate dirty inodes old enough to flush.
>
> We already keep an index of all metadata changes in age order - in
> the AIL - and continued log pressure will do age ordered writeback
> without any extra overhead at all. If there is no pressure on the
> log, the xfssyncd will periodically write back metadata in ascending
> disk address offset order so will be very efficient.
>
> Hence we can stop marking VFS inodes dirty during transaction commit
> or when changing timestamps during transactions. This will keep the
> inodes in the superblock dirty list to those containing data or
> unlogged metadata changes.
This looks good. There is a minor typo I'll highlight
below in case you want to fix it.
> However, the timstamp changes are slightly more complex than this -
> there are a couple of places that do unlogged updates of the
> timestamps, and the VFS need to be informed of these. Hence add a
> new function xfs_trans_inode_chgtime() for transactional changes,
You actually used the name "xfs_trans_ichgtime".
> and leave xfs_ichgtime() for the non-transactional changes.
I haven't updated my cscope database, but it looks to me like
this leaves just one spot where xfs_ichtime() is still used.
Namely, xfs_setattr(), when truncating a zero-length file.
>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
. . .
> diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
> index b1fc2a6..37918f4 100644
> --- a/fs/xfs/linux-2.6/xfs_iops.c
> +++ b/fs/xfs/linux-2.6/xfs_iops.c
> @@ -96,40 +96,63 @@ xfs_mark_inode_dirty(
>
> /*
. . .
> + if (xfs_ichgtime_int(ip, flags))
> xfs_mark_inode_dirty_sync(ip);
> }
>
> /*
> + * Transactional inode timestamp update. requires inod to be locked and joined
s/inod /inode /
> + * to the transaction supplied. Relies on the transaction subsystem to track
. . .
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 39+ messages in thread
* [PATCH 05/16] xfs: rename xfs_buf_get_nodaddr to be more appropriate
2010-09-22 6:44 [PATCH 0/16] xfs: metadata scalability V2 Dave Chinner
` (3 preceding siblings ...)
2010-09-22 6:44 ` [PATCH 04/16] xfs: don't use vfs writeback for pure metadata modifications Dave Chinner
@ 2010-09-22 6:44 ` Dave Chinner
2010-09-22 17:25 ` Christoph Hellwig
2010-09-23 16:22 ` Alex Elder
2010-09-22 6:44 ` [PATCH 06/16] xfs: introduced uncached buffer read primitve Dave Chinner
` (11 subsequent siblings)
16 siblings, 2 replies; 39+ messages in thread
From: Dave Chinner @ 2010-09-22 6:44 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
xfs_buf_get_nodaddr() is really used to allocate a buffer that is
uncached. While it is not directly assigned a disk address, the fact
that they are not cached is a more important distinction. With the
upcoming uncached buffer read primitive, we should be consistent
with this disctinction.
While there, make page allocation in xfs_buf_get_nodaddr() safe
against memory reclaim re-entrancy into the filesystem by changing
the allocation to GFP_NOFS.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
fs/xfs/linux-2.6/xfs_buf.c | 7 ++++---
fs/xfs/linux-2.6/xfs_buf.h | 2 +-
fs/xfs/linux-2.6/xfs_trace.h | 2 +-
fs/xfs/xfs_log.c | 3 ++-
fs/xfs/xfs_log_recover.c | 3 ++-
fs/xfs/xfs_vnodeops.c | 2 +-
6 files changed, 11 insertions(+), 8 deletions(-)
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 286e36e..fbbc6d3 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -707,8 +707,9 @@ xfs_buf_associate_memory(
}
xfs_buf_t *
-xfs_buf_get_noaddr(
+xfs_buf_get_uncached(
size_t len,
+ int flags,
xfs_buftarg_t *target)
{
unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
@@ -725,7 +726,7 @@ xfs_buf_get_noaddr(
goto fail_free_buf;
for (i = 0; i < page_count; i++) {
- bp->b_pages[i] = alloc_page(GFP_KERNEL);
+ bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
if (!bp->b_pages[i])
goto fail_free_mem;
}
@@ -740,7 +741,7 @@ xfs_buf_get_noaddr(
xfs_buf_unlock(bp);
- trace_xfs_buf_get_noaddr(bp, _RET_IP_);
+ trace_xfs_buf_get_uncached(bp, _RET_IP_);
return bp;
fail_free_mem:
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 2a05614..fbcc77b 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -213,7 +213,7 @@ extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
xfs_buf_flags_t);
extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
-extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *);
+extern xfs_buf_t *xfs_buf_get_uncached(size_t, int, xfs_buftarg_t *);
extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
extern void xfs_buf_hold(xfs_buf_t *);
extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t,
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index be5dffd..2a1d4fb 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -331,7 +331,7 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done);
DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue);
DEFINE_BUF_EVENT(xfs_buf_delwri_split);
-DEFINE_BUF_EVENT(xfs_buf_get_noaddr);
+DEFINE_BUF_EVENT(xfs_buf_get_uncached);
DEFINE_BUF_EVENT(xfs_bdstrat_shut);
DEFINE_BUF_EVENT(xfs_buf_item_relse);
DEFINE_BUF_EVENT(xfs_buf_item_iodone);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 33f718f..6119a9e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1131,7 +1131,8 @@ xlog_alloc_log(xfs_mount_t *mp,
iclog->ic_prev = prev_iclog;
prev_iclog = iclog;
- bp = xfs_buf_get_noaddr(log->l_iclog_size, mp->m_logdev_targp);
+ bp = xfs_buf_get_uncached(log->l_iclog_size, 0,
+ mp->m_logdev_targp);
if (!bp)
goto out_free_iclog;
if (!XFS_BUF_CPSEMA(bp))
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 6f3f5fa..2f57be0 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -107,7 +107,8 @@ xlog_get_bp(
nbblks += log->l_sectBBsize;
nbblks = round_up(nbblks, log->l_sectBBsize);
- return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
+ return xfs_buf_get_uncached(BBTOB(nbblks), 0,
+ log->l_mp->m_logdev_targp);
}
STATIC void
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 7413a02..d0dc6d0 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2431,7 +2431,7 @@ xfs_zero_remaining_bytes(
if (endoff > ip->i_size)
endoff = ip->i_size;
- bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
+ bp = xfs_buf_get_uncached(mp->m_sb.sb_blocksize, XBF_DONT_BLOCK,
XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp);
if (!bp)
--
1.7.1
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 39+ messages in thread* Re: [PATCH 05/16] xfs: rename xfs_buf_get_nodaddr to be more appropriate
2010-09-22 6:44 ` [PATCH 05/16] xfs: rename xfs_buf_get_nodaddr to be more appropriate Dave Chinner
@ 2010-09-22 17:25 ` Christoph Hellwig
2010-09-23 0:37 ` Dave Chinner
2010-09-23 16:22 ` Alex Elder
1 sibling, 1 reply; 39+ messages in thread
From: Christoph Hellwig @ 2010-09-22 17:25 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
> xfs_buf_t *
> -xfs_buf_get_noaddr(
> +xfs_buf_get_uncached(
> size_t len,
> + int flags,
> xfs_buftarg_t *target)
Sorry for coming up with minor details like this again, but if we change
the name and signature anyway, can we please make the buftarg the first
argument, just like for the other buf helpers?
Otherwise looks good,
Reviewed-by: Christoph Hellwig <hch@lst.de>
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 05/16] xfs: rename xfs_buf_get_nodaddr to be more appropriate
2010-09-22 17:25 ` Christoph Hellwig
@ 2010-09-23 0:37 ` Dave Chinner
0 siblings, 0 replies; 39+ messages in thread
From: Dave Chinner @ 2010-09-23 0:37 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: xfs
On Wed, Sep 22, 2010 at 01:25:26PM -0400, Christoph Hellwig wrote:
> > xfs_buf_t *
> > -xfs_buf_get_noaddr(
> > +xfs_buf_get_uncached(
> > size_t len,
> > + int flags,
> > xfs_buftarg_t *target)
>
> Sorry for coming up with minor details like this again, but if we change
> the name and signature anyway, can we please make the buftarg the first
> argument, just like for the other buf helpers?
Yup, I'll fix it. I should have done that in the first place.
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 05/16] xfs: rename xfs_buf_get_nodaddr to be more appropriate
2010-09-22 6:44 ` [PATCH 05/16] xfs: rename xfs_buf_get_nodaddr to be more appropriate Dave Chinner
2010-09-22 17:25 ` Christoph Hellwig
@ 2010-09-23 16:22 ` Alex Elder
1 sibling, 0 replies; 39+ messages in thread
From: Alex Elder @ 2010-09-23 16:22 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
On Wed, 2010-09-22 at 16:44 +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
>
> xfs_buf_get_nodaddr() is really used to allocate a buffer that is
> uncached. While it is not directly assigned a disk address, the fact
> that they are not cached is a more important distinction. With the
> upcoming uncached buffer read primitive, we should be consistent
> with this disctinction.
>
> While there, make page allocation in xfs_buf_get_nodaddr() safe
> against memory reclaim re-entrancy into the filesystem by changing
> the allocation to GFP_NOFS.
>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
Looks good.
Reviewed-by: Alex Elder <aelder@sgi.com>
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 39+ messages in thread
* [PATCH 06/16] xfs: introduced uncached buffer read primitve
2010-09-22 6:44 [PATCH 0/16] xfs: metadata scalability V2 Dave Chinner
` (4 preceding siblings ...)
2010-09-22 6:44 ` [PATCH 05/16] xfs: rename xfs_buf_get_nodaddr to be more appropriate Dave Chinner
@ 2010-09-22 6:44 ` Dave Chinner
2010-09-22 6:44 ` [PATCH 07/16] xfs: store xfs_mount in the buftarg instead of in the xfs_buf Dave Chinner
` (10 subsequent siblings)
16 siblings, 0 replies; 39+ messages in thread
From: Dave Chinner @ 2010-09-22 6:44 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
To avoid the need to use cached buffers for single-shot or buffers
cached at the filesystem level, introduce a new buffer read
primitive that bypasses the cache an reads directly from disk.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
fs/xfs/linux-2.6/xfs_buf.c | 34 ++++++++++++++++++++++++++++++++++
fs/xfs/linux-2.6/xfs_buf.h | 3 +++
2 files changed, 37 insertions(+), 0 deletions(-)
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index fbbc6d3..c8420c0 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -638,6 +638,40 @@ xfs_buf_readahead(
xfs_buf_read(target, ioff, isize, flags);
}
+/*
+ * Read an uncached buffer from disk. Allocates and returns a locked
+ * buffer containing the disk contents or nothing.
+ */
+struct xfs_buf *
+xfs_buf_read_uncached(
+ struct xfs_mount *mp,
+ struct xfs_buftarg *target,
+ xfs_daddr_t daddr,
+ size_t length,
+ int flags)
+{
+ xfs_buf_t *bp;
+ int error;
+
+ bp = xfs_buf_get_uncached(length, flags, target);
+ if (!bp)
+ return NULL;
+
+ /* set up the buffer for a read IO */
+ xfs_buf_lock(bp);
+ XFS_BUF_SET_ADDR(bp, daddr);
+ XFS_BUF_READ(bp);
+ XFS_BUF_BUSY(bp);
+
+ xfsbdstrat(mp, bp);
+ error = xfs_iowait(bp);
+ if (error || bp->b_error) {
+ xfs_buf_relse(bp);
+ return NULL;
+ }
+ return bp;
+}
+
xfs_buf_t *
xfs_buf_get_empty(
size_t len,
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index fbcc77b..23da2aa 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -218,6 +218,9 @@ extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
extern void xfs_buf_hold(xfs_buf_t *);
extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t,
xfs_buf_flags_t);
+struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp,
+ struct xfs_buftarg *target,
+ xfs_daddr_t daddr, size_t length, int flags);
/* Releasing Buffers */
extern void xfs_buf_free(xfs_buf_t *);
--
1.7.1
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 39+ messages in thread* [PATCH 07/16] xfs: store xfs_mount in the buftarg instead of in the xfs_buf
2010-09-22 6:44 [PATCH 0/16] xfs: metadata scalability V2 Dave Chinner
` (5 preceding siblings ...)
2010-09-22 6:44 ` [PATCH 06/16] xfs: introduced uncached buffer read primitve Dave Chinner
@ 2010-09-22 6:44 ` Dave Chinner
2010-09-22 6:44 ` [PATCH 08/16] xfs: kill XBF_FS_MANAGED buffers Dave Chinner
` (9 subsequent siblings)
16 siblings, 0 replies; 39+ messages in thread
From: Dave Chinner @ 2010-09-22 6:44 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
Each buffer contains both a buftarg pointer and a mount pointer. If
we add a mount pointer into the buftarg, we can avoid needing the
b_mount field in every buffer and grab it from the buftarg when
needed instead. This shrinks the xfs_buf by 8 bytes.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
fs/xfs/linux-2.6/xfs_buf.c | 9 ++++-----
fs/xfs/linux-2.6/xfs_buf.h | 5 +++--
fs/xfs/linux-2.6/xfs_super.c | 8 +++++---
fs/xfs/xfs_buf_item.c | 3 +--
fs/xfs/xfs_log_recover.c | 16 +++++++---------
5 files changed, 20 insertions(+), 21 deletions(-)
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index c8420c0..49374ae 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -894,7 +894,7 @@ xfs_buf_lock(
trace_xfs_buf_lock(bp, _RET_IP_);
if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
- xfs_log_force(bp->b_mount, 0);
+ xfs_log_force(bp->b_target->bt_mount, 0);
if (atomic_read(&bp->b_io_remaining))
blk_run_address_space(bp->b_target->bt_mapping);
down(&bp->b_sema);
@@ -1017,7 +1017,6 @@ xfs_bwrite(
{
int error;
- bp->b_mount = mp;
bp->b_flags |= XBF_WRITE;
bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
@@ -1038,8 +1037,6 @@ xfs_bdwrite(
{
trace_xfs_buf_bdwrite(bp, _RET_IP_);
- bp->b_mount = mp;
-
bp->b_flags &= ~XBF_READ;
bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
@@ -1128,7 +1125,7 @@ int
xfs_bdstrat_cb(
struct xfs_buf *bp)
{
- if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
+ if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
trace_xfs_bdstrat_shut(bp, _RET_IP_);
/*
* Metadata write that didn't get logged but
@@ -1644,6 +1641,7 @@ out_error:
xfs_buftarg_t *
xfs_alloc_buftarg(
+ struct xfs_mount *mp,
struct block_device *bdev,
int external,
const char *fsname)
@@ -1652,6 +1650,7 @@ xfs_alloc_buftarg(
btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
+ btp->bt_mount = mp;
btp->bt_dev = bdev->bd_dev;
btp->bt_bdev = bdev;
if (xfs_setsize_buftarg_early(btp, bdev))
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 23da2aa..e3aa8c0 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -132,6 +132,7 @@ typedef struct xfs_buftarg {
dev_t bt_dev;
struct block_device *bt_bdev;
struct address_space *bt_mapping;
+ struct xfs_mount *bt_mount;
unsigned int bt_bsize;
unsigned int bt_sshift;
size_t bt_smask;
@@ -189,7 +190,6 @@ typedef struct xfs_buf {
struct completion b_iowait; /* queue for I/O waiters */
void *b_fspriv;
void *b_fspriv2;
- struct xfs_mount *b_mount;
unsigned short b_error; /* error code on I/O */
unsigned int b_page_count; /* size of page array */
unsigned int b_offset; /* page offset in first page */
@@ -377,7 +377,8 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
/*
* Handling of buftargs.
*/
-extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *);
+extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
+ struct block_device *, int, const char *);
extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
extern void xfs_wait_buftarg(xfs_buftarg_t *);
extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a4e0797..62501ee 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -758,18 +758,20 @@ xfs_open_devices(
* Setup xfs_mount buffer target pointers
*/
error = ENOMEM;
- mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname);
+ mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname);
if (!mp->m_ddev_targp)
goto out_close_rtdev;
if (rtdev) {
- mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname);
+ mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1,
+ mp->m_fsname);
if (!mp->m_rtdev_targp)
goto out_free_ddev_targ;
}
if (logdev && logdev != ddev) {
- mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname);
+ mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1,
+ mp->m_fsname);
if (!mp->m_logdev_targp)
goto out_free_rtdev_targ;
} else {
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 1b09d7a..ee75576 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -692,8 +692,7 @@ xfs_buf_item_init(
* the first. If we do already have one, there is
* nothing to do here so return.
*/
- if (bp->b_mount != mp)
- bp->b_mount = mp;
+ ASSERT(bp->b_target->bt_mount == mp);
if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
if (lip->li_type == XFS_LI_BUF) {
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 2f57be0..0e08e30 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -322,10 +322,11 @@ xlog_recover_iodone(
* this during recovery. One strike!
*/
xfs_ioerror_alert("xlog_recover_iodone",
- bp->b_mount, bp, XFS_BUF_ADDR(bp));
- xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
+ bp->b_target->bt_mount, bp,
+ XFS_BUF_ADDR(bp));
+ xfs_force_shutdown(bp->b_target->bt_mount,
+ SHUTDOWN_META_IO_ERROR);
}
- bp->b_mount = NULL;
XFS_BUF_CLR_IODONE_FUNC(bp);
xfs_biodone(bp);
}
@@ -2276,8 +2277,7 @@ xlog_recover_do_buffer_trans(
XFS_BUF_STALE(bp);
error = xfs_bwrite(mp, bp);
} else {
- ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
- bp->b_mount = mp;
+ ASSERT(bp->b_target->bt_mount == mp);
XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
xfs_bdwrite(mp, bp);
}
@@ -2541,8 +2541,7 @@ xlog_recover_do_inode_trans(
}
write_inode_buffer:
- ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
- bp->b_mount = mp;
+ ASSERT(bp->b_target->bt_mount == mp);
XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
xfs_bdwrite(mp, bp);
error:
@@ -2679,8 +2678,7 @@ xlog_recover_do_dquot_trans(
memcpy(ddq, recddq, item->ri_buf[1].i_len);
ASSERT(dq_f->qlf_size == 2);
- ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
- bp->b_mount = mp;
+ ASSERT(bp->b_target->bt_mount == mp);
XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
xfs_bdwrite(mp, bp);
--
1.7.1
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 39+ messages in thread* [PATCH 08/16] xfs: kill XBF_FS_MANAGED buffers
2010-09-22 6:44 [PATCH 0/16] xfs: metadata scalability V2 Dave Chinner
` (6 preceding siblings ...)
2010-09-22 6:44 ` [PATCH 07/16] xfs: store xfs_mount in the buftarg instead of in the xfs_buf Dave Chinner
@ 2010-09-22 6:44 ` Dave Chinner
2010-09-22 6:44 ` [PATCH 09/16] xfs: use unhashed buffers for size checks Dave Chinner
` (8 subsequent siblings)
16 siblings, 0 replies; 39+ messages in thread
From: Dave Chinner @ 2010-09-22 6:44 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
Filesystem level managed buffers are buffers that have their
lifecycle controlled by the filesystem layer, not the buffer cache.
We currently cache these buffers, which makes cleanup and cache
walking somewhat troublesome. Convert the fs managed buffers to
uncached buffers obtained by via xfs_buf_get_uncached(), and remove
the XBF_FS_MANAGED special cases from the buffer cache.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
fs/xfs/linux-2.6/xfs_buf.c | 20 +++------------
fs/xfs/linux-2.6/xfs_buf.h | 4 ---
fs/xfs/xfs_mount.c | 57 +++++++++++++------------------------------
3 files changed, 21 insertions(+), 60 deletions(-)
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 49374ae..1497362 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -826,8 +826,6 @@ xfs_buf_rele(
atomic_inc(&bp->b_hold);
spin_unlock(&hash->bh_lock);
(*(bp->b_relse)) (bp);
- } else if (bp->b_flags & XBF_FS_MANAGED) {
- spin_unlock(&hash->bh_lock);
} else {
ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
list_del_init(&bp->b_hash_list);
@@ -1433,26 +1431,16 @@ void
xfs_wait_buftarg(
xfs_buftarg_t *btp)
{
- xfs_buf_t *bp, *n;
xfs_bufhash_t *hash;
uint i;
for (i = 0; i < (1 << btp->bt_hashshift); i++) {
hash = &btp->bt_hash[i];
-again:
spin_lock(&hash->bh_lock);
- list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
- ASSERT(btp == bp->b_target);
- if (!(bp->b_flags & XBF_FS_MANAGED)) {
- spin_unlock(&hash->bh_lock);
- /*
- * Catch superblock reference count leaks
- * immediately
- */
- BUG_ON(bp->b_bn == 0);
- delay(100);
- goto again;
- }
+ while (!list_empty(&hash->bh_list)) {
+ spin_unlock(&hash->bh_lock);
+ delay(100);
+ spin_lock(&hash->bh_lock);
}
spin_unlock(&hash->bh_lock);
}
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index e3aa8c0..d10a954 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -51,7 +51,6 @@ typedef enum {
#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */
#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */
-#define XBF_FS_MANAGED (1 << 8) /* filesystem controls freeing memory */
#define XBF_ORDERED (1 << 11)/* use ordered writes */
#define XBF_READ_AHEAD (1 << 12)/* asynchronous read-ahead */
#define XBF_LOG_BUFFER (1 << 13)/* this is a buffer used for the log */
@@ -104,7 +103,6 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_DONE, "DONE" }, \
{ XBF_DELWRI, "DELWRI" }, \
{ XBF_STALE, "STALE" }, \
- { XBF_FS_MANAGED, "FS_MANAGED" }, \
{ XBF_ORDERED, "ORDERED" }, \
{ XBF_READ_AHEAD, "READ_AHEAD" }, \
{ XBF_LOCK, "LOCK" }, /* should never be set */\
@@ -279,8 +277,6 @@ extern void xfs_buf_terminate(void);
XFS_BUF_DONE(bp); \
} while (0)
-#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED)
-
#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI)
#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp)
#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 14fc6e9..fbca293 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -646,7 +646,6 @@ int
xfs_readsb(xfs_mount_t *mp, int flags)
{
unsigned int sector_size;
- unsigned int extra_flags;
xfs_buf_t *bp;
int error;
@@ -659,28 +658,24 @@ xfs_readsb(xfs_mount_t *mp, int flags)
* access to the superblock.
*/
sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
- extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED;
- bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size),
- extra_flags);
- if (!bp || XFS_BUF_ISERROR(bp)) {
- xfs_fs_mount_cmn_err(flags, "SB read failed");
- error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
- goto fail;
+reread:
+ bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
+ XFS_SB_DADDR, sector_size, 0);
+ if (!bp) {
+ xfs_fs_mount_cmn_err(flags, "SB buffer read failed");
+ return EIO;
}
- ASSERT(XFS_BUF_ISBUSY(bp));
- ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
/*
* Initialize the mount structure from the superblock.
* But first do some basic consistency checking.
*/
xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
-
error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
if (error) {
xfs_fs_mount_cmn_err(flags, "SB validate failed");
- goto fail;
+ goto release_buf;
}
/*
@@ -691,7 +686,7 @@ xfs_readsb(xfs_mount_t *mp, int flags)
"device supports only %u byte sectors (not %u)",
sector_size, mp->m_sb.sb_sectsize);
error = ENOSYS;
- goto fail;
+ goto release_buf;
}
/*
@@ -699,33 +694,20 @@ xfs_readsb(xfs_mount_t *mp, int flags)
* re-read the superblock so the buffer is correctly sized.
*/
if (sector_size < mp->m_sb.sb_sectsize) {
- XFS_BUF_UNMANAGE(bp);
xfs_buf_relse(bp);
sector_size = mp->m_sb.sb_sectsize;
- bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR,
- BTOBB(sector_size), extra_flags);
- if (!bp || XFS_BUF_ISERROR(bp)) {
- xfs_fs_mount_cmn_err(flags, "SB re-read failed");
- error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
- goto fail;
- }
- ASSERT(XFS_BUF_ISBUSY(bp));
- ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+ goto reread;
}
/* Initialize per-cpu counters */
xfs_icsb_reinit_counters(mp);
mp->m_sb_bp = bp;
- xfs_buf_relse(bp);
- ASSERT(XFS_BUF_VALUSEMA(bp) > 0);
+ xfs_buf_unlock(bp);
return 0;
- fail:
- if (bp) {
- XFS_BUF_UNMANAGE(bp);
- xfs_buf_relse(bp);
- }
+release_buf:
+ xfs_buf_relse(bp);
return error;
}
@@ -2005,18 +1987,13 @@ xfs_getsb(
*/
void
xfs_freesb(
- xfs_mount_t *mp)
+ struct xfs_mount *mp)
{
- xfs_buf_t *bp;
+ struct xfs_buf *bp = mp->m_sb_bp;
- /*
- * Use xfs_getsb() so that the buffer will be locked
- * when we call xfs_buf_relse().
- */
- bp = xfs_getsb(mp, 0);
- XFS_BUF_UNMANAGE(bp);
- xfs_buf_relse(bp);
+ xfs_buf_lock(bp);
mp->m_sb_bp = NULL;
+ xfs_buf_relse(bp);
}
/*
--
1.7.1
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 39+ messages in thread* [PATCH 09/16] xfs: use unhashed buffers for size checks
2010-09-22 6:44 [PATCH 0/16] xfs: metadata scalability V2 Dave Chinner
` (7 preceding siblings ...)
2010-09-22 6:44 ` [PATCH 08/16] xfs: kill XBF_FS_MANAGED buffers Dave Chinner
@ 2010-09-22 6:44 ` Dave Chinner
2010-09-22 6:44 ` [PATCH 10/16] xfs: remove buftarg hash for external devices Dave Chinner
` (7 subsequent siblings)
16 siblings, 0 replies; 39+ messages in thread
From: Dave Chinner @ 2010-09-22 6:44 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
When we are checking we can access the last block of each device, we
do not need to use cached buffers as they will be tossed away
immediately. Use uncached buffers for size checks so that all IO
prior to full in-memory structure initialisation does not use the
buffer cache.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
fs/xfs/xfs_fsops.c | 11 +++++------
fs/xfs/xfs_mount.c | 39 ++++++++++++++++-----------------------
fs/xfs/xfs_rtalloc.c | 29 +++++++++++++----------------
3 files changed, 34 insertions(+), 45 deletions(-)
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 43b1d56..6a1edb1 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -144,12 +144,11 @@ xfs_growfs_data_private(
if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
return error;
dpct = pct - mp->m_sb.sb_imax_pct;
- error = xfs_read_buf(mp, mp->m_ddev_targp,
- XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
- XFS_FSS_TO_BB(mp, 1), 0, &bp);
- if (error)
- return error;
- ASSERT(bp);
+ bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
+ XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
+ BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
+ if (!bp)
+ return EIO;
xfs_buf_relse(bp);
new = nb; /* use new as a temporary here */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index fbca293..912101d 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -980,42 +980,35 @@ xfs_check_sizes(xfs_mount_t *mp)
{
xfs_buf_t *bp;
xfs_daddr_t d;
- int error;
d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
- cmn_err(CE_WARN, "XFS: size check 1 failed");
+ cmn_err(CE_WARN, "XFS: filesystem size mismatch detected");
return XFS_ERROR(EFBIG);
}
- error = xfs_read_buf(mp, mp->m_ddev_targp,
- d - XFS_FSS_TO_BB(mp, 1),
- XFS_FSS_TO_BB(mp, 1), 0, &bp);
- if (!error) {
- xfs_buf_relse(bp);
- } else {
- cmn_err(CE_WARN, "XFS: size check 2 failed");
- if (error == ENOSPC)
- error = XFS_ERROR(EFBIG);
- return error;
+ bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
+ d - XFS_FSS_TO_BB(mp, 1),
+ BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
+ if (!bp) {
+ cmn_err(CE_WARN, "XFS: last sector read failed");
+ return EIO;
}
+ xfs_buf_relse(bp);
if (mp->m_logdev_targp != mp->m_ddev_targp) {
d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
- cmn_err(CE_WARN, "XFS: size check 3 failed");
+ cmn_err(CE_WARN, "XFS: log size mismatch detected");
return XFS_ERROR(EFBIG);
}
- error = xfs_read_buf(mp, mp->m_logdev_targp,
- d - XFS_FSB_TO_BB(mp, 1),
- XFS_FSB_TO_BB(mp, 1), 0, &bp);
- if (!error) {
- xfs_buf_relse(bp);
- } else {
- cmn_err(CE_WARN, "XFS: size check 3 failed");
- if (error == ENOSPC)
- error = XFS_ERROR(EFBIG);
- return error;
+ bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
+ d - XFS_FSB_TO_BB(mp, 1),
+ XFS_FSB_TO_B(mp, 1), 0);
+ if (!bp) {
+ cmn_err(CE_WARN, "XFS: log device read failed");
+ return EIO;
}
+ xfs_buf_relse(bp);
}
return 0;
}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 891260f..12a1913 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -39,6 +39,7 @@
#include "xfs_trans_space.h"
#include "xfs_utils.h"
#include "xfs_trace.h"
+#include "xfs_buf.h"
/*
@@ -1883,13 +1884,13 @@ xfs_growfs_rt(
/*
* Read in the last block of the device, make sure it exists.
*/
- error = xfs_read_buf(mp, mp->m_rtdev_targp,
- XFS_FSB_TO_BB(mp, nrblocks - 1),
- XFS_FSB_TO_BB(mp, 1), 0, &bp);
- if (error)
- return error;
- ASSERT(bp);
+ bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
+ XFS_FSB_TO_BB(mp, nrblocks - 1),
+ XFS_FSB_TO_B(mp, 1), 0);
+ if (!bp)
+ return EIO;
xfs_buf_relse(bp);
+
/*
* Calculate new parameters. These are the final values to be reached.
*/
@@ -2215,7 +2216,6 @@ xfs_rtmount_init(
{
xfs_buf_t *bp; /* buffer for last block of subvolume */
xfs_daddr_t d; /* address of last block of subvolume */
- int error; /* error return value */
xfs_sb_t *sbp; /* filesystem superblock copy in mount */
sbp = &mp->m_sb;
@@ -2242,15 +2242,12 @@ xfs_rtmount_init(
(unsigned long long) mp->m_sb.sb_rblocks);
return XFS_ERROR(EFBIG);
}
- error = xfs_read_buf(mp, mp->m_rtdev_targp,
- d - XFS_FSB_TO_BB(mp, 1),
- XFS_FSB_TO_BB(mp, 1), 0, &bp);
- if (error) {
- cmn_err(CE_WARN,
- "XFS: realtime mount -- xfs_read_buf failed, returned %d", error);
- if (error == ENOSPC)
- return XFS_ERROR(EFBIG);
- return error;
+ bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
+ d - XFS_FSB_TO_BB(mp, 1),
+ XFS_FSB_TO_B(mp, 1), 0);
+ if (!bp) {
+ cmn_err(CE_WARN, "XFS: realtime device size check failed");
+ return EIO;
}
xfs_buf_relse(bp);
return 0;
--
1.7.1
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 39+ messages in thread* [PATCH 10/16] xfs: remove buftarg hash for external devices
2010-09-22 6:44 [PATCH 0/16] xfs: metadata scalability V2 Dave Chinner
` (8 preceding siblings ...)
2010-09-22 6:44 ` [PATCH 09/16] xfs: use unhashed buffers for size checks Dave Chinner
@ 2010-09-22 6:44 ` Dave Chinner
2010-09-22 6:44 ` [PATCH 11/16] xfs: split inode AG walking into separate code for reclaim Dave Chinner
` (6 subsequent siblings)
16 siblings, 0 replies; 39+ messages in thread
From: Dave Chinner @ 2010-09-22 6:44 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
For RT and external log devices, we never use hashed buffers on them
now. Remove the buftarg hash tables that are set up for them.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
fs/xfs/linux-2.6/xfs_buf.c | 6 +++++-
1 files changed, 5 insertions(+), 1 deletions(-)
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 1497362..6de9513 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1458,7 +1458,11 @@ xfs_alloc_bufhash(
{
unsigned int i;
- btp->bt_hashshift = external ? 3 : 12; /* 8 or 4096 buckets */
+ if (external) {
+ btp->bt_hash = NULL;
+ return;
+ }
+ btp->bt_hashshift = 12; /* 4096 buckets */
btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
sizeof(xfs_bufhash_t));
for (i = 0; i < (1 << btp->bt_hashshift); i++) {
--
1.7.1
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 39+ messages in thread* [PATCH 11/16] xfs: split inode AG walking into separate code for reclaim
2010-09-22 6:44 [PATCH 0/16] xfs: metadata scalability V2 Dave Chinner
` (9 preceding siblings ...)
2010-09-22 6:44 ` [PATCH 10/16] xfs: remove buftarg hash for external devices Dave Chinner
@ 2010-09-22 6:44 ` Dave Chinner
2010-09-22 17:28 ` Christoph Hellwig
2010-09-23 16:45 ` Alex Elder
2010-09-22 6:44 ` [PATCH 12/16] xfs: implement batched inode lookups for AG walking Dave Chinner
` (5 subsequent siblings)
16 siblings, 2 replies; 39+ messages in thread
From: Dave Chinner @ 2010-09-22 6:44 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
The reclaim walk requires different locking and has a slightly
different walk algorithm, so separate it out so that it can be
optimised separately.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
fs/xfs/linux-2.6/xfs_sync.c | 200 ++++++++++++++++++----------------------
fs/xfs/linux-2.6/xfs_sync.h | 2 +-
fs/xfs/linux-2.6/xfs_trace.h | 2 +-
fs/xfs/quota/xfs_qm_syscalls.c | 3 +-
fs/xfs/xfs_mount.c | 26 +++++
fs/xfs/xfs_mount.h | 2 +
6 files changed, 120 insertions(+), 115 deletions(-)
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index ddeaff9..7737a13 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -40,78 +40,46 @@
#include <linux/freezer.h>
-STATIC xfs_inode_t *
-xfs_inode_ag_lookup(
- struct xfs_mount *mp,
- struct xfs_perag *pag,
- uint32_t *first_index,
- int tag)
-{
- int nr_found;
- struct xfs_inode *ip;
-
- /*
- * use a gang lookup to find the next inode in the tree
- * as the tree is sparse and a gang lookup walks to find
- * the number of objects requested.
- */
- if (tag == XFS_ICI_NO_TAG) {
- nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
- (void **)&ip, *first_index, 1);
- } else {
- nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
- (void **)&ip, *first_index, 1, tag);
- }
- if (!nr_found)
- return NULL;
-
- /*
- * Update the index for the next lookup. Catch overflows
- * into the next AG range which can occur if we have inodes
- * in the last block of the AG and we are currently
- * pointing to the last inode.
- */
- *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
- if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
- return NULL;
- return ip;
-}
-
STATIC int
xfs_inode_ag_walk(
struct xfs_mount *mp,
struct xfs_perag *pag,
int (*execute)(struct xfs_inode *ip,
struct xfs_perag *pag, int flags),
- int flags,
- int tag,
- int exclusive,
- int *nr_to_scan)
+ int flags)
{
uint32_t first_index;
int last_error = 0;
int skipped;
+ int done;
restart:
+ done = 0;
skipped = 0;
first_index = 0;
do {
int error = 0;
+ int nr_found;
xfs_inode_t *ip;
- if (exclusive)
- write_lock(&pag->pag_ici_lock);
- else
- read_lock(&pag->pag_ici_lock);
- ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
- if (!ip) {
- if (exclusive)
- write_unlock(&pag->pag_ici_lock);
- else
- read_unlock(&pag->pag_ici_lock);
+ read_lock(&pag->pag_ici_lock);
+ nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+ (void **)&ip, first_index, 1);
+ if (!nr_found) {
+ read_unlock(&pag->pag_ici_lock);
break;
}
+ /*
+ * Update the index for the next lookup. Catch overflows
+ * into the next AG range which can occur if we have inodes
+ * in the last block of the AG and we are currently
+ * pointing to the last inode.
+ */
+ first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+ if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+ done = 1;
+
/* execute releases pag->pag_ici_lock */
error = execute(ip, pag, flags);
if (error == EAGAIN) {
@@ -125,7 +93,7 @@ restart:
if (error == EFSCORRUPTED)
break;
- } while ((*nr_to_scan)--);
+ } while (!done);
if (skipped) {
delay(1);
@@ -134,73 +102,29 @@ restart:
return last_error;
}
-/*
- * Select the next per-ag structure to iterate during the walk. The reclaim
- * walk is optimised only to walk AGs with reclaimable inodes in them.
- */
-static struct xfs_perag *
-xfs_inode_ag_iter_next_pag(
- struct xfs_mount *mp,
- xfs_agnumber_t *first,
- int tag)
-{
- struct xfs_perag *pag = NULL;
-
- if (tag == XFS_ICI_RECLAIM_TAG) {
- int found;
- int ref;
-
- rcu_read_lock();
- found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
- (void **)&pag, *first, 1, tag);
- if (found <= 0) {
- rcu_read_unlock();
- return NULL;
- }
- *first = pag->pag_agno + 1;
- /* open coded pag reference increment */
- ref = atomic_inc_return(&pag->pag_ref);
- rcu_read_unlock();
- trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
- } else {
- pag = xfs_perag_get(mp, *first);
- (*first)++;
- }
- return pag;
-}
-
int
xfs_inode_ag_iterator(
struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip,
struct xfs_perag *pag, int flags),
- int flags,
- int tag,
- int exclusive,
- int *nr_to_scan)
+ int flags)
{
struct xfs_perag *pag;
int error = 0;
int last_error = 0;
xfs_agnumber_t ag;
- int nr;
- nr = nr_to_scan ? *nr_to_scan : INT_MAX;
ag = 0;
- while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) {
- error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
- exclusive, &nr);
+ while ((pag = xfs_perag_get(mp, ag))) {
+ ag = pag->pag_agno + 1;
+ error = xfs_inode_ag_walk(mp, pag, execute, flags);
xfs_perag_put(pag);
if (error) {
last_error = error;
if (error == EFSCORRUPTED)
break;
}
- if (nr <= 0)
- break;
}
- if (nr_to_scan)
- *nr_to_scan = nr;
return XFS_ERROR(last_error);
}
@@ -318,8 +242,7 @@ xfs_sync_data(
ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
- error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
- XFS_ICI_NO_TAG, 0, NULL);
+ error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
if (error)
return XFS_ERROR(error);
@@ -337,8 +260,7 @@ xfs_sync_attr(
{
ASSERT((flags & ~SYNC_WAIT) == 0);
- return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
- XFS_ICI_NO_TAG, 0, NULL);
+ return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags);
}
STATIC int
@@ -859,13 +781,70 @@ reclaim:
}
+/*
+ * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
+ * corrupted, we still want to try to reclaim all the inodes.
+ */
+int
+xfs_reclaim_inodes_ag(
+ struct xfs_mount *mp,
+ int flags,
+ int *nr_to_scan)
+{
+ struct xfs_perag *pag;
+ int error = 0;
+ int last_error = 0;
+ xfs_agnumber_t ag;
+
+ ag = 0;
+ while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+ unsigned long first_index = 0;
+ int done = 0;
+
+ ag = pag->pag_agno + 1;
+
+ do {
+ struct xfs_inode *ip;
+ int nr_found;
+
+ write_lock(&pag->pag_ici_lock);
+ nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
+ (void **)&ip, first_index, 1,
+ XFS_ICI_RECLAIM_TAG);
+ if (!nr_found) {
+ write_unlock(&pag->pag_ici_lock);
+ break;
+ }
+
+ /*
+ * Update the index for the next lookup. Catch overflows
+ * into the next AG range which can occur if we have inodes
+ * in the last block of the AG and we are currently
+ * pointing to the last inode.
+ */
+ first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+ if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+ done = 1;
+
+ error = xfs_reclaim_inode(ip, pag, flags);
+ if (error && last_error != EFSCORRUPTED)
+ last_error = error;
+
+ } while (!done && (*nr_to_scan)--);
+
+ xfs_perag_put(pag);
+ }
+ return XFS_ERROR(last_error);
+}
+
int
xfs_reclaim_inodes(
xfs_mount_t *mp,
int mode)
{
- return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
- XFS_ICI_RECLAIM_TAG, 1, NULL);
+ int nr_to_scan = INT_MAX;
+
+ return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
}
/*
@@ -887,17 +866,16 @@ xfs_reclaim_inode_shrink(
if (!(gfp_mask & __GFP_FS))
return -1;
- xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
- XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
- /* if we don't exhaust the scan, don't bother coming back */
+ xfs_reclaim_inodes_ag(mp, 0, &nr_to_scan);
+ /* terminate if we don't exhaust the scan */
if (nr_to_scan > 0)
return -1;
}
reclaimable = 0;
ag = 0;
- while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag,
- XFS_ICI_RECLAIM_TAG))) {
+ while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+ ag = pag->pag_agno + 1;
reclaimable += pag->pag_ici_reclaimable;
xfs_perag_put(pag);
}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index fe78726..e8a3528 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -50,7 +50,7 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
- int flags, int tag, int write_lock, int *nr_to_scan);
+ int flags);
void xfs_inode_shrinker_register(struct xfs_mount *mp);
void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 2a1d4fb..286dc20 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -124,7 +124,7 @@ DEFINE_EVENT(xfs_perag_class, name, \
unsigned long caller_ip), \
TP_ARGS(mp, agno, refcount, caller_ip))
DEFINE_PERAG_REF_EVENT(xfs_perag_get);
-DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
DEFINE_PERAG_REF_EVENT(xfs_perag_put);
DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 7a71336..ac11fbe 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -918,8 +918,7 @@ xfs_qm_dqrele_all_inodes(
uint flags)
{
ASSERT(mp->m_quotainfo);
- xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags,
- XFS_ICI_NO_TAG, 0, NULL);
+ xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags);
}
/*------------------------------------------------------------------------*/
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 912101d..d66e87c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -219,6 +219,32 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
return pag;
}
+/*
+ * search from @first to find the next perag with the given tag set.
+ */
+struct xfs_perag *
+xfs_perag_get_tag(
+ struct xfs_mount *mp,
+ xfs_agnumber_t first,
+ int tag)
+{
+ struct xfs_perag *pag;
+ int found;
+ int ref;
+
+ rcu_read_lock();
+ found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+ (void **)&pag, first, 1, tag);
+ if (found <= 0) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ ref = atomic_inc_return(&pag->pag_ref);
+ rcu_read_unlock();
+ trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
+ return pag;
+}
+
void
xfs_perag_put(struct xfs_perag *pag)
{
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 622da21..7ab2409 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -327,6 +327,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
* perag get/put wrappers for ref counting
*/
struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
+struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
+ int tag);
void xfs_perag_put(struct xfs_perag *pag);
/*
--
1.7.1
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 39+ messages in thread* Re: [PATCH 11/16] xfs: split inode AG walking into separate code for reclaim
2010-09-22 6:44 ` [PATCH 11/16] xfs: split inode AG walking into separate code for reclaim Dave Chinner
@ 2010-09-22 17:28 ` Christoph Hellwig
2010-09-23 16:45 ` Alex Elder
1 sibling, 0 replies; 39+ messages in thread
From: Christoph Hellwig @ 2010-09-22 17:28 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
On Wed, Sep 22, 2010 at 04:44:24PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
>
> The reclaim walk requires different locking and has a slightly
> different walk algorithm, so separate it out so that it can be
> optimised separately.
Yeah, the code was getting far too cruft already with all the
conditionals.
Looks good,
Reviewed-by: Christoph Hellwig <hch@lst.de>
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 11/16] xfs: split inode AG walking into separate code for reclaim
2010-09-22 6:44 ` [PATCH 11/16] xfs: split inode AG walking into separate code for reclaim Dave Chinner
2010-09-22 17:28 ` Christoph Hellwig
@ 2010-09-23 16:45 ` Alex Elder
1 sibling, 0 replies; 39+ messages in thread
From: Alex Elder @ 2010-09-23 16:45 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
On Wed, 2010-09-22 at 16:44 +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
>
> The reclaim walk requires different locking and has a slightly
> different walk algorithm, so separate it out so that it can be
> optimised separately.
You assert this above xfs_reclaim_inodes_ag():
Walk the AGs and reclaim the inodes in them. Even if the filesystem is
corrupted, we still want to try to reclaim all the inodes.
I don't refute that statement, but if I'm not mistaken
this is new handling. I just felt it should not slip in
without being mentioned. I also suggest a short sentence
along with that comment to justify it might be in order.
I have one other minor comment below, but otherwise this
looks good.
Reviewed-by: Alex Elder <aelder@sgi.com>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
> fs/xfs/linux-2.6/xfs_sync.c | 200 ++++++++++++++++++----------------------
> fs/xfs/linux-2.6/xfs_sync.h | 2 +-
> fs/xfs/linux-2.6/xfs_trace.h | 2 +-
> fs/xfs/quota/xfs_qm_syscalls.c | 3 +-
> fs/xfs/xfs_mount.c | 26 +++++
> fs/xfs/xfs_mount.h | 2 +
> 6 files changed, 120 insertions(+), 115 deletions(-)
>
> diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
> index ddeaff9..7737a13 100644
> --- a/fs/xfs/linux-2.6/xfs_sync.c
> +++ b/fs/xfs/linux-2.6/xfs_sync.c
. . .
> @@ -859,13 +781,70 @@ reclaim:
>
> }
>
> +/*
> + * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
> + * corrupted, we still want to try to reclaim all the inodes.
> + */
> +int
> +xfs_reclaim_inodes_ag(
> + struct xfs_mount *mp,
> + int flags,
> + int *nr_to_scan)
> +{
> + struct xfs_perag *pag;
> + int error = 0;
> + int last_error = 0;
> + xfs_agnumber_t ag;
> +
> + ag = 0;
> + while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
> + unsigned long first_index = 0;
> + int done = 0;
> +
> + ag = pag->pag_agno + 1;
> +
> + do {
> + struct xfs_inode *ip;
> + int nr_found;
> +
> + write_lock(&pag->pag_ici_lock);
> + nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
> + (void **)&ip, first_index, 1,
> + XFS_ICI_RECLAIM_TAG);
> + if (!nr_found) {
> + write_unlock(&pag->pag_ici_lock);
> + break;
> + }
> +
> + /*
> + * Update the index for the next lookup. Catch overflows
> + * into the next AG range which can occur if we have inodes
> + * in the last block of the AG and we are currently
> + * pointing to the last inode.
> + */
> + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
> + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
> + done = 1;
> +
> + error = xfs_reclaim_inode(ip, pag, flags);
> + if (error && last_error != EFSCORRUPTED)
> + last_error = error;
> +
> + } while (!done && (*nr_to_scan)--);
I know you guys disagreed with me before on this point.
But since there is no reason anybody outside this function
has a need to see the intermediate values of *nr_to_scan,
you should use a local variable to keep track of it instead,
and then assign its final value to *nr_to_scan before returning.
It allows registers to be used rather than updating real
memory at each iteration.
> +
> + xfs_perag_put(pag);
> + }
> + return XFS_ERROR(last_error);
> +}
> +
> int
> xfs_reclaim_inodes(
> xfs_mount_t *mp,
> int mode)
> {
. . .
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 39+ messages in thread
* [PATCH 12/16] xfs: implement batched inode lookups for AG walking
2010-09-22 6:44 [PATCH 0/16] xfs: metadata scalability V2 Dave Chinner
` (10 preceding siblings ...)
2010-09-22 6:44 ` [PATCH 11/16] xfs: split inode AG walking into separate code for reclaim Dave Chinner
@ 2010-09-22 6:44 ` Dave Chinner
2010-09-22 17:33 ` Christoph Hellwig
2010-09-23 17:17 ` Alex Elder
2010-09-22 6:44 ` [PATCH 13/16] xfs: batch inode reclaim lookup Dave Chinner
` (4 subsequent siblings)
16 siblings, 2 replies; 39+ messages in thread
From: Dave Chinner @ 2010-09-22 6:44 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
With the reclaim code separated from the generic walking code, it is
simple to implement batched lookups for the generic walk code.
Separate out the inode validation from the execute operations and
modify the tree lookups to get a batch of inodes at a time.
Reclaim operations will be optimised separately.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
fs/xfs/linux-2.6/xfs_sync.c | 104 +++++++++++++++++++++++-----------------
fs/xfs/linux-2.6/xfs_sync.h | 3 +-
fs/xfs/quota/xfs_qm_syscalls.c | 26 +++++-----
3 files changed, 75 insertions(+), 58 deletions(-)
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 7737a13..227ecde 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -39,11 +39,19 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
+/*
+ * The inode lookup is done in batches to keep the amount of lock traffic and
+ * radix tree lookups to a minimum. The batch size is a trade off between
+ * lookup reduction and stack usage. This is in the reclaim path, so we can't
+ * be too greedy.
+ */
+#define XFS_LOOKUP_BATCH 32
STATIC int
xfs_inode_ag_walk(
struct xfs_mount *mp,
struct xfs_perag *pag,
+ int (*grab)(struct xfs_inode *ip),
int (*execute)(struct xfs_inode *ip,
struct xfs_perag *pag, int flags),
int flags)
@@ -52,48 +60,68 @@ xfs_inode_ag_walk(
int last_error = 0;
int skipped;
int done;
+ int nr_found;
restart:
done = 0;
skipped = 0;
first_index = 0;
+ nr_found = 0;
do {
int error = 0;
- int nr_found;
- xfs_inode_t *ip;
+ int i;
+ struct xfs_inode *batch[XFS_LOOKUP_BATCH];
read_lock(&pag->pag_ici_lock);
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
- (void **)&ip, first_index, 1);
+ (void **)batch, first_index,
+ XFS_LOOKUP_BATCH);
if (!nr_found) {
read_unlock(&pag->pag_ici_lock);
break;
}
/*
- * Update the index for the next lookup. Catch overflows
- * into the next AG range which can occur if we have inodes
- * in the last block of the AG and we are currently
- * pointing to the last inode.
+ * Grab the inodes before we drop the lock. if we found
+ * nothing, nr == 0 and the loop will be skipped.
*/
- first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
- if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
- done = 1;
-
- /* execute releases pag->pag_ici_lock */
- error = execute(ip, pag, flags);
- if (error == EAGAIN) {
- skipped++;
- continue;
+ for (i = 0; i < nr_found; i++) {
+ struct xfs_inode *ip = batch[i];
+
+ if (done || grab(ip))
+ batch[i] = NULL;
+
+ /*
+ * Update the index for the next lookup. Catch overflows
+ * into the next AG range which can occur if we have inodes
+ * in the last block of the AG and we are currently
+ * pointing to the last inode.
+ */
+ first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+ if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+ done = 1;
+ }
+
+ /* unlock now we've grabbed the inodes. */
+ read_unlock(&pag->pag_ici_lock);
+
+ for (i = 0; i < nr_found; i++) {
+ if (!batch[i])
+ continue;
+ error = execute(batch[i], pag, flags);
+ if (error == EAGAIN) {
+ skipped++;
+ continue;
+ }
+ if (error && last_error != EFSCORRUPTED)
+ last_error = error;
}
- if (error)
- last_error = error;
/* bail out if the filesystem is corrupted. */
if (error == EFSCORRUPTED)
break;
- } while (!done);
+ } while (nr_found && !done);
if (skipped) {
delay(1);
@@ -105,6 +133,7 @@ restart:
int
xfs_inode_ag_iterator(
struct xfs_mount *mp,
+ int (*grab)(struct xfs_inode *ip),
int (*execute)(struct xfs_inode *ip,
struct xfs_perag *pag, int flags),
int flags)
@@ -117,7 +146,7 @@ xfs_inode_ag_iterator(
ag = 0;
while ((pag = xfs_perag_get(mp, ag))) {
ag = pag->pag_agno + 1;
- error = xfs_inode_ag_walk(mp, pag, execute, flags);
+ error = xfs_inode_ag_walk(mp, pag, grab, execute, flags);
xfs_perag_put(pag);
if (error) {
last_error = error;
@@ -128,38 +157,31 @@ xfs_inode_ag_iterator(
return XFS_ERROR(last_error);
}
-/* must be called with pag_ici_lock held and releases it */
int
-xfs_sync_inode_valid(
- struct xfs_inode *ip,
- struct xfs_perag *pag)
+xfs_sync_inode_grab(
+ struct xfs_inode *ip)
{
struct inode *inode = VFS_I(ip);
- int error = EFSCORRUPTED;
/* nothing to sync during shutdown */
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- goto out_unlock;
+ return EFSCORRUPTED;
/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
- error = ENOENT;
if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
- goto out_unlock;
+ return ENOENT;
/* If we can't grab the inode, it must on it's way to reclaim. */
if (!igrab(inode))
- goto out_unlock;
+ return ENOENT;
if (is_bad_inode(inode)) {
IRELE(ip);
- goto out_unlock;
+ return ENOENT;
}
/* inode is valid */
- error = 0;
-out_unlock:
- read_unlock(&pag->pag_ici_lock);
- return error;
+ return 0;
}
STATIC int
@@ -172,10 +194,6 @@ xfs_sync_inode_data(
struct address_space *mapping = inode->i_mapping;
int error = 0;
- error = xfs_sync_inode_valid(ip, pag);
- if (error)
- return error;
-
if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
goto out_wait;
@@ -204,10 +222,6 @@ xfs_sync_inode_attr(
{
int error = 0;
- error = xfs_sync_inode_valid(ip, pag);
- if (error)
- return error;
-
xfs_ilock(ip, XFS_ILOCK_SHARED);
if (xfs_inode_clean(ip))
goto out_unlock;
@@ -242,7 +256,8 @@ xfs_sync_data(
ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
- error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
+ error = xfs_inode_ag_iterator(mp, xfs_sync_inode_grab,
+ xfs_sync_inode_data, flags);
if (error)
return XFS_ERROR(error);
@@ -260,7 +275,8 @@ xfs_sync_attr(
{
ASSERT((flags & ~SYNC_WAIT) == 0);
- return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags);
+ return xfs_inode_ag_iterator(mp, xfs_sync_inode_grab,
+ xfs_sync_inode_attr, flags);
}
STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index e8a3528..8b73fb4 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -47,8 +47,9 @@ void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
struct xfs_inode *ip);
-int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
+int xfs_sync_inode_grab(struct xfs_inode *ip);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
+ int (*grab)(struct xfs_inode *ip),
int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
int flags);
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index ac11fbe..8d2fdbe 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -868,28 +868,27 @@ xfs_qm_export_flags(
return (uflags);
}
-
STATIC int
-xfs_dqrele_inode(
- struct xfs_inode *ip,
- struct xfs_perag *pag,
- int flags)
+xfs_dqrele_inode_grab(
+ struct xfs_inode *ip)
{
- int error;
-
/* skip quota inodes */
if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
ASSERT(ip->i_udquot == NULL);
ASSERT(ip->i_gdquot == NULL);
- read_unlock(&pag->pag_ici_lock);
- return 0;
+ return ENOENT;
}
- error = xfs_sync_inode_valid(ip, pag);
- if (error)
- return error;
+ return xfs_sync_inode_grab(ip);
+}
+STATIC int
+xfs_dqrele_inode(
+ struct xfs_inode *ip,
+ struct xfs_perag *pag,
+ int flags)
+{
xfs_ilock(ip, XFS_ILOCK_EXCL);
if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
xfs_qm_dqrele(ip->i_udquot);
@@ -918,7 +917,8 @@ xfs_qm_dqrele_all_inodes(
uint flags)
{
ASSERT(mp->m_quotainfo);
- xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags);
+ xfs_inode_ag_iterator(mp, xfs_dqrele_inode_grab,
+ xfs_dqrele_inode, flags);
}
/*------------------------------------------------------------------------*/
--
1.7.1
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 39+ messages in thread* Re: [PATCH 12/16] xfs: implement batched inode lookups for AG walking
2010-09-22 6:44 ` [PATCH 12/16] xfs: implement batched inode lookups for AG walking Dave Chinner
@ 2010-09-22 17:33 ` Christoph Hellwig
2010-09-23 0:40 ` Dave Chinner
2010-09-23 17:17 ` Alex Elder
1 sibling, 1 reply; 39+ messages in thread
From: Christoph Hellwig @ 2010-09-22 17:33 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
> + int (*grab)(struct xfs_inode *ip),
I think we can do without this callback. The inode verification that
needs to be done is always the same. The only difference is that the
dqrele code skips the quota inodes - but this can easily be done in
the execute callback, and given that it's a slow path the additional
two igrab calls won't hurt either.
To be symmetic with that the IRELE call should also be moved from the
execute callbacks into the caller. All this is stuff enabled by
splitting out the reclaim code earlier which was pretty different in
this area. In fact just moving the validation + igrab and IRELE into
common code might just be done in a separate patch between the last one
and this.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 12/16] xfs: implement batched inode lookups for AG walking
2010-09-22 17:33 ` Christoph Hellwig
@ 2010-09-23 0:40 ` Dave Chinner
0 siblings, 0 replies; 39+ messages in thread
From: Dave Chinner @ 2010-09-23 0:40 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: xfs
On Wed, Sep 22, 2010 at 01:33:06PM -0400, Christoph Hellwig wrote:
> > + int (*grab)(struct xfs_inode *ip),
>
> I think we can do without this callback. The inode verification that
> needs to be done is always the same. The only difference is that the
> dqrele code skips the quota inodes - but this can easily be done in
> the execute callback, and given that it's a slow path the additional
> two igrab calls won't hurt either.
Seems fair - the grab callback is from the first version where the
reclaim walk was not split out so there was significant differences.
I'll clean that up.
> To be symmetic with that the IRELE call should also be moved from the
> execute callbacks into the caller. All this is stuff enabled by
> splitting out the reclaim code earlier which was pretty different in
> this area. In fact just moving the validation + igrab and IRELE into
> common code might just be done in a separate patch between the last one
> and this.
Yup, that makes it a lot cleaner. Will do.
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 12/16] xfs: implement batched inode lookups for AG walking
2010-09-22 6:44 ` [PATCH 12/16] xfs: implement batched inode lookups for AG walking Dave Chinner
2010-09-22 17:33 ` Christoph Hellwig
@ 2010-09-23 17:17 ` Alex Elder
2010-09-24 9:15 ` Dave Chinner
1 sibling, 1 reply; 39+ messages in thread
From: Alex Elder @ 2010-09-23 17:17 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
On Wed, 2010-09-22 at 16:44 +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
>
> With the reclaim code separated from the generic walking code, it is
> simple to implement batched lookups for the generic walk code.
> Separate out the inode validation from the execute operations and
> modify the tree lookups to get a batch of inodes at a time.
Two comments below. I noticed your discussion with Christoph
so I'll look for the new version before I stamp it "reviewed."
> Reclaim operations will be optimised separately.
>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
> fs/xfs/linux-2.6/xfs_sync.c | 104 +++++++++++++++++++++++-----------------
> fs/xfs/linux-2.6/xfs_sync.h | 3 +-
> fs/xfs/quota/xfs_qm_syscalls.c | 26 +++++-----
> 3 files changed, 75 insertions(+), 58 deletions(-)
>
> diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
> index 7737a13..227ecde 100644
> --- a/fs/xfs/linux-2.6/xfs_sync.c
> +++ b/fs/xfs/linux-2.6/xfs_sync.c
> @@ -39,11 +39,19 @@
> #include <linux/kthread.h>
> #include <linux/freezer.h>
>
> +/*
> + * The inode lookup is done in batches to keep the amount of lock traffic and
> + * radix tree lookups to a minimum. The batch size is a trade off between
> + * lookup reduction and stack usage. This is in the reclaim path, so we can't
> + * be too greedy.
> + */
> +#define XFS_LOOKUP_BATCH 32
Did you come up with 32 empirically? As the OS evolves might another
value be better? And if a larger value would improve things, how
would allocating the arrays rather than making them automatic (stack)
affect things? (Just a discussion point, I think it's fine as-is.)
> STATIC int
> xfs_inode_ag_walk(
> struct xfs_mount *mp,
> struct xfs_perag *pag,
> + int (*grab)(struct xfs_inode *ip),
> int (*execute)(struct xfs_inode *ip,
> struct xfs_perag *pag, int flags),
> int flags)
> @@ -52,48 +60,68 @@ xfs_inode_ag_walk(
> int last_error = 0;
> int skipped;
> int done;
> + int nr_found;
>
> restart:
> done = 0;
> skipped = 0;
> first_index = 0;
> + nr_found = 0;
> do {
> int error = 0;
> - int nr_found;
> - xfs_inode_t *ip;
> + int i;
> + struct xfs_inode *batch[XFS_LOOKUP_BATCH];
>
> read_lock(&pag->pag_ici_lock);
> nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
> - (void **)&ip, first_index, 1);
> + (void **)batch, first_index,
> + XFS_LOOKUP_BATCH);
> if (!nr_found) {
> read_unlock(&pag->pag_ici_lock);
> break;
> }
>
> /*
> - * Update the index for the next lookup. Catch overflows
> - * into the next AG range which can occur if we have inodes
> - * in the last block of the AG and we are currently
> - * pointing to the last inode.
> + * Grab the inodes before we drop the lock. if we found
> + * nothing, nr == 0 and the loop will be skipped.
> */
> - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
> - if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
> - done = 1;
> -
> - /* execute releases pag->pag_ici_lock */
> - error = execute(ip, pag, flags);
> - if (error == EAGAIN) {
> - skipped++;
> - continue;
> + for (i = 0; i < nr_found; i++) {
> + struct xfs_inode *ip = batch[i];
> +
> + if (done || grab(ip))
> + batch[i] = NULL;
> +
> + /*
> + * Update the index for the next lookup. Catch overflows
> + * into the next AG range which can occur if we have inodes
> + * in the last block of the AG and we are currently
> + * pointing to the last inode.
> + */
> + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
> + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
> + done = 1;
It sounds like you're going to re-work this, but
I'll mention this for you to consider anyway. I
don't know that the "done" flag here should be
needed. The gang lookup should never return
anything beyond the end of the AG. It seems
like you ought to be able to detect when you've
covered all the whole AG elsewhere, *not*
on every entry found in this inner loop and
also *not* while holding the lock.
> + }
> +
> + /* unlock now we've grabbed the inodes. */
> + read_unlock(&pag->pag_ici_lock);
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [PATCH 12/16] xfs: implement batched inode lookups for AG walking
2010-09-23 17:17 ` Alex Elder
@ 2010-09-24 9:15 ` Dave Chinner
2010-09-27 16:05 ` Alex Elder
2010-09-27 17:43 ` Alex Elder
0 siblings, 2 replies; 39+ messages in thread
From: Dave Chinner @ 2010-09-24 9:15 UTC (permalink / raw)
To: Alex Elder; +Cc: xfs
On Thu, Sep 23, 2010 at 12:17:05PM -0500, Alex Elder wrote:
> On Wed, 2010-09-22 at 16:44 +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> >
> > With the reclaim code separated from the generic walking code, it is
> > simple to implement batched lookups for the generic walk code.
> > Separate out the inode validation from the execute operations and
> > modify the tree lookups to get a batch of inodes at a time.
>
> Two comments below. I noticed your discussion with Christoph
> so I'll look for the new version before I stamp it "reviewed."
>
> > Reclaim operations will be optimised separately.
> >
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > ---
> > fs/xfs/linux-2.6/xfs_sync.c | 104 +++++++++++++++++++++++-----------------
> > fs/xfs/linux-2.6/xfs_sync.h | 3 +-
> > fs/xfs/quota/xfs_qm_syscalls.c | 26 +++++-----
> > 3 files changed, 75 insertions(+), 58 deletions(-)
> >
> > diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
> > index 7737a13..227ecde 100644
> > --- a/fs/xfs/linux-2.6/xfs_sync.c
> > +++ b/fs/xfs/linux-2.6/xfs_sync.c
> > @@ -39,11 +39,19 @@
> > #include <linux/kthread.h>
> > #include <linux/freezer.h>
> >
> > +/*
> > + * The inode lookup is done in batches to keep the amount of lock traffic and
> > + * radix tree lookups to a minimum. The batch size is a trade off between
> > + * lookup reduction and stack usage. This is in the reclaim path, so we can't
> > + * be too greedy.
> > + */
> > +#define XFS_LOOKUP_BATCH 32
>
> Did you come up with 32 empirically? As the OS evolves might another
> value be better? And if a larger value would improve things, how
> would allocating the arrays rather than making them automatic (stack)
> affect things? (Just a discussion point, I think it's fine as-is.)
It's a trade off between stack space and efficiency. It uses 256
bytes of stack space to reduce lock traffic by a factor fo 32. For
the rea side walks, stack space is not an issue because the call
paths are all shallow.
For the write side (reclaim) walks, we are in memory reclaim, so we
cannot rely on memory allocations, and we have relatively limited
stack space to work in. However we have much more contention on that
side, so the importance of large batches is higher.
Hence a batch size of 32 seems like a decent tradeoff between stack
usage and efficiency gains. We can't make it much larger because of
the reclaim path stack usage, we can't make it allocated because of
the reclaim path usage, and we can't make it much smaller otherwise
we don't get much improvement in efficiency....
> > + if (done || grab(ip))
> > + batch[i] = NULL;
> > +
> > + /*
> > + * Update the index for the next lookup. Catch overflows
> > + * into the next AG range which can occur if we have inodes
> > + * in the last block of the AG and we are currently
> > + * pointing to the last inode.
> > + */
> > + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
> > + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
> > + done = 1;
>
> It sounds like you're going to re-work this, but
> I'll mention this for you to consider anyway. I
> don't know that the "done" flag here should be
> needed.
This check was added because if we don't detect the special case of
the last valid inode _number_ in the AG, first_index will loop back
to 0 and we'll start searching the AG again. IOWs, we're not
looking for the last inode in the cache, we're looking for the last
valid inode number.
Hence the done flag is ensuring that:
a) we terminate the walk at the last valid inode
b) if there are inodes at indexes above the last valid inode
number, we do not grab them or continue walking them.
Yes, b) should never happen, but I've had bugs in development code
that have put inodes in stange places before...
> The gang lookup should never return
> anything beyond the end of the AG. It seems
> like you ought to be able to detect when you've
> covered all the whole AG elsewhere,
AFAICT, there are only two ways - the gang lookup returns nothing,
or we see the last valid inode number in the AG. If you can come up
with something that doesn't invlove a tree or inode number lookup,
I'm all ears....
> *not*
> on every entry found in this inner loop and
> also *not* while holding the lock.
It has to be done while holding the lock because if we cannot grab
the inode then the only way we can safely derefence the inode is
by still holding the inode cache lock. Once we drop the lock, the
inodes we failed to grab can be removed from the cache and we cannot
safely dereference them to get the inode number from them.
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 12/16] xfs: implement batched inode lookups for AG walking
2010-09-24 9:15 ` Dave Chinner
@ 2010-09-27 16:05 ` Alex Elder
2010-09-27 17:43 ` Alex Elder
1 sibling, 0 replies; 39+ messages in thread
From: Alex Elder @ 2010-09-27 16:05 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
On Fri, 2010-09-24 at 19:15 +1000, Dave Chinner wrote:
> On Thu, Sep 23, 2010 at 12:17:05PM -0500, Alex Elder wrote:
> > On Wed, 2010-09-22 at 16:44 +1000, Dave Chinner wrote:
> > > From: Dave Chinner <dchinner@redhat.com>
> > >
> > > With the reclaim code separated from the generic walking code, it is
> > > simple to implement batched lookups for the generic walk code.
. . .
> > The gang lookup should never return
> > anything beyond the end of the AG. It seems
> > like you ought to be able to detect when you've
> > covered all the whole AG elsewhere,
>
> AFAICT, there are only two ways - the gang lookup returns nothing,
> or we see the last valid inode number in the AG. If you can come up
> with something that doesn't invlove a tree or inode number lookup,
> I'm all ears....
I didn't have anything specific in mind and was basing
this somewhat on gut feeling so you probably have the
best solution as-is. But I'll look at your new code
and if I come up with any bright ideas I'll let you
know.
-Alex
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 12/16] xfs: implement batched inode lookups for AG walking
2010-09-24 9:15 ` Dave Chinner
2010-09-27 16:05 ` Alex Elder
@ 2010-09-27 17:43 ` Alex Elder
1 sibling, 0 replies; 39+ messages in thread
From: Alex Elder @ 2010-09-27 17:43 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
On Fri, 2010-09-24 at 19:15 +1000, Dave Chinner wrote:
> On Thu, Sep 23, 2010 at 12:17:05PM -0500, Alex Elder wrote:
> > On Wed, 2010-09-22 at 16:44 +1000, Dave Chinner wrote:
. . .
> > It sounds like you're going to re-work this, but
> > I'll mention this for you to consider anyway. I
> > don't know that the "done" flag here should be
> > needed.
>
> This check was added because if we don't detect the special case of
> the last valid inode _number_ in the AG, first_index will loop back
> to 0 and we'll start searching the AG again. IOWs, we're not
> looking for the last inode in the cache, we're looking for the last
> valid inode number.
>
> Hence the done flag is ensuring that:
> a) we terminate the walk at the last valid inode
> b) if there are inodes at indexes above the last valid inode
> number, we do not grab them or continue walking them.
>
> Yes, b) should never happen, but I've had bugs in development code
> that have put inodes in stange places before...
>
> > The gang lookup should never return
> > anything beyond the end of the AG. It seems
> > like you ought to be able to detect when you've
> > covered all the whole AG elsewhere,
>
> AFAICT, there are only two ways - the gang lookup returns nothing,
> or we see the last valid inode number in the AG. If you can come up
> with something that doesn't invlove a tree or inode number lookup,
> I'm all ears....
>
> > *not*
> > on every entry found in this inner loop and
> > also *not* while holding the lock.
>
> It has to be done while holding the lock because if we cannot grab
> the inode then the only way we can safely derefence the inode is
> by still holding the inode cache lock. Once we drop the lock, the
> inodes we failed to grab can be removed from the cache and we cannot
> safely dereference them to get the inode number from them.
OK, I have a proposal below--it's not a diff, it's just a
modified version of xfs_inode_ag_walk() for you to consider.
It's not hugely better but it reduces the amount
of computation done inside the inner loop and while the
lock is held. I haven't done any testing with it.
How this differs from what you have, probably in order
of importance:
- Update first_index only on the last inode returned by
a gang lookup (not on every inode returned)
- Don't compare new value of first_index against the
old one when it's updated.
- Use first_index == 0 (rather than done != 0) as an
indication that we've exhausted the inodes in the AG.
- Tracks only grabbed inodes (rather than filling the
array with null pointers in slots for those not grabbed)
- Don't bother looking at "error" again if it's zero
(the normal case)
Anyway, do what you like with this (or do nothing at all).
I was just following up on my suggestion.
-Alex
STATIC int
xfs_inode_ag_walk(
struct xfs_mount *mp,
struct xfs_perag *pag,
int (*execute)(struct xfs_inode *ip,
struct xfs_perag *pag, int
flags),
int flags)
{
uint32_t first_index;
int last_error = 0;
int skipped;
int nr_found;
restart:
skipped = 0;
first_index = 0;
do {
int error = 0;
int nr_grabbed = 0;
int i;
struct xfs_inode *batch[XFS_LOOKUP_BATCH];
struct xfs_inode *ip; /* = NULL if compiler complains */
read_lock(&pag->pag_ici_lock);
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
(void **) batch, first_index,
XFS_LOOKUP_BATCH);
if (!nr_found) {
read_unlock(&pag->pag_ici_lock);
break;
}
/* Grab the inodes while we hold the lock. */
for (i = 0; i < nr_found; i++) {
ip = batch[i];
if (!xfs_inode_ag_walk_grab(ip)) {
if (i > nr_grabbed)
batch[nr_grabbed] = ip;
nr_grabbed++;
}
}
/*
* Update the index so the next lookup starts after
* the last inode found (whether or not we were able
* to grab it). If that inode was the highest one
* in the AG, this will evaluate to 0, which will
* cause the loop to terminate (below).
*/
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
/* Done looking at (ungrabbed) inodes; drop the lock */
read_unlock(&pag->pag_ici_lock);
for (i = 0; i < nr_grabbed; i++) {
ip = batch[i];
error = execute(ip, pag, flags);
IRELE(ip);
if (error) {
if (error == EAGAIN) {
skipped++;
else if (last_error != EFSCORRUPTED)
last_error = error;
}
}
/* bail out if the filesystem is corrupted. */
if (error == EFSCORRUPTED)
break;
} while (nr_found && first_index);
if (skipped) {
delay(1);
goto restart;
}
return last_error;
}
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 39+ messages in thread
* [PATCH 13/16] xfs: batch inode reclaim lookup
2010-09-22 6:44 [PATCH 0/16] xfs: metadata scalability V2 Dave Chinner
` (11 preceding siblings ...)
2010-09-22 6:44 ` [PATCH 12/16] xfs: implement batched inode lookups for AG walking Dave Chinner
@ 2010-09-22 6:44 ` Dave Chinner
2010-09-22 17:34 ` Christoph Hellwig
2010-09-23 17:39 ` Alex Elder
2010-09-22 6:44 ` [PATCH 14/16] xfs: serialise inode reclaim within an AG Dave Chinner
` (3 subsequent siblings)
16 siblings, 2 replies; 39+ messages in thread
From: Dave Chinner @ 2010-09-22 6:44 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
Batch and optimise the per-ag inode lookup for reclaim to minimise
scanning overhead. This involves gang lookups on the radix trees to
get multiple inodes during each tree walk, and tighter validation of
what inodes can be reclaimed without blocking befor we take any
locks.
This is based on ideas suggested in a proof-of-concept patch
posted by Nick Piggin.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
fs/xfs/linux-2.6/xfs_sync.c | 110 ++++++++++++++++++++++++++++++-------------
1 files changed, 77 insertions(+), 33 deletions(-)
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 227ecde..ea44b1d 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -628,6 +628,43 @@ __xfs_inode_clear_reclaim_tag(
}
/*
+ * Grab the inode for reclaim exclusively.
+ * Return 0 if we grabbed it, non-zero otherwise.
+ */
+STATIC int
+xfs_reclaim_inode_grab(
+ struct xfs_inode *ip,
+ int flags)
+{
+
+ /*
+ * do some unlocked checks first to avoid unnecceary lock traffic.
+ * The first is a flush lock check, the second is a already in reclaim
+ * check. Only do these checks if we are not going to block on locks.
+ */
+ if ((flags & SYNC_TRYLOCK) &&
+ (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
+ return 1;
+ }
+
+ /*
+ * The radix tree lock here protects a thread in xfs_iget from racing
+ * with us starting reclaim on the inode. Once we have the
+ * XFS_IRECLAIM flag set it will not touch us.
+ */
+ spin_lock(&ip->i_flags_lock);
+ ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+ if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
+ /* ignore as it is already under reclaim */
+ spin_unlock(&ip->i_flags_lock);
+ return 1;
+ }
+ __xfs_iflags_set(ip, XFS_IRECLAIM);
+ spin_unlock(&ip->i_flags_lock);
+ return 0;
+}
+
+/*
* Inodes in different states need to be treated differently, and the return
* value of xfs_iflush is not sufficient to get this right. The following table
* lists the inode states and the reclaim actions necessary for non-blocking
@@ -685,23 +722,6 @@ xfs_reclaim_inode(
{
int error = 0;
- /*
- * The radix tree lock here protects a thread in xfs_iget from racing
- * with us starting reclaim on the inode. Once we have the
- * XFS_IRECLAIM flag set it will not touch us.
- */
- spin_lock(&ip->i_flags_lock);
- ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
- if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
- /* ignore as it is already under reclaim */
- spin_unlock(&ip->i_flags_lock);
- write_unlock(&pag->pag_ici_lock);
- return 0;
- }
- __xfs_iflags_set(ip, XFS_IRECLAIM);
- spin_unlock(&ip->i_flags_lock);
- write_unlock(&pag->pag_ici_lock);
-
xfs_ilock(ip, XFS_ILOCK_EXCL);
if (!xfs_iflock_nowait(ip)) {
if (!(sync_mode & SYNC_WAIT))
@@ -816,16 +836,19 @@ xfs_reclaim_inodes_ag(
while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
unsigned long first_index = 0;
int done = 0;
+ int nr_found = 0;
ag = pag->pag_agno + 1;
do {
- struct xfs_inode *ip;
- int nr_found;
+ struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+ int i;
write_lock(&pag->pag_ici_lock);
- nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
- (void **)&ip, first_index, 1,
+ nr_found = radix_tree_gang_lookup_tag(
+ &pag->pag_ici_root,
+ (void **)batch, first_index,
+ XFS_LOOKUP_BATCH,
XFS_ICI_RECLAIM_TAG);
if (!nr_found) {
write_unlock(&pag->pag_ici_lock);
@@ -833,20 +856,41 @@ xfs_reclaim_inodes_ag(
}
/*
- * Update the index for the next lookup. Catch overflows
- * into the next AG range which can occur if we have inodes
- * in the last block of the AG and we are currently
- * pointing to the last inode.
+ * Grab the inodes before we drop the lock. if we found
+ * nothing, nr == 0 and the loop will be skipped.
*/
- first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
- if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
- done = 1;
+ for (i = 0; i < nr_found; i++) {
+ struct xfs_inode *ip = batch[i];
+
+ if (done || xfs_reclaim_inode_grab(ip, flags))
+ batch[i] = NULL;
+
+ /*
+ * Update the index for the next lookup. Catch
+ * overflows into the next AG range which can
+ * occur if we have inodes in the last block of
+ * the AG and we are currently pointing to the
+ * last inode.
+ */
+ first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+ if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+ done = 1;
+ }
- error = xfs_reclaim_inode(ip, pag, flags);
- if (error && last_error != EFSCORRUPTED)
- last_error = error;
+ /* unlock now we've grabbed the inodes. */
+ write_unlock(&pag->pag_ici_lock);
+
+ for (i = 0; i < nr_found; i++) {
+ if (!batch[i])
+ continue;
+ error = xfs_reclaim_inode(batch[i], pag, flags);
+ if (error && last_error != EFSCORRUPTED)
+ last_error = error;
+ }
+
+ *nr_to_scan -= XFS_LOOKUP_BATCH;
- } while (!done && (*nr_to_scan)--);
+ } while (nr_found && !done && *nr_to_scan > 0);
xfs_perag_put(pag);
}
@@ -882,7 +926,7 @@ xfs_reclaim_inode_shrink(
if (!(gfp_mask & __GFP_FS))
return -1;
- xfs_reclaim_inodes_ag(mp, 0, &nr_to_scan);
+ xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan);
/* terminate if we don't exhaust the scan */
if (nr_to_scan > 0)
return -1;
--
1.7.1
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 39+ messages in thread* Re: [PATCH 13/16] xfs: batch inode reclaim lookup
2010-09-22 6:44 ` [PATCH 13/16] xfs: batch inode reclaim lookup Dave Chinner
@ 2010-09-22 17:34 ` Christoph Hellwig
2010-09-23 0:43 ` Dave Chinner
2010-09-23 17:39 ` Alex Elder
1 sibling, 1 reply; 39+ messages in thread
From: Christoph Hellwig @ 2010-09-22 17:34 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
On Wed, Sep 22, 2010 at 04:44:26PM +1000, Dave Chinner wrote:
> This is based on ideas suggested in a proof-of-concept patch
> posted by Nick Piggin.
Hmm, I can't remember seeing that patch.
But the code itself looks good,
Reviewed-by: Christoph Hellwig <hch@lst.de>
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 13/16] xfs: batch inode reclaim lookup
2010-09-22 6:44 ` [PATCH 13/16] xfs: batch inode reclaim lookup Dave Chinner
2010-09-22 17:34 ` Christoph Hellwig
@ 2010-09-23 17:39 ` Alex Elder
1 sibling, 0 replies; 39+ messages in thread
From: Alex Elder @ 2010-09-23 17:39 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
On Wed, 2010-09-22 at 16:44 +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
>
> Batch and optimise the per-ag inode lookup for reclaim to minimise
> scanning overhead. This involves gang lookups on the radix trees to
> get multiple inodes during each tree walk, and tighter validation of
> what inodes can be reclaimed without blocking befor we take any
> locks.
>
> This is based on ideas suggested in a proof-of-concept patch
> posted by Nick Piggin.
Looks good.
Reviewed-by: Alex Elder <aelder@sgi.com>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 39+ messages in thread
* [PATCH 14/16] xfs: serialise inode reclaim within an AG
2010-09-22 6:44 [PATCH 0/16] xfs: metadata scalability V2 Dave Chinner
` (12 preceding siblings ...)
2010-09-22 6:44 ` [PATCH 13/16] xfs: batch inode reclaim lookup Dave Chinner
@ 2010-09-22 6:44 ` Dave Chinner
2010-09-23 17:50 ` Alex Elder
2010-09-22 6:44 ` [PATCH 16/16] xfs; pack xfs_buf structure more tightly Dave Chinner
` (2 subsequent siblings)
16 siblings, 1 reply; 39+ messages in thread
From: Dave Chinner @ 2010-09-22 6:44 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
Memory reclaim via shrinkers has a terrible habit of having N+M
concurrent shrinker executions (N = num CPUs, M = num kswapds) all
trying to shrink the same cache. When the cache they are all working
on is protected by a single spinlock, massive contention an
slowdowns occur.
Wrap the per-ag inode caches with a reclaim mutex to serialise
reclaim access to the AG. This will block concurrent reclaim in each
AG but still allow reclaim to scan multiple AGs concurrently. Allow
shrinkers to move on to the next AG if it can't get the lock, and if
we can't get any AG, then start blocking on locks.
To prevent reclaimers from continually scanning the same inodes in
each AG, add a cursor that tracks where the last reclaim got up to
and start from that point on the next reclaim. This should avoid
only ever scanning a small number of inodes at the satart of each AG
and not making progress. If we have a non-shrinker based reclaim
pass, ignore the cursor and reset it to zero once we are done.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
fs/xfs/linux-2.6/xfs_sync.c | 24 ++++++++++++++++++++++++
fs/xfs/xfs_ag.h | 2 ++
fs/xfs/xfs_mount.c | 1 +
3 files changed, 27 insertions(+), 0 deletions(-)
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index ea44b1d..7b06399 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -831,7 +831,9 @@ xfs_reclaim_inodes_ag(
int error = 0;
int last_error = 0;
xfs_agnumber_t ag;
+ int trylock = !!(flags & SYNC_TRYLOCK);
+restart:
ag = 0;
while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
unsigned long first_index = 0;
@@ -840,6 +842,17 @@ xfs_reclaim_inodes_ag(
ag = pag->pag_agno + 1;
+ if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
+ if (trylock) {
+ trylock++;
+ continue;
+ }
+ mutex_lock(&pag->pag_ici_reclaim_lock);
+ }
+
+ if (trylock)
+ first_index = pag->pag_ici_reclaim_cursor;
+
do {
struct xfs_inode *batch[XFS_LOOKUP_BATCH];
int i;
@@ -892,8 +905,19 @@ xfs_reclaim_inodes_ag(
} while (nr_found && !done && *nr_to_scan > 0);
+ pag->pag_ici_reclaim_cursor = (done || !trylock) ? 0 : first_index;
+ mutex_unlock(&pag->pag_ici_reclaim_lock);
xfs_perag_put(pag);
}
+
+ /*
+ * if we skipped any AG, and we still have scan count remaining, do
+ * another pass this time waiting on the reclaim locks.
+ */
+ if (trylock > 1 && *nr_to_scan) {
+ trylock = 0;
+ goto restart;
+ }
return XFS_ERROR(last_error);
}
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 51c42c2..baeec83 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -230,6 +230,8 @@ typedef struct xfs_perag {
rwlock_t pag_ici_lock; /* incore inode lock */
struct radix_tree_root pag_ici_root; /* incore inode cache root */
int pag_ici_reclaimable; /* reclaimable inodes */
+ struct mutex pag_ici_reclaim_lock; /* serialisation point */
+ unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
/* for rcu-safe freeing */
struct rcu_head rcu_head;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d66e87c..59859c3 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -477,6 +477,7 @@ xfs_initialize_perag(
pag->pag_agno = index;
pag->pag_mount = mp;
rwlock_init(&pag->pag_ici_lock);
+ mutex_init(&pag->pag_ici_reclaim_lock);
INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
if (radix_tree_preload(GFP_NOFS))
--
1.7.1
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 39+ messages in thread* Re: [PATCH 14/16] xfs: serialise inode reclaim within an AG
2010-09-22 6:44 ` [PATCH 14/16] xfs: serialise inode reclaim within an AG Dave Chinner
@ 2010-09-23 17:50 ` Alex Elder
0 siblings, 0 replies; 39+ messages in thread
From: Alex Elder @ 2010-09-23 17:50 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
On Wed, 2010-09-22 at 16:44 +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
>
> Memory reclaim via shrinkers has a terrible habit of having N+M
> concurrent shrinker executions (N = num CPUs, M = num kswapds) all
> trying to shrink the same cache. When the cache they are all working
> on is protected by a single spinlock, massive contention an
> slowdowns occur.
>
> Wrap the per-ag inode caches with a reclaim mutex to serialise
> reclaim access to the AG. This will block concurrent reclaim in each
> AG but still allow reclaim to scan multiple AGs concurrently. Allow
> shrinkers to move on to the next AG if it can't get the lock, and if
> we can't get any AG, then start blocking on locks.
>
> To prevent reclaimers from continually scanning the same inodes in
> each AG, add a cursor that tracks where the last reclaim got up to
> and start from that point on the next reclaim. This should avoid
> only ever scanning a small number of inodes at the satart of each AG
> and not making progress. If we have a non-shrinker based reclaim
> pass, ignore the cursor and reset it to zero once we are done.
>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
One small comment about the implied meaning of "trylock"
below. But not a big deal, so...
Reviewed-by: Alex Elder <aelder@sgi.com>
> ---
> fs/xfs/linux-2.6/xfs_sync.c | 24 ++++++++++++++++++++++++
> fs/xfs/xfs_ag.h | 2 ++
> fs/xfs/xfs_mount.c | 1 +
> 3 files changed, 27 insertions(+), 0 deletions(-)
>
> diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
> index ea44b1d..7b06399 100644
> --- a/fs/xfs/linux-2.6/xfs_sync.c
> +++ b/fs/xfs/linux-2.6/xfs_sync.c
. . .
> @@ -840,6 +842,17 @@ xfs_reclaim_inodes_ag(
>
> ag = pag->pag_agno + 1;
>
> + if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
> + if (trylock) {
> + trylock++;
> + continue;
> + }
> + mutex_lock(&pag->pag_ici_reclaim_lock);
> + }
> +
It isn't all that obvious here that "trylock" also
carries the meaning "called via the inode shrinker",
which is why we're using the cursor in this case.
> + if (trylock)
> + first_index = pag->pag_ici_reclaim_cursor;
> +
> do {
> struct xfs_inode *batch[XFS_LOOKUP_BATCH];
> int i;
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 39+ messages in thread
* [PATCH 16/16] xfs; pack xfs_buf structure more tightly
2010-09-22 6:44 [PATCH 0/16] xfs: metadata scalability V2 Dave Chinner
` (13 preceding siblings ...)
2010-09-22 6:44 ` [PATCH 14/16] xfs: serialise inode reclaim within an AG Dave Chinner
@ 2010-09-22 6:44 ` Dave Chinner
2010-09-22 14:53 ` [PATCH 0/16] xfs: metadata scalability V2 Christoph Hellwig
2010-09-22 20:55 ` Alex Elder
16 siblings, 0 replies; 39+ messages in thread
From: Dave Chinner @ 2010-09-22 6:44 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
pahole reports the struct xfs_buf has quite a few holes in it, so
packing the structure better will reduce the size of it by 16 bytes.
Also, move all the fields used in cache lookups into the first
cacheline.
Before on x86_64:
/* size: 320, cachelines: 5 */
/* sum members: 298, holes: 6, sum holes: 22 */
After on x86_64:
/* size: 304, cachelines: 5 */
/* padding: 6 */
/* last cacheline: 48 bytes */
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
fs/xfs/linux-2.6/xfs_buf.h | 30 +++++++++++++++++++-----------
1 files changed, 19 insertions(+), 11 deletions(-)
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 4f3c845..c2e2b50 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -162,33 +162,41 @@ typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
#define XB_PAGES 2
typedef struct xfs_buf {
+ /*
+ * first cacheline holds all the fields needed for an uncontended cache
+ * hit to be fully processed. The semaphore straddles the cacheline
+ * boundary, but the counter and lock sits on the first cacheline,
+ * which is the only bit that is touched if we hit the semaphore
+ * fast-path on locking.
+ */
+ struct rb_node b_rbnode; /* rbtree node */
+ xfs_off_t b_file_offset; /* offset in file */
+ size_t b_buffer_length;/* size of buffer in bytes */
+ atomic_t b_hold; /* reference count */
+ xfs_buf_flags_t b_flags; /* status flags */
struct semaphore b_sema; /* semaphore for lockables */
- unsigned long b_queuetime; /* time buffer was queued */
- atomic_t b_pin_count; /* pin count */
+
wait_queue_head_t b_waiters; /* unpin waiters */
struct list_head b_list;
- xfs_buf_flags_t b_flags; /* status flags */
- struct rb_node b_rbnode; /* rbtree node */
struct xfs_perag *b_pag; /* contains rbtree root */
xfs_buftarg_t *b_target; /* buffer target (device) */
- atomic_t b_hold; /* reference count */
xfs_daddr_t b_bn; /* block number for I/O */
- xfs_off_t b_file_offset; /* offset in file */
- size_t b_buffer_length;/* size of buffer in bytes */
size_t b_count_desired;/* desired transfer size */
void *b_addr; /* virtual address of buffer */
struct work_struct b_iodone_work;
- atomic_t b_io_remaining; /* #outstanding I/O requests */
xfs_buf_iodone_t b_iodone; /* I/O completion function */
xfs_buf_relse_t b_relse; /* releasing function */
struct completion b_iowait; /* queue for I/O waiters */
void *b_fspriv;
void *b_fspriv2;
- unsigned short b_error; /* error code on I/O */
- unsigned int b_page_count; /* size of page array */
- unsigned int b_offset; /* page offset in first page */
struct page **b_pages; /* array of page pointers */
struct page *b_page_array[XB_PAGES]; /* inline pages */
+ unsigned long b_queuetime; /* time buffer was queued */
+ atomic_t b_pin_count; /* pin count */
+ atomic_t b_io_remaining; /* #outstanding I/O requests */
+ unsigned int b_page_count; /* size of page array */
+ unsigned int b_offset; /* page offset in first page */
+ unsigned short b_error; /* error code on I/O */
#ifdef XFS_BUF_LOCK_TRACKING
int b_last_holder;
#endif
--
1.7.1
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 39+ messages in thread* Re: [PATCH 0/16] xfs: metadata scalability V2
2010-09-22 6:44 [PATCH 0/16] xfs: metadata scalability V2 Dave Chinner
` (14 preceding siblings ...)
2010-09-22 6:44 ` [PATCH 16/16] xfs; pack xfs_buf structure more tightly Dave Chinner
@ 2010-09-22 14:53 ` Christoph Hellwig
2010-09-22 20:55 ` Alex Elder
16 siblings, 0 replies; 39+ messages in thread
From: Christoph Hellwig @ 2010-09-22 14:53 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
On Wed, Sep 22, 2010 at 04:44:13PM +1000, Dave Chinner wrote:
> Note: The branch also contains Christoph's btree + dquot cleanups;
> they have been sitting in my tree being tested for quite some time
> now and have been tested all through this metadata scalability work.
These patches made it into the oss tree by now. What's still absent
there is the punch hole + zero patches from you. Alex, can you apply
these two?
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [PATCH 0/16] xfs: metadata scalability V2
2010-09-22 6:44 [PATCH 0/16] xfs: metadata scalability V2 Dave Chinner
` (15 preceding siblings ...)
2010-09-22 14:53 ` [PATCH 0/16] xfs: metadata scalability V2 Christoph Hellwig
@ 2010-09-22 20:55 ` Alex Elder
2010-09-23 0:46 ` [PATCH 15/16] xfs: convert buffer cache hash to rbtree Dave Chinner
16 siblings, 1 reply; 39+ messages in thread
From: Alex Elder @ 2010-09-22 20:55 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
On Wed, 2010-09-22 at 16:44 +1000, Dave Chinner wrote:
> This patchset started out as a "convert the buffer cache to rbtrees"
> patch, and just gew from there as I peeled the onion from one
> bottleneck to another. The second version of this patch does not go
> as far as the first version - it drops the more radical changes as
> they are not ready for integration yet.
I saw patches 01-14 and 16, but no 15. Was that
intentionally excluded, or an oversight?
-Alex
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 39+ messages in thread* [PATCH 15/16] xfs: convert buffer cache hash to rbtree
2010-09-22 20:55 ` Alex Elder
@ 2010-09-23 0:46 ` Dave Chinner
0 siblings, 0 replies; 39+ messages in thread
From: Dave Chinner @ 2010-09-23 0:46 UTC (permalink / raw)
To: Alex Elder; +Cc: xfs
On Wed, Sep 22, 2010 at 03:55:32PM -0500, Alex Elder wrote:
> On Wed, 2010-09-22 at 16:44 +1000, Dave Chinner wrote:
> > This patchset started out as a "convert the buffer cache to rbtrees"
> > patch, and just gew from there as I peeled the onion from one
> > bottleneck to another. The second version of this patch does not go
> > as far as the first version - it drops the more radical changes as
> > they are not ready for integration yet.
>
> I saw patches 01-14 and 16, but no 15. Was that
> intentionally excluded, or an oversight?
No, it looks like it got eaten along the way - it was definitely
sent by the patchbomb script. It was the rbtree conversion. Patch
below.
--
Dave Chinner
david@fromorbit.com
commit 92473cd83181668e5785337db0441a0d90a626ec
Author: Dave Chinner <dchinner@redhat.com>
Date: Wed Sep 22 10:47:20 2010 +1000
xfs: convert buffer cache hash to rbtree
The buffer cache hash is showing typical hash scalability problems.
In large scale testing the number of cached items growing far larger
than the hash can efficiently handle. Hence we need to move to a
self-scaling cache indexing mechanism.
I have selected rbtrees for indexing becuse they can have O(log n)
search scalability, and insert and remove cost is not excessive,
even on large trees. Hence we should be able to cache large numbers
of buffers without incurring the excessive cache miss search
penalties that the hash is imposing on us.
To ensure we still have parallel access to the cache, we need
multiple trees. Rather than hashing the buffers by disk address to
select a tree, it seems more sensible to separate trees by typical
access patterns. Most operations use buffers from within a single AG
at a time, so rather than searching lots of different lists,
separate the buffer indexes out into per-AG rbtrees. This means that
searches during metadata operation have a much higher chance of
hitting cache resident nodes, and that updates of the tree are less
likely to disturb trees being accessed on other CPUs doing
independent operations.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 6de9513..7a2b860 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -188,7 +188,7 @@ _xfs_buf_initialize(
atomic_set(&bp->b_hold, 1);
init_completion(&bp->b_iowait);
INIT_LIST_HEAD(&bp->b_list);
- INIT_LIST_HEAD(&bp->b_hash_list);
+ RB_CLEAR_NODE(&bp->b_rbnode);
init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
XB_SET_OWNER(bp);
bp->b_target = target;
@@ -262,8 +262,6 @@ xfs_buf_free(
{
trace_xfs_buf_free(bp, _RET_IP_);
- ASSERT(list_empty(&bp->b_hash_list));
-
if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
uint i;
@@ -422,8 +420,10 @@ _xfs_buf_find(
{
xfs_off_t range_base;
size_t range_length;
- xfs_bufhash_t *hash;
- xfs_buf_t *bp, *n;
+ struct xfs_perag *pag;
+ struct rb_node **rbp;
+ struct rb_node *parent;
+ xfs_buf_t *bp;
range_base = (ioff << BBSHIFT);
range_length = (isize << BBSHIFT);
@@ -432,14 +432,37 @@ _xfs_buf_find(
ASSERT(!(range_length < (1 << btp->bt_sshift)));
ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
- hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
-
- spin_lock(&hash->bh_lock);
-
- list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
- ASSERT(btp == bp->b_target);
- if (bp->b_file_offset == range_base &&
- bp->b_buffer_length == range_length) {
+ /* get tree root */
+ pag = xfs_perag_get(btp->bt_mount,
+ xfs_daddr_to_agno(btp->bt_mount, ioff));
+
+ /* walk tree */
+ spin_lock(&pag->pag_buf_lock);
+ rbp = &pag->pag_buf_tree.rb_node;
+ parent = NULL;
+ bp = NULL;
+ while (*rbp) {
+ parent = *rbp;
+ bp = rb_entry(parent, struct xfs_buf, b_rbnode);
+
+ if (range_base < bp->b_file_offset)
+ rbp = &(*rbp)->rb_left;
+ else if (range_base > bp->b_file_offset)
+ rbp = &(*rbp)->rb_right;
+ else {
+ /*
+ * found a block offset match. If the range doesn't
+ * match, the only way this is allowed is if the buffer
+ * in the cache is stale and the transaction that made
+ * it stale has not yet committed. i.e. we are
+ * reallocating a busy extent. Skip this buffer and
+ * continue searching to the right for an exact match.
+ */
+ if (bp->b_buffer_length != range_length) {
+ ASSERT(bp->b_flags & XBF_STALE);
+ rbp = &(*rbp)->rb_right;
+ continue;
+ }
atomic_inc(&bp->b_hold);
goto found;
}
@@ -449,17 +472,21 @@ _xfs_buf_find(
if (new_bp) {
_xfs_buf_initialize(new_bp, btp, range_base,
range_length, flags);
- new_bp->b_hash = hash;
- list_add(&new_bp->b_hash_list, &hash->bh_list);
+ rb_link_node(&new_bp->b_rbnode, parent, rbp);
+ rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
+ /* the buffer keeps the perag reference until it is freed */
+ new_bp->b_pag = pag;
+ spin_unlock(&pag->pag_buf_lock);
} else {
XFS_STATS_INC(xb_miss_locked);
+ spin_unlock(&pag->pag_buf_lock);
+ xfs_perag_put(pag);
}
-
- spin_unlock(&hash->bh_lock);
return new_bp;
found:
- spin_unlock(&hash->bh_lock);
+ spin_unlock(&pag->pag_buf_lock);
+ xfs_perag_put(pag);
/* Attempt to get the semaphore without sleeping,
* if this does not work then we need to drop the
@@ -809,27 +836,30 @@ void
xfs_buf_rele(
xfs_buf_t *bp)
{
- xfs_bufhash_t *hash = bp->b_hash;
+ struct xfs_perag *pag = bp->b_pag;
trace_xfs_buf_rele(bp, _RET_IP_);
- if (unlikely(!hash)) {
+ if (!pag) {
ASSERT(!bp->b_relse);
+ ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
if (atomic_dec_and_test(&bp->b_hold))
xfs_buf_free(bp);
return;
}
+ ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
ASSERT(atomic_read(&bp->b_hold) > 0);
- if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
+ if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
if (bp->b_relse) {
atomic_inc(&bp->b_hold);
- spin_unlock(&hash->bh_lock);
- (*(bp->b_relse)) (bp);
+ spin_unlock(&pag->pag_buf_lock);
+ bp->b_relse(bp);
} else {
ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
- list_del_init(&bp->b_hash_list);
- spin_unlock(&hash->bh_lock);
+ rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+ spin_unlock(&pag->pag_buf_lock);
+ xfs_perag_put(pag);
xfs_buf_free(bp);
}
}
@@ -1429,56 +1459,24 @@ xfs_buf_iomove(
*/
void
xfs_wait_buftarg(
- xfs_buftarg_t *btp)
+ struct xfs_buftarg *btp)
{
- xfs_bufhash_t *hash;
- uint i;
+ struct xfs_perag *pag;
+ uint i;
- for (i = 0; i < (1 << btp->bt_hashshift); i++) {
- hash = &btp->bt_hash[i];
- spin_lock(&hash->bh_lock);
- while (!list_empty(&hash->bh_list)) {
- spin_unlock(&hash->bh_lock);
+ for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
+ pag = xfs_perag_get(btp->bt_mount, i);
+ spin_lock(&pag->pag_buf_lock);
+ while (rb_first(&pag->pag_buf_tree)) {
+ spin_unlock(&pag->pag_buf_lock);
delay(100);
- spin_lock(&hash->bh_lock);
+ spin_lock(&pag->pag_buf_lock);
}
- spin_unlock(&hash->bh_lock);
- }
-}
-
-/*
- * Allocate buffer hash table for a given target.
- * For devices containing metadata (i.e. not the log/realtime devices)
- * we need to allocate a much larger hash table.
- */
-STATIC void
-xfs_alloc_bufhash(
- xfs_buftarg_t *btp,
- int external)
-{
- unsigned int i;
-
- if (external) {
- btp->bt_hash = NULL;
- return;
- }
- btp->bt_hashshift = 12; /* 4096 buckets */
- btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
- sizeof(xfs_bufhash_t));
- for (i = 0; i < (1 << btp->bt_hashshift); i++) {
- spin_lock_init(&btp->bt_hash[i].bh_lock);
- INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
+ spin_unlock(&pag->pag_buf_lock);
+ xfs_perag_put(pag);
}
}
-STATIC void
-xfs_free_bufhash(
- xfs_buftarg_t *btp)
-{
- kmem_free_large(btp->bt_hash);
- btp->bt_hash = NULL;
-}
-
/*
* buftarg list for delwrite queue processing
*/
@@ -1511,7 +1509,6 @@ xfs_free_buftarg(
xfs_flush_buftarg(btp, 1);
if (mp->m_flags & XFS_MOUNT_BARRIER)
xfs_blkdev_issue_flush(btp);
- xfs_free_bufhash(btp);
iput(btp->bt_mapping->host);
/* Unregister the buftarg first so that we don't get a
@@ -1651,7 +1648,6 @@ xfs_alloc_buftarg(
goto error;
if (xfs_alloc_delwrite_queue(btp, fsname))
goto error;
- xfs_alloc_bufhash(btp, external);
return btp;
error:
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index d10a954..4f3c845 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -135,10 +135,6 @@ typedef struct xfs_buftarg {
unsigned int bt_sshift;
size_t bt_smask;
- /* per device buffer hash table */
- uint bt_hashshift;
- xfs_bufhash_t *bt_hash;
-
/* per device delwri queue */
struct task_struct *bt_task;
struct list_head bt_list;
@@ -172,8 +168,8 @@ typedef struct xfs_buf {
wait_queue_head_t b_waiters; /* unpin waiters */
struct list_head b_list;
xfs_buf_flags_t b_flags; /* status flags */
- struct list_head b_hash_list; /* hash table list */
- xfs_bufhash_t *b_hash; /* hash table list start */
+ struct rb_node b_rbnode; /* rbtree node */
+ struct xfs_perag *b_pag; /* contains rbtree root */
xfs_buftarg_t *b_target; /* buffer target (device) */
atomic_t b_hold; /* reference count */
xfs_daddr_t b_bn; /* block number for I/O */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index baeec83..63c7a1a 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -233,6 +233,10 @@ typedef struct xfs_perag {
struct mutex pag_ici_reclaim_lock; /* serialisation point */
unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
+ /* buffer cache index */
+ spinlock_t pag_buf_lock; /* lock for pag_buf_tree */
+ struct rb_root pag_buf_tree; /* ordered tree of active buffers */
+
/* for rcu-safe freeing */
struct rcu_head rcu_head;
#endif
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 59859c3..cfa2fb4 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -479,6 +479,8 @@ xfs_initialize_perag(
rwlock_init(&pag->pag_ici_lock);
mutex_init(&pag->pag_ici_reclaim_lock);
INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+ spin_lock_init(&pag->pag_buf_lock);
+ pag->pag_buf_tree = RB_ROOT;
if (radix_tree_preload(GFP_NOFS))
goto out_unwind;
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 39+ messages in thread