public inbox for linux-xfs@vger.kernel.org
 help / color / mirror / Atom feed
From: Dave Chinner <david@fromorbit.com>
To: xfs@oss.sgi.com
Subject: [PATCH 01/16] xfs: reduce the number of CIL lock round trips during commit
Date: Wed, 22 Sep 2010 16:44:14 +1000	[thread overview]
Message-ID: <1285137869-10310-2-git-send-email-david@fromorbit.com> (raw)
In-Reply-To: <1285137869-10310-1-git-send-email-david@fromorbit.com>

From: Dave Chinner <dchinner@redhat.com>

When commiting a transaction, we do a lock CIL state lock round trip
on every single log vector we insert into the CIL. This is resulting
in the lock being as hot as the inode and dcache locks on 8-way
create workloads. Rework the insertion loops to bring the number
of lock round trips to one per transaction for log vectors, and one
more do the busy extents.

Also change the allocation of the log vector buffer not to zero it
as we copy over the entire allocated buffer anyway.

This patch also includes a structural cleanup to the CIL item
insertion provided by Christoph Hellwig.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_log_cil.c |  232 +++++++++++++++++++++++++++-----------------------
 1 files changed, 127 insertions(+), 105 deletions(-)

diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index ed575fb..3c78a17 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -146,102 +146,6 @@ xlog_cil_init_post_recovery(
 }
 
 /*
- * Insert the log item into the CIL and calculate the difference in space
- * consumed by the item. Add the space to the checkpoint ticket and calculate
- * if the change requires additional log metadata. If it does, take that space
- * as well. Remove the amount of space we addded to the checkpoint ticket from
- * the current transaction ticket so that the accounting works out correctly.
- *
- * If this is the first time the item is being placed into the CIL in this
- * context, pin it so it can't be written to disk until the CIL is flushed to
- * the iclog and the iclog written to disk.
- */
-static void
-xlog_cil_insert(
-	struct log		*log,
-	struct xlog_ticket	*ticket,
-	struct xfs_log_item	*item,
-	struct xfs_log_vec	*lv)
-{
-	struct xfs_cil		*cil = log->l_cilp;
-	struct xfs_log_vec	*old = lv->lv_item->li_lv;
-	struct xfs_cil_ctx	*ctx = cil->xc_ctx;
-	int			len;
-	int			diff_iovecs;
-	int			iclog_space;
-
-	if (old) {
-		/* existing lv on log item, space used is a delta */
-		ASSERT(!list_empty(&item->li_cil));
-		ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
-
-		len = lv->lv_buf_len - old->lv_buf_len;
-		diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
-		kmem_free(old->lv_buf);
-		kmem_free(old);
-	} else {
-		/* new lv, must pin the log item */
-		ASSERT(!lv->lv_item->li_lv);
-		ASSERT(list_empty(&item->li_cil));
-
-		len = lv->lv_buf_len;
-		diff_iovecs = lv->lv_niovecs;
-		IOP_PIN(lv->lv_item);
-
-	}
-	len += diff_iovecs * sizeof(xlog_op_header_t);
-
-	/* attach new log vector to log item */
-	lv->lv_item->li_lv = lv;
-
-	spin_lock(&cil->xc_cil_lock);
-	list_move_tail(&item->li_cil, &cil->xc_cil);
-	ctx->nvecs += diff_iovecs;
-
-	/*
-	 * If this is the first time the item is being committed to the CIL,
-	 * store the sequence number on the log item so we can tell
-	 * in future commits whether this is the first checkpoint the item is
-	 * being committed into.
-	 */
-	if (!item->li_seq)
-		item->li_seq = ctx->sequence;
-
-	/*
-	 * Now transfer enough transaction reservation to the context ticket
-	 * for the checkpoint. The context ticket is special - the unit
-	 * reservation has to grow as well as the current reservation as we
-	 * steal from tickets so we can correctly determine the space used
-	 * during the transaction commit.
-	 */
-	if (ctx->ticket->t_curr_res == 0) {
-		/* first commit in checkpoint, steal the header reservation */
-		ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
-		ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
-		ticket->t_curr_res -= ctx->ticket->t_unit_res;
-	}
-
-	/* do we need space for more log record headers? */
-	iclog_space = log->l_iclog_size - log->l_iclog_hsize;
-	if (len > 0 && (ctx->space_used / iclog_space !=
-				(ctx->space_used + len) / iclog_space)) {
-		int hdrs;
-
-		hdrs = (len + iclog_space - 1) / iclog_space;
-		/* need to take into account split region headers, too */
-		hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
-		ctx->ticket->t_unit_res += hdrs;
-		ctx->ticket->t_curr_res += hdrs;
-		ticket->t_curr_res -= hdrs;
-		ASSERT(ticket->t_curr_res >= len);
-	}
-	ticket->t_curr_res -= len;
-	ctx->space_used += len;
-
-	spin_unlock(&cil->xc_cil_lock);
-}
-
-/*
  * Format log item into a flat buffers
  *
  * For delayed logging, we need to hold a formatted buffer containing all the
@@ -286,7 +190,7 @@ xlog_cil_format_items(
 			len += lv->lv_iovecp[index].i_len;
 
 		lv->lv_buf_len = len;
-		lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
+		lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
 		ptr = lv->lv_buf;
 
 		for (index = 0; index < lv->lv_niovecs; index++) {
@@ -300,21 +204,136 @@ xlog_cil_format_items(
 	}
 }
 
+/*
+ * Prepare the log item for insertion into the CIL. Calculate the difference in
+ * log space and vectors it will consume, and if it is a new item pin it as
+ * well.
+ */
+STATIC void
+xfs_cil_prepare_item(
+	struct log		*log,
+	struct xfs_log_vec	*lv,
+	int			*len,
+	int			*diff_iovecs)
+{
+	struct xfs_log_vec	*old = lv->lv_item->li_lv;
+
+	if (old) {
+		/* existing lv on log item, space used is a delta */
+		ASSERT(!list_empty(&lv->lv_item->li_cil));
+		ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
+
+		*len += lv->lv_buf_len - old->lv_buf_len;
+		*diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
+		kmem_free(old->lv_buf);
+		kmem_free(old);
+	} else {
+		/* new lv, must pin the log item */
+		ASSERT(!lv->lv_item->li_lv);
+		ASSERT(list_empty(&lv->lv_item->li_cil));
+
+		*len += lv->lv_buf_len;
+		*diff_iovecs += lv->lv_niovecs;
+		IOP_PIN(lv->lv_item);
+
+	}
+
+	/* attach new log vector to log item */
+	lv->lv_item->li_lv = lv;
+
+	/*
+	 * If this is the first time the item is being committed to the
+	 * CIL, store the sequence number on the log item so we can
+	 * tell in future commits whether this is the first checkpoint
+	 * the item is being committed into.
+	 */
+	if (!lv->lv_item->li_seq)
+		lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
+}
+
+/*
+ * Insert the log items into the CIL and calculate the difference in space
+ * consumed by the item. Add the space to the checkpoint ticket and calculate
+ * if the change requires additional log metadata. If it does, take that space
+ * as well. Remove the amount of space we addded to the checkpoint ticket from
+ * the current transaction ticket so that the accounting works out correctly.
+ */
 static void
 xlog_cil_insert_items(
 	struct log		*log,
 	struct xfs_log_vec	*log_vector,
-	struct xlog_ticket	*ticket,
-	xfs_lsn_t		*start_lsn)
+	struct xlog_ticket	*ticket)
 {
-	struct xfs_log_vec *lv;
-
-	if (start_lsn)
-		*start_lsn = log->l_cilp->xc_ctx->sequence;
+	struct xfs_cil		*cil = log->l_cilp;
+	struct xfs_cil_ctx	*ctx = cil->xc_ctx;
+	struct xfs_log_vec	*lv;
+	int			len = 0;
+	int			diff_iovecs = 0;
+	int			iclog_space;
 
 	ASSERT(log_vector);
+
+	/*
+	 * Do all the accounting aggregation and switching of log vectors
+	 * around in a separate loop to the insertion of items into the CIL.
+	 * Then we can do a separate loop to update the CIL within a single
+	 * lock/unlock pair. This reduces the number of round trips on the CIL
+	 * lock from O(nr_logvectors) to O(1) and greatly reduces the overall
+	 * hold time for the transaction commit.
+	 *
+	 * If this is the first time the item is being placed into the CIL in
+	 * this context, pin it so it can't be written to disk until the CIL is
+	 * flushed to the iclog and the iclog written to disk.
+	 *
+	 * We can do this safely because the context can't checkpoint until we
+	 * are done so it doesn't matter exactly how we update the CIL.
+	 */
+	for (lv = log_vector; lv; lv = lv->lv_next)
+		xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
+
+	/* account for space used by new iovec headers  */
+	len += diff_iovecs * sizeof(xlog_op_header_t);
+
+	spin_lock(&cil->xc_cil_lock);
+
+	/* move the items to the tail of the CIL */
 	for (lv = log_vector; lv; lv = lv->lv_next)
-		xlog_cil_insert(log, ticket, lv->lv_item, lv);
+		list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
+
+	ctx->nvecs += diff_iovecs;
+
+	/*
+	 * Now transfer enough transaction reservation to the context ticket
+	 * for the checkpoint. The context ticket is special - the unit
+	 * reservation has to grow as well as the current reservation as we
+	 * steal from tickets so we can correctly determine the space used
+	 * during the transaction commit.
+	 */
+	if (ctx->ticket->t_curr_res == 0) {
+		/* first commit in checkpoint, steal the header reservation */
+		ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
+		ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
+		ticket->t_curr_res -= ctx->ticket->t_unit_res;
+	}
+
+	/* do we need space for more log record headers? */
+	iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+	if (len > 0 && (ctx->space_used / iclog_space !=
+				(ctx->space_used + len) / iclog_space)) {
+		int hdrs;
+
+		hdrs = (len + iclog_space - 1) / iclog_space;
+		/* need to take into account split region headers, too */
+		hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
+		ctx->ticket->t_unit_res += hdrs;
+		ctx->ticket->t_curr_res += hdrs;
+		ticket->t_curr_res -= hdrs;
+		ASSERT(ticket->t_curr_res >= len);
+	}
+	ticket->t_curr_res -= len;
+	ctx->space_used += len;
+
+	spin_unlock(&cil->xc_cil_lock);
 }
 
 static void
@@ -632,7 +651,10 @@ xfs_log_commit_cil(
 
 	/* lock out background commit */
 	down_read(&log->l_cilp->xc_ctx_lock);
-	xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn);
+	if (commit_lsn)
+		*commit_lsn = log->l_cilp->xc_ctx->sequence;
+
+	xlog_cil_insert_items(log, log_vector, tp->t_ticket);
 
 	/* check we didn't blow the reservation */
 	if (tp->t_ticket->t_curr_res < 0)
-- 
1.7.1

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

  reply	other threads:[~2010-09-22  6:43 UTC|newest]

Thread overview: 39+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-09-22  6:44 [PATCH 0/16] xfs: metadata scalability V2 Dave Chinner
2010-09-22  6:44 ` Dave Chinner [this message]
2010-09-22 16:51   ` [PATCH 01/16] xfs: reduce the number of CIL lock round trips during commit Christoph Hellwig
2010-09-22 19:57   ` Alex Elder
2010-09-22  6:44 ` [PATCH 02/16] xfs: remove debug assert for per-ag reference counting Dave Chinner
2010-09-22  6:44 ` [PATCH 03/16] xfs: lockless per-ag lookups Dave Chinner
2010-09-22  6:44 ` [PATCH 04/16] xfs: don't use vfs writeback for pure metadata modifications Dave Chinner
2010-09-22 17:24   ` Christoph Hellwig
2010-09-23  0:36     ` Dave Chinner
2010-09-23 16:19   ` Alex Elder
2010-09-22  6:44 ` [PATCH 05/16] xfs: rename xfs_buf_get_nodaddr to be more appropriate Dave Chinner
2010-09-22 17:25   ` Christoph Hellwig
2010-09-23  0:37     ` Dave Chinner
2010-09-23 16:22   ` Alex Elder
2010-09-22  6:44 ` [PATCH 06/16] xfs: introduced uncached buffer read primitve Dave Chinner
2010-09-22  6:44 ` [PATCH 07/16] xfs: store xfs_mount in the buftarg instead of in the xfs_buf Dave Chinner
2010-09-22  6:44 ` [PATCH 08/16] xfs: kill XBF_FS_MANAGED buffers Dave Chinner
2010-09-22  6:44 ` [PATCH 09/16] xfs: use unhashed buffers for size checks Dave Chinner
2010-09-22  6:44 ` [PATCH 10/16] xfs: remove buftarg hash for external devices Dave Chinner
2010-09-22  6:44 ` [PATCH 11/16] xfs: split inode AG walking into separate code for reclaim Dave Chinner
2010-09-22 17:28   ` Christoph Hellwig
2010-09-23 16:45   ` Alex Elder
2010-09-22  6:44 ` [PATCH 12/16] xfs: implement batched inode lookups for AG walking Dave Chinner
2010-09-22 17:33   ` Christoph Hellwig
2010-09-23  0:40     ` Dave Chinner
2010-09-23 17:17   ` Alex Elder
2010-09-24  9:15     ` Dave Chinner
2010-09-27 16:05       ` Alex Elder
2010-09-27 17:43       ` Alex Elder
2010-09-22  6:44 ` [PATCH 13/16] xfs: batch inode reclaim lookup Dave Chinner
2010-09-22 17:34   ` Christoph Hellwig
2010-09-23  0:43     ` Dave Chinner
2010-09-23 17:39   ` Alex Elder
2010-09-22  6:44 ` [PATCH 14/16] xfs: serialise inode reclaim within an AG Dave Chinner
2010-09-23 17:50   ` Alex Elder
2010-09-22  6:44 ` [PATCH 16/16] xfs; pack xfs_buf structure more tightly Dave Chinner
2010-09-22 14:53 ` [PATCH 0/16] xfs: metadata scalability V2 Christoph Hellwig
2010-09-22 20:55 ` Alex Elder
2010-09-23  0:46   ` [PATCH 15/16] xfs: convert buffer cache hash to rbtree Dave Chinner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1285137869-10310-2-git-send-email-david@fromorbit.com \
    --to=david@fromorbit.com \
    --cc=xfs@oss.sgi.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox