All of lore.kernel.org
 help / color / mirror / Atom feed
From: scjody@sun.com
To: linux-ext4@vger.kernel.org, linux-raid@vger.kernel.org
Cc: linux-kernel@vger.kernel.org, Andreas Dilger <adilger@sun.com>
Subject: [patch 3/4] [jbd] Add support for journal guided resync.
Date: Thu, 01 Oct 2009 18:39:32 -0400	[thread overview]
Message-ID: <20091001224018.740641147@sun.com> (raw)
In-Reply-To: 20091001223929.120106893@sun.com

[-- Attachment #1: jbd-journal-guided-resync-infra.patch --]
[-- Type: TEXT/PLAIN, Size: 34517 bytes --]

Adds support for declare blocks, used by ext3's journal guided resync (declared
mode.)  A declare block is added to the journal to list blocks to be written
during the current transaction.  During journal replay, we perform a RAID
resync of only these blocks and skip the rest of the resync.

Index: linux-2.6.18-128.1.6/fs/jbd/checkpoint.c
===================================================================
--- linux-2.6.18-128.1.6.orig/fs/jbd/checkpoint.c
+++ linux-2.6.18-128.1.6/fs/jbd/checkpoint.c
@@ -712,6 +712,8 @@ void __journal_drop_transaction(journal_
 
 	J_ASSERT(transaction->t_state == T_FINISHED);
 	J_ASSERT(transaction->t_buffers == NULL);
+	J_ASSERT(transaction->t_declare_root.rnode == NULL);
+	J_ASSERT(transaction->t_declare_done_root.rnode == NULL);
 	J_ASSERT(transaction->t_sync_datalist == NULL);
 	J_ASSERT(transaction->t_forget == NULL);
 	J_ASSERT(transaction->t_iobuf_list == NULL);
Index: linux-2.6.18-128.1.6/fs/jbd/commit.c
===================================================================
--- linux-2.6.18-128.1.6.orig/fs/jbd/commit.c
+++ linux-2.6.18-128.1.6/fs/jbd/commit.c
@@ -373,6 +373,262 @@ static inline __u32 jbd_checksum_data(__
 	return checksum;
 }
 
+int wait_for_descriptors(journal_t *journal, transaction_t *trans) {
+	struct journal_head *jh;
+	struct buffer_head *bh;
+	int err = 0;
+
+wait_for_ctlbuf:
+
+	while (trans->t_log_list != NULL) {
+
+		jh = trans->t_log_list->b_tprev;
+		bh = jh2bh(jh);
+		if (buffer_locked(bh)) {
+			wait_on_buffer(bh);
+			goto wait_for_ctlbuf;
+		}
+		if (cond_resched())
+			goto wait_for_ctlbuf;
+
+		if (unlikely(!buffer_uptodate(bh)))
+			err = -EIO;
+
+		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
+		clear_buffer_jwrite(bh);
+		journal_unfile_buffer(journal, jh);
+		journal_put_journal_head(jh);
+		__brelse(bh);		/* One for getblk */
+	}
+
+	return err;
+}
+
+struct journal_head *get_descriptor(journal_t *journal, transaction_t *trans,
+				    int blocktype, char **tagp, int *space_left) {
+	struct journal_head *descriptor;
+	struct buffer_head *dbh;
+	journal_header_t *header;
+
+	jbd_debug(4, "JBD: get descriptor\n");
+
+	descriptor = journal_get_descriptor_buffer(journal);
+	if (!descriptor)
+		return NULL;
+
+	dbh = jh2bh(descriptor);
+	jbd_debug(4, "JBD: got buffer %llu (%p)\n",
+	    (unsigned long long)dbh->b_blocknr, dbh->b_data);
+	header = (journal_header_t *)&dbh->b_data[0];
+	header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
+	header->h_blocktype = cpu_to_be32(blocktype);
+	header->h_sequence  = cpu_to_be32(trans->t_tid);
+
+	*tagp = &dbh->b_data[sizeof(journal_header_t)];
+	*space_left = dbh->b_size - sizeof(journal_header_t);
+
+	set_buffer_jwrite(dbh);
+	set_buffer_dirty(dbh);
+
+	/* Record it so that we can wait for it later */
+	BUFFER_TRACE(dbh, "ph3: file as descriptor");
+	journal_file_buffer(descriptor, trans, BJ_LogCtl);
+
+	return descriptor;
+}
+
+/*
+ * Write declare blocks containing a list of the data blocks that will be
+ * written out
+ */
+void write_declare_blocks(journal_t *journal, transaction_t *transaction,
+			  int committing)
+{
+	struct journal_head *jh, *descriptor = NULL;
+	struct buffer_head *bh;
+	int i, bufs = 0, err;
+	unsigned int n, count = 0, to_write;
+	unsigned long nextblock = 0;
+	char *tagp = NULL;
+	journal_block_tag_t *tag = NULL;
+	int space_left = 0, first_tag = 0, tag_flag;
+	struct radix_tree_root *root;
+
+	root = &transaction->t_declare_root;
+
+	spin_lock(&journal->j_list_lock);
+	to_write = transaction->t_declare_request;
+	transaction->t_declare_request = 0;
+	spin_unlock(&journal->j_list_lock);
+
+	if (to_write == UINT_MAX)
+		jbd_debug (1, "jbd: tid %d write declare request for ALL "
+			   "blocks\n", transaction->t_tid);
+	else
+		jbd_debug (1, "jbd: tid %d write declare request for %u "
+			   "blocks\n", transaction->t_tid, to_write);
+write_declare:
+	cond_resched();
+	spin_lock(&journal->j_list_lock);
+
+	n = radix_tree_gang_lookup(root, journal->j_declare_jhs, nextblock, 1);
+	while (n) {
+		if (!descriptor) {
+			J_ASSERT(bufs == 0);
+
+			spin_unlock(&journal->j_list_lock);
+
+			descriptor = get_descriptor(journal, transaction,
+						    JFS_DECLARE_BLOCK,
+						    &tagp, &space_left);
+
+			if (!descriptor) {
+				journal_abort(journal, -EIO);
+				return;
+			}
+
+			first_tag = 1;
+			journal->j_declare_bhs[bufs++] = jh2bh(descriptor);
+
+			goto write_declare;
+		}
+
+		jh = (struct journal_head *)journal->j_declare_jhs[0];
+		bh = jh2bh(jh);
+
+		/* refile the buffer as having been declared */
+		if (!inverted_lock(journal, bh))
+			goto write_declare;
+		__journal_unfile_buffer(jh);
+		__journal_file_buffer(jh, transaction, BJ_DeclareDone);
+
+		jbd_unlock_bh_state(bh);
+
+		/* record the block's tag in the current descriptor buffer */
+		tag_flag = 0;
+		if (!first_tag)
+			tag_flag |= JFS_FLAG_SAME_UUID;
+
+		tag = (journal_block_tag_t *)tagp;
+		tag->t_blocknr = cpu_to_be32(bh->b_blocknr);
+		tag->t_flags = cpu_to_be32(tag_flag);
+		tagp += sizeof(journal_block_tag_t);
+		space_left -= sizeof(journal_block_tag_t);
+
+		if (first_tag) {
+			memcpy (tagp, journal->j_uuid, 16);
+			tagp += 16;
+			space_left -= 16;
+			first_tag = 0;
+		}
+
+		count++;
+
+		/* advance to the next journal head and buffer */
+		nextblock = bh->b_blocknr + 1;
+		n = radix_tree_gang_lookup(root, journal->j_declare_jhs,
+					   nextblock, 1);
+
+		/* If there's no more to do, or if the descriptor is full,
+		   let the IO rip! */
+
+		if (bufs == ARRAY_SIZE(journal->j_declare_bhs) || n == 0 ||
+		    count == to_write ||
+		    space_left < sizeof(journal_block_tag_t) + 16) {
+
+			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
+
+			/* Write an end-of-descriptor marker before
+			 * submitting the IOs.  "tag" still points to
+                         * the last tag we set up.
+			 */
+
+			tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
+
+			spin_unlock(&journal->j_list_lock);
+
+			for (i = 0; i < bufs; i++) {
+				struct buffer_head *bh = journal->j_declare_bhs[i];
+				lock_buffer(bh);
+				clear_buffer_dirty(bh);
+				set_buffer_uptodate(bh);
+				bh->b_end_io = journal_end_buffer_io_sync;
+				submit_bh(WRITE, bh);
+			}
+
+			cond_resched();
+			spin_lock(&journal->j_list_lock);
+
+			/* force a new descriptor to be generated next time */
+			descriptor = NULL;
+			bufs = 0;
+
+			/* need to redo tree lookup since we lost the lock,
+			   but that will happen after we get a new descriptor */
+		}
+
+		if (count == to_write) break;
+	}
+	spin_unlock(&journal->j_list_lock);
+
+	jbd_debug(2, "jbd: tid %d wrote declarations for %u blocks\n",
+		  transaction->t_tid, count);
+	if (to_write == UINT_MAX)
+		J_ASSERT(transaction->t_declare_root.rnode == NULL);
+
+	/* wait for the declare blocks to be written */
+	err = wait_for_descriptors(journal, transaction);
+
+	/* move the declared buffers to the sync data list */
+
+	root = &transaction->t_declare_done_root;
+	count = 0;
+	nextblock = 0;
+
+move_declare:
+	cond_resched();
+	spin_lock(&journal->j_list_lock);
+
+	while ((n = radix_tree_gang_lookup(root, journal->j_declare_jhs,
+					   nextblock,
+					   ARRAY_SIZE(journal->j_declare_jhs)))) {
+		/* loop and move the journal heads */
+		for (i = 0; i < n; i++) {
+			jh = journal->j_declare_jhs[i];
+			bh = jh2bh(jh);
+
+			if (!inverted_lock(journal, bh)) {
+				goto move_declare;
+			}
+			__journal_unfile_buffer(jh);
+
+			if (committing)
+				/* set buffer dirty for writing below */
+				set_buffer_dirty(bh);
+			else
+				/* set page dirty for virtual memory */
+				mark_buffer_dirty(bh);
+
+			__journal_file_buffer(jh, transaction, BJ_SyncData);
+
+			count++;
+
+			nextblock = bh->b_blocknr + 1;
+
+			jbd_unlock_bh_state(bh);
+
+			if (lock_need_resched(&journal->j_list_lock)) {
+				spin_unlock(&journal->j_list_lock);
+				goto move_declare;
+			}
+		}
+	}
+	spin_unlock(&journal->j_list_lock);
+
+	jbd_debug(2, "jbd: tid %d moved %u declare blocks\n",
+		  transaction->t_tid, count);
+}
+
 /*
  * journal_commit_transaction
  *
@@ -390,7 +646,6 @@ void journal_commit_transaction(journal_
 	int err;
 	unsigned long blocknr;
 	char *tagp = NULL;
-	journal_header_t *header;
 	journal_block_tag_t *tag = NULL;
 	int space_left = 0;
 	int first_tag = 0;
@@ -517,6 +772,11 @@ void journal_commit_transaction(journal_
 
 	jbd_debug (3, "JBD: commit phase 2\n");
 
+	if (journal->j_flags & JFS_DECLARE) {
+		commit_transaction->t_declare_request = UINT_MAX;
+		write_declare_blocks(journal, commit_transaction, 1);
+	}
+
 	/*
 	 * Now start flushing things to disk, in the order they appear
 	 * on the transaction lists.  Data blocks go first.
@@ -545,9 +805,13 @@ void journal_commit_transaction(journal_
 	 * If we found any dirty or locked buffers, then we should have
 	 * looped back up to the write_out_data label.  If there weren't
 	 * any then journal_clean_data_list should have wiped the list
-	 * clean by now, so check that it is in fact empty.
+	 * clean by now, so check that it is in fact empty.  Also check
+	 * declared mode trees - write_declare_blocks() should have left
+	 * them empty.
 	 */
-	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
+	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+	J_ASSERT(commit_transaction->t_declare_root.rnode == NULL);
+	J_ASSERT(commit_transaction->t_declare_done_root.rnode == NULL);
 
 	jbd_debug (3, "JBD: commit phase 3\n");
 
@@ -596,38 +860,20 @@ void journal_commit_transaction(journal_
 		   record the metadata buffer. */
 
 		if (!descriptor) {
-			struct buffer_head *bh;
-
 			J_ASSERT (bufs == 0);
 
-			jbd_debug(4, "JBD: get descriptor\n");
+			descriptor = get_descriptor(journal,
+						    commit_transaction,
+						    JFS_DESCRIPTOR_BLOCK,
+						    &tagp, &space_left);
 
-			descriptor = journal_get_descriptor_buffer(journal);
 			if (!descriptor) {
 				journal_abort(journal, -EIO);
 				continue;
 			}
 
-			bh = jh2bh(descriptor);
-			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
-				(unsigned long long)bh->b_blocknr, bh->b_data);
-			header = (journal_header_t *)&bh->b_data[0];
-			header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
-			header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
-			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
-
-			tagp = &bh->b_data[sizeof(journal_header_t)];
-			space_left = bh->b_size - sizeof(journal_header_t);
 			first_tag = 1;
-			set_buffer_jwrite(bh);
-			set_buffer_dirty(bh);
-			wbuf[bufs++] = bh;
-
-			/* Record it so that we can wait for IO
-                           completion later */
-			BUFFER_TRACE(bh, "ph3: file as descriptor");
-			journal_file_buffer(descriptor, commit_transaction,
-					BJ_LogCtl);
+			wbuf[bufs++] = jh2bh(descriptor);
 		}
 
 		/* Where is the buffer to be written? */
@@ -826,29 +1072,7 @@ wait_for_iobuf:
 	jbd_debug(3, "JBD: commit phase 5\n");
 
 	/* Here we wait for the revoke record and descriptor record buffers */
- wait_for_ctlbuf:
-	while (commit_transaction->t_log_list != NULL) {
-		struct buffer_head *bh;
-
-		jh = commit_transaction->t_log_list->b_tprev;
-		bh = jh2bh(jh);
-		if (buffer_locked(bh)) {
-			wait_on_buffer(bh);
-			goto wait_for_ctlbuf;
-		}
-		if (cond_resched())
-			goto wait_for_ctlbuf;
-
-		if (unlikely(!buffer_uptodate(bh)))
-			err = -EIO;
-
-		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
-		clear_buffer_jwrite(bh);
-		journal_unfile_buffer(journal, jh);
-		journal_put_journal_head(jh);
-		__brelse(bh);		/* One for getblk */
-		/* AKPM: bforget here */
-	}
+	err = wait_for_descriptors(journal, commit_transaction);
 
 	if (err)
 		journal_abort(journal, err);
@@ -904,6 +1128,8 @@ wait_for_iobuf:
 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
 	J_ASSERT(commit_transaction->t_log_list == NULL);
+	J_ASSERT(commit_transaction->t_declare_root.rnode == NULL);
+	J_ASSERT(commit_transaction->t_declare_done_root.rnode == NULL);
 
 restart_loop:
 	/*
Index: linux-2.6.18-128.1.6/fs/jbd/journal.c
===================================================================
--- linux-2.6.18-128.1.6.orig/fs/jbd/journal.c
+++ linux-2.6.18-128.1.6/fs/jbd/journal.c
@@ -86,6 +86,10 @@ EXPORT_SYMBOL(journal_invalidatepage);
 EXPORT_SYMBOL(journal_try_to_free_buffers);
 EXPORT_SYMBOL(journal_bmap);
 EXPORT_SYMBOL(journal_force_commit);
+EXPORT_SYMBOL(journal_write_declare);
+
+extern void write_declare_blocks(journal_t *journal,
+			  transaction_t *commit_transaction, int committing);
 
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
@@ -156,6 +160,16 @@ loop:
 		journal_commit_transaction(journal);
 		spin_lock(&journal->j_state_lock);
 		goto loop;
+	} else if (journal->j_flags & JFS_DECLARE &&
+		   (transaction = journal->j_running_transaction) &&
+		   transaction->t_declare_request) {
+		jbd_debug(2, "early declare\n");
+		spin_unlock(&journal->j_state_lock);
+		write_declare_blocks(journal, transaction, 0);
+		spin_lock(&journal->j_state_lock);
+
+		wake_up(&journal->j_wait_declare);
+		goto loop;
 	}
 
 	wake_up(&journal->j_wait_done_commit);
@@ -494,6 +508,38 @@ int journal_force_commit_nested(journal_
 }
 
 /*
+ * For ext3_fsync: start a request to declare the file's data and wait
+ * for the declarations to complete.
+ */
+int journal_write_declare(journal_t *journal)
+{
+	transaction_t *transaction = journal->j_running_transaction;
+	DEFINE_WAIT(wait);
+
+	if (transaction == NULL)
+		return 0;
+
+	spin_lock(&journal->j_list_lock);
+
+	if (transaction->t_declare_root.rnode == NULL) {
+		spin_unlock(&journal->j_list_lock);
+		return 0;
+	}
+
+	transaction->t_declare_request = UINT_MAX;
+
+	jbd_debug(1, "waking commit thread for fsync declare\n");
+	wake_up(&journal->j_wait_commit);
+
+	prepare_to_wait(&journal->j_wait_declare, &wait, TASK_INTERRUPTIBLE);
+	spin_unlock(&journal->j_list_lock);
+	schedule();
+	finish_wait(&journal->j_wait_declare, &wait);
+
+	return 0;
+}
+
+/*
  * Start a commit of the current running transaction (if any).  Returns true
  * if a transaction was started, and fills its tid in at *ptid
  */
@@ -959,6 +1005,7 @@ static journal_t * journal_init_common (
 	init_waitqueue_head(&journal->j_wait_checkpoint);
 	init_waitqueue_head(&journal->j_wait_commit);
 	init_waitqueue_head(&journal->j_wait_updates);
+	init_waitqueue_head(&journal->j_wait_declare);
 	mutex_init(&journal->j_barrier);
 	mutex_init(&journal->j_checkpoint_mutex);
 	spin_lock_init(&journal->j_revoke_lock);
@@ -1292,6 +1339,8 @@ static int journal_get_superblock(journa
 
 	J_ASSERT(bh != NULL);
 	if (!buffer_uptodate(bh)) {
+		/* TODO: resync the superblock */
+
 		ll_rw_block(READ, 1, &bh);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
Index: linux-2.6.18-128.1.6/fs/jbd/recovery.c
===================================================================
--- linux-2.6.18-128.1.6.orig/fs/jbd/recovery.c
+++ linux-2.6.18-128.1.6/fs/jbd/recovery.c
@@ -22,6 +22,7 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/crc32.h>
+#include <linux/raid/md.h>
 #endif
 
 /*
@@ -36,6 +37,9 @@ struct recovery_info 
 	int		nr_replays;
 	int		nr_revokes;
 	int		nr_revoke_hits;
+	int		nr_declared;
+
+	int		resync_errors;
 };
 
 enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
@@ -43,6 +47,7 @@ static int do_one_pass(journal_t *journa
 				struct recovery_info *info, enum passtype pass);
 static int scan_revoke_records(journal_t *, struct buffer_head *,
 				tid_t, struct recovery_info *);
+static int journal_syncraid(journal_t *, unsigned long);
 
 #ifdef __KERNEL__
 
@@ -53,6 +58,37 @@ void journal_brelse_array(struct buffer_
 		brelse (b[n]);
 }
 
+static int resync_range(journal_t *j, unsigned long start,
+			unsigned long end)
+{
+	int err;
+	struct inode *fake_inode = kmalloc(sizeof(*fake_inode), GFP_KERNEL);
+	mdu_range_t range;
+	sector_t sectors_per_block = j->j_blocksize >> 9;
+	mm_segment_t old_fs;
+
+	if (fake_inode == NULL) {
+		printk(KERN_ERR "JBD: Out of memory during recovery.\n");
+		return -ENOMEM;
+	}
+
+	fake_inode->i_bdev = j->j_fs_dev;
+	range.start = start * sectors_per_block;
+	range.end = end * sectors_per_block + sectors_per_block - 1;
+
+	old_fs = get_fs();
+	set_fs(KERNEL_DS);
+	err = blkdev_driver_ioctl(fake_inode, NULL, j->j_fs_dev->bd_disk,
+				  RESYNC_RANGE, (long)&range);
+	set_fs(old_fs);
+
+	jbd_debug(3, "RESYNC_RANGE of sectors %llu - %llu returned %d\n",
+		  range.start, range.end, err);
+
+	kfree(fake_inode);
+
+	return err;
+}
 
 /*
  * When reading from the journal, we are going through the block device
@@ -67,7 +103,7 @@ void journal_brelse_array(struct buffer_
  */
 
 #define MAXBUF 8
-static int do_readahead(journal_t *journal, unsigned int start)
+static int do_readahead(journal_t *journal, unsigned int start, int raid_sync)
 {
 	int err;
 	unsigned int max, nbufs, next;
@@ -95,6 +131,14 @@ static int do_readahead(journal_t *journ
 			goto failed;
 		}
 
+		/* For declared mode: perform a raid synchronization for the
+		 * journal blocks; this will resync all of the journal blocks
+		 * read, which is more than strictly necessary.
+		 */
+
+		if (raid_sync)
+			resync_range(journal, blocknr, blocknr);
+
 		bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
 		if (!bh) {
 			err = -ENOMEM;
@@ -103,6 +147,7 @@ static int do_readahead(journal_t *journ
 
 		if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
 			bufs[nbufs++] = bh;
+
 			if (nbufs == MAXBUF) {
 				ll_rw_block(READ, nbufs, bufs);
 				journal_brelse_array(bufs, nbufs);
@@ -130,7 +175,7 @@ failed:
  */
 
 static int jread(struct buffer_head **bhp, journal_t *journal, 
-		 unsigned int offset)
+		 unsigned int offset, int sync_raid)
 {
 	int err;
 	unsigned long blocknr;
@@ -159,7 +204,7 @@ static int jread(struct buffer_head **bh
 		/* If this is a brand new buffer, start readahead.
                    Otherwise, we assume we are already reading it.  */
 		if (!buffer_req(bh))
-			do_readahead(journal, offset);
+			do_readahead(journal, offset, sync_raid);
 		wait_on_buffer(bh);
 	}
 
@@ -257,6 +302,30 @@ int journal_recover(journal_t *journal)
 	jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n", 
 		  info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
 
+	if (!err && !info.resync_errors && JFS_HAS_INCOMPAT_FEATURE(journal,
+					JFS_FEATURE_INCOMPAT_DECLARE_BLOCKS)) {
+		/* Successful declared mode resync: instruct the block device
+		 * to skip its resync */
+		struct inode *fake_inode;
+
+		jbd_debug(0, "JBD: Resynced %d declared blocks\n",
+			  info.nr_declared);
+
+		fake_inode = kmalloc(sizeof(*fake_inode), GFP_KERNEL);
+		if (fake_inode) {
+			fake_inode->i_bdev = journal->j_fs_dev;
+			jbd_debug(1, "Sending SKIP_RESYNC ioctl\n");
+
+			blkdev_driver_ioctl(fake_inode, NULL,
+					    journal->j_fs_dev->bd_disk,
+					    SKIP_RESYNC, 0);
+		}
+		kfree(fake_inode);
+	}
+
+	journal_clear_features(journal, 0, 0,
+			       JFS_FEATURE_INCOMPAT_DECLARE_BLOCKS);
+
 	/* Restart the log at the next transaction ID, thus invalidating
 	 * any existing commit records in the log. */
 	journal->j_transaction_sequence = ++info.end_transaction;
@@ -329,7 +398,7 @@ static int calc_chksums(journal_t *journ
 	for (i = 0; i < num_blks; i++) {
 		io_block = (*next_log_block)++;
 		wrap(journal, *next_log_block);
-		err = jread(&obh, journal, io_block);
+		err = jread(&obh, journal, io_block, 0);
 		if (err) {
 			printk(KERN_ERR "JBD: IO error %d recovering block "
 				"%lu in log\n", err, io_block);
@@ -355,6 +424,7 @@ static int do_one_pass(journal_t *journa
 	unsigned int		sequence;
 	int			blocktype;
 	__u32			crc32_sum = ~0; /* Transactional Checksums */
+	int			raid_sync_journal = 0, raid_sync_data = 0;
 
 	/* Precompute the maximum metadata descriptors in a descriptor block */
 	int			MAX_BLOCKS_PER_DESC;
@@ -397,9 +467,30 @@ static int do_one_pass(journal_t *journa
 		 * check right now that we haven't gone past the end of
 		 * the log. */
 
-		if (pass != PASS_SCAN)
-			if (tid_geq(next_commit_ID, info->end_transaction))
-				break;
+		if (pass != PASS_SCAN) {
+			if (tid_geq(next_commit_ID, info->end_transaction)) {
+				/* For declared mode resync, move ahead past
+				 * the last commmitted transaction to deal with
+				 * raid sync for declare blocks and the head
+				 * of the journal.
+				 */
+				if (pass == PASS_REPLAY &&
+				    JFS_HAS_INCOMPAT_FEATURE(journal,
+					 JFS_FEATURE_INCOMPAT_DECLARE_BLOCKS)) {
+					if (journal->j_fs_dev == journal->j_dev)
+						raid_sync_journal = 1;
+					if (!raid_sync_data)
+						jbd_debug(1, "Declared mode was used; "
+							  "performing raid sync %s\n",
+							  raid_sync_journal ?
+							  "of journal and data" :
+							  "of data");
+					raid_sync_data = 1;
+				}
+				else
+					break;
+			}
+		}
 
 		jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
 			  next_commit_ID, next_log_block, journal->j_last);
@@ -409,7 +500,7 @@ static int do_one_pass(journal_t *journa
 		 * record. */
 
 		jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
-		err = jread(&bh, journal, next_log_block);
+		err = jread(&bh, journal, next_log_block, raid_sync_journal);
 		if (err)
 			goto failed;
 
@@ -426,6 +517,12 @@ static int do_one_pass(journal_t *journa
 
 		if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) {
 			brelse(bh);
+
+			/* raid sync the head of the journal */
+			if (raid_sync_journal) {
+				if (journal_syncraid(journal, next_log_block))
+					info->resync_errors++;
+			}
 			break;
 		}
 
@@ -436,6 +533,12 @@ static int do_one_pass(journal_t *journa
 
 		if (sequence != next_commit_ID) {
 			brelse(bh);
+
+			/* raid sync the head of the journal */
+			if (raid_sync_journal) {
+				if (journal_syncraid(journal, next_log_block))
+					info->resync_errors++;
+			}
 			break;
 		}
 
@@ -485,7 +588,8 @@ static int do_one_pass(journal_t *journa
 
 				io_block = next_log_block++;
 				wrap(journal, next_log_block);
-				err = jread(&obh, journal, io_block);
+				err = jread(&obh, journal, io_block,
+					    raid_sync_journal);
 				if (err) {
 					/* Recover what we can, but
 					 * report failure at the end. */
@@ -668,6 +772,42 @@ static int do_one_pass(journal_t *journa
 				goto failed;
 			continue;
 
+		case JFS_DECLARE_BLOCK:
+			if (!raid_sync_data) {
+				brelse(bh);
+				continue;
+			}
+
+			/* this is a declare block for an uncommitted
+			 * transaction, so raid sync all of the blocks it
+			 * describes
+			 */
+
+			tagp = &bh->b_data[sizeof(journal_header_t)];
+			while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
+			       <= journal->j_blocksize) {
+
+				unsigned long blocknr;
+
+				tag = (journal_block_tag_t *) tagp;
+				flags = be32_to_cpu(tag->t_flags);
+				blocknr = be32_to_cpu(tag->t_blocknr);
+
+				if (resync_range(journal, blocknr, blocknr))
+					++info->resync_errors;
+				++info->nr_declared;
+
+				tagp += sizeof(journal_block_tag_t);
+				if (!(flags & JFS_FLAG_SAME_UUID))
+					tagp += 16;
+
+				if (flags & JFS_FLAG_LAST_TAG)
+					break;
+			}
+
+			brelse(bh);
+			continue;
+
 		default:
 			jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
 				  blocktype);
@@ -705,6 +845,38 @@ static int do_one_pass(journal_t *journa
 	return err;
 }
 
+/* RAID sync the next one quarter of the journal.  This is called once at the
+ * end of recovery if declare blocks are present since that part of the journal
+ * was likely undergoing writes before the crash.
+ */
+static int
+journal_syncraid(journal_t *journal, unsigned long next_log_block)
+{
+	int i, err;
+	unsigned long blocknr;
+
+	jbd_debug(2, "RAID resync of 1/4 of the journal starting at %lu\n",
+		  next_log_block);
+
+	for (i = 0; i < journal->j_maxlen / 4; i++) {
+		err = journal_bmap(journal, next_log_block, &blocknr);
+
+		if (err) {
+			printk(KERN_ERR "JBD: bad block at offset %lu\n",
+			       next_log_block);
+			return err;
+		}
+
+		err = resync_range(journal, blocknr, blocknr);
+		if (err)
+			return err;
+
+		next_log_block++;
+		wrap(journal, next_log_block);
+	}
+
+	return 0;
+}
 
 /* Scan a revoke record, marking all blocks mentioned as revoked. */
 
Index: linux-2.6.18-128.1.6/fs/jbd/transaction.c
===================================================================
--- linux-2.6.18-128.1.6.orig/fs/jbd/transaction.c
+++ linux-2.6.18-128.1.6/fs/jbd/transaction.c
@@ -58,6 +58,10 @@ get_transaction(journal_t *journal, tran
 	journal->j_commit_timer.expires = transaction->t_expires;
 	add_timer(&journal->j_commit_timer);
 
+	/* Initialize the declare radix tree */
+	INIT_RADIX_TREE(&transaction->t_declare_root, GFP_ATOMIC);
+	INIT_RADIX_TREE(&transaction->t_declare_done_root, GFP_ATOMIC);
+
 	J_ASSERT(journal->j_running_transaction == NULL);
 	journal->j_running_transaction = transaction;
 	transaction->t_max_wait = 0;
@@ -956,6 +960,7 @@ int journal_dirty_data(handle_t *handle,
 	journal_t *journal = handle->h_transaction->t_journal;
 	int need_brelse = 0;
 	struct journal_head *jh;
+	int jdatalist;
 
 	if (is_handle_aborted(handle))
 		return 0;
@@ -999,6 +1004,8 @@ int journal_dirty_data(handle_t *handle,
 		goto no_journal;
 	}
 
+	jdatalist = journal->j_flags & JFS_DECLARE ? BJ_Declare : BJ_SyncData;
+
 	if (jh->b_transaction) {
 		JBUFFER_TRACE(jh, "has transaction");
 		if (jh->b_transaction != handle->h_transaction) {
@@ -1041,6 +1048,8 @@ int journal_dirty_data(handle_t *handle,
 			 */
 			if (jh->b_jlist != BJ_None &&
 					jh->b_jlist != BJ_SyncData &&
+					jh->b_jlist != BJ_Declare &&
+					jh->b_jlist != BJ_DeclareDone &&
 					jh->b_jlist != BJ_Locked) {
 				JBUFFER_TRACE(jh, "Not stealing");
 				goto no_journal;
@@ -1088,18 +1097,19 @@ int journal_dirty_data(handle_t *handle,
 		 * committing transaction, so might still be left on that
 		 * transaction's metadata lists.
 		 */
-		if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
+		if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Declare &&
+		    jh->b_jlist != BJ_DeclareDone && jh->b_jlist != BJ_Locked) {
 			JBUFFER_TRACE(jh, "not on correct data list: unfile");
 			J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
 			__journal_temp_unlink_buffer(jh);
 			jh->b_transaction = handle->h_transaction;
 			JBUFFER_TRACE(jh, "file as data");
 			__journal_file_buffer(jh, handle->h_transaction,
-						BJ_SyncData);
+						jdatalist);
 		}
 	} else {
 		JBUFFER_TRACE(jh, "not on a transaction");
-		__journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
+		__journal_file_buffer(jh, handle->h_transaction, jdatalist);
 	}
 no_journal:
 	spin_unlock(&journal->j_list_lock);
@@ -1578,6 +1588,7 @@ void __journal_temp_unlink_buffer(struct
 	struct journal_head **list = NULL;
 	transaction_t *transaction;
 	struct buffer_head *bh = jh2bh(jh);
+	struct radix_tree_root *root = NULL;
 
 	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
 	transaction = jh->b_transaction;
@@ -1617,9 +1628,25 @@ void __journal_temp_unlink_buffer(struct
 	case BJ_Locked:
 		list = &transaction->t_locked_list;
 		break;
+	case BJ_Declare:
+		root = &transaction->t_declare_root;
+		transaction->t_declare_count--;
+		break;
+	case BJ_DeclareDone:
+		root = &transaction->t_declare_done_root;
+		break;
+	}
+
+	if (jh->b_jlist == BJ_Declare || jh->b_jlist == BJ_DeclareDone) {
+		if ((radix_tree_delete(root, bh->b_blocknr)) != jh) {
+			printk(KERN_ERR
+				"jbd: ERROR radix tree delete block %8llu\n",
+				(unsigned long long)bh->b_blocknr);
+		}
 	}
+	else
+		__blist_del_buffer(list, jh);
 
-	__blist_del_buffer(list, jh);
 	jh->b_jlist = BJ_None;
 	if (test_clear_buffer_jbddirty(bh))
 		mark_buffer_dirty(bh);	/* Expose it to the VM */
@@ -1660,7 +1687,8 @@ __journal_try_to_free_buffer(journal_t *
 
 	spin_lock(&journal->j_list_lock);
 	if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
-		if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
+		if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Declare ||
+		    jh->b_jlist == BJ_DeclareDone || jh->b_jlist == BJ_Locked) {
 			/* A written-back ordered data buffer */
 			JBUFFER_TRACE(jh, "release data");
 			__journal_unfile_buffer(jh);
@@ -2072,6 +2100,8 @@ void __journal_file_buffer(struct journa
 	struct journal_head **list = NULL;
 	int was_dirty = 0;
 	struct buffer_head *bh = jh2bh(jh);
+	struct radix_tree_root *root = NULL;
+	int declare_per_block;
 
 	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
 	assert_spin_locked(&transaction->t_journal->j_list_lock);
@@ -2126,15 +2156,44 @@ void __journal_file_buffer(struct journa
 		list = &transaction->t_reserved_list;
 		break;
 	case BJ_Locked:
-		list =  &transaction->t_locked_list;
+		list = &transaction->t_locked_list;
+		break;
+	case BJ_Declare:
+		root = &transaction->t_declare_root;
+		transaction->t_declare_count++;
 		break;
+	case BJ_DeclareDone:
+		root = &transaction->t_declare_done_root;
+		break;
+	}
+
+	if (jlist == BJ_Declare || jlist == BJ_DeclareDone) {
+		if ((radix_tree_insert(root, bh->b_blocknr, jh)) != 0) {
+			printk(KERN_ERR
+				"jbd: ERROR radix tree insert block %8lu\n",
+				(long unsigned)bh->b_blocknr);
+		}
+	} else {
+		__blist_add_buffer(list, jh);
 	}
 
-	__blist_add_buffer(list, jh);
 	jh->b_jlist = jlist;
 
 	if (was_dirty)
 		set_buffer_jbddirty(bh);
+
+	declare_per_block = (bh->b_size - (sizeof(journal_header_t) + 32)) /
+		sizeof(journal_block_tag_t);
+
+	/* wake up the commit thread to perform early declarations */
+	assert_spin_locked(&transaction->t_journal->j_list_lock);
+	if (transaction->t_journal->j_flags & JFS_DECLARE &&
+			jlist == BJ_Declare &&
+			transaction->t_declare_count >= declare_per_block) {
+		transaction->t_declare_request = transaction->t_declare_count /
+			declare_per_block * declare_per_block;
+		wake_up(&transaction->t_journal->j_wait_commit);
+	}
 }
 
 void journal_file_buffer(struct journal_head *jh,
Index: linux-2.6.18-128.1.6/include/linux/jbd.h
===================================================================
--- linux-2.6.18-128.1.6.orig/include/linux/jbd.h
+++ linux-2.6.18-128.1.6/include/linux/jbd.h
@@ -26,6 +26,7 @@
 #include <linux/types.h>
 #include <linux/buffer_head.h>
 #include <linux/journal-head.h>
+#include <linux/radix-tree.h>
 #include <linux/stddef.h>
 #include <linux/bit_spinlock.h>
 #include <linux/mutex.h>
@@ -137,6 +138,7 @@ typedef struct journal_s	journal_t;	/* J
 #define JFS_SUPERBLOCK_V1	3
 #define JFS_SUPERBLOCK_V2	4
 #define JFS_REVOKE_BLOCK	5
+#define JFS_DECLARE_BLOCK	6
 
 /*
  * Standard header for all descriptor blocks:
@@ -261,12 +263,14 @@ typedef struct journal_superblock_s
 
 #define JFS_FEATURE_INCOMPAT_REVOKE		0x00000001
 #define JFS_FEATURE_INCOMPAT_ASYNC_COMMIT    	0x00000004
+#define JFS_FEATURE_INCOMPAT_DECLARE_BLOCKS    	0x00000008
 
 /* Features known to this kernel version: */
 #define JFS_KNOWN_COMPAT_FEATURES	JFS_FEATURE_COMPAT_CHECKSUM
 #define JFS_KNOWN_ROCOMPAT_FEATURES	0
 #define JFS_KNOWN_INCOMPAT_FEATURES	(JFS_FEATURE_INCOMPAT_REVOKE | \
-					JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)
+					JFS_FEATURE_INCOMPAT_ASYNC_COMMIT | \
+					JFS_FEATURE_INCOMPAT_DECLARE_BLOCKS)
 
 #ifdef __KERNEL__
 
@@ -559,6 +563,15 @@ struct transaction_s 
 	struct journal_head	*t_sync_datalist;
 
 	/*
+	 * Radix tree of all data buffers that must be declared before being
+	 * written, declare mode counters [j_list_lock]
+	 */
+	struct radix_tree_root	 t_declare_root;
+	struct radix_tree_root	 t_declare_done_root;
+	unsigned int		 t_declare_count;
+	unsigned int		 t_declare_request;
+
+	/*
 	 * Doubly-linked circular list of all forget buffers (superseded
 	 * buffers which we can un-checkpoint once this transaction commits)
 	 * [j_list_lock]
@@ -730,6 +743,7 @@ jbd_time_diff(unsigned int start, unsign
  * @j_wait_checkpoint:  Wait queue to trigger checkpointing
  * @j_wait_commit: Wait queue to trigger commit
  * @j_wait_updates: Wait queue to wait for updates to complete
+ * @j_wait_declare: Wait queue to wait for declarations to complete
  * @j_checkpoint_mutex: Mutex for locking against concurrent checkpoints
  * @j_head: Journal head - identifies the first unused block in the journal
  * @j_tail: Journal tail - identifies the oldest still-used block in the
@@ -768,6 +782,8 @@ jbd_time_diff(unsigned int start, unsign
  * @j_wbufsize: maximum number of buffer_heads allowed in j_wbuf, the
  *	number that will fit in j_blocksize
  * @j_last_sync_writer: most recent pid which did a synchronous write
+ * @j_declare_jhs: array of journal_heads for write_declare_blocks
+ * @j_declare_bhs: array of buffer_heads for write_declare_blocks
  * @j_private: An opaque pointer to fs-private information.
  */
 
@@ -841,6 +857,9 @@ struct journal_s
 	/* Wait queue to wait for updates to complete */
 	wait_queue_head_t	j_wait_updates;
 
+	/* Wait queue to wait for declarations to complete */
+	wait_queue_head_t	j_wait_declare;
+
 	/* Semaphore for locking against concurrent checkpoints */
 	struct mutex	 	j_checkpoint_mutex;
 
@@ -970,6 +989,13 @@ struct journal_s
 	struct transaction_stats_s j_stats;
 
 	/*
+	 * Arrays of jhs and bhs for write_declare_blocks, to avoid
+	 * having to allocate them each time.
+	 */
+	void			*j_declare_jhs[64];
+	struct buffer_head	*j_declare_bhs[64];
+
+	/*
 	 * An opaque pointer to fs-private information.  ext3 puts its
 	 * superblock pointer here
 	 */
@@ -985,6 +1011,7 @@ struct journal_s
 #define JFS_FLUSHED	0x008	/* The journal superblock has been flushed */
 #define JFS_LOADED	0x010	/* The journal superblock has been loaded */
 #define JFS_BARRIER	0x020	/* Use IDE barriers */
+#define JFS_DECLARE	0x040	/* Declare data blocks before writing */
 
 /* 
  * Function declarations for the journaling transaction and buffer
@@ -1100,6 +1127,7 @@ extern void	   journal_ack_err    (journ
 extern int	   journal_clear_err  (journal_t *);
 extern int	   journal_bmap(journal_t *, unsigned long, unsigned long *);
 extern int	   journal_force_commit(journal_t *);
+extern int	   journal_write_declare(journal_t *);
 
 /*
  * journal_head management
@@ -1244,7 +1272,9 @@ static inline int jbd_space_needed(journ
 #define BJ_LogCtl	6	/* Buffer contains log descriptors */
 #define BJ_Reserved	7	/* Buffer is reserved for access by journal */
 #define BJ_Locked	8	/* Locked for I/O during commit */
-#define BJ_Types	9
+#define BJ_Declare	9	/* Needs to be declared first */
+#define BJ_DeclareDone	10	/* Has been declared */
+#define BJ_Types	11
  
 extern int jbd_blocks_per_page(struct inode *inode);
 

-- 

  parent reply	other threads:[~2009-10-01 22:40 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-10-01 22:39 [patch 0/4] Journal guided resync and support scjody
2009-10-01 22:39 ` [patch 1/4] [md] Add SKIP_RESYNC ioctl scjody
2009-10-01 22:39 ` [patch 2/4] [md] Add RESYNC_RANGE ioctl scjody
2009-10-01 22:39 ` scjody [this message]
2009-10-01 23:39   ` [patch 3/4] [jbd] Add support for journal guided resync Andrew Morton
2009-10-01 22:39 ` [patch 4/4] [ext3] Add journal guided resync (data=declared mode) scjody
2009-10-02  1:51   ` Neil Brown
2009-10-02 15:53     ` Jody McIntyre
2009-10-02  0:36 ` [patch 0/4] Journal guided resync and support Andi Kleen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20091001224018.740641147@sun.com \
    --to=scjody@sun.com \
    --cc=adilger@sun.com \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-raid@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.