linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] ext4: add support for multiple mount protection
@ 2011-04-12 18:04 Johann Lombardi
  2011-04-12 19:20 ` Bernd Schubert
  2011-04-12 20:39 ` Eric Sandeen
  0 siblings, 2 replies; 14+ messages in thread
From: Johann Lombardi @ 2011-04-12 18:04 UTC (permalink / raw)
  To: linux-ext4; +Cc: Johann Lombardi, Andreas Dilger

Prevent an ext4 filesystem from being mounted multiple times.
A sequence number is stored on disk and is periodically updated (every 5
seconds by default) by a mounted filesystem.
At mount time, we now wait for s_mmp_update_interval seconds to make sure
that the MMP sequence does not change.
In case of failure, the nodename, bdevname and the time at which the MMP
block was last updated is displayed.

Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Signed-off-by: Johann Lombardi <johann@whamcloud.com>
---
 fs/ext4/ext4.h  |   56 ++++++++-
 fs/ext4/super.c |  363 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 416 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4daaf2b..f130ac7 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1028,7 +1028,7 @@ struct ext4_super_block {
 	__le16	s_want_extra_isize; 	/* New inodes should reserve # bytes */
 	__le32	s_flags;		/* Miscellaneous flags */
 	__le16  s_raid_stride;		/* RAID stride */
-	__le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
+	__le16  s_mmp_update_interval;  /* # seconds to wait in MMP checking */
 	__le64  s_mmp_block;            /* Block for multi-mount protection */
 	__le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
 	__u8	s_log_groups_per_flex;  /* FLEX_BG group size */
@@ -1201,6 +1201,9 @@ struct ext4_sb_info {
 	struct ext4_li_request *s_li_request;
 	/* Wait multiplier for lazy initialization thread */
 	unsigned int s_li_wait_mult;
+
+	/* Kernel thread for multiple mount protection */
+	struct task_struct *s_mmp_tsk;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1357,7 +1360,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 					 EXT4_FEATURE_INCOMPAT_META_BG| \
 					 EXT4_FEATURE_INCOMPAT_EXTENTS| \
 					 EXT4_FEATURE_INCOMPAT_64BIT| \
-					 EXT4_FEATURE_INCOMPAT_FLEX_BG)
+					 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+					 EXT4_FEATURE_INCOMPAT_MMP)
 #define EXT4_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
 					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
 					 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1615,6 +1619,50 @@ struct ext4_features {
 };
 
 /*
+ * This structure will be used for multiple mount protection. It will be
+ * written into the block number saved in the s_mmp_block field in the
+ * superblock. Programs that check MMP should assume that if
+ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
+ * to use the filesystem, regardless of how old the timestamp is.
+ */
+#define EXT4_MMP_MAGIC     0x004D4D50U /* ASCII for MMP */
+#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
+#define EXT4_MMP_SEQ_FSCK  0xE24D4D50U /* mmp_seq value when being fscked */
+#define EXT4_MMP_SEQ_MAX   0xE24D4D4FU /* maximum valid mmp_seq value */
+
+struct mmp_struct {
+	__le32	mmp_magic;
+	__le32	mmp_seq;
+	__le64	mmp_time;
+	char	mmp_nodename[64];
+	char	mmp_bdevname[32];
+	__le16	mmp_check_interval;
+	__le16	mmp_pad1;
+	__le32	mmp_pad2[227];
+};
+
+/* arguments passed to the mmp thread */
+struct mmpd_data {
+	struct buffer_head *bh; /* bh from initial read_mmp_block() */
+	struct super_block *sb;  /* super block of the fs */
+};
+
+/*
+ * Default interval in seconds to update the MMP sequence number.
+ */
+#define EXT4_MMP_UPDATE_INTERVAL   1
+
+/*
+ * Minimum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MIN_CHECK_INTERVAL	5UL
+
+/*
+ * Maximum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MAX_CHECK_INTERVAL	300UL
+
+/*
  * Function prototypes
  */
 
@@ -1788,6 +1836,10 @@ extern void __ext4_warning(struct super_block *, const char *, unsigned int,
 						       __LINE__, ## message)
 extern void ext4_msg(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
+extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
+			   const char *, unsigned int, const char *);
+#define dump_mmp_msg(sb, mmp, msg)	__dump_mmp_msg(sb, mmp, __func__, \
+						       __LINE__, msg)
 extern void __ext4_grp_locked_error(const char *, unsigned int, \
 				    struct super_block *, ext4_group_t, \
 				    unsigned long, ext4_fsblk_t, \
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 22546ad..b8b37c5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -39,6 +39,8 @@
 #include <linux/log2.h>
 #include <linux/crc16.h>
 #include <asm/uaccess.h>
+#include <linux/kthread.h>
+#include <linux/utsname.h>
 
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -789,6 +791,8 @@ static void ext4_put_super(struct super_block *sb)
 		invalidate_bdev(sbi->journal_bdev);
 		ext4_blkdev_remove(sbi);
 	}
+	if (sbi->s_mmp_tsk)
+		kthread_stop(sbi->s_mmp_tsk);
 	sb->s_fs_info = NULL;
 	/*
 	 * Now that we are completely done shutting down the
@@ -1088,6 +1092,349 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	return 0;
 }
 
+
+/*
+ * Write the MMP block using WRITE_SYNC to try to get the block on-disk
+ * faster.
+ */
+static int write_mmp_block(struct buffer_head *bh)
+{
+	mark_buffer_dirty(bh);
+	lock_buffer(bh);
+	bh->b_end_io = end_buffer_write_sync;
+	get_bh(bh);
+	submit_bh(WRITE_SYNC, bh);
+	wait_on_buffer(bh);
+	if (unlikely(!buffer_uptodate(bh)))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Read the MMP block. It _must_ be read from disk and hence we clear the
+ * uptodate flag on the buffer.
+ */
+static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
+			  unsigned long mmp_block)
+{
+	struct mmp_struct *mmp;
+
+	if (*bh)
+		clear_buffer_uptodate(*bh);
+
+	/* This would be sb_bread(sb, mmp_block), except we need to be sure
+	 * that the MD RAID device cache has been bypassed, and that the read
+	 * is not blocked in the elevator. */
+	if (!*bh)
+		*bh = sb_getblk(sb, mmp_block);
+	if (*bh) {
+		get_bh(*bh);
+		lock_buffer(*bh);
+		(*bh)->b_end_io = end_buffer_read_sync;
+		submit_bh(READ_SYNC, *bh);
+		wait_on_buffer(*bh);
+		if (!buffer_uptodate(*bh)) {
+			brelse(*bh);
+			*bh = NULL;
+		}
+	}
+	if (!*bh) {
+		ext4_warning(sb, "Error while reading MMP block %lu",
+		             mmp_block);
+		return -EIO;
+	}
+
+	mmp = (struct mmp_struct *)((*bh)->b_data);
+	if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Dump as much information as possible to help the admin.
+ */
+void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
+		    const char *function, unsigned int line, const char *msg)
+{
+	__ext4_warning(sb, function, line, msg);
+	__ext4_warning(sb, function, line,
+		       "MMP failure info: last update time: %llu, last update "
+		       "node: %s, last update device: %s\n",
+		       (long long unsigned int) le64_to_cpu(mmp->mmp_time),
+		       mmp->mmp_nodename, mmp->mmp_bdevname);
+}
+
+/*
+ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
+ */
+static int kmmpd(void *data)
+{
+	struct super_block *sb = ((struct mmpd_data *) data)->sb;
+	struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct mmp_struct *mmp;
+	unsigned long mmp_block;
+	u32 seq = 0;
+	unsigned long failed_writes = 0;
+	int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
+	unsigned mmp_check_interval;
+	unsigned long last_update_time;
+	unsigned long diff;
+	int retval;
+
+	mmp_block = le64_to_cpu(es->s_mmp_block);
+	mmp = (struct mmp_struct *)(bh->b_data);
+	mmp->mmp_time = cpu_to_le64(get_seconds());
+	/*
+	 * Start with the higher mmp_check_interval and reduce it if
+	 * the MMP block is being updated on time.
+	 */
+	mmp_check_interval = max(5UL * mmp_update_interval,
+				 EXT4_MMP_MIN_CHECK_INTERVAL);
+	mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+	bdevname(bh->b_bdev, mmp->mmp_bdevname);
+
+	memcpy(mmp->mmp_nodename, init_utsname()->sysname,
+	       sizeof(mmp->mmp_nodename));
+
+	while (!kthread_should_stop()) {
+		if (++seq > EXT4_MMP_SEQ_MAX)
+			seq = 1;
+
+		mmp->mmp_seq = cpu_to_le32(seq);
+		mmp->mmp_time = cpu_to_le64(get_seconds());
+		last_update_time = jiffies;
+
+		retval = write_mmp_block(bh);
+		/*
+		 * Don't spew too many error messages. Print one every
+		 * (s_mmp_update_interval * 60) seconds.
+		 */
+		if (retval && (failed_writes % 60) == 0) {
+			ext4_error(sb, "Error writing to MMP block");
+			failed_writes++;
+		}
+
+		if (!(le32_to_cpu(es->s_feature_incompat) &
+		    EXT4_FEATURE_INCOMPAT_MMP)) {
+			ext4_warning(sb, "kmmpd being stopped since MMP feature"
+				     " has been disabled.");
+			EXT4_SB(sb)->s_mmp_tsk = NULL;
+			goto failed;
+		}
+
+		if (sb->s_flags & MS_RDONLY) {
+			ext4_warning(sb, "kmmpd being stopped since filesystem "
+				     "has been remounted as readonly.");
+			EXT4_SB(sb)->s_mmp_tsk = NULL;
+			goto failed;
+		}
+
+		diff = jiffies - last_update_time;
+		if (diff < mmp_update_interval * HZ)
+			schedule_timeout_interruptible(mmp_update_interval *
+						       HZ - diff);
+
+		/*
+		 * We need to make sure that more than mmp_check_interval
+		 * seconds have not passed since writing. If that has happened
+		 * we need to check if the MMP block is as we left it.
+		 */
+		diff = jiffies - last_update_time;
+		if (diff > mmp_check_interval * HZ) {
+			struct buffer_head *bh_check = NULL;
+			struct mmp_struct *mmp_check;
+
+			retval = read_mmp_block(sb, &bh_check, mmp_block);
+			if (retval) {
+				ext4_error(sb, "error reading MMP data: %d",
+					   retval);
+
+				EXT4_SB(sb)->s_mmp_tsk = NULL;
+				goto failed;
+			}
+
+			mmp_check = (struct mmp_struct *)(bh_check->b_data);
+			if (mmp->mmp_seq != mmp_check->mmp_seq ||
+			    memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
+				   sizeof(mmp->mmp_nodename))) {
+				dump_mmp_msg(sb, mmp_check,
+					     "Error while updating MMP info. "
+					     "The filesystem seems to have been"
+					     " multiply mounted.");
+				ext4_error(sb, "abort");
+				goto failed;
+			}
+			put_bh(bh_check);
+		}
+
+		/*
+		 * Adjust the mmp_check_interval depending on how much time
+		 * it took for the MMP block to be written.
+		 */
+		mmp_check_interval = max(min(5 * diff / HZ,
+					     EXT4_MMP_MAX_CHECK_INTERVAL),
+					 EXT4_MMP_MIN_CHECK_INTERVAL);
+		mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+	}
+
+	/*
+	 * Unmount seems to be clean.
+	 */
+	mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
+	mmp->mmp_time = cpu_to_le64(get_seconds());
+
+	retval = write_mmp_block(bh);
+
+failed:
+	kfree(data);
+	brelse(bh);
+	return retval;
+}
+
+/*
+ * Get a random new sequence number but make sure it is not greater than
+ * EXT4_MMP_SEQ_MAX.
+ */
+static unsigned int mmp_new_seq(void)
+{
+	u32 new_seq;
+
+	do {
+		get_random_bytes(&new_seq, sizeof(u32));
+	} while (new_seq > EXT4_MMP_SEQ_MAX);
+
+	return new_seq;
+}
+
+/*
+ * Protect the filesystem from being mounted more than once.
+ */
+static int ext4_multi_mount_protect(struct super_block *sb,
+				    unsigned long mmp_block)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct buffer_head *bh = NULL;
+	struct mmp_struct *mmp = NULL;
+	struct mmpd_data *mmpd_data;
+	u32 seq;
+	unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
+	unsigned int wait_time = 0;
+	int retval;
+
+	if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
+	    mmp_block >= ext4_blocks_count(es)) {
+		ext4_warning(sb, "Invalid MMP block in superblock");
+		goto failed;
+	}
+
+	retval = read_mmp_block(sb, &bh, mmp_block);
+	if (retval)
+		goto failed;
+
+	mmp = (struct mmp_struct *)(bh->b_data);
+
+	if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
+		mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
+
+	/*
+	 * If check_interval in MMP block is larger, use that instead of
+	 * update_interval from the superblock.
+	 */
+	if (mmp->mmp_check_interval > mmp_check_interval)
+		mmp_check_interval = mmp->mmp_check_interval;
+
+	seq = le32_to_cpu(mmp->mmp_seq);
+	if (seq == EXT4_MMP_SEQ_CLEAN)
+		goto skip;
+
+	if (seq == EXT4_MMP_SEQ_FSCK) {
+		dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
+		goto failed;
+	}
+
+	wait_time = min(mmp_check_interval * 2 + 1,
+			mmp_check_interval + 60);
+
+	/* Print MMP interval if more than 20 secs. */
+	if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
+		ext4_warning(sb, "MMP interval %u higher than expected, please"
+		             " wait.\n", wait_time * 2);
+
+	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+		ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+		goto failed;
+	}
+
+	retval = read_mmp_block(sb, &bh, mmp_block);
+	if (retval)
+		goto failed;
+	mmp = (struct mmp_struct *)(bh->b_data);
+	if (seq != le32_to_cpu(mmp->mmp_seq)) {
+		dump_mmp_msg(sb, mmp,
+			     "Device is already active on another node.");
+		goto failed;
+	}
+
+skip:
+	/*
+	 * write a new random sequence number.
+	 */
+	mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
+
+	retval = write_mmp_block(bh);
+	if (retval)
+		goto failed;
+
+	/*
+	 * wait for MMP interval and check mmp_seq.
+	 */
+	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+		ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+		goto failed;
+	}
+
+	retval = read_mmp_block(sb, &bh, mmp_block);
+	if (retval)
+		goto failed;
+	mmp = (struct mmp_struct *)(bh->b_data);
+	if (seq != le32_to_cpu(mmp->mmp_seq)) {
+		dump_mmp_msg(sb, mmp,
+			     "Device is already active on another node.");
+		goto failed;
+	}
+
+	mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
+	if (!mmpd_data) {
+		ext4_warning(sb, "not enough memory for mmpd_data");
+		goto failed;
+	}
+	mmpd_data->sb = sb;
+	mmpd_data->bh = bh;
+
+	/*
+	 * Start a kernel thread to update the MMP block periodically.
+	 */
+	EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
+					     bdevname(bh->b_bdev,
+						      mmp->mmp_bdevname));
+	if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
+		EXT4_SB(sb)->s_mmp_tsk = NULL;
+		kfree(mmpd_data);
+		ext4_warning(sb, "Unable to create kmmpd thread for %s.",
+			     sb->s_id);
+		goto failed;
+	}
+
+	return 0;
+
+failed:
+	brelse(bh);
+	return 1;
+}
+
 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
 					u64 ino, u32 generation)
 {
@@ -3432,6 +3779,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			  EXT4_HAS_INCOMPAT_FEATURE(sb,
 				    EXT4_FEATURE_INCOMPAT_RECOVER));
 
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
+	    !(sb->s_flags & MS_RDONLY))
+		if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
+			goto failed_mount3;
+
 	/*
 	 * The first inode we look at is the journal inode.  Don't try
 	 * root first: it may be modified in the journal!
@@ -3682,6 +4034,8 @@ failed_mount3:
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+	if (sbi->s_mmp_tsk)
+		kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
@@ -4212,7 +4566,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	int enable_quota = 0;
 	ext4_group_t g;
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
-	int err;
+	int err = 0;
 #ifdef CONFIG_QUOTA
 	int i;
 #endif
@@ -4338,6 +4692,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 				goto restore_opts;
 			if (!ext4_setup_super(sb, es, 0))
 				sb->s_flags &= ~MS_RDONLY;
+			if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+						     EXT4_FEATURE_INCOMPAT_MMP))
+				if (ext4_multi_mount_protect(sb,
+						le64_to_cpu(es->s_mmp_block))) {
+					err = -EROFS;
+					goto restore_opts;
+				}
 			enable_quota = 1;
 		}
 	}
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] ext4: add support for multiple mount protection
  2011-04-12 18:04 Johann Lombardi
@ 2011-04-12 19:20 ` Bernd Schubert
  2011-05-02 13:43   ` Johann Lombardi
  2011-04-12 20:39 ` Eric Sandeen
  1 sibling, 1 reply; 14+ messages in thread
From: Bernd Schubert @ 2011-04-12 19:20 UTC (permalink / raw)
  To: Johann Lombardi; +Cc: linux-ext4, Andreas Dilger

Hello Johann,

> +/*
> + * Default interval in seconds to update the MMP sequence number.
> + */
> +#define EXT4_MMP_UPDATE_INTERVAL   1

this define is nowhere used (e2fsprogs already sets the update-interval 
in the superblock). And it is also too small, which may cause a huge 
performance issue (please remember my corresponding lustre bugzilla...).

Once you are going to post the e2fsprogs patch, could you please also 
take care of the 2TiB mmp-block read limitation bug (please see another 
Lustre bugzilla )?

It also would be nice to explain somewhere (and not hidden in the code) 
what is the difference between "update interval" and "check interval". 
For example something like this:

The 'mmp update interval' is the frequency how often the mmp is is 
supposed to be written. Values smaller than 5s may reduce performance.
The 'mmp check interval' is used to verify if the mmp block has been 
updated on the device. The value is updated based on the maximum time to 
write the mmp-block during an update-cycle. Its minimum is
5 * mmp-update-interval



Thanks,
Bernd




^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] ext4: add support for multiple mount protection
  2011-04-12 18:04 Johann Lombardi
  2011-04-12 19:20 ` Bernd Schubert
@ 2011-04-12 20:39 ` Eric Sandeen
  2011-04-12 21:08   ` Andreas Dilger
                     ` (2 more replies)
  1 sibling, 3 replies; 14+ messages in thread
From: Eric Sandeen @ 2011-04-12 20:39 UTC (permalink / raw)
  To: Johann Lombardi; +Cc: linux-ext4, Andreas Dilger

On 4/12/11 1:04 PM, Johann Lombardi wrote:
> Prevent an ext4 filesystem from being mounted multiple times.
> A sequence number is stored on disk and is periodically updated (every 5
> seconds by default) by a mounted filesystem.
> At mount time, we now wait for s_mmp_update_interval seconds to make sure
> that the MMP sequence does not change.
> In case of failure, the nodename, bdevname and the time at which the MMP
> block was last updated is displayed.
> 
> Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
> Signed-off-by: Johann Lombardi <johann@whamcloud.com>
> ---
>  fs/ext4/ext4.h  |   56 ++++++++-
>  fs/ext4/super.c |  363 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 416 insertions(+), 3 deletions(-)
> 

There was a lot of skepticism about this last time, and I imagine there still is...

400 new lines of kernel code for this, and if the other machine is hung up for 5 seconds and doesn't update, it can still be multiply-mounted anyway, right?

BUG: soft lockup - CPU#0 stuck for 10s! anyone?  :(

I don't see the value in it for upstream ext4, but then hey, ext4 rarely meets a feature it doesn't like ;)

-Eric

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] ext4: add support for multiple mount protection
  2011-04-12 20:39 ` Eric Sandeen
@ 2011-04-12 21:08   ` Andreas Dilger
  2011-04-12 21:11   ` Johann Lombardi
  2011-04-12 21:41   ` Bernd Schubert
  2 siblings, 0 replies; 14+ messages in thread
From: Andreas Dilger @ 2011-04-12 21:08 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: Johann Lombardi, linux-ext4

On 2011-04-12, at 4:39 PM, Eric Sandeen wrote:
> On 4/12/11 1:04 PM, Johann Lombardi wrote:
>> Prevent an ext4 filesystem from being mounted multiple times.
>> A sequence number is stored on disk and is periodically updated (every 5
>> seconds by default) by a mounted filesystem.
>> At mount time, we now wait for s_mmp_update_interval seconds to make sure
>> that the MMP sequence does not change.
>> In case of failure, the nodename, bdevname and the time at which the MMP
>> block was last updated is displayed.
>> 
>> Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
>> Signed-off-by: Johann Lombardi <johann@whamcloud.com>
>> ---
>> fs/ext4/ext4.h  |   56 ++++++++-
>> fs/ext4/super.c |  363 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>> 2 files changed, 416 insertions(+), 3 deletions(-)
>> 
> 
> There was a lot of skepticism about this last time, and I imagine there still is...
> 
> 400 new lines of kernel code for this, and if the other machine is hung up for 5 seconds and doesn't update, it can still be multiply-mounted anyway, right?
> 
> BUG: soft lockup - CPU#0 stuck for 10s! anyone?  :(

No, that isn't true, or the whole patch would be completely useless...

If the owning node is blocked for longer than the update interval then the kmmpd thread will detect this (it tracks the last time the IO was completed) it will attempt to "re-acquire" the MMP block (read, wait, re-read, update-if-unchanged) or mark the filesystem in error (mounted with errors={remount-ro,panic} to block the node from accessing the filesystem again.

Note also that MMP isn't intended to be a primary HA failover manager (i.e. having all nodes trying to mount a shared filesystem and depending on MMP to decide on a winner), but rather as a failsafe for broken HA managers that may fail due to many reasons, as we have found in the past (STONITH failure, admin mounting on backup node, mounting fs while e2fsck is running, broken HA scripts, etc).

> I don't see the value in it for upstream ext4, but then hey, ext4 rarely meets a feature it doesn't like ;)

While it is true that this is 400 lines of code, the only change to the main codepath is a few lines at mount/remount/unmount to start/stop the kmmpd thread.  The risk of this causing any problem for someone not using MMP is virtually non-existent, IMHO.


Cheers, Andreas
--
Andreas Dilger 
Principal Engineer
Whamcloud, Inc.




^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] ext4: add support for multiple mount protection
  2011-04-12 20:39 ` Eric Sandeen
  2011-04-12 21:08   ` Andreas Dilger
@ 2011-04-12 21:11   ` Johann Lombardi
  2011-04-12 21:41   ` Bernd Schubert
  2 siblings, 0 replies; 14+ messages in thread
From: Johann Lombardi @ 2011-04-12 21:11 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: linux-ext4, Andreas Dilger

On Tue, Apr 12, 2011 at 03:39:33PM -0500, Eric Sandeen wrote:
> There was a lot of skepticism about this last time, and I imagine there still is...

MMP has absolutely no overhead if you don't turn it on. HA configurations (e.g. for NFS/CIFS servers) are very common out there.
Don't you think that MMP could be used in conjunction with Red Hat Cluster Suite to avoid corrupting the backend filesystem when things go wrong?

> 400 new lines of kernel code for this, and if the other machine is hung up for 5 seconds and doesn't update, it can still be multiply-mounted anyway, right?
> 
> BUG: soft lockup - CPU#0 stuck for 10s! anyone?  :(

If the kmmpd thread misses some MMP sequence updates, it reads the MMP block and calls ext3_error() if the sequence was updated behind its back.

Johann

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] ext4: add support for multiple mount protection
  2011-04-12 20:39 ` Eric Sandeen
  2011-04-12 21:08   ` Andreas Dilger
  2011-04-12 21:11   ` Johann Lombardi
@ 2011-04-12 21:41   ` Bernd Schubert
  2011-04-12 21:44     ` Eric Sandeen
  2 siblings, 1 reply; 14+ messages in thread
From: Bernd Schubert @ 2011-04-12 21:41 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: Johann Lombardi, linux-ext4, Andreas Dilger

On 04/12/2011 10:39 PM, Eric Sandeen wrote:
> On 4/12/11 1:04 PM, Johann Lombardi wrote:
>> Prevent an ext4 filesystem from being mounted multiple times. A
>> sequence number is stored on disk and is periodically updated
>> (every 5 seconds by default) by a mounted filesystem. At mount
>> time, we now wait for s_mmp_update_interval seconds to make sure 
>> that the MMP sequence does not change. In case of failure, the
>> nodename, bdevname and the time at which the MMP block was last
>> updated is displayed.
>> 
>> Signed-off-by: Andreas Dilger <adilger@whamcloud.com> 
>> Signed-off-by: Johann Lombardi <johann@whamcloud.com> --- 
>> fs/ext4/ext4.h  |   56 ++++++++- fs/ext4/super.c |  363
>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files
>> changed, 416 insertions(+), 3 deletions(-)
>> 
> 
> There was a lot of skepticism about this last time, and I imagine
> there still is...
> 
> 400 new lines of kernel code for this, and if the other machine is
> hung up for 5 seconds and doesn't update, it can still be
> multiply-mounted anyway, right?
> 
> BUG: soft lockup - CPU#0 stuck for 10s! anyone?  :(

Please see my other comment about the two different intervals. Yes,
there is a minimal chance of a race. But firstly, 5s are too small,
already for performance reasons (setting the update-interval to 5s will
increase the min check-interval to 25s). Secondly, the mount-wait time is

+	wait_time = min(mmp_check_interval * 2 + 1,
+			mmp_check_interval + 60);

So even with Johanns patch it is at least 12s.

Thirdly, the check-interval is automatically increased, if updating the
mmp block takes too long. This value will also be saved in the
mmp-block. Of course, it has a disadvantage - the mount time increases.

> 
> I don't see the value in it for upstream ext4, but then hey, ext4
> rarely meets a feature it doesn't like ;)

Is ext4 is only used on desktop systems? IMHO, every HA solution that
does not use scsi reservations or another way to check if a device is
already in use, needs a solution like this. I have seen so many problems
with heartbeat/pacemaker to not properly detect an already mounted
devices (*) and this MMP patch already protected so many HA Lustre
installations from data corruption due to double mounts....
So why shouldn't other HA solutions benefit from such a nice feature?

Usually, the heartbeat/pacemaker issues to detect if a device is mounted
or not are due to unreliable information if a device is mounted or not.
/etc/mtab is entirely unreliable and /proc/mounts does not always show
if a device is mounted or not.
However, even if that would work somehow perfectly, without the MMP
patch there is still zero protection from user-errors. It can easily
happen an admin forgets about a mounted device and runs e2fsck or
manually mounts the device on another machine again.

So please, let this patch go in.

Thanks,
Bernd

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] ext4: add support for multiple mount protection
  2011-04-12 21:41   ` Bernd Schubert
@ 2011-04-12 21:44     ` Eric Sandeen
  0 siblings, 0 replies; 14+ messages in thread
From: Eric Sandeen @ 2011-04-12 21:44 UTC (permalink / raw)
  To: Bernd Schubert; +Cc: Johann Lombardi, linux-ext4, Andreas Dilger

On 4/12/11 4:41 PM, Bernd Schubert wrote:

> So please, let this patch go in.

Well, it's not up to me :)

-Eric
 
> Thanks,
> Bernd


^ permalink raw reply	[flat|nested] 14+ messages in thread

* MMP update
@ 2011-05-02  8:11 Johann Lombardi
  2011-05-02  8:13 ` [PATCH] Add multi-mount protection support to libext2fs (INCOMPAT_MMP feature) Johann Lombardi
  2011-05-02  9:36 ` [PATCH] ext4: add support for multiple mount protection Johann Lombardi
  0 siblings, 2 replies; 14+ messages in thread
From: Johann Lombardi @ 2011-05-02  8:11 UTC (permalink / raw)
  To: linux-ext4

e2fsprogs patch:
- use blk64_t instead of blk_t
- bump update interval to 5s

kernel patch:
- add more comments to struct mmp_struct
- remove unused EXT4_MMP_UPDATE_INTERVAL
- change check interval multiplier from 5 to 2

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] Add multi-mount protection support to libext2fs (INCOMPAT_MMP feature).
  2011-05-02  8:11 MMP update Johann Lombardi
@ 2011-05-02  8:13 ` Johann Lombardi
  2011-05-02  9:36 ` [PATCH] ext4: add support for multiple mount protection Johann Lombardi
  1 sibling, 0 replies; 14+ messages in thread
From: Johann Lombardi @ 2011-05-02  8:13 UTC (permalink / raw)
  To: linux-ext4; +Cc: Johann Lombardi, Andreas Dilger

This allows mke2fs, e2fsck, and others to detect if the filesystem is
mounted on a remote node (on SAN disks) and avoid corrupting the
filesystem. For e2fsprogs this only means that it check the MMP block
to see if the filesystem is in use, and mark the filesystem busy while
e2fsck is running on the system.

There is no requirement that e2fsck updates the MMP block in any regular
interval, but e2fsck does this occasionally to provide additional
information to the sysadmin in case of conflict.

Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Signed-off-by: Johann Lombardi <johann@whamcloud.com>
---
 debugfs/debug_cmds.ct       |    3 +
 debugfs/debugfs.c           |   35 ++++
 debugfs/set_fields.c        |    2 +-
 e2fsck/e2fsck.c             |    4 +
 e2fsck/e2fsck.h             |    2 +
 e2fsck/journal.c            |    2 +
 e2fsck/pass1.c              |   13 ++
 e2fsck/pass1b.c             |    7 +
 e2fsck/problem.c            |   10 +
 e2fsck/problem.h            |    5 +
 e2fsck/unix.c               |   83 +++++++++-
 e2fsck/util.c               |   22 +++
 lib/e2p/feature.c           |    4 +-
 lib/e2p/ls.c                |    6 +
 lib/ext2fs/Makefile.in      |    4 +
 lib/ext2fs/closefs.c        |    5 +
 lib/ext2fs/ext2_err.et.in   |   21 +++
 lib/ext2fs/ext2_fs.h        |   39 +++--
 lib/ext2fs/ext2fs.h         |   36 ++++
 lib/ext2fs/freefs.c         |    5 +
 lib/ext2fs/mmp.c            |  420 +++++++++++++++++++++++++++++++++++++++++++
 lib/ext2fs/openfs.c         |   16 ++
 lib/ext2fs/swapfs.c         |   10 +
 lib/ext2fs/tst_super_size.c |    2 +-
 misc/mke2fs.c               |   15 ++-
 misc/tune2fs.8.in           |   13 ++
 misc/tune2fs.c              |  259 +++++++++++++++++++++-----
 misc/util.c                 |    8 +
 28 files changed, 982 insertions(+), 69 deletions(-)
 create mode 100644 lib/ext2fs/mmp.c

diff --git a/debugfs/debug_cmds.ct b/debugfs/debug_cmds.ct
index 9b6c985..af4b504 100644
--- a/debugfs/debug_cmds.ct
+++ b/debugfs/debug_cmds.ct
@@ -163,5 +163,8 @@ request do_set_current_time, "Set current time to use when setting filesystme fi
 request do_supported_features, "Print features supported by this version of e2fsprogs",
 	supported_features;
 
+request do_dump_mmp, "Dump MMP information",
+	dump_mmp;
+
 end;
 
diff --git a/debugfs/debugfs.c b/debugfs/debugfs.c
index e441bc5..c6160fc 100644
--- a/debugfs/debugfs.c
+++ b/debugfs/debugfs.c
@@ -78,6 +78,8 @@ static void open_filesystem(char *device, int open_flags, blk64_t superblock,
 			"opening read-only because of catastrophic mode");
 		open_flags &= ~EXT2_FLAG_RW;
 	}
+	if (catastrophic)
+		open_flags |= EXT2_FLAG_SKIP_MMP;
 
 	retval = ext2fs_open(device, open_flags, superblock, blocksize,
 			     unix_io_manager, &current_fs);
@@ -2133,6 +2135,39 @@ void do_punch(int argc, char *argv[])
 	}
 }
 
+void do_dump_mmp(int argc, char *argv[])
+{
+	struct mmp_struct *mmp_s;
+	errcode_t retval = 0;
+
+	if (current_fs->mmp_buf == NULL) {
+		retval = ext2fs_get_mem(current_fs->blocksize,
+					&current_fs->mmp_buf);
+		if (retval) {
+			com_err(argv[0], 0, "Could not allocate memory.\n");
+			return;
+	 	}
+	}
+
+	mmp_s = current_fs->mmp_buf;
+
+	retval = ext2fs_mmp_read(current_fs, current_fs->super->s_mmp_block,
+				 current_fs->mmp_buf);
+	if (retval) {
+		com_err(argv[0], retval, "Error reading MMP block.\n");
+		return;
+	}
+
+	fprintf(stdout, "MMP Block: %llu\n", current_fs->super->s_mmp_block);
+	fprintf(stdout, "MMP Update Interval: %d\n",
+		current_fs->super->s_mmp_update_interval);
+	fprintf(stdout, "MMP Check Interval: %d\n", mmp_s->mmp_check_interval);
+	fprintf(stdout, "MMP Sequence: %u\n", mmp_s->mmp_seq);
+	fprintf(stdout, "Last Update Time: %llu\n", mmp_s->mmp_time);
+	fprintf(stdout, "Node: %s\n", mmp_s->mmp_nodename);
+	fprintf(stdout, "Device: %s\n", mmp_s->mmp_bdevname);
+}
+
 static int source_file(const char *cmd_file, int sci_idx)
 {
 	FILE		*f;
diff --git a/debugfs/set_fields.c b/debugfs/set_fields.c
index ac6bc25..efb8d1b 100644
--- a/debugfs/set_fields.c
+++ b/debugfs/set_fields.c
@@ -130,7 +130,7 @@ static struct field_set_info super_fields[] = {
 	{ "flags", &set_sb.s_flags, 4, parse_uint },
 	{ "raid_stride", &set_sb.s_raid_stride, 2, parse_uint },
 	{ "min_extra_isize", &set_sb.s_min_extra_isize, 4, parse_uint },
-	{ "mmp_interval", &set_sb.s_mmp_interval, 2, parse_uint },
+	{ "mmp_update_interval", &set_sb.s_mmp_update_interval, 2, parse_uint },
 	{ "mmp_block", &set_sb.s_mmp_block, 8, parse_uint },
 	{ "raid_stripe_width", &set_sb.s_raid_stripe_width, 4, parse_uint },
 	{ "log_groups_per_flex", &set_sb.s_log_groups_per_flex, 1, parse_uint },
diff --git a/e2fsck/e2fsck.c b/e2fsck/e2fsck.c
index f1da97a..3fcc444 100644
--- a/e2fsck/e2fsck.c
+++ b/e2fsck/e2fsck.c
@@ -205,6 +205,7 @@ int e2fsck_run(e2fsck_t ctx)
 {
 	int	i;
 	pass_t	e2fsck_pass;
+	int error;
 
 #ifdef HAVE_SETJMP_H
 	if (setjmp(ctx->abort_loc)) {
@@ -217,6 +218,9 @@ int e2fsck_run(e2fsck_t ctx)
 	for (i=0; (e2fsck_pass = e2fsck_passes[i]); i++) {
 		if (ctx->flags & E2F_FLAG_RUN_RETURN)
 			break;
+		error = e2fsck_mmp_update(ctx->fs);
+		if (error)
+			fatal_error(ctx, 0);
 		e2fsck_pass(ctx);
 		if (ctx->progress)
 			(void) (ctx->progress)(ctx, 0, 0, 0);
diff --git a/e2fsck/e2fsck.h b/e2fsck/e2fsck.h
index 486a71b..34dcf73 100644
--- a/e2fsck/e2fsck.h
+++ b/e2fsck/e2fsck.h
@@ -521,6 +521,8 @@ extern blk_t get_backup_sb(e2fsck_t ctx, ext2_filsys fs,
 			   const char *name, io_manager manager);
 extern int ext2_file_type(unsigned int mode);
 extern int write_all(int fd, char *buf, size_t count);
+void dump_mmp_msg(struct mmp_struct *mmp, const char *msg);
+errcode_t e2fsck_mmp_update(ext2_filsys fs);
 
 /* unix.c */
 extern void e2fsck_clear_progbar(e2fsck_t ctx);
diff --git a/e2fsck/journal.c b/e2fsck/journal.c
index 93f685c..c43d57d 100644
--- a/e2fsck/journal.c
+++ b/e2fsck/journal.c
@@ -879,6 +879,8 @@ int e2fsck_run_ext3_journal(e2fsck_t ctx)
 		ctx->fs->io->manager->get_stats(ctx->fs->io, &stats);
 	if (stats && stats->bytes_written)
 		kbytes_written = stats->bytes_written >> 10;
+
+	ext2fs_mmp_stop(ctx->fs);
 	ext2fs_free(ctx->fs);
 	retval = ext2fs_open(ctx->filesystem_name, EXT2_FLAG_RW,
 			     ctx->superblock, blocksize, io_ptr,
diff --git a/e2fsck/pass1.c b/e2fsck/pass1.c
index 67dd986..7e2f692 100644
--- a/e2fsck/pass1.c
+++ b/e2fsck/pass1.c
@@ -694,7 +694,20 @@ void e2fsck_pass1(e2fsck_t ctx)
 	    (fs->super->s_mtime < fs->super->s_inodes_count))
 		busted_fs_time = 1;
 
+	if ((fs->super->s_feature_incompat & EXT4_FEATURE_INCOMPAT_MMP) &&
+	    !(fs->super->s_mmp_block <= fs->super->s_first_data_block ||
+	      fs->super->s_mmp_block >= fs->super->s_blocks_count))
+		ext2fs_mark_block_bitmap2(ctx->block_found_map,
+					 fs->super->s_mmp_block);
+
 	while (1) {
+		if (ino % EXT2_MMP_INODE_INTERVAL == 0) {
+			errcode_t error;
+
+			error = e2fsck_mmp_update(fs);
+			if (error)
+				fatal_error(ctx, 0);
+		}
 		old_op = ehandler_operation(_("getting next inode from scan"));
 		pctx.errcode = ext2fs_get_next_inode_full(scan, &ino,
 							  inode, inode_size);
diff --git a/e2fsck/pass1b.c b/e2fsck/pass1b.c
index 155fcba..577087f 100644
--- a/e2fsck/pass1b.c
+++ b/e2fsck/pass1b.c
@@ -286,6 +286,13 @@ static void pass1b(e2fsck_t ctx, char *block_buf)
 	pb.pctx = &pctx;
 	pctx.str = "pass1b";
 	while (1) {
+		if (ino % EXT2_MMP_INODE_INTERVAL == 0) {
+			errcode_t error;
+
+			error = e2fsck_mmp_update(fs);
+			if (error)
+				fatal_error(ctx, 0);
+		}
 		pctx.errcode = ext2fs_get_next_inode(scan, &ino, &inode);
 		if (pctx.errcode == EXT2_ET_BAD_BLOCK_IN_INODE_TABLE)
 			continue;
diff --git a/e2fsck/problem.c b/e2fsck/problem.c
index 8f0b211..170d61a 100644
--- a/e2fsck/problem.c
+++ b/e2fsck/problem.c
@@ -933,6 +933,16 @@ static struct e2fsck_problem problem_table[] = {
 	  N_("Error adjusting refcount for @a @b %b (@i %i): %m\n"),
 	  PROMPT_NONE, 0 },
 
+	/* Superblock has invalid MMP block. */
+	{ PR_0_MMP_INVALID_BLK,
+	  N_("@S has invalid MMP block.  "),
+	  PROMPT_CLEAR, PR_PREEN_OK },
+
+	/* Superblock has invalid MMP magic. */
+	{ PR_0_MMP_INVALID_MAGIC,
+	  N_("@S has invalid MMP magic.  "),
+	  PROMPT_FIX, PR_PREEN_OK | PR_NO_OK},
+
 
 	/* Pass 1C: Scan directories for inodes with multiply-claimed blocks. */
 	{ PR_1C_PASS_HEADER,
diff --git a/e2fsck/problem.h b/e2fsck/problem.h
index 7c4c156..08ae76c 100644
--- a/e2fsck/problem.h
+++ b/e2fsck/problem.h
@@ -227,6 +227,11 @@ struct problem_context {
 /* Block group checksum (latch question) */
 #define PR_0_GDT_CSUM_LATCH			0x00003E
 
+/* Superblock has invalid MMP block. */
+#define PR_0_MMP_INVALID_BLK			0x000043
+
+/* Superblock has invalid MMP magic. */
+#define PR_0_MMP_INVALID_MAGIC			0x000044
 
 /*
  * Pass 1 errors
diff --git a/e2fsck/unix.c b/e2fsck/unix.c
index 73cc2cf..4fd108f 100644
--- a/e2fsck/unix.c
+++ b/e2fsck/unix.c
@@ -973,6 +973,70 @@ static errcode_t try_open_fs(e2fsck_t ctx, int flags, io_manager io_ptr,
 static const char *my_ver_string = E2FSPROGS_VERSION;
 static const char *my_ver_date = E2FSPROGS_DATE;
 
+int e2fsck_check_mmp(ext2_filsys fs, e2fsck_t ctx)
+{
+	struct mmp_struct *mmp_s;
+	unsigned int mmp_check_interval;
+	errcode_t retval = 0;
+	struct problem_context pctx;
+	unsigned int wait_time = 0;
+
+	clear_problem_context(&pctx);
+	if (fs->mmp_buf == NULL) {
+		retval = ext2fs_get_mem(fs->blocksize, &fs->mmp_buf);
+		if (retval)
+			goto check_error;
+	}
+
+	retval = ext2fs_mmp_read(fs, fs->super->s_mmp_block, fs->mmp_buf);
+	if (retval)
+		goto check_error;
+
+	mmp_s = fs->mmp_buf;
+
+	mmp_check_interval = fs->super->s_mmp_update_interval;
+	if (mmp_check_interval < EXT2_MMP_MIN_CHECK_INTERVAL)
+		mmp_check_interval = EXT2_MMP_MIN_CHECK_INTERVAL;
+
+	/*
+ 	 * If check_interval in MMP block is larger, use that instead of
+	 * check_interval from the superblock.
+	 */
+	if (mmp_s->mmp_check_interval > mmp_check_interval)
+		mmp_check_interval = mmp_s->mmp_check_interval;
+
+	wait_time = mmp_check_interval * 2 + 1;
+
+	/* Print warning if e2fck will wait for more than 20 secs. */
+	if (wait_time > EXT2_MMP_MIN_CHECK_INTERVAL * 4) {
+		printf("MMP interval is %u seconds and total wait time is %u "
+		       "seconds. Please wait...\n",
+			mmp_check_interval, wait_time * 2);
+	}
+
+	return 0;
+
+check_error:
+
+	if (retval == EXT2_ET_MMP_BAD_BLOCK) {
+		if (fix_problem(ctx, PR_0_MMP_INVALID_BLK, &pctx)) {
+			fs->super->s_mmp_block = 0;
+			ext2fs_mark_super_dirty(fs);
+		}
+	} else if (retval == EXT2_ET_MMP_FAILED) {
+		dump_mmp_msg(fs->mmp_buf, NULL);
+	} else if (retval == EXT2_ET_MMP_FSCK_ON) {
+		dump_mmp_msg(fs->mmp_buf,
+			     _("If you are sure that e2fsck "
+			       "is not running on any node then use "
+			       "'tune2fs -f -E clear_mmp {device}'\n"));
+	} else if (retval == EXT2_ET_MMP_MAGIC_INVALID) {
+		if (fix_problem(ctx, PR_0_MMP_INVALID_MAGIC, &pctx))
+			ext2fs_mmp_clear(fs);
+	}
+	return 1;
+}
+
 int main (int argc, char *argv[])
 {
 	errcode_t	retval = 0, retval2 = 0, orig_retval = 0;
@@ -1042,6 +1106,8 @@ int main (int argc, char *argv[])
 				    _("need terminal for interactive repairs"));
 	}
 	ctx->superblock = ctx->use_superblock;
+
+	flags = EXT2_FLAG_SKIP_MMP;
 restart:
 #ifdef CONFIG_TESTIO_DEBUG
 	if (getenv("TEST_IO_FLAGS") || getenv("TEST_IO_BLOCK")) {
@@ -1050,7 +1116,7 @@ restart:
 	} else
 #endif
 		io_ptr = unix_io_manager;
-	flags = EXT2_FLAG_NOFREE_ON_ERROR;
+	flags |= EXT2_FLAG_NOFREE_ON_ERROR;
 	profile_get_boolean(ctx->profile, "options", "old_bitmaps", 0, 0,
 			    &old_bitmaps);
 	if (!old_bitmaps)
@@ -1223,6 +1289,21 @@ failure:
 
 	ehandler_init(fs->io);
 
+	if ((fs->super->s_feature_incompat & EXT4_FEATURE_INCOMPAT_MMP) &&
+	    (flags & EXT2_FLAG_SKIP_MMP)) {
+		if (e2fsck_check_mmp(fs, ctx))
+			fatal_error(ctx, 0);
+	}
+
+	 /*
+	  * Restart in order to reopen fs but this time start mmp.
+	  */
+	if (flags & EXT2_FLAG_SKIP_MMP) {
+		ext2fs_close(fs);
+		flags &=~EXT2_FLAG_SKIP_MMP;
+		goto restart;
+	}
+
 	if ((ctx->mount_flags & EXT2_MF_MOUNTED) &&
 	    !(sb->s_feature_incompat & EXT3_FEATURE_INCOMPAT_RECOVER))
 		goto skip_journal;
diff --git a/e2fsck/util.c b/e2fsck/util.c
index fa156a1..86f4fc0 100644
--- a/e2fsck/util.c
+++ b/e2fsck/util.c
@@ -48,6 +48,7 @@ void fatal_error(e2fsck_t ctx, const char *msg)
 	if (msg)
 		fprintf (stderr, "e2fsck: %s\n", msg);
 	if (ctx->fs && ctx->fs->io) {
+		ext2fs_mmp_stop(ctx->fs);
 		if (ctx->fs->io->magic == EXT2_ET_MAGIC_IO_CHANNEL)
 			io_channel_flush(ctx->fs->io);
 		else
@@ -709,3 +710,24 @@ int write_all(int fd, char *buf, size_t count)
 	}
 	return c;
 }
+
+void dump_mmp_msg(struct mmp_struct *mmp, const char *msg)
+{
+	if (msg)
+		printf("MMP check failed: %s\n", msg);
+	printf("MMP failure info: last update time: %llu node: %s device: %s\n",
+	       (long long)mmp->mmp_time, mmp->mmp_nodename, mmp->mmp_bdevname);
+}
+
+errcode_t e2fsck_mmp_update(ext2_filsys fs)
+{
+	errcode_t retval;
+
+	retval = ext2fs_mmp_update(fs);
+	if (retval == EXT2_ET_MMP_CHANGE_ABORT)
+		dump_mmp_msg(fs->mmp_cmp,
+			     _("UNEXPECTED INCONSISTENCY: the filesystem is "
+			       "being modified while fsck is running.\n"));
+
+	return retval;
+}
diff --git a/lib/e2p/feature.c b/lib/e2p/feature.c
index 16fba53..6482c80 100644
--- a/lib/e2p/feature.c
+++ b/lib/e2p/feature.c
@@ -77,7 +77,9 @@ static struct feature feature_list[] = {
 	{	E2P_FEATURE_INCOMPAT, EXT4_FEATURE_INCOMPAT_64BIT,
 			"64bit" },
 	{       E2P_FEATURE_INCOMPAT, EXT4_FEATURE_INCOMPAT_FLEX_BG,
-                        "flex_bg"},
+			"flex_bg"},
+	{       E2P_FEATURE_INCOMPAT, EXT4_FEATURE_INCOMPAT_MMP,
+			"mmp" },
 	{	0, 0, 0 },
 };
 
diff --git a/lib/e2p/ls.c b/lib/e2p/ls.c
index 5e560ed..9e8472b 100644
--- a/lib/e2p/ls.c
+++ b/lib/e2p/ls.c
@@ -413,6 +413,12 @@ void list_super2(struct ext2_super_block * sb, FILE *f)
 	if (sb->s_grp_quota_inum)
 		fprintf(f, "Group quota inode:        %u\n",
 			sb->s_grp_quota_inum);
+	if (sb->s_feature_incompat & EXT4_FEATURE_INCOMPAT_MMP) {
+		fprintf(f, "MMP block number:         %llu\n",
+			(long long)sb->s_mmp_block);
+		fprintf(f, "MMP update interval:      %u\n",
+			sb->s_mmp_update_interval);
+	}
 }
 
 void list_super (struct ext2_super_block * s)
diff --git a/lib/ext2fs/Makefile.in b/lib/ext2fs/Makefile.in
index 9c1c273..63845a6 100644
--- a/lib/ext2fs/Makefile.in
+++ b/lib/ext2fs/Makefile.in
@@ -64,6 +64,7 @@ OBJS= $(DEBUGFS_LIB_OBJS) $(RESIZE_LIB_OBJS) $(E2IMAGE_LIB_OBJS) \
 	lookup.o \
 	mkdir.o \
 	mkjournal.o \
+	mmp.o \
 	namei.o \
 	native.o \
 	newdir.o \
@@ -132,6 +133,7 @@ SRCS= ext2_err.c \
 	$(srcdir)/lookup.c \
 	$(srcdir)/mkdir.c \
 	$(srcdir)/mkjournal.c \
+	$(srcdir)/mmp.c	\
 	$(srcdir)/namei.c \
 	$(srcdir)/native.c \
 	$(srcdir)/newdir.c \
@@ -646,6 +648,8 @@ mkjournal.o: $(srcdir)/mkjournal.c $(srcdir)/ext2_fs.h \
  $(top_builddir)/lib/ext2fs/ext2_err.h $(srcdir)/ext2_ext_attr.h \
  $(srcdir)/bitops.h $(srcdir)/jfs_user.h $(srcdir)/kernel-jbd.h \
  $(srcdir)/jfs_compat.h $(srcdir)/kernel-list.h
+mmp.o: $(srcdir)/ext2_fs.h $(srcdir)/ext2fs.h \
+ $(top_builddir)/lib/ext2fs/ext2_err.h
 namei.o: $(srcdir)/namei.c $(srcdir)/ext2_fs.h \
  $(top_builddir)/lib/ext2fs/ext2_types.h $(srcdir)/ext2fs.h \
  $(srcdir)/ext2_fs.h $(srcdir)/ext3_extents.h $(top_srcdir)/lib/et/com_err.h \
diff --git a/lib/ext2fs/closefs.c b/lib/ext2fs/closefs.c
index 7a23e46..0da41e9 100644
--- a/lib/ext2fs/closefs.c
+++ b/lib/ext2fs/closefs.c
@@ -455,6 +455,11 @@ errcode_t ext2fs_close(ext2_filsys fs)
 		if (retval)
 			return retval;
 	}
+
+	retval = ext2fs_mmp_stop(fs);
+	if (retval)
+		return retval;
+
 	ext2fs_free(fs);
 	return 0;
 }
diff --git a/lib/ext2fs/ext2_err.et.in b/lib/ext2fs/ext2_err.et.in
index 995ddc3..e759b6f 100644
--- a/lib/ext2fs/ext2_err.et.in
+++ b/lib/ext2fs/ext2_err.et.in
@@ -422,4 +422,25 @@ ec	EXT2_NO_MTAB_FILE,
 ec	EXT2_ET_CANT_USE_LEGACY_BITMAPS,
 	"Filesystem too large to use legacy bitmaps"
 
+ec	EXT2_ET_MMP_MAGIC_INVALID,
+	"MMP: invalid magic number"
+
+ec	EXT2_ET_MMP_FAILED,
+	"MMP: device currently active"
+
+ec	EXT2_ET_MMP_FSCK_ON,
+	"MMP: fsck being run"
+
+ec	EXT2_ET_MMP_BAD_BLOCK,
+	"MMP: block number beyond filesystem range"
+
+ec	EXT2_ET_MMP_UNKNOWN_SEQ,
+	"MMP: undergoing an unknown operation"
+
+ec	EXT2_ET_MMP_CHANGE_ABORT,
+	"MMP: filesystem still in use"
+
+ec	EXT2_ET_MMP_OPEN_DIRECT,
+	"MMP: open with O_DIRECT failed"
+
 	end
diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h
index a89e33b..ba36c3d 100644
--- a/lib/ext2fs/ext2_fs.h
+++ b/lib/ext2fs/ext2_fs.h
@@ -588,7 +588,7 @@ struct ext2_super_block {
 	__u16	s_want_extra_isize; 	/* New inodes should reserve # bytes */
 	__u32	s_flags;		/* Miscellaneous flags */
 	__u16   s_raid_stride;		/* RAID stride */
-	__u16   s_mmp_interval;         /* # seconds to wait in MMP checking */
+	__u16   s_mmp_update_interval;  /* # seconds to wait in MMP checking */
 	__u64   s_mmp_block;            /* Block for multi-mount protection */
 	__u32   s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
 	__u8	s_log_groups_per_flex;	/* FLEX_BG group size */
@@ -691,7 +691,8 @@ struct ext2_super_block {
 #define EXT4_FEATURE_INCOMPAT_DIRDATA		0x1000
 
 #define EXT2_FEATURE_COMPAT_SUPP	0
-#define EXT2_FEATURE_INCOMPAT_SUPP	(EXT2_FEATURE_INCOMPAT_FILETYPE)
+#define EXT2_FEATURE_INCOMPAT_SUPP    (EXT2_FEATURE_INCOMPAT_FILETYPE| \
+				       EXT4_FEATURE_INCOMPAT_MMP)
 #define EXT2_FEATURE_RO_COMPAT_SUPP	(EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \
 					 EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \
 					 EXT4_FEATURE_RO_COMPAT_DIR_NLINK| \
@@ -774,26 +775,34 @@ struct ext2_dir_entry_2 {
 /*
  * This structure will be used for multiple mount protection. It will be
  * written into the block number saved in the s_mmp_block field in the
- * superblock.
+ * superblock. Programs that check MMP should assume that if SEQ_FSCK
+ * (or any unknown code above SEQ_MAX) is present then it is NOT safe
+ * to use the filesystem, regardless of how old the timestamp is.
  */
-#define	EXT2_MMP_MAGIC    0x004D4D50 /* ASCII for MMP */
-#define	EXT2_MMP_CLEAN    0xFF4D4D50 /* Value of mmp_seq for clean unmount */
-#define	EXT2_MMP_FSCK_ON  0xE24D4D50 /* Value of mmp_seq when being fscked */
+#define EXT2_MMP_MAGIC     0x004D4D50U /* ASCII for MMP */
+#define EXT2_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
+#define EXT2_MMP_SEQ_FSCK  0xE24D4D50U /* mmp_seq value when being fscked */
+#define EXT2_MMP_SEQ_MAX   0xE24D4D4FU /* maximum valid mmp_seq value */
 
 struct mmp_struct {
-	__u32	mmp_magic;
-	__u32	mmp_seq;
-	__u64	mmp_time;
-	char	mmp_nodename[64];
-	char	mmp_bdevname[32];
-	__u16	mmp_interval;
+	__u32	mmp_magic;		/* Magic number for MMP */
+	__u32	mmp_seq;		/* Sequence no. updated periodically */
+	__u64	mmp_time;		/* Time last updated */
+	char	mmp_nodename[64];	/* Node which last updated MMP block */
+	char	mmp_bdevname[32];	/* Bdev which last updated MMP block */
+	__u16	mmp_check_interval;	/* Changed mmp_check_interval */
 	__u16	mmp_pad1;
-	__u32	mmp_pad2;
+	__u32	mmp_pad2[227];
 };
 
 /*
- * Interval in number of seconds to update the MMP sequence number.
+ * Default interval in seconds to update the MMP sequence number.
  */
-#define EXT2_MMP_DEF_INTERVAL	5
+#define EXT2_MMP_UPDATE_INTERVAL	5
+
+/*
+ * Minimum interval for MMP checking in seconds.
+ */
+#define EXT2_MMP_MIN_CHECK_INTERVAL     5
 
 #endif	/* _LINUX_EXT2_FS_H */
diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h
index d3eb31d..f38c608 100644
--- a/lib/ext2fs/ext2fs.h
+++ b/lib/ext2fs/ext2fs.h
@@ -184,6 +184,7 @@ typedef struct ext2_file *ext2_file_t;
 #define EXT2_FLAG_64BITS		0x20000
 #define EXT2_FLAG_PRINT_PROGRESS	0x40000
 #define EXT2_FLAG_DIRECT_IO		0x80000
+#define EXT2_FLAG_SKIP_MMP		0x100000
 
 /*
  * Special flag in the ext2 inode i_flag field that means that this is
@@ -200,6 +201,15 @@ typedef struct ext2_file *ext2_file_t;
 
 struct opaque_ext2_group_desc;
 
+/*
+ * The timestamp in the MMP structure will be updated by e2fsck at some
+ * arbitary intervals (start of passes, after every EXT2_MMP_INODE_INTERVAL
+ * inodes in pass1 and pass1b).  There is no guarantee that e2fsck is updating
+ * the MMP block in a timely manner, and the updates it does are purely for
+ * the convenience of the sysadmin and not for automatic validation.
+ */
+#define EXT2_MMP_INODE_INTERVAL 20000
+
 struct struct_ext2_filsys {
 	errcode_t			magic;
 	io_channel			io;
@@ -246,6 +256,19 @@ struct struct_ext2_filsys {
 	io_channel			image_io;
 
 	/*
+	 * Buffers for Multiple mount protection(MMP) block.
+	 */
+	void *mmp_buf;
+	void *mmp_unaligned_buf;
+	void *mmp_cmp;
+	int mmp_fd;
+
+	/*
+	 * Time at which e2fsck last updated the MMP block.
+	 */
+	long mmp_last_written;
+
+	/*
 	 * More callback functions
 	 */
 	errcode_t (*get_alloc_block)(ext2_filsys fs, blk64_t goal,
@@ -538,6 +561,7 @@ typedef struct ext2_icount *ext2_icount_t;
 					 EXT3_FEATURE_INCOMPAT_RECOVER|\
 					 EXT3_FEATURE_INCOMPAT_EXTENTS|\
 					 EXT4_FEATURE_INCOMPAT_FLEX_BG|\
+					 EXT4_FEATURE_INCOMPAT_MMP|\
 					 EXT4_FEATURE_INCOMPAT_64BIT)
 #else
 #define EXT2_LIB_FEATURE_INCOMPAT_SUPP	(EXT2_FEATURE_INCOMPAT_FILETYPE|\
@@ -546,6 +570,7 @@ typedef struct ext2_icount *ext2_icount_t;
 					 EXT3_FEATURE_INCOMPAT_RECOVER|\
 					 EXT3_FEATURE_INCOMPAT_EXTENTS|\
 					 EXT4_FEATURE_INCOMPAT_FLEX_BG|\
+					 EXT4_FEATURE_INCOMPAT_MMP|\
 					 EXT4_FEATURE_INCOMPAT_64BIT)
 #endif
 #define EXT2_LIB_FEATURE_RO_COMPAT_SUPP	(EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER|\
@@ -1280,6 +1305,16 @@ errcode_t ext2fs_link(ext2_filsys fs, ext2_ino_t dir, const char *name,
 errcode_t ext2fs_unlink(ext2_filsys fs, ext2_ino_t dir, const char *name,
 			ext2_ino_t ino, int flags);
 
+/* mmp.c */
+errcode_t ext2fs_mmp_read(ext2_filsys fs, blk64_t mmp_blk, void *buf);
+errcode_t ext2fs_mmp_write(ext2_filsys fs, blk64_t mmp_blk, void *buf);
+errcode_t ext2fs_mmp_clear(ext2_filsys fs);
+errcode_t ext2fs_mmp_init(ext2_filsys fs);
+errcode_t ext2fs_mmp_start(ext2_filsys fs);
+errcode_t ext2fs_mmp_update(ext2_filsys fs);
+errcode_t ext2fs_mmp_stop(ext2_filsys fs);
+unsigned ext2fs_mmp_new_seq();
+
 /* read_bb.c */
 extern errcode_t ext2fs_read_bb_inode(ext2_filsys fs,
 				      ext2_badblocks_list *bb_list);
@@ -1315,6 +1350,7 @@ extern void ext2fs_swap_inode_full(ext2_filsys fs, struct ext2_inode_large *t,
 				   int bufsize);
 extern void ext2fs_swap_inode(ext2_filsys fs,struct ext2_inode *t,
 			      struct ext2_inode *f, int hostorder);
+extern void ext2fs_swap_mmp(struct mmp_struct *mmp);
 
 /* valid_blk.c */
 extern int ext2fs_inode_has_valid_blocks(struct ext2_inode *inode);
diff --git a/lib/ext2fs/freefs.c b/lib/ext2fs/freefs.c
index 5c35bb6..5a151d1 100644
--- a/lib/ext2fs/freefs.c
+++ b/lib/ext2fs/freefs.c
@@ -53,6 +53,11 @@ void ext2fs_free(ext2_filsys fs)
 	if (fs->icache)
 		ext2fs_free_inode_cache(fs->icache);
 
+	if (fs->mmp_buf)
+		ext2fs_free_mem(&fs->mmp_buf);
+	if (fs->mmp_unaligned_buf)
+		ext2fs_free_mem(&fs->mmp_unaligned_buf);
+
 	fs->magic = 0;
 
 	ext2fs_free_mem(&fs);
diff --git a/lib/ext2fs/mmp.c b/lib/ext2fs/mmp.c
new file mode 100644
index 0000000..340fa31
--- /dev/null
+++ b/lib/ext2fs/mmp.c
@@ -0,0 +1,420 @@
+/*
+ * Helper functions for multiple mount protection(MMP).
+ *
+ * Copyright (C) 2006, 2007 by Kalpak Shah <kalpak@clusterfs.com>
+ *
+ * %Begin-Header%
+ * This file may be redistributed under the terms of the GNU Public
+ * License.
+ * %End-Header%
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <sys/time.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "ext2fs/ext2_fs.h"
+#include "ext2fs/ext2fs.h"
+
+static int mmp_pagesize(void)
+{
+#ifdef _SC_PAGESIZE
+	int sysval = sysconf(_SC_PAGESIZE);
+	if (sysval > 0)
+		return sysval;
+#endif /* _SC_PAGESIZE */
+#ifdef HAVE_GETPAGESIZE
+	return getpagesize();
+#else
+	return 4096;
+#endif
+}
+
+#define ptr_align(ptr, size)	(void *)(((unsigned long)(ptr) + (size) - 1) & \
+					 ~((unsigned long)(size) - 1))
+#ifndef O_DIRECT
+#define O_DIRECT 0
+#endif
+
+errcode_t ext2fs_mmp_read(ext2_filsys fs, blk64_t mmp_blk, void *buf)
+{
+	struct mmp_struct *mmp_cmp;
+	errcode_t retval = 0;
+
+	if ((mmp_blk <= fs->super->s_first_data_block) ||
+	    (mmp_blk >= fs->super->s_blocks_count))
+		return EXT2_ET_MMP_BAD_BLOCK;
+
+	if (fs->mmp_cmp == NULL) {
+		/* O_DIRECT in linux 2.4: page aligned
+		 * O_DIRECT in linux 2.6: sector aligned
+		 * A filesystem cannot be created with blocksize < sector size,
+		 * or with blocksize > page_size. */
+		int bufsize = fs->blocksize;
+
+		if (bufsize < mmp_pagesize())
+			bufsize = mmp_pagesize();
+		retval = ext2fs_get_mem(bufsize * 2, &fs->mmp_unaligned_buf);
+		if (retval)
+			return retval;
+		fs->mmp_cmp = ptr_align(fs->mmp_unaligned_buf, bufsize);
+	}
+
+	/* ext2fs_open reserves fd0,1,2 to avoid stdio collision */
+	if (fs->mmp_fd <= 0) {
+		fs->mmp_fd = open(fs->device_name, O_RDWR | O_DIRECT);
+		if (fs->mmp_fd < 0) {
+			retval = EXT2_ET_MMP_OPEN_DIRECT;
+			goto out;
+		}
+	}
+
+	if (ext2fs_llseek(fs->mmp_fd, mmp_blk * fs->blocksize, SEEK_SET) !=
+	    mmp_blk * fs->blocksize) {
+		retval = EXT2_ET_LLSEEK_FAILED;
+		goto out;
+	}
+
+	if (read(fs->mmp_fd, fs->mmp_cmp, fs->blocksize) != fs->blocksize) {
+		retval = EXT2_ET_SHORT_READ;
+		goto out;
+	}
+
+	mmp_cmp = fs->mmp_cmp;
+#ifdef EXT2FS_ENABLE_SWAPFS
+	if (fs->flags & EXT2_FLAG_SWAP_BYTES)
+		ext2fs_swap_mmp(mmp_cmp);
+#endif
+
+	if (buf != NULL && buf != fs->mmp_cmp)
+		memcpy(buf, fs->mmp_cmp, fs->blocksize);
+
+	if (mmp_cmp->mmp_magic != EXT2_MMP_MAGIC) {
+		retval = EXT2_ET_MMP_MAGIC_INVALID;
+		goto out;
+	}
+
+out:
+	return retval;
+}
+
+errcode_t ext2fs_mmp_write(ext2_filsys fs, blk64_t mmp_blk, void *buf)
+{
+	struct mmp_struct *mmp_s = buf;
+	struct timeval tv;
+	errcode_t retval = 0;
+
+	gettimeofday(&tv, 0);
+	mmp_s->mmp_time = tv.tv_sec;
+	fs->mmp_last_written = tv.tv_sec;
+
+#ifdef EXT2FS_ENABLE_SWAPFS
+	if (fs->super->s_magic == ext2fs_swab16(EXT2_SUPER_MAGIC))
+		ext2fs_swap_mmp(mmp_s);
+#endif
+
+	/* I was tempted to make this use O_DIRECT and the mmp_fd, but
+	 * this caused no end of grief, while leaving it as-is works. */
+	retval = io_channel_write_blk(fs->io, mmp_blk, -fs->blocksize, buf);
+
+#ifdef EXT2FS_ENABLE_SWAPFS
+	if (fs->super->s_magic == ext2fs_swab16(EXT2_SUPER_MAGIC))
+		ext2fs_swap_mmp(mmp_s);
+#endif
+
+	/* Make sure the block gets to disk quickly */
+	io_channel_flush(fs->io);
+	return retval;
+}
+
+#ifdef HAVE_SRANDOM
+#define srand(x) 	srandom(x)
+#define rand() 		random()
+#endif
+
+unsigned ext2fs_mmp_new_seq()
+{
+	unsigned new_seq;
+	struct timeval tv;
+
+	gettimeofday(&tv, 0);
+	srand((getpid() << 16) ^ getuid() ^ tv.tv_sec ^ tv.tv_usec);
+
+	gettimeofday(&tv, 0);
+	/* Crank the random number generator a few times */
+	for (new_seq = (tv.tv_sec ^ tv.tv_usec) & 0x1F; new_seq > 0; new_seq--)
+		rand();
+
+	do {
+		new_seq = rand();
+	} while (new_seq > EXT2_MMP_SEQ_MAX);
+
+	return new_seq;
+}
+
+static errcode_t ext2fs_mmp_reset(ext2_filsys fs)
+{
+	struct mmp_struct *mmp_s = NULL;
+	errcode_t retval = 0;
+
+	if (fs->mmp_buf == NULL) {
+		retval = ext2fs_get_mem(fs->blocksize, &fs->mmp_buf);
+		if (retval)
+			goto out;
+	}
+
+	memset(fs->mmp_buf, 0, fs->blocksize);
+	mmp_s = fs->mmp_buf;
+
+	mmp_s->mmp_magic = EXT2_MMP_MAGIC;
+	mmp_s->mmp_seq = EXT2_MMP_SEQ_CLEAN;
+	mmp_s->mmp_time = 0;
+#if _BSD_SOURCE || _XOPEN_SOURCE >= 500
+	gethostname(mmp_s->mmp_nodename, sizeof(mmp_s->mmp_nodename));
+#else
+	mmp_s->mmp_nodename[0] = '\0';
+#endif
+	strncpy(mmp_s->mmp_bdevname, fs->device_name,
+		sizeof(mmp_s->mmp_bdevname));
+
+	mmp_s->mmp_check_interval = fs->super->s_mmp_update_interval;
+	if (mmp_s->mmp_check_interval < EXT2_MMP_MIN_CHECK_INTERVAL)
+		mmp_s->mmp_check_interval = EXT2_MMP_MIN_CHECK_INTERVAL;
+
+	retval = ext2fs_mmp_write(fs, fs->super->s_mmp_block, fs->mmp_buf);
+out:
+	return retval;
+}
+
+errcode_t ext2fs_mmp_clear(ext2_filsys fs)
+{
+	struct mmp_struct *mmp_s = NULL;
+	errcode_t retval = 0;
+
+	if (!(fs->super->s_feature_incompat & EXT4_FEATURE_INCOMPAT_MMP) ||
+	    !(fs->flags & EXT2_FLAG_RW))
+		return 0;
+
+	retval = ext2fs_mmp_reset(fs);
+
+	return retval;
+}
+
+errcode_t ext2fs_mmp_init(ext2_filsys fs)
+{
+	struct ext2_super_block *sb = fs->super;
+	struct mmp_struct *mmp_s = NULL;
+	blk64_t mmp_block;
+	errcode_t retval;
+
+	if (fs->mmp_buf == NULL) {
+		retval = ext2fs_get_mem(fs->blocksize, &fs->mmp_buf);
+		if (retval)
+			goto out;
+	}
+
+	retval = ext2fs_alloc_block(fs, 0, fs->mmp_buf, &mmp_block);
+	if (retval)
+		goto out;
+
+	sb->s_mmp_block = mmp_block;
+	sb->s_mmp_update_interval = EXT2_MMP_UPDATE_INTERVAL;
+
+	retval = ext2fs_mmp_reset(fs);
+	if (retval)
+		goto out;
+
+out:
+	return retval;
+}
+
+/*
+ * Make sure that the fs is not mounted or being fsck'ed while opening the fs.
+ */
+errcode_t ext2fs_mmp_start(ext2_filsys fs)
+{
+	struct mmp_struct *mmp_s;
+	unsigned seq;
+	unsigned int mmp_check_interval;
+	errcode_t retval = 0;
+
+	if (fs->mmp_buf == NULL) {
+		retval = ext2fs_get_mem(fs->blocksize, &fs->mmp_buf);
+		if (retval)
+			goto mmp_error;
+	}
+
+	retval = ext2fs_mmp_read(fs, fs->super->s_mmp_block, fs->mmp_buf);
+	if (retval)
+		goto mmp_error;
+
+	mmp_s = fs->mmp_buf;
+
+	mmp_check_interval = fs->super->s_mmp_update_interval;
+	if (mmp_check_interval < EXT2_MMP_MIN_CHECK_INTERVAL)
+		mmp_check_interval = EXT2_MMP_MIN_CHECK_INTERVAL;
+
+	seq = mmp_s->mmp_seq;
+	if (seq == EXT2_MMP_SEQ_CLEAN)
+		goto clean_seq;
+	if (seq == EXT2_MMP_SEQ_FSCK) {
+		retval = EXT2_ET_MMP_FSCK_ON;
+		goto mmp_error;
+	}
+
+	if (seq > EXT2_MMP_SEQ_FSCK) {
+		retval = EXT2_ET_MMP_UNKNOWN_SEQ;
+		goto mmp_error;
+	}
+
+	/*
+	 * If check_interval in MMP block is larger, use that instead of
+	 * check_interval from the superblock.
+	 */
+	if (mmp_s->mmp_check_interval > mmp_check_interval)
+		mmp_check_interval = mmp_s->mmp_check_interval;
+
+	sleep(2 * mmp_check_interval + 1);
+
+	retval = ext2fs_mmp_read(fs, fs->super->s_mmp_block, fs->mmp_buf);
+	if (retval)
+		goto mmp_error;
+
+	if (seq != mmp_s->mmp_seq) {
+		retval = EXT2_ET_MMP_FAILED;
+		goto mmp_error;
+	}
+
+clean_seq:
+	if (!(fs->flags & EXT2_FLAG_RW))
+		goto mmp_error;
+
+	mmp_s->mmp_seq = seq = ext2fs_mmp_new_seq();
+#if _BSD_SOURCE || _XOPEN_SOURCE >= 500
+	gethostname(mmp_s->mmp_nodename, sizeof(mmp_s->mmp_nodename));
+#else
+	strcpy(mmp_s->mmp_nodename, "unknown host");
+#endif
+	strncpy(mmp_s->mmp_bdevname, fs->device_name,
+		sizeof(mmp_s->mmp_bdevname));
+
+	retval = ext2fs_mmp_write(fs, fs->super->s_mmp_block, fs->mmp_buf);
+	if (retval)
+		goto mmp_error;
+
+	sleep(2 * mmp_check_interval + 1);
+
+	retval = ext2fs_mmp_read(fs, fs->super->s_mmp_block, fs->mmp_buf);
+	if (retval)
+		goto mmp_error;
+
+	if (seq != mmp_s->mmp_seq) {
+		retval = EXT2_ET_MMP_FAILED;
+		goto mmp_error;
+	}
+
+	mmp_s->mmp_seq = EXT2_MMP_SEQ_FSCK;
+	retval = ext2fs_mmp_write(fs, fs->super->s_mmp_block, fs->mmp_buf);
+	if (retval)
+		goto mmp_error;
+
+	return 0;
+
+mmp_error:
+	return retval;
+}
+
+/*
+ * Clear the MMP usage in the filesystem.  If this function returns an
+ * error EXT2_ET_MMP_CHANGE_ABORT it means the filesystem was modified
+ * by some other process while in use, and changes should be dropped, or
+ * risk filesystem corruption.
+ */
+errcode_t ext2fs_mmp_stop(ext2_filsys fs)
+{
+	struct mmp_struct *mmp, *mmp_cmp;
+	errcode_t retval = 0;
+
+	if (!(fs->super->s_feature_incompat & EXT4_FEATURE_INCOMPAT_MMP) ||
+	    !(fs->flags & EXT2_FLAG_RW) || (fs->flags & EXT2_FLAG_SKIP_MMP))
+		goto mmp_error;
+
+	retval = ext2fs_mmp_read(fs, fs->super->s_mmp_block, fs->mmp_buf);
+	if (retval)
+		goto mmp_error;
+
+	/* Check if the MMP block is not changed. */
+	mmp = fs->mmp_buf;
+	mmp_cmp = fs->mmp_cmp;
+	if (memcmp(mmp, mmp_cmp, sizeof(*mmp_cmp))) {
+		retval = EXT2_ET_MMP_CHANGE_ABORT;
+		goto mmp_error;
+	}
+
+check_skipped:
+	mmp_cmp->mmp_seq = EXT2_MMP_SEQ_CLEAN;
+	retval = ext2fs_mmp_write(fs, fs->super->s_mmp_block, fs->mmp_cmp);
+
+mmp_error:
+	if (fs->mmp_fd > 0) {
+		close(fs->mmp_fd);
+		fs->mmp_fd = -1;
+	}
+	if (fs->mmp_buf) {
+		ext2fs_free_mem(&fs->mmp_buf);
+		fs->mmp_buf = NULL;
+	}
+	if (fs->mmp_unaligned_buf) {
+		ext2fs_free_mem(&fs->mmp_unaligned_buf);
+		fs->mmp_unaligned_buf = NULL;
+		fs->mmp_cmp = NULL;
+	}
+
+	return retval;
+}
+
+#define EXT2_MIN_MMP_UPDATE_INTERVAL 60
+
+/*
+ * Update the on-disk mmp buffer, after checking that it hasn't been changed.
+ */
+errcode_t ext2fs_mmp_update(ext2_filsys fs)
+{
+	struct mmp_struct *mmp, *mmp_cmp;
+	struct timeval tv;
+	errcode_t retval = 0;
+
+	if (!(fs->super->s_feature_incompat & EXT4_FEATURE_INCOMPAT_MMP) ||
+	    !(fs->flags & EXT2_FLAG_RW) || (fs->flags & EXT2_FLAG_SKIP_MMP))
+		return 0;
+
+	gettimeofday(&tv, 0);
+	if (tv.tv_sec - fs->mmp_last_written < EXT2_MIN_MMP_UPDATE_INTERVAL)
+		return 0;
+
+	retval = ext2fs_mmp_read(fs, fs->super->s_mmp_block, NULL);
+	if (retval)
+		goto mmp_error;
+
+	mmp = fs->mmp_buf;
+	mmp_cmp = fs->mmp_cmp;
+
+	if (memcmp(mmp, mmp_cmp, sizeof(*mmp_cmp)))
+		return EXT2_ET_MMP_CHANGE_ABORT;
+
+	mmp->mmp_time = tv.tv_sec;
+	mmp->mmp_seq = EXT2_MMP_SEQ_FSCK;
+	retval = ext2fs_mmp_write(fs, fs->super->s_mmp_block, fs->mmp_buf);
+
+mmp_error:
+	return retval;
+}
diff --git a/lib/ext2fs/openfs.c b/lib/ext2fs/openfs.c
index 90abed1..aa1c8bd 100644
--- a/lib/ext2fs/openfs.c
+++ b/lib/ext2fs/openfs.c
@@ -22,6 +22,9 @@
 #if HAVE_SYS_TYPES_H
 #include <sys/types.h>
 #endif
+#ifdef HAVE_ERRNO_H
+#include <errno.h>
+#endif
 
 #include "ext2_fs.h"
 
@@ -82,6 +85,7 @@ errcode_t ext2fs_open(const char *name, int flags, int superblock,
  * 	EXT2_FLAG_FORCE - Open the filesystem even if some of the
  *				features aren't supported.
  *	EXT2_FLAG_JOURNAL_DEV_OK - Open an ext3 journal device
+ *	EXT2_FLAG_SKIP_MMP - Open without multi-mount protection check.
  */
 errcode_t ext2fs_open2(const char *name, const char *io_options,
 		       int flags, int superblock,
@@ -366,6 +370,18 @@ errcode_t ext2fs_open2(const char *name, const char *io_options,
 
 	fs->flags &= ~EXT2_FLAG_NOFREE_ON_ERROR;
 	*ret_fs = fs;
+
+	if ((fs->super->s_feature_incompat & EXT4_FEATURE_INCOMPAT_MMP) &&
+	    !(flags & EXT2_FLAG_SKIP_MMP)
+	    && (flags & (EXT2_FLAG_RW | EXT2_FLAG_EXCLUSIVE))) {
+		retval = ext2fs_mmp_start(fs);
+		if (retval) {
+			fs->flags |= EXT2_FLAG_SKIP_MMP; /* just do cleanup */
+			ext2fs_mmp_stop(fs);
+			goto cleanup;
+		}
+	}
+
 	return 0;
 cleanup:
 	if (flags & EXT2_FLAG_NOFREE_ON_ERROR)
diff --git a/lib/ext2fs/swapfs.c b/lib/ext2fs/swapfs.c
index 3a43c6c..90c1e8b 100644
--- a/lib/ext2fs/swapfs.c
+++ b/lib/ext2fs/swapfs.c
@@ -70,6 +70,8 @@ void ext2fs_swap_super(struct ext2_super_block * sb)
 	sb->s_min_extra_isize = ext2fs_swab16(sb->s_min_extra_isize);
 	sb->s_want_extra_isize = ext2fs_swab16(sb->s_want_extra_isize);
 	sb->s_flags = ext2fs_swab32(sb->s_flags);
+	sb->s_mmp_update_interval = ext2fs_swab16(sb->s_mmp_update_interval);
+	sb->s_mmp_block = ext2fs_swab64(sb->s_mmp_block);
 	sb->s_kbytes_written = ext2fs_swab64(sb->s_kbytes_written);
 	sb->s_snapshot_inum = ext2fs_swab32(sb->s_snapshot_inum);
 	sb->s_snapshot_id = ext2fs_swab32(sb->s_snapshot_id);
@@ -312,4 +314,12 @@ void ext2fs_swap_inode(ext2_filsys fs, struct ext2_inode *t,
 				sizeof(struct ext2_inode));
 }
 
+void ext2fs_swap_mmp(struct mmp_struct *mmp)
+{
+	mmp->mmp_magic = ext2fs_swab32(mmp->mmp_magic);
+	mmp->mmp_seq = ext2fs_swab32(mmp->mmp_seq);
+	mmp->mmp_time = ext2fs_swab64(mmp->mmp_time);
+	mmp->mmp_check_interval = ext2fs_swab16(mmp->mmp_check_interval);
+}
+
 #endif
diff --git a/lib/ext2fs/tst_super_size.c b/lib/ext2fs/tst_super_size.c
index 1e5a524..7a16995 100644
--- a/lib/ext2fs/tst_super_size.c
+++ b/lib/ext2fs/tst_super_size.c
@@ -100,7 +100,7 @@ void check_superblock_fields()
 	check_field(s_want_extra_isize);
 	check_field(s_flags);
 	check_field(s_raid_stride);
-	check_field(s_mmp_interval);
+	check_field(s_mmp_update_interval);
 	check_field(s_mmp_block);
 	check_field(s_raid_stripe_width);
 	check_field(s_log_groups_per_flex);
diff --git a/misc/mke2fs.c b/misc/mke2fs.c
index 9798b88..832e7ec 100644
--- a/misc/mke2fs.c
+++ b/misc/mke2fs.c
@@ -808,6 +808,7 @@ static __u32 ok_features[3] = {
 		EXT3_FEATURE_INCOMPAT_JOURNAL_DEV|
 		EXT2_FEATURE_INCOMPAT_META_BG|
 		EXT4_FEATURE_INCOMPAT_FLEX_BG|
+		EXT4_FEATURE_INCOMPAT_MMP |
 		EXT4_FEATURE_INCOMPAT_64BIT,
 	/* R/O compat */
 	EXT2_FEATURE_RO_COMPAT_LARGE_FILE|
@@ -2366,7 +2367,19 @@ int main (int argc, char *argv[])
 			printf(_("done\n"));
 	}
 no_journal:
-
+	if (!super_only) {
+		if (fs->super->s_feature_incompat & EXT4_FEATURE_INCOMPAT_MMP) {
+			retval = ext2fs_mmp_init(fs);
+			if (retval) {
+				fprintf(stderr, _("\nError while enabling "
+					"multiple mount protection feature."));
+				exit(1);
+			}
+			printf(_("Multiple mount protection has been enabled "
+				 "with update interval %d seconds.\n"),
+				 fs->super->s_mmp_update_interval);
+		}
+	}
 	if (!quiet)
 		printf(_("Writing superblocks and "
 		       "filesystem accounting information: "));
diff --git a/misc/tune2fs.8.in b/misc/tune2fs.8.in
index 2f9db81..0f5c48c 100644
--- a/misc/tune2fs.8.in
+++ b/misc/tune2fs.8.in
@@ -167,6 +167,11 @@ separated, and may take an argument using the equals ('=') sign.
 The following extended options are supported:
 .RS 1.2i
 .TP
+.B clear-mmp
+Reset the MMP block (if any) back to the clean state.  Use only if
+absolutely certain the device is not currently mounted or being
+fscked, or major filesystem corruption can result.  Needs '-f'.
+.TP
 .BI stride= stride-size
 Configure the filesystem for a RAID array with
 .I stride-size
@@ -519,6 +524,11 @@ future.
 .B Tune2fs 
 only supports clearing this filesystem feature.
 .TP
+.B mmp
+Enable or disable multiple mount protection(MMP) feature. MMP helps to protect
+the filesystem from being multiply mounted and is useful in shared storage
+environment.
+.TP
 .B sparse_super
 Limit the number of backup superblocks to save space on large filesystems.
 .TP
@@ -555,6 +565,9 @@ and
 .BR flex_bg
 features are only supported by the ext4 filesystem.
 .TP
+.BI \-p " mmp_check_interval"
+Set the desired MMP check interval in seconds. It is 5 seconds by default.
+.TP
 .BI \-r " reserved-blocks-count"
 Set the number of reserved filesystem blocks.
 .TP
diff --git a/misc/tune2fs.c b/misc/tune2fs.c
index bcada11..eea4e31 100644
--- a/misc/tune2fs.c
+++ b/misc/tune2fs.c
@@ -64,8 +64,9 @@ char *device_name;
 char *new_label, *new_last_mounted, *new_UUID;
 char *io_options;
 static int c_flag, C_flag, e_flag, f_flag, g_flag, i_flag, l_flag, L_flag;
-static int m_flag, M_flag, r_flag, s_flag = -1, u_flag, U_flag, T_flag;
+static int m_flag, M_flag, r_flag, s_flag = -1, u_flag, U_flag, T_flag, p_flag;
 static int I_flag;
+static int clear_mmp;
 static time_t last_check_time;
 static int print_label;
 static int max_mount_count, mount_count, mount_flags;
@@ -75,6 +76,7 @@ static double reserved_ratio;
 static unsigned long resgid, resuid;
 static unsigned short errors;
 static int open_flag;
+static unsigned int mmp_update_interval;
 static char *features_cmd;
 static char *mntopts_cmd;
 static int stride, stripe_width;
@@ -107,7 +109,7 @@ static void usage(void)
 		  "[-g group]\n"
 		  "\t[-i interval[d|m|w]] [-j] [-J journal_options] [-l]\n"
 		  "\t[-m reserved_blocks_percent] "
-		  "[-o [^]mount_options[,...]] \n"
+		  "[-o [^]mount_options[,...]] [-p mmp_update_interval]\n"
 		  "\t[-r reserved_blocks_count] [-u user] [-C mount_count] "
 		  "[-L volume_label]\n"
 		  "\t[-M last_mounted_dir] [-O [^]feature[,...]]\n"
@@ -123,7 +125,8 @@ static __u32 ok_features[3] = {
 	/* Incompat */
 	EXT2_FEATURE_INCOMPAT_FILETYPE |
 		EXT3_FEATURE_INCOMPAT_EXTENTS |
-		EXT4_FEATURE_INCOMPAT_FLEX_BG,
+		EXT4_FEATURE_INCOMPAT_FLEX_BG |
+		EXT4_FEATURE_INCOMPAT_MMP,
 	/* R/O compat */
 	EXT2_FEATURE_RO_COMPAT_LARGE_FILE |
 		EXT4_FEATURE_RO_COMPAT_HUGE_FILE|
@@ -140,7 +143,8 @@ static __u32 clear_ok_features[3] = {
 		EXT2_FEATURE_COMPAT_DIR_INDEX,
 	/* Incompat */
 	EXT2_FEATURE_INCOMPAT_FILETYPE |
-		EXT4_FEATURE_INCOMPAT_FLEX_BG,
+		EXT4_FEATURE_INCOMPAT_FLEX_BG |
+		EXT4_FEATURE_INCOMPAT_MMP,
 	/* R/O compat */
 	EXT2_FEATURE_RO_COMPAT_LARGE_FILE |
 		EXT4_FEATURE_RO_COMPAT_HUGE_FILE|
@@ -152,7 +156,7 @@ static __u32 clear_ok_features[3] = {
 /*
  * Remove an external journal from the filesystem
  */
-static void remove_journal_device(ext2_filsys fs)
+static int remove_journal_device(ext2_filsys fs)
 {
 	char		*journal_path;
 	ext2_filsys	jfs;
@@ -241,13 +245,15 @@ static void remove_journal_device(ext2_filsys fs)
 no_valid_journal:
 	if (commit_remove_journal == 0) {
 		fputs(_("Journal NOT removed\n"), stderr);
-		exit(1);
+		return 1;
 	}
 	fs->super->s_journal_dev = 0;
 	uuid_clear(fs->super->s_journal_uuid);
 	ext2fs_mark_super_dirty(fs);
 	fputs(_("Journal removed\n"), stdout);
 	free(journal_path);
+
+	return 0;
 }
 
 /* Helper function for remove_journal_inode */
@@ -272,7 +278,7 @@ static int release_blocks_proc(ext2_filsys fs, blk64_t *blocknr,
 /*
  * Remove the journal inode from the filesystem
  */
-static void remove_journal_inode(ext2_filsys fs)
+static errcode_t remove_journal_inode(ext2_filsys fs)
 {
 	struct ext2_inode	inode;
 	errcode_t		retval;
@@ -282,14 +288,14 @@ static void remove_journal_inode(ext2_filsys fs)
 	if (retval) {
 		com_err(program_name, retval,
 			_("while reading journal inode"));
-		exit(1);
+		return retval;
 	}
 	if (ino == EXT2_JOURNAL_INO) {
 		retval = ext2fs_read_bitmaps(fs);
 		if (retval) {
 			com_err(program_name, retval,
 				_("while reading bitmaps"));
-			exit(1);
+			return retval;
 		}
 		retval = ext2fs_block_iterate3(fs, ino,
 					       BLOCK_FLAG_READ_ONLY, NULL,
@@ -297,7 +303,7 @@ static void remove_journal_inode(ext2_filsys fs)
 		if (retval) {
 			com_err(program_name, retval,
 				_("while clearing journal inode"));
-			exit(1);
+			return retval;
 		}
 		memset(&inode, 0, sizeof(inode));
 		ext2fs_mark_bb_dirty(fs);
@@ -308,25 +314,29 @@ static void remove_journal_inode(ext2_filsys fs)
 	if (retval) {
 		com_err(program_name, retval,
 			_("while writing journal inode"));
-		exit(1);
+		return retval;
 	}
 	fs->super->s_journal_inum = 0;
 	ext2fs_mark_super_dirty(fs);
+
+	return 0;
 }
 
 /*
  * Update the default mount options
  */
-static void update_mntopts(ext2_filsys fs, char *mntopts)
+static int update_mntopts(ext2_filsys fs, char *mntopts)
 {
 	struct ext2_super_block *sb = fs->super;
 
 	if (e2p_edit_mntopts(mntopts, &sb->s_default_mount_opts, ~0)) {
 		fprintf(stderr, _("Invalid mount option set: %s\n"),
 			mntopts);
-		exit(1);
+		return 1;
 	}
 	ext2fs_mark_super_dirty(fs);
+
+	return 0;
 }
 
 static void request_fsck_afterwards(ext2_filsys fs)
@@ -344,7 +354,7 @@ static void request_fsck_afterwards(ext2_filsys fs)
 /*
  * Update the feature set as provided by the user.
  */
-static void update_feature_set(ext2_filsys fs, char *features)
+static int update_feature_set(ext2_filsys fs, char *features)
 {
 	struct ext2_super_block *sb = fs->super;
 	struct ext2_group_desc *gd;
@@ -381,7 +391,7 @@ static void update_feature_set(ext2_filsys fs, char *features)
 			fprintf(stderr, _("Setting filesystem feature '%s' "
 					  "not supported.\n"),
 				e2p_feature2string(type_err, mask_err));
-		exit(1);
+		return 1;
 	}
 
 	if (FEATURE_OFF(E2P_FEATURE_COMPAT, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
@@ -391,22 +401,89 @@ static void update_feature_set(ext2_filsys fs, char *features)
 				"cleared when the filesystem is\n"
 				"unmounted or mounted "
 				"read-only.\n"), stderr);
-			exit(1);
+			return 1;
 		}
 		if (sb->s_feature_incompat &
 		    EXT3_FEATURE_INCOMPAT_RECOVER) {
 			fputs(_("The needs_recovery flag is set.  "
 				"Please run e2fsck before clearing\n"
 				"the has_journal flag.\n"), stderr);
-			exit(1);
+			return 1;
 		}
 		if (sb->s_journal_inum) {
-			remove_journal_inode(fs);
+			if (remove_journal_inode(fs))
+				return 1;
 		}
 		if (sb->s_journal_dev) {
-			remove_journal_device(fs);
+			if (remove_journal_device(fs))
+				return 1;
 		}
 	}
+	if (FEATURE_ON(E2P_FEATURE_INCOMPAT, EXT4_FEATURE_INCOMPAT_MMP)) {
+		int error;
+
+		if ((mount_flags & EXT2_MF_MOUNTED) ||
+		    (mount_flags & EXT2_MF_READONLY)) {
+			fputs(_("The multiple mount protection feature can't \n"
+				"be set if the filesystem is mounted or \n"
+				"read-only.\n"), stderr);
+			return 1;
+		}
+
+		error = ext2fs_mmp_init(fs);
+		if (error) {
+			fputs(_("\nError while enabling multiple mount "
+				"protection feature."), stderr);
+			return 1;
+		}
+
+		/*
+		 * We want to update group desc with the new free blocks count
+		 */
+		fs->flags &= ~EXT2_FLAG_SUPER_ONLY;
+
+		printf(_("Multiple mount protection has been enabled "
+			 "with update interval %ds.\n"),
+		       sb->s_mmp_update_interval);
+	}
+
+	if (FEATURE_OFF(E2P_FEATURE_INCOMPAT, EXT4_FEATURE_INCOMPAT_MMP)) {
+		int error;
+
+		if (mount_flags & EXT2_MF_READONLY) {
+			fputs(_("The multiple mount protection feature cannot\n"
+				"be disabled if the filesystem is readonly.\n"),
+				stderr);
+			return 1;
+		}
+
+		error = ext2fs_read_bitmaps(fs);
+		if (error) {
+			fputs(_("Error while reading bitmaps\n"), stderr);
+			return 1;
+		}
+
+		error = ext2fs_mmp_read(fs, sb->s_mmp_block, NULL);
+		if (error) {
+			struct mmp_struct *mmp_cmp = fs->mmp_cmp;
+
+			if (error == EXT2_ET_MMP_MAGIC_INVALID)
+				printf(_("Magic number in MMP block does not "
+					 "match. expected: %x, actual: %x\n"),
+					 EXT2_MMP_MAGIC, mmp_cmp->mmp_magic);
+			else
+				com_err (program_name, error,
+					 _("while reading MMP block."));
+			goto mmp_error;
+		}
+
+		/* We need to force out the group descriptors as well */
+		fs->flags &= ~EXT2_FLAG_SUPER_ONLY;
+		ext2fs_block_alloc_stats(fs, sb->s_mmp_block, -1);
+mmp_error:
+		sb->s_mmp_block = 0;
+		sb->s_mmp_update_interval = 0;
+	}
 
 	if (FEATURE_ON(E2P_FEATURE_COMPAT, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
 		/*
@@ -498,12 +575,14 @@ static void update_feature_set(ext2_filsys fs, char *features)
 	    (old_features[E2P_FEATURE_INCOMPAT] != sb->s_feature_incompat) ||
 	    (old_features[E2P_FEATURE_RO_INCOMPAT] != sb->s_feature_ro_compat))
 		ext2fs_mark_super_dirty(fs);
+
+	return 0;
 }
 
 /*
  * Add a journal to the filesystem.
  */
-static void add_journal(ext2_filsys fs)
+static int add_journal(ext2_filsys fs)
 {
 	unsigned long journal_blocks;
 	errcode_t	retval;
@@ -558,7 +637,7 @@ static void add_journal(ext2_filsys fs)
 			fprintf(stderr, "\n");
 			com_err(program_name, retval,
 				_("\n\twhile trying to create journal file"));
-			exit(1);
+			return retval;
 		} else
 			fputs(_("done\n"), stdout);
 		/*
@@ -569,11 +648,11 @@ static void add_journal(ext2_filsys fs)
 			fs->flags &= ~EXT2_FLAG_SUPER_ONLY;
 	}
 	print_check_message(fs);
-	return;
+	return 0;
 
 err:
 	free(journal_device);
-	exit(1);
+	return 1;
 }
 
 
@@ -641,7 +720,7 @@ static void parse_tune2fs_options(int argc, char **argv)
 	open_flag = 0;
 
 	printf("tune2fs %s (%s)\n", E2FSPROGS_VERSION, E2FSPROGS_DATE);
-	while ((c = getopt(argc, argv, "c:e:fg:i:jlm:o:r:s:u:C:E:I:J:L:M:O:T:U:")) != EOF)
+	while ((c = getopt(argc, argv, "c:e:fg:i:jlm:o:p:r:s:u:C:E:I:J:L:M:O:T:U:")) != EOF)
 		switch (c) {
 		case 'c':
 			max_mount_count = strtol(optarg, &tmp, 0);
@@ -796,6 +875,25 @@ static void parse_tune2fs_options(int argc, char **argv)
 			features_cmd = optarg;
 			open_flag = EXT2_FLAG_RW;
 			break;
+		case 'p':
+			mmp_update_interval = strtol(optarg, &tmp, 0);
+			if (*tmp && mmp_update_interval < 0) {
+				com_err(program_name, 0,
+					_("invalid mmp update interval"));
+				usage();
+			}
+			if (mmp_update_interval == 0)
+				mmp_update_interval = EXT2_MMP_UPDATE_INTERVAL;
+			if (mmp_update_interval > EXT2_MMP_UPDATE_INTERVAL) {
+				com_err(program_name, 0,
+					_("MMP update interval of %s seconds "
+					  "may be dangerous under high load.  "
+					  "Consider decreasing it."),
+					optarg);
+			}
+			p_flag = 1;
+			open_flag = EXT2_FLAG_RW;
+			break;
 		case 'r':
 			reserved_blocks = strtoul(optarg, &tmp, 0);
 			if (*tmp) {
@@ -900,7 +998,7 @@ void do_findfs(int argc, char **argv)
 }
 #endif
 
-static void parse_extended_opts(ext2_filsys fs, const char *opts)
+static int parse_extended_opts(ext2_filsys fs, const char *opts)
 {
 	char	*buf, *token, *next, *p, *arg;
 	int	len, hash_alg;
@@ -911,7 +1009,7 @@ static void parse_extended_opts(ext2_filsys fs, const char *opts)
 	if (!buf) {
 		fprintf(stderr,
 			_("Couldn't allocate memory to parse options!\n"));
-		exit(1);
+		return 1;
 	}
 	strcpy(buf, opts);
 	for (token = buf; token && *token; token = next) {
@@ -934,6 +1032,9 @@ static void parse_extended_opts(ext2_filsys fs, const char *opts)
 			fs->super->s_flags &= ~EXT2_FLAGS_TEST_FILESYS;
 			printf("Clearing test filesystem flag\n");
 			ext2fs_mark_super_dirty(fs);
+		} else if (strcmp(token, "clear-mmp") == 0 ||
+		           strcmp(token, "clear_mmp") == 0) {
+			clear_mmp = 1;
 		} else if (strcmp(token, "stride") == 0) {
 			if (!arg) {
 				r_usage++;
@@ -1003,15 +1104,18 @@ static void parse_extended_opts(ext2_filsys fs, const char *opts)
 			"and may take an argument which\n"
 			"\tis set off by an equals ('=') sign.\n\n"
 			"Valid extended options are:\n"
+			"\tclear-mmp\n"
 			"\tstride=<RAID per-disk chunk size in blocks>\n"
 			"\tstripe_width=<RAID stride*data disks in blocks>\n"
 			"\thash_alg=<hash algorithm>\n"
 			"\ttest_fs\n"
 			"\t^test_fs\n"));
 		free(buf);
-		exit(1);
+		return 1;
 	}
 	free(buf);
+
+	return 0;
 }
 
 /*
@@ -1591,6 +1695,7 @@ int main(int argc, char **argv)
 	ext2_filsys fs;
 	struct ext2_super_block *sb;
 	io_manager io_ptr, io_ptr_orig = NULL;
+	int rc = 0;
 
 #ifdef ENABLE_NLS
 	setlocale(LC_MESSAGES, "");
@@ -1620,14 +1725,26 @@ int main(int argc, char **argv)
 		io_ptr = unix_io_manager;
 
 retry_open:
+	if ((open_flag & EXT2_FLAG_RW) == 0 || f_flag)
+		 open_flag |= EXT2_FLAG_SKIP_MMP;
+
 	retval = ext2fs_open2(device_name, io_options, open_flag,
 			      0, 0, io_ptr, &fs);
 	if (retval) {
-			com_err(program_name, retval,
-				_("while trying to open %s"),
+		com_err(program_name, retval,
+			_("while trying to open %s"),
 			device_name);
-		fprintf(stderr,
-			_("Couldn't find valid filesystem superblock.\n"));
+		if (retval == EXT2_ET_MMP_FSCK_ON)
+			fprintf(stderr,
+				_("If you are sure e2fsck is not running then "
+				  "use 'tune2fs -f -E clear_mmp {device}'\n"));
+		else if (retval == EXT2_ET_MMP_MAGIC_INVALID)
+			fprintf(stderr,
+				_("Magic for mmp is wrong. Try to fix it by "
+				  "using 'fsck {device}'\n"));
+		else if (retval != EXT2_ET_MMP_FAILED)
+			fprintf(stderr,
+			     _("Couldn't find valid filesystem superblock.\n"));
 		exit(1);
 	}
 
@@ -1640,12 +1757,14 @@ retry_open:
 		if (new_inode_size == EXT2_INODE_SIZE(fs->super)) {
 			fprintf(stderr, _("The inode size is already %lu\n"),
 				new_inode_size);
-			exit(1);
+			rc = 1;
+			goto closefs;
 		}
 		if (new_inode_size < EXT2_INODE_SIZE(fs->super)) {
 			fprintf(stderr, _("Shrinking the inode size is "
 					  "not supported\n"));
-			exit(1);
+			rc = 1;
+			goto closefs;
 		}
 
 		/*
@@ -1654,8 +1773,10 @@ retry_open:
 		 */
 		io_ptr_orig = io_ptr;
 		retval = tune2fs_setup_tdb(device_name, &io_ptr);
-		if (retval)
-			exit(1);
+		if (retval) {
+			rc = 1;
+			goto closefs;
+		}
 		if (io_ptr != io_ptr_orig) {
 			ext2fs_close(fs);
 			goto retry_open;
@@ -1670,7 +1791,7 @@ retry_open:
 		printf("%.*s\n", (int) sizeof(sb->s_volume_name),
 		       sb->s_volume_name);
 		remove_error_table(&et_ext2_error_table);
-		exit(0);
+		goto closefs;
 	}
 
 	retval = ext2fs_check_if_mounted(device_name, &mount_flags);
@@ -1678,7 +1799,8 @@ retry_open:
 		com_err("ext2fs_check_if_mount", retval,
 			_("while determining whether %s is mounted."),
 			device_name);
-		exit(1);
+		rc = 1;
+		goto closefs;
 	}
 	/* Normally we only need to write out the superblock */
 	fs->flags |= EXT2_FLAG_SUPER_ONLY;
@@ -1717,12 +1839,19 @@ retry_open:
 		printf (_("Setting reserved blocks percentage to %g%% (%llu blocks)\n"),
 			reserved_ratio, ext2fs_r_blocks_count(sb));
 	}
+	if (p_flag) {
+		sb->s_mmp_update_interval = mmp_update_interval;
+		ext2fs_mark_super_dirty(fs);
+		printf(_("Setting multiple mount protection update interval to "
+			  "%u seconds\n"), mmp_update_interval);
+	}
 	if (r_flag) {
 		if (reserved_blocks >= ext2fs_blocks_count(sb)/2) {
 			com_err(program_name, 0,
 				_("reserved blocks count is too big (%llu)"),
 				reserved_blocks);
-			exit(1);
+			rc = 1;
+			goto closefs;
 		}
 		ext2fs_r_blocks_count_set(sb, reserved_blocks);
 		ext2fs_mark_super_dirty(fs);
@@ -1746,7 +1875,8 @@ retry_open:
 	if (s_flag == 0) {
 		fputs(_("\nClearing the sparse superflag not supported.\n"),
 		      stderr);
-		exit(1);
+		rc = 1;
+		goto closefs;
 	}
 	if (T_flag) {
 		sb->s_lastcheck = last_check_time;
@@ -1774,14 +1904,36 @@ retry_open:
 			sizeof(sb->s_last_mounted));
 		ext2fs_mark_super_dirty(fs);
 	}
-	if (mntopts_cmd)
-		update_mntopts(fs, mntopts_cmd);
-	if (features_cmd)
-		update_feature_set(fs, features_cmd);
-	if (extended_cmd)
-		parse_extended_opts(fs, extended_cmd);
-	if (journal_size || journal_device)
-		add_journal(fs);
+	if (mntopts_cmd) {
+		rc = update_mntopts(fs, mntopts_cmd);
+		if (rc)
+			goto closefs;
+	}
+	if (features_cmd) {
+		rc = update_feature_set(fs, features_cmd);
+		if (rc)
+			goto closefs;
+	}
+	if (extended_cmd) {
+		rc = parse_extended_opts(fs, extended_cmd);
+		if (rc)
+			goto closefs;
+		if (clear_mmp && !f_flag) {
+			fputs(_("Error in using clear_mmp. "
+			        "It must be used with -f\n"),
+			      stderr);
+			goto closefs;
+		}
+	}
+	if (clear_mmp) {
+		rc = ext2fs_mmp_clear(fs);
+		goto closefs;
+	}
+	if (journal_size || journal_device) {
+		rc = add_journal(fs);
+		if (rc);
+			goto closefs;
+	}
 
 	if (U_flag) {
 		int set_csum = 0;
@@ -1809,7 +1961,8 @@ retry_open:
 			uuid_generate(sb->s_uuid);
 		} else if (uuid_parse(new_UUID, sb->s_uuid)) {
 			com_err(program_name, 0, _("Invalid UUID format\n"));
-			exit(1);
+			rc = 1;
+			goto closefs;
 		}
 		if (set_csum) {
 			for (i = 0; i < fs->group_desc_count; i++)
@@ -1823,7 +1976,8 @@ retry_open:
 			fputs(_("The inode size may only be "
 				"changed when the filesystem is "
 				"unmounted.\n"), stderr);
-			exit(1);
+			rc = 1;
+			goto closefs;
 		}
 		if (fs->super->s_feature_incompat &
 		    EXT4_FEATURE_INCOMPAT_FLEX_BG) {
@@ -1858,5 +2012,12 @@ retry_open:
 	}
 	free(device_name);
 	remove_error_table(&et_ext2_error_table);
+
+closefs:
+	if (rc) {
+		ext2fs_mmp_stop(fs);
+		exit(1);
+	}
+
 	return (ext2fs_close(fs) ? 1 : 0);
 }
diff --git a/misc/util.c b/misc/util.c
index 51bdb60..a22ba91 100644
--- a/misc/util.c
+++ b/misc/util.c
@@ -291,3 +291,11 @@ void print_check_message(ext2_filsys fs)
 	       fs->super->s_max_mnt_count,
 	       (double)fs->super->s_checkinterval / (3600 * 24));
 }
+
+void dump_mmp_msg(struct mmp_struct *mmp, const char *msg)
+{
+	if (msg)
+		printf("MMP check failed: %s\n", msg);
+	printf("MMP failure info: last update time: %llu node: %s device: %s\n",
+	       (long long)mmp->mmp_time, mmp->mmp_nodename, mmp->mmp_bdevname);
+}
-- 
1.7.4.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH] ext4: add support for multiple mount protection
  2011-05-02  8:11 MMP update Johann Lombardi
  2011-05-02  8:13 ` [PATCH] Add multi-mount protection support to libext2fs (INCOMPAT_MMP feature) Johann Lombardi
@ 2011-05-02  9:36 ` Johann Lombardi
  2011-05-23  2:19   ` Ted Ts'o
  1 sibling, 1 reply; 14+ messages in thread
From: Johann Lombardi @ 2011-05-02  9:36 UTC (permalink / raw)
  To: linux-ext4; +Cc: Johann Lombardi, Andreas Dilger

Prevent an ext4 filesystem from being mounted multiple times.
A sequence number is stored on disk and is periodically updated (every 5
seconds by default) by a mounted filesystem.
At mount time, we now wait for s_mmp_update_interval seconds to make sure
that the MMP sequence does not change.
In case of failure, the nodename, bdevname and the time at which the MMP
block was last updated is displayed.

Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Signed-off-by: Johann Lombardi <johann@whamcloud.com>
---
 fs/ext4/ext4.h  |   73 +++++++++++-
 fs/ext4/super.c |  363 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 433 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4daaf2b..b5ad696 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1028,7 +1028,7 @@ struct ext4_super_block {
 	__le16	s_want_extra_isize; 	/* New inodes should reserve # bytes */
 	__le32	s_flags;		/* Miscellaneous flags */
 	__le16  s_raid_stride;		/* RAID stride */
-	__le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
+	__le16  s_mmp_update_interval;  /* # seconds to wait in MMP checking */
 	__le64  s_mmp_block;            /* Block for multi-mount protection */
 	__le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
 	__u8	s_log_groups_per_flex;  /* FLEX_BG group size */
@@ -1201,6 +1201,9 @@ struct ext4_sb_info {
 	struct ext4_li_request *s_li_request;
 	/* Wait multiplier for lazy initialization thread */
 	unsigned int s_li_wait_mult;
+
+	/* Kernel thread for multiple mount protection */
+	struct task_struct *s_mmp_tsk;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1357,7 +1360,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 					 EXT4_FEATURE_INCOMPAT_META_BG| \
 					 EXT4_FEATURE_INCOMPAT_EXTENTS| \
 					 EXT4_FEATURE_INCOMPAT_64BIT| \
-					 EXT4_FEATURE_INCOMPAT_FLEX_BG)
+					 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+					 EXT4_FEATURE_INCOMPAT_MMP)
 #define EXT4_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
 					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
 					 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1615,6 +1619,67 @@ struct ext4_features {
 };
 
 /*
+ * This structure will be used for multiple mount protection. It will be
+ * written into the block number saved in the s_mmp_block field in the
+ * superblock. Programs that check MMP should assume that if
+ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
+ * to use the filesystem, regardless of how old the timestamp is.
+ */
+#define EXT4_MMP_MAGIC     0x004D4D50U /* ASCII for MMP */
+#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
+#define EXT4_MMP_SEQ_FSCK  0xE24D4D50U /* mmp_seq value when being fscked */
+#define EXT4_MMP_SEQ_MAX   0xE24D4D4FU /* maximum valid mmp_seq value */
+
+struct mmp_struct {
+	__le32	mmp_magic;		/* Magic number for MMP */
+	__le32	mmp_seq;		/* Sequence no. updated periodically */
+
+	/*
+	 * mmp_time, mmp_nodename & mmp_bdevname are only used for information
+	 * purposes and do not affect the correctness of the algorithm
+	 */
+	__le64	mmp_time;		/* Time last updated */
+	char	mmp_nodename[64];	/* Node which last updated MMP block */
+	char	mmp_bdevname[32];	/* Bdev which last updated MMP block */
+
+	/*
+	 * mmp_check_interval is used to verify if the MMP block has been
+	 * updated on the block device. The value is updated based on the
+	 * maximum time to write the MMP block during an update cycle.
+	 */
+	__le16	mmp_check_interval;
+
+	__le16	mmp_pad1;
+	__le32	mmp_pad2[227];
+};
+
+/* arguments passed to the mmp thread */
+struct mmpd_data {
+	struct buffer_head *bh; /* bh from initial read_mmp_block() */
+	struct super_block *sb;  /* super block of the fs */
+};
+
+/*
+ * Check interval multiplier
+ * The MMP block is written every update interval and initially checked every
+ * update interval x the multiplier (the value is then adapted based on the
+ * write latency). The reason is that writes can be delayed under load and we
+ * don't want readers to incorrectly assume that the filesystem is no longer
+ * in use.
+ */
+#define EXT4_MMP_CHECK_MULT		2UL
+
+/*
+ * Minimum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MIN_CHECK_INTERVAL	5UL
+
+/*
+ * Maximum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MAX_CHECK_INTERVAL	300UL
+
+/*
  * Function prototypes
  */
 
@@ -1788,6 +1853,10 @@ extern void __ext4_warning(struct super_block *, const char *, unsigned int,
 						       __LINE__, ## message)
 extern void ext4_msg(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
+extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
+			   const char *, unsigned int, const char *);
+#define dump_mmp_msg(sb, mmp, msg)	__dump_mmp_msg(sb, mmp, __func__, \
+						       __LINE__, msg)
 extern void __ext4_grp_locked_error(const char *, unsigned int, \
 				    struct super_block *, ext4_group_t, \
 				    unsigned long, ext4_fsblk_t, \
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 22546ad..44ede1e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -39,6 +39,8 @@
 #include <linux/log2.h>
 #include <linux/crc16.h>
 #include <asm/uaccess.h>
+#include <linux/kthread.h>
+#include <linux/utsname.h>
 
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -789,6 +791,8 @@ static void ext4_put_super(struct super_block *sb)
 		invalidate_bdev(sbi->journal_bdev);
 		ext4_blkdev_remove(sbi);
 	}
+	if (sbi->s_mmp_tsk)
+		kthread_stop(sbi->s_mmp_tsk);
 	sb->s_fs_info = NULL;
 	/*
 	 * Now that we are completely done shutting down the
@@ -1088,6 +1092,349 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	return 0;
 }
 
+
+/*
+ * Write the MMP block using WRITE_SYNC to try to get the block on-disk
+ * faster.
+ */
+static int write_mmp_block(struct buffer_head *bh)
+{
+	mark_buffer_dirty(bh);
+	lock_buffer(bh);
+	bh->b_end_io = end_buffer_write_sync;
+	get_bh(bh);
+	submit_bh(WRITE_SYNC, bh);
+	wait_on_buffer(bh);
+	if (unlikely(!buffer_uptodate(bh)))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Read the MMP block. It _must_ be read from disk and hence we clear the
+ * uptodate flag on the buffer.
+ */
+static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
+			  ext4_fsblk_t mmp_block)
+{
+	struct mmp_struct *mmp;
+
+	if (*bh)
+		clear_buffer_uptodate(*bh);
+
+	/* This would be sb_bread(sb, mmp_block), except we need to be sure
+	 * that the MD RAID device cache has been bypassed, and that the read
+	 * is not blocked in the elevator. */
+	if (!*bh)
+		*bh = sb_getblk(sb, mmp_block);
+	if (*bh) {
+		get_bh(*bh);
+		lock_buffer(*bh);
+		(*bh)->b_end_io = end_buffer_read_sync;
+		submit_bh(READ_SYNC, *bh);
+		wait_on_buffer(*bh);
+		if (!buffer_uptodate(*bh)) {
+			brelse(*bh);
+			*bh = NULL;
+		}
+	}
+	if (!*bh) {
+		ext4_warning(sb, "Error while reading MMP block %llu",
+		             mmp_block);
+		return -EIO;
+	}
+
+	mmp = (struct mmp_struct *)((*bh)->b_data);
+	if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Dump as much information as possible to help the admin.
+ */
+void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
+		    const char *function, unsigned int line, const char *msg)
+{
+	__ext4_warning(sb, function, line, msg);
+	__ext4_warning(sb, function, line,
+		       "MMP failure info: last update time: %llu, last update "
+		       "node: %s, last update device: %s\n",
+		       (long long unsigned int) le64_to_cpu(mmp->mmp_time),
+		       mmp->mmp_nodename, mmp->mmp_bdevname);
+}
+
+/*
+ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
+ */
+static int kmmpd(void *data)
+{
+	struct super_block *sb = ((struct mmpd_data *) data)->sb;
+	struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct mmp_struct *mmp;
+	ext4_fsblk_t mmp_block;
+	u32 seq = 0;
+	unsigned long failed_writes = 0;
+	int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
+	unsigned mmp_check_interval;
+	unsigned long last_update_time;
+	unsigned long diff;
+	int retval;
+
+	mmp_block = le64_to_cpu(es->s_mmp_block);
+	mmp = (struct mmp_struct *)(bh->b_data);
+	mmp->mmp_time = cpu_to_le64(get_seconds());
+	/*
+	 * Start with the higher mmp_check_interval and reduce it if
+	 * the MMP block is being updated on time.
+	 */
+	mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
+				 EXT4_MMP_MIN_CHECK_INTERVAL);
+	mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+	bdevname(bh->b_bdev, mmp->mmp_bdevname);
+
+	memcpy(mmp->mmp_nodename, init_utsname()->sysname,
+	       sizeof(mmp->mmp_nodename));
+
+	while (!kthread_should_stop()) {
+		if (++seq > EXT4_MMP_SEQ_MAX)
+			seq = 1;
+
+		mmp->mmp_seq = cpu_to_le32(seq);
+		mmp->mmp_time = cpu_to_le64(get_seconds());
+		last_update_time = jiffies;
+
+		retval = write_mmp_block(bh);
+		/*
+		 * Don't spew too many error messages. Print one every
+		 * (s_mmp_update_interval * 60) seconds.
+		 */
+		if (retval && (failed_writes % 60) == 0) {
+			ext4_error(sb, "Error writing to MMP block");
+			failed_writes++;
+		}
+
+		if (!(le32_to_cpu(es->s_feature_incompat) &
+		    EXT4_FEATURE_INCOMPAT_MMP)) {
+			ext4_warning(sb, "kmmpd being stopped since MMP feature"
+				     " has been disabled.");
+			EXT4_SB(sb)->s_mmp_tsk = NULL;
+			goto failed;
+		}
+
+		if (sb->s_flags & MS_RDONLY) {
+			ext4_warning(sb, "kmmpd being stopped since filesystem "
+				     "has been remounted as readonly.");
+			EXT4_SB(sb)->s_mmp_tsk = NULL;
+			goto failed;
+		}
+
+		diff = jiffies - last_update_time;
+		if (diff < mmp_update_interval * HZ)
+			schedule_timeout_interruptible(mmp_update_interval *
+						       HZ - diff);
+
+		/*
+		 * We need to make sure that more than mmp_check_interval
+		 * seconds have not passed since writing. If that has happened
+		 * we need to check if the MMP block is as we left it.
+		 */
+		diff = jiffies - last_update_time;
+		if (diff > mmp_check_interval * HZ) {
+			struct buffer_head *bh_check = NULL;
+			struct mmp_struct *mmp_check;
+
+			retval = read_mmp_block(sb, &bh_check, mmp_block);
+			if (retval) {
+				ext4_error(sb, "error reading MMP data: %d",
+					   retval);
+
+				EXT4_SB(sb)->s_mmp_tsk = NULL;
+				goto failed;
+			}
+
+			mmp_check = (struct mmp_struct *)(bh_check->b_data);
+			if (mmp->mmp_seq != mmp_check->mmp_seq ||
+			    memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
+				   sizeof(mmp->mmp_nodename))) {
+				dump_mmp_msg(sb, mmp_check,
+					     "Error while updating MMP info. "
+					     "The filesystem seems to have been"
+					     " multiply mounted.");
+				ext4_error(sb, "abort");
+				goto failed;
+			}
+			put_bh(bh_check);
+		}
+
+		 /*
+		 * Adjust the mmp_check_interval depending on how much time
+		 * it took for the MMP block to be written.
+		 */
+		mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
+					     EXT4_MMP_MAX_CHECK_INTERVAL),
+					 EXT4_MMP_MIN_CHECK_INTERVAL);
+		mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+	}
+
+	/*
+	 * Unmount seems to be clean.
+	 */
+	mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
+	mmp->mmp_time = cpu_to_le64(get_seconds());
+
+	retval = write_mmp_block(bh);
+
+failed:
+	kfree(data);
+	brelse(bh);
+	return retval;
+}
+
+/*
+ * Get a random new sequence number but make sure it is not greater than
+ * EXT4_MMP_SEQ_MAX.
+ */
+static unsigned int mmp_new_seq(void)
+{
+	u32 new_seq;
+
+	do {
+		get_random_bytes(&new_seq, sizeof(u32));
+	} while (new_seq > EXT4_MMP_SEQ_MAX);
+
+	return new_seq;
+}
+
+/*
+ * Protect the filesystem from being mounted more than once.
+ */
+static int ext4_multi_mount_protect(struct super_block *sb,
+				    ext4_fsblk_t mmp_block)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct buffer_head *bh = NULL;
+	struct mmp_struct *mmp = NULL;
+	struct mmpd_data *mmpd_data;
+	u32 seq;
+	unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
+	unsigned int wait_time = 0;
+	int retval;
+
+	if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
+	    mmp_block >= ext4_blocks_count(es)) {
+		ext4_warning(sb, "Invalid MMP block in superblock");
+		goto failed;
+	}
+
+	retval = read_mmp_block(sb, &bh, mmp_block);
+	if (retval)
+		goto failed;
+
+	mmp = (struct mmp_struct *)(bh->b_data);
+
+	if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
+		mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
+
+	/*
+	 * If check_interval in MMP block is larger, use that instead of
+	 * update_interval from the superblock.
+	 */
+	if (mmp->mmp_check_interval > mmp_check_interval)
+		mmp_check_interval = mmp->mmp_check_interval;
+
+	seq = le32_to_cpu(mmp->mmp_seq);
+	if (seq == EXT4_MMP_SEQ_CLEAN)
+		goto skip;
+
+	if (seq == EXT4_MMP_SEQ_FSCK) {
+		dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
+		goto failed;
+	}
+
+	wait_time = min(mmp_check_interval * 2 + 1,
+			mmp_check_interval + 60);
+
+	/* Print MMP interval if more than 20 secs. */
+	if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
+		ext4_warning(sb, "MMP interval %u higher than expected, please"
+		             " wait.\n", wait_time * 2);
+
+	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+		ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+		goto failed;
+	}
+
+	retval = read_mmp_block(sb, &bh, mmp_block);
+	if (retval)
+		goto failed;
+	mmp = (struct mmp_struct *)(bh->b_data);
+	if (seq != le32_to_cpu(mmp->mmp_seq)) {
+		dump_mmp_msg(sb, mmp,
+			     "Device is already active on another node.");
+		goto failed;
+	}
+
+skip:
+	/*
+	 * write a new random sequence number.
+	 */
+	mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
+
+	retval = write_mmp_block(bh);
+	if (retval)
+		goto failed;
+
+	/*
+	 * wait for MMP interval and check mmp_seq.
+	 */
+	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+		ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+		goto failed;
+	}
+
+	retval = read_mmp_block(sb, &bh, mmp_block);
+	if (retval)
+		goto failed;
+	mmp = (struct mmp_struct *)(bh->b_data);
+	if (seq != le32_to_cpu(mmp->mmp_seq)) {
+		dump_mmp_msg(sb, mmp,
+			     "Device is already active on another node.");
+		goto failed;
+	}
+
+	mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
+	if (!mmpd_data) {
+		ext4_warning(sb, "not enough memory for mmpd_data");
+		goto failed;
+	}
+	mmpd_data->sb = sb;
+	mmpd_data->bh = bh;
+
+	/*
+	 * Start a kernel thread to update the MMP block periodically.
+	 */
+	EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
+					     bdevname(bh->b_bdev,
+						      mmp->mmp_bdevname));
+	if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
+		EXT4_SB(sb)->s_mmp_tsk = NULL;
+		kfree(mmpd_data);
+		ext4_warning(sb, "Unable to create kmmpd thread for %s.",
+			     sb->s_id);
+		goto failed;
+	}
+
+	return 0;
+
+failed:
+	brelse(bh);
+	return 1;
+}
+
 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
 					u64 ino, u32 generation)
 {
@@ -3432,6 +3779,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			  EXT4_HAS_INCOMPAT_FEATURE(sb,
 				    EXT4_FEATURE_INCOMPAT_RECOVER));
 
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
+	    !(sb->s_flags & MS_RDONLY))
+		if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
+			goto failed_mount3;
+
 	/*
 	 * The first inode we look at is the journal inode.  Don't try
 	 * root first: it may be modified in the journal!
@@ -3682,6 +4034,8 @@ failed_mount3:
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+	if (sbi->s_mmp_tsk)
+		kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
@@ -4212,7 +4566,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	int enable_quota = 0;
 	ext4_group_t g;
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
-	int err;
+	int err = 0;
 #ifdef CONFIG_QUOTA
 	int i;
 #endif
@@ -4338,6 +4692,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 				goto restore_opts;
 			if (!ext4_setup_super(sb, es, 0))
 				sb->s_flags &= ~MS_RDONLY;
+			if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+						     EXT4_FEATURE_INCOMPAT_MMP))
+				if (ext4_multi_mount_protect(sb,
+						le64_to_cpu(es->s_mmp_block))) {
+					err = -EROFS;
+					goto restore_opts;
+				}
 			enable_quota = 1;
 		}
 	}
-- 
1.7.4.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] ext4: add support for multiple mount protection
  2011-04-12 19:20 ` Bernd Schubert
@ 2011-05-02 13:43   ` Johann Lombardi
  0 siblings, 0 replies; 14+ messages in thread
From: Johann Lombardi @ 2011-05-02 13:43 UTC (permalink / raw)
  To: Bernd Schubert; +Cc: linux-ext4, Andreas Dilger

Hi Bernd,

On Tue, Apr 12, 2011 at 09:20:03PM +0200, Bernd Schubert wrote:
> >+#define EXT4_MMP_UPDATE_INTERVAL   1
> 
> this define is nowhere used (e2fsprogs already sets the
> update-interval in the superblock).

Indeed, i have removed it.

> And it is also too small, which may cause a huge performance issue
> (please remember my corresponding lustre bugzilla...).

The patches i have just sent increase the update interval to 5s and decrease the multiplier used to compute the check interval from 5 to 2.

> Once you are going to post the e2fsprogs patch, could you please
> also take care of the 2TiB mmp-block read limitation bug (please see
> another Lustre bugzilla )?

The updated e2fsprogs & kernel patches use blk64_t/ext4_fsblk_t, so this should work now. I will give this a try ASAP.

> It also would be nice to explain somewhere (and not hidden in the
> code) what is the difference between "update interval" and "check
> interval".

Done.

Thanks for the feedback.

Cheers,
Johann

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] ext4: add support for multiple mount protection
@ 2011-05-14  0:38 Johann Lombardi
  2011-05-24 21:47 ` Ted Ts'o
  0 siblings, 1 reply; 14+ messages in thread
From: Johann Lombardi @ 2011-05-14  0:38 UTC (permalink / raw)
  To: linux-ext4; +Cc: Johann Lombardi, Andreas Dilger

Prevent an ext4 filesystem from being mounted multiple times.
A sequence number is stored on disk and is periodically updated (every 5
seconds by default) by a mounted filesystem.
At mount time, we now wait for s_mmp_update_interval seconds to make sure
that the MMP sequence does not change.
In case of failure, the nodename, bdevname and the time at which the MMP
block was last updated is displayed.
Move all mmp code to a dedicated file (mmp.c).

Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Signed-off-by: Johann Lombardi <johann@whamcloud.com>
---
 fs/ext4/Makefile |    3 +-
 fs/ext4/ext4.h   |   76 ++++++++++++-
 fs/ext4/mmp.c    |  351 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/super.c  |   18 +++-
 4 files changed, 444 insertions(+), 4 deletions(-)
 create mode 100644 fs/ext4/mmp.c

diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index c947e36..0410946 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
 
 ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
 		ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-		ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
+		ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
+		mmp.o
 
 ext4-$(CONFIG_EXT4_FS_XATTR)		+= xattr.o xattr_user.o xattr_trusted.o
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4daaf2b..5e1ce1d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1028,7 +1028,7 @@ struct ext4_super_block {
 	__le16	s_want_extra_isize; 	/* New inodes should reserve # bytes */
 	__le32	s_flags;		/* Miscellaneous flags */
 	__le16  s_raid_stride;		/* RAID stride */
-	__le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
+	__le16  s_mmp_update_interval;  /* # seconds to wait in MMP checking */
 	__le64  s_mmp_block;            /* Block for multi-mount protection */
 	__le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
 	__u8	s_log_groups_per_flex;  /* FLEX_BG group size */
@@ -1201,6 +1201,9 @@ struct ext4_sb_info {
 	struct ext4_li_request *s_li_request;
 	/* Wait multiplier for lazy initialization thread */
 	unsigned int s_li_wait_mult;
+
+	/* Kernel thread for multiple mount protection */
+	struct task_struct *s_mmp_tsk;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1357,7 +1360,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 					 EXT4_FEATURE_INCOMPAT_META_BG| \
 					 EXT4_FEATURE_INCOMPAT_EXTENTS| \
 					 EXT4_FEATURE_INCOMPAT_64BIT| \
-					 EXT4_FEATURE_INCOMPAT_FLEX_BG)
+					 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+					 EXT4_FEATURE_INCOMPAT_MMP)
 #define EXT4_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
 					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
 					 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1615,6 +1619,67 @@ struct ext4_features {
 };
 
 /*
+ * This structure will be used for multiple mount protection. It will be
+ * written into the block number saved in the s_mmp_block field in the
+ * superblock. Programs that check MMP should assume that if
+ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
+ * to use the filesystem, regardless of how old the timestamp is.
+ */
+#define EXT4_MMP_MAGIC     0x004D4D50U /* ASCII for MMP */
+#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
+#define EXT4_MMP_SEQ_FSCK  0xE24D4D50U /* mmp_seq value when being fscked */
+#define EXT4_MMP_SEQ_MAX   0xE24D4D4FU /* maximum valid mmp_seq value */
+
+struct mmp_struct {
+	__le32	mmp_magic;		/* Magic number for MMP */
+	__le32	mmp_seq;		/* Sequence no. updated periodically */
+
+	/*
+	 * mmp_time, mmp_nodename & mmp_bdevname are only used for information
+	 * purposes and do not affect the correctness of the algorithm
+	 */
+	__le64	mmp_time;		/* Time last updated */
+	char	mmp_nodename[64];	/* Node which last updated MMP block */
+	char	mmp_bdevname[32];	/* Bdev which last updated MMP block */
+
+	/*
+	 * mmp_check_interval is used to verify if the MMP block has been
+	 * updated on the block device. The value is updated based on the
+	 * maximum time to write the MMP block during an update cycle.
+	 */
+	__le16	mmp_check_interval;
+
+	__le16	mmp_pad1;
+	__le32	mmp_pad2[227];
+};
+
+/* arguments passed to the mmp thread */
+struct mmpd_data {
+	struct buffer_head *bh; /* bh from initial read_mmp_block() */
+	struct super_block *sb;  /* super block of the fs */
+};
+
+/*
+ * Check interval multiplier
+ * The MMP block is written every update interval and initially checked every
+ * update interval x the multiplier (the value is then adapted based on the
+ * write latency). The reason is that writes can be delayed under load and we
+ * don't want readers to incorrectly assume that the filesystem is no longer
+ * in use.
+ */
+#define EXT4_MMP_CHECK_MULT		2UL
+
+/*
+ * Minimum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MIN_CHECK_INTERVAL	5UL
+
+/*
+ * Maximum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MAX_CHECK_INTERVAL	300UL
+
+/*
  * Function prototypes
  */
 
@@ -1788,6 +1853,10 @@ extern void __ext4_warning(struct super_block *, const char *, unsigned int,
 						       __LINE__, ## message)
 extern void ext4_msg(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
+extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
+			   const char *, unsigned int, const char *);
+#define dump_mmp_msg(sb, mmp, msg)	__dump_mmp_msg(sb, mmp, __func__, \
+						       __LINE__, msg)
 extern void __ext4_grp_locked_error(const char *, unsigned int, \
 				    struct super_block *, ext4_group_t, \
 				    unsigned long, ext4_fsblk_t, \
@@ -2092,6 +2161,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
 			       int len,
 			       struct writeback_control *wbc);
 
+/* mmp.c */
+extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
+
 /* BH_Uninit flag: blocks are allocated but uninitialized on disk */
 enum ext4_state_bits {
 	BH_Uninit	/* blocks are allocated but uninitialized on disk */
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
new file mode 100644
index 0000000..0832fd3
--- /dev/null
+++ b/fs/ext4/mmp.c
@@ -0,0 +1,351 @@
+#include <linux/fs.h>
+#include <linux/random.h>
+#include <linux/buffer_head.h>
+#include <linux/utsname.h>
+#include <linux/kthread.h>
+
+#include "ext4.h"
+
+/*
+ * Write the MMP block using WRITE_SYNC to try to get the block on-disk
+ * faster.
+ */
+static int write_mmp_block(struct buffer_head *bh)
+{
+	mark_buffer_dirty(bh);
+	lock_buffer(bh);
+	bh->b_end_io = end_buffer_write_sync;
+	get_bh(bh);
+	submit_bh(WRITE_SYNC, bh);
+	wait_on_buffer(bh);
+	if (unlikely(!buffer_uptodate(bh)))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Read the MMP block. It _must_ be read from disk and hence we clear the
+ * uptodate flag on the buffer.
+ */
+static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
+			  ext4_fsblk_t mmp_block)
+{
+	struct mmp_struct *mmp;
+
+	if (*bh)
+		clear_buffer_uptodate(*bh);
+
+	/* This would be sb_bread(sb, mmp_block), except we need to be sure
+	 * that the MD RAID device cache has been bypassed, and that the read
+	 * is not blocked in the elevator. */
+	if (!*bh)
+		*bh = sb_getblk(sb, mmp_block);
+	if (*bh) {
+		get_bh(*bh);
+		lock_buffer(*bh);
+		(*bh)->b_end_io = end_buffer_read_sync;
+		submit_bh(READ_SYNC, *bh);
+		wait_on_buffer(*bh);
+		if (!buffer_uptodate(*bh)) {
+			brelse(*bh);
+			*bh = NULL;
+		}
+	}
+	if (!*bh) {
+		ext4_warning(sb, "Error while reading MMP block %llu",
+		             mmp_block);
+		return -EIO;
+	}
+
+	mmp = (struct mmp_struct *)((*bh)->b_data);
+	if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Dump as much information as possible to help the admin.
+ */
+void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
+		    const char *function, unsigned int line, const char *msg)
+{
+	__ext4_warning(sb, function, line, msg);
+	__ext4_warning(sb, function, line,
+		       "MMP failure info: last update time: %llu, last update "
+		       "node: %s, last update device: %s\n",
+		       (long long unsigned int) le64_to_cpu(mmp->mmp_time),
+		       mmp->mmp_nodename, mmp->mmp_bdevname);
+}
+
+/*
+ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
+ */
+static int kmmpd(void *data)
+{
+	struct super_block *sb = ((struct mmpd_data *) data)->sb;
+	struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct mmp_struct *mmp;
+	ext4_fsblk_t mmp_block;
+	u32 seq = 0;
+	unsigned long failed_writes = 0;
+	int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
+	unsigned mmp_check_interval;
+	unsigned long last_update_time;
+	unsigned long diff;
+	int retval;
+
+	mmp_block = le64_to_cpu(es->s_mmp_block);
+	mmp = (struct mmp_struct *)(bh->b_data);
+	mmp->mmp_time = cpu_to_le64(get_seconds());
+	/*
+	 * Start with the higher mmp_check_interval and reduce it if
+	 * the MMP block is being updated on time.
+	 */
+	mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
+				 EXT4_MMP_MIN_CHECK_INTERVAL);
+	mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+	bdevname(bh->b_bdev, mmp->mmp_bdevname);
+
+	memcpy(mmp->mmp_nodename, init_utsname()->sysname,
+	       sizeof(mmp->mmp_nodename));
+
+	while (!kthread_should_stop()) {
+		if (++seq > EXT4_MMP_SEQ_MAX)
+			seq = 1;
+
+		mmp->mmp_seq = cpu_to_le32(seq);
+		mmp->mmp_time = cpu_to_le64(get_seconds());
+		last_update_time = jiffies;
+
+		retval = write_mmp_block(bh);
+		/*
+		 * Don't spew too many error messages. Print one every
+		 * (s_mmp_update_interval * 60) seconds.
+		 */
+		if (retval && (failed_writes % 60) == 0) {
+			ext4_error(sb, "Error writing to MMP block");
+			failed_writes++;
+		}
+
+		if (!(le32_to_cpu(es->s_feature_incompat) &
+		    EXT4_FEATURE_INCOMPAT_MMP)) {
+			ext4_warning(sb, "kmmpd being stopped since MMP feature"
+				     " has been disabled.");
+			EXT4_SB(sb)->s_mmp_tsk = NULL;
+			goto failed;
+		}
+
+		if (sb->s_flags & MS_RDONLY) {
+			ext4_warning(sb, "kmmpd being stopped since filesystem "
+				     "has been remounted as readonly.");
+			EXT4_SB(sb)->s_mmp_tsk = NULL;
+			goto failed;
+		}
+
+		diff = jiffies - last_update_time;
+		if (diff < mmp_update_interval * HZ)
+			schedule_timeout_interruptible(mmp_update_interval *
+						       HZ - diff);
+
+		/*
+		 * We need to make sure that more than mmp_check_interval
+		 * seconds have not passed since writing. If that has happened
+		 * we need to check if the MMP block is as we left it.
+		 */
+		diff = jiffies - last_update_time;
+		if (diff > mmp_check_interval * HZ) {
+			struct buffer_head *bh_check = NULL;
+			struct mmp_struct *mmp_check;
+
+			retval = read_mmp_block(sb, &bh_check, mmp_block);
+			if (retval) {
+				ext4_error(sb, "error reading MMP data: %d",
+					   retval);
+
+				EXT4_SB(sb)->s_mmp_tsk = NULL;
+				goto failed;
+			}
+
+			mmp_check = (struct mmp_struct *)(bh_check->b_data);
+			if (mmp->mmp_seq != mmp_check->mmp_seq ||
+			    memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
+				   sizeof(mmp->mmp_nodename))) {
+				dump_mmp_msg(sb, mmp_check,
+					     "Error while updating MMP info. "
+					     "The filesystem seems to have been"
+					     " multiply mounted.");
+				ext4_error(sb, "abort");
+				goto failed;
+			}
+			put_bh(bh_check);
+		}
+
+		 /*
+		 * Adjust the mmp_check_interval depending on how much time
+		 * it took for the MMP block to be written.
+		 */
+		mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
+					     EXT4_MMP_MAX_CHECK_INTERVAL),
+					 EXT4_MMP_MIN_CHECK_INTERVAL);
+		mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+	}
+
+	/*
+	 * Unmount seems to be clean.
+	 */
+	mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
+	mmp->mmp_time = cpu_to_le64(get_seconds());
+
+	retval = write_mmp_block(bh);
+
+failed:
+	kfree(data);
+	brelse(bh);
+	return retval;
+}
+
+/*
+ * Get a random new sequence number but make sure it is not greater than
+ * EXT4_MMP_SEQ_MAX.
+ */
+static unsigned int mmp_new_seq(void)
+{
+	u32 new_seq;
+
+	do {
+		get_random_bytes(&new_seq, sizeof(u32));
+	} while (new_seq > EXT4_MMP_SEQ_MAX);
+
+	return new_seq;
+}
+
+/*
+ * Protect the filesystem from being mounted more than once.
+ */
+int ext4_multi_mount_protect(struct super_block *sb,
+				    ext4_fsblk_t mmp_block)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct buffer_head *bh = NULL;
+	struct mmp_struct *mmp = NULL;
+	struct mmpd_data *mmpd_data;
+	u32 seq;
+	unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
+	unsigned int wait_time = 0;
+	int retval;
+
+	if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
+	    mmp_block >= ext4_blocks_count(es)) {
+		ext4_warning(sb, "Invalid MMP block in superblock");
+		goto failed;
+	}
+
+	retval = read_mmp_block(sb, &bh, mmp_block);
+	if (retval)
+		goto failed;
+
+	mmp = (struct mmp_struct *)(bh->b_data);
+
+	if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
+		mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
+
+	/*
+	 * If check_interval in MMP block is larger, use that instead of
+	 * update_interval from the superblock.
+	 */
+	if (mmp->mmp_check_interval > mmp_check_interval)
+		mmp_check_interval = mmp->mmp_check_interval;
+
+	seq = le32_to_cpu(mmp->mmp_seq);
+	if (seq == EXT4_MMP_SEQ_CLEAN)
+		goto skip;
+
+	if (seq == EXT4_MMP_SEQ_FSCK) {
+		dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
+		goto failed;
+	}
+
+	wait_time = min(mmp_check_interval * 2 + 1,
+			mmp_check_interval + 60);
+
+	/* Print MMP interval if more than 20 secs. */
+	if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
+		ext4_warning(sb, "MMP interval %u higher than expected, please"
+		             " wait.\n", wait_time * 2);
+
+	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+		ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+		goto failed;
+	}
+
+	retval = read_mmp_block(sb, &bh, mmp_block);
+	if (retval)
+		goto failed;
+	mmp = (struct mmp_struct *)(bh->b_data);
+	if (seq != le32_to_cpu(mmp->mmp_seq)) {
+		dump_mmp_msg(sb, mmp,
+			     "Device is already active on another node.");
+		goto failed;
+	}
+
+skip:
+	/*
+	 * write a new random sequence number.
+	 */
+	mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
+
+	retval = write_mmp_block(bh);
+	if (retval)
+		goto failed;
+
+	/*
+	 * wait for MMP interval and check mmp_seq.
+	 */
+	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+		ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+		goto failed;
+	}
+
+	retval = read_mmp_block(sb, &bh, mmp_block);
+	if (retval)
+		goto failed;
+	mmp = (struct mmp_struct *)(bh->b_data);
+	if (seq != le32_to_cpu(mmp->mmp_seq)) {
+		dump_mmp_msg(sb, mmp,
+			     "Device is already active on another node.");
+		goto failed;
+	}
+
+	mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
+	if (!mmpd_data) {
+		ext4_warning(sb, "not enough memory for mmpd_data");
+		goto failed;
+	}
+	mmpd_data->sb = sb;
+	mmpd_data->bh = bh;
+
+	/*
+	 * Start a kernel thread to update the MMP block periodically.
+	 */
+	EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
+					     bdevname(bh->b_bdev,
+						      mmp->mmp_bdevname));
+	if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
+		EXT4_SB(sb)->s_mmp_tsk = NULL;
+		kfree(mmpd_data);
+		ext4_warning(sb, "Unable to create kmmpd thread for %s.",
+			     sb->s_id);
+		goto failed;
+	}
+
+	return 0;
+
+failed:
+	brelse(bh);
+	return 1;
+}
+
+
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8553dfb..c23d328 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -806,6 +806,8 @@ static void ext4_put_super(struct super_block *sb)
 		invalidate_bdev(sbi->journal_bdev);
 		ext4_blkdev_remove(sbi);
 	}
+	if (sbi->s_mmp_tsk)
+		kthread_stop(sbi->s_mmp_tsk);
 	sb->s_fs_info = NULL;
 	/*
 	 * Now that we are completely done shutting down the
@@ -3459,6 +3461,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			  EXT4_HAS_INCOMPAT_FEATURE(sb,
 				    EXT4_FEATURE_INCOMPAT_RECOVER));
 
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
+	    !(sb->s_flags & MS_RDONLY))
+		if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
+			goto failed_mount3;
+
 	/*
 	 * The first inode we look at is the journal inode.  Don't try
 	 * root first: it may be modified in the journal!
@@ -3707,6 +3714,8 @@ failed_mount3:
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+	if (sbi->s_mmp_tsk)
+		kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
@@ -4242,7 +4251,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	int enable_quota = 0;
 	ext4_group_t g;
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
-	int err;
+	int err = 0;
 #ifdef CONFIG_QUOTA
 	int i;
 #endif
@@ -4368,6 +4377,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 				goto restore_opts;
 			if (!ext4_setup_super(sb, es, 0))
 				sb->s_flags &= ~MS_RDONLY;
+			if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+						     EXT4_FEATURE_INCOMPAT_MMP))
+				if (ext4_multi_mount_protect(sb,
+						le64_to_cpu(es->s_mmp_block))) {
+					err = -EROFS;
+					goto restore_opts;
+				}
 			enable_quota = 1;
 		}
 	}
-- 
1.7.4.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] ext4: add support for multiple mount protection
  2011-05-02  9:36 ` [PATCH] ext4: add support for multiple mount protection Johann Lombardi
@ 2011-05-23  2:19   ` Ted Ts'o
  0 siblings, 0 replies; 14+ messages in thread
From: Ted Ts'o @ 2011-05-23  2:19 UTC (permalink / raw)
  To: Johann Lombardi; +Cc: linux-ext4, Andreas Dilger

On Mon, May 02, 2011 at 11:36:36AM +0200, Johann Lombardi wrote:
> Prevent an ext4 filesystem from being mounted multiple times.
> A sequence number is stored on disk and is periodically updated (every 5
> seconds by default) by a mounted filesystem.
> At mount time, we now wait for s_mmp_update_interval seconds to make sure
> that the MMP sequence does not change.
> In case of failure, the nodename, bdevname and the time at which the MMP
> block was last updated is displayed.
> 
> Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
> Signed-off-by: Johann Lombardi <johann@whamcloud.com>

Added to the ext4 tree, thanks.

					- Ted

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] ext4: add support for multiple mount protection
  2011-05-14  0:38 Johann Lombardi
@ 2011-05-24 21:47 ` Ted Ts'o
  0 siblings, 0 replies; 14+ messages in thread
From: Ted Ts'o @ 2011-05-24 21:47 UTC (permalink / raw)
  To: Johann Lombardi; +Cc: linux-ext4, Andreas Dilger

On Fri, May 13, 2011 at 05:38:05PM -0700, Johann Lombardi wrote:
> Prevent an ext4 filesystem from being mounted multiple times.
> A sequence number is stored on disk and is periodically updated (every 5
> seconds by default) by a mounted filesystem.
> At mount time, we now wait for s_mmp_update_interval seconds to make sure
> that the MMP sequence does not change.
> In case of failure, the nodename, bdevname and the time at which the MMP
> block was last updated is displayed.
> Move all mmp code to a dedicated file (mmp.c).
> 
> Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
> Signed-off-by: Johann Lombardi <johann@whamcloud.com>

Oops, I see I pulled the wrong (older) version of this patch into my
dev tree.  I've corrected this and pulled in this version.  Thanks!!

    	   		       	   	       - Ted

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2011-05-24 21:47 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-05-02  8:11 MMP update Johann Lombardi
2011-05-02  8:13 ` [PATCH] Add multi-mount protection support to libext2fs (INCOMPAT_MMP feature) Johann Lombardi
2011-05-02  9:36 ` [PATCH] ext4: add support for multiple mount protection Johann Lombardi
2011-05-23  2:19   ` Ted Ts'o
  -- strict thread matches above, loose matches on Subject: below --
2011-05-14  0:38 Johann Lombardi
2011-05-24 21:47 ` Ted Ts'o
2011-04-12 18:04 Johann Lombardi
2011-04-12 19:20 ` Bernd Schubert
2011-05-02 13:43   ` Johann Lombardi
2011-04-12 20:39 ` Eric Sandeen
2011-04-12 21:08   ` Andreas Dilger
2011-04-12 21:11   ` Johann Lombardi
2011-04-12 21:41   ` Bernd Schubert
2011-04-12 21:44     ` Eric Sandeen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).