linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v5 1/3] ext4: add discard/zeroout flags to journal flush
@ 2021-05-18 15:13 Leah Rumancik
  2021-05-18 15:13 ` [PATCH v5 2/3] ext4: add ioctl EXT4_IOC_CHECKPOINT Leah Rumancik
                   ` (3 more replies)
  0 siblings, 4 replies; 6+ messages in thread
From: Leah Rumancik @ 2021-05-18 15:13 UTC (permalink / raw)
  To: linux-ext4; +Cc: tytso, Leah Rumancik

Add a flags argument to jbd2_journal_flush to enable discarding or
zero-filling the journal blocks while flushing the journal.

Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>

Changes in v4:
- restructured code division between patches
- changed jbd2_journal_flush flags arg from bool to unsigned long long

Changes in v5:
- changed jbd2_journal_flush flags to unsigned int
- changed name of jbd2_journal_flush flags from JBD2_ERASE* to
JBD2_JOURNAL_FLUSH*
- cleaned up loop in jbd2_journal_erase which finds contiguous regions
- updated flag checking
---
 fs/ext4/inode.c      |   4 +-
 fs/ext4/ioctl.c      |   6 +--
 fs/ext4/super.c      |   6 +--
 fs/jbd2/journal.c    | 119 +++++++++++++++++++++++++++++++++++++++++--
 fs/ocfs2/alloc.c     |   2 +-
 fs/ocfs2/journal.c   |   8 +--
 include/linux/jbd2.h |   6 ++-
 7 files changed, 134 insertions(+), 17 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index fe6045a46599..f44800361a38 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3223,7 +3223,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 		ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
 		journal = EXT4_JOURNAL(inode);
 		jbd2_journal_lock_updates(journal);
-		err = jbd2_journal_flush(journal);
+		err = jbd2_journal_flush(journal, 0);
 		jbd2_journal_unlock_updates(journal);
 
 		if (err)
@@ -6005,7 +6005,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	if (val)
 		ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
 	else {
-		err = jbd2_journal_flush(journal);
+		err = jbd2_journal_flush(journal, 0);
 		if (err < 0) {
 			jbd2_journal_unlock_updates(journal);
 			percpu_up_write(&sbi->s_writepages_rwsem);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0796bfa72829..d5512e17a13f 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -762,7 +762,7 @@ static long ext4_ioctl_group_add(struct file *file,
 	err = ext4_group_add(sb, input);
 	if (EXT4_SB(sb)->s_journal) {
 		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-		err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+		err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
 	}
 	if (err == 0)
@@ -945,7 +945,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
 		if (EXT4_SB(sb)->s_journal) {
 			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
 			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
 		}
 		if (err == 0)
@@ -1088,7 +1088,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		if (EXT4_SB(sb)->s_journal) {
 			ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE);
 			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
 			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
 		}
 		if (err == 0)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 886e0d518668..fe3ae750d83a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5639,7 +5639,7 @@ static int ext4_mark_recovery_complete(struct super_block *sb,
 		return 0;
 	}
 	jbd2_journal_lock_updates(journal);
-	err = jbd2_journal_flush(journal);
+	err = jbd2_journal_flush(journal, 0);
 	if (err < 0)
 		goto out;
 
@@ -5781,7 +5781,7 @@ static int ext4_freeze(struct super_block *sb)
 		 * Don't clear the needs_recovery flag if we failed to
 		 * flush the journal.
 		 */
-		error = jbd2_journal_flush(journal);
+		error = jbd2_journal_flush(journal, 0);
 		if (error < 0)
 			goto out;
 
@@ -6379,7 +6379,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 		 * otherwise be livelocked...
 		 */
 		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-		err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+		err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
 		if (err)
 			return err;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2dc944442802..521ce41c242c 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1686,6 +1686,110 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
 	write_unlock(&journal->j_state_lock);
 }
 
+/**
+ * __jbd2_journal_erase() - Discard or zeroout journal blocks (excluding superblock)
+ * @journal: The journal to erase.
+ * @flags: A discard/zeroout request is sent for each physically contigous
+ *	region of the journal. Either JBD2_JOURNAL_FLUSH_DISCARD or
+ *	JBD2_JOURNAL_FLUSH_ZEROOUT must be set to determine which operation
+ *	to perform.
+ *
+ * Note: JBD2_JOURNAL_FLUSH_ZEROOUT attempts to use hardware offload. Zeroes
+ * will be explicitly written if no hardware offload is available, see
+ * blkdev_issue_zeroout for more details.
+ */
+static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
+{
+	int err = 0;
+	unsigned long block, log_offset; /* logical */
+	unsigned long long phys_block, block_start, block_stop; /* physical */
+	loff_t byte_start, byte_stop, byte_count;
+	struct request_queue *q = bdev_get_queue(journal->j_dev);
+
+	/* flags must be set to either discard or zeroout */
+	if ((flags & ~JBD2_JOURNAL_FLUSH_VALID) || !flags ||
+			((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
+			(flags & JBD2_JOURNAL_FLUSH_ZEROOUT)))
+		return -EINVAL;
+
+	if (!q)
+		return -ENXIO;
+
+	if (JBD2_JOURNAL_FLUSH_DISCARD & !blk_queue_discard(q))
+		return -EOPNOTSUPP;
+
+	/*
+	 * lookup block mapping and issue discard/zeroout for each
+	 * contiguous region
+	 */
+	log_offset = be32_to_cpu(journal->j_superblock->s_first);
+	block_start =  ~0ULL;
+	for (block = log_offset; block < journal->j_total_len; block++) {
+		err = jbd2_journal_bmap(journal, block, &phys_block);
+		if (err) {
+			pr_err("JBD2: bad block at offset %lu", block);
+			return err;
+		}
+
+		if (block_start == ~0ULL) {
+			block_start = phys_block;
+			block_stop = block_start - 1;
+		}
+
+		/*
+		 * last block not contiguous with current block,
+		 * process last contiguous region and return to this block on
+		 * next loop
+		 */
+		if (phys_block != block_stop + 1) {
+			block--;
+		} else {
+			block_stop++;
+			/*
+			 * if this isn't the last block of journal,
+			 * no need to process now because next block may also
+			 * be part of this contiguous region
+			 */
+			if (block != journal->j_total_len - 1)
+				continue;
+		}
+
+		/*
+		 * end of contiguous region or this is last block of journal,
+		 * take care of the region
+		 */
+		byte_start = block_start * journal->j_blocksize;
+		byte_stop = block_stop * journal->j_blocksize;
+		byte_count = (block_stop - block_start + 1) *
+				journal->j_blocksize;
+
+		truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping,
+				byte_start, byte_stop);
+
+		if (flags & JBD2_JOURNAL_FLUSH_DISCARD) {
+			err = blkdev_issue_discard(journal->j_dev,
+					byte_start >> SECTOR_SHIFT,
+					byte_count >> SECTOR_SHIFT,
+					GFP_NOFS, 0);
+		} else if (flags & JBD2_JOURNAL_FLUSH_ZEROOUT) {
+			err = blkdev_issue_zeroout(journal->j_dev,
+					byte_start >> SECTOR_SHIFT,
+					byte_count >> SECTOR_SHIFT,
+					GFP_NOFS, 0);
+		}
+
+		if (unlikely(err != 0)) {
+			pr_err("JBD2: (error %d) unable to wipe journal at physical blocks %llu - %llu",
+					err, block_start, block_stop);
+			return err;
+		}
+
+		/* reset start and stop after processing a region */
+		block_start = ~0ULL;
+	}
+
+	return blkdev_issue_flush(journal->j_dev);
+}
 
 /**
  * jbd2_journal_update_sb_errno() - Update error in the journal.
@@ -2246,13 +2350,18 @@ EXPORT_SYMBOL(jbd2_journal_clear_features);
 /**
  * jbd2_journal_flush() - Flush journal
  * @journal: Journal to act on.
+ * @flags: optional operation on the journal blocks after the flush (see below)
  *
  * Flush all data for a given journal to disk and empty the journal.
  * Filesystems can use this when remounting readonly to ensure that
- * recovery does not need to happen on remount.
+ * recovery does not need to happen on remount. Optionally, a discard or zeroout
+ * can be issued on the journal blocks after flushing.
+ *
+ * flags:
+ *	JBD2_JOURNAL_FLUSH_DISCARD: issues discards for the journal blocks
+ *	JBD2_JOURNAL_FLUSH_ZEROOUT: issues zeroouts for the journal blocks
  */
-
-int jbd2_journal_flush(journal_t *journal)
+int jbd2_journal_flush(journal_t *journal, unsigned int flags)
 {
 	int err = 0;
 	transaction_t *transaction = NULL;
@@ -2306,6 +2415,10 @@ int jbd2_journal_flush(journal_t *journal)
 	 * commits of data to the journal will restore the current
 	 * s_start value. */
 	jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
+
+	if (flags)
+		err = __jbd2_journal_erase(journal, flags);
+
 	mutex_unlock(&journal->j_checkpoint_mutex);
 	write_lock(&journal->j_state_lock);
 	J_ASSERT(!journal->j_running_transaction);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 78710788c237..1b41bf9f4a7e 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6020,7 +6020,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 	 * Then truncate log will be replayed resulting in cluster double free.
 	 */
 	jbd2_journal_lock_updates(journal->j_journal);
-	status = jbd2_journal_flush(journal->j_journal);
+	status = jbd2_journal_flush(journal->j_journal, 0);
 	jbd2_journal_unlock_updates(journal->j_journal);
 	if (status < 0) {
 		mlog_errno(status);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index db52e843002a..a1438548747e 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -310,7 +310,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
 	}
 
 	jbd2_journal_lock_updates(journal->j_journal);
-	status = jbd2_journal_flush(journal->j_journal);
+	status = jbd2_journal_flush(journal->j_journal, 0);
 	jbd2_journal_unlock_updates(journal->j_journal);
 	if (status < 0) {
 		up_write(&journal->j_trans_barrier);
@@ -1002,7 +1002,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 
 	if (ocfs2_mount_local(osb)) {
 		jbd2_journal_lock_updates(journal->j_journal);
-		status = jbd2_journal_flush(journal->j_journal);
+		status = jbd2_journal_flush(journal->j_journal, 0);
 		jbd2_journal_unlock_updates(journal->j_journal);
 		if (status < 0)
 			mlog_errno(status);
@@ -1072,7 +1072,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
 
 	if (replayed) {
 		jbd2_journal_lock_updates(journal->j_journal);
-		status = jbd2_journal_flush(journal->j_journal);
+		status = jbd2_journal_flush(journal->j_journal, 0);
 		jbd2_journal_unlock_updates(journal->j_journal);
 		if (status < 0)
 			mlog_errno(status);
@@ -1668,7 +1668,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 
 	/* wipe the journal */
 	jbd2_journal_lock_updates(journal);
-	status = jbd2_journal_flush(journal);
+	status = jbd2_journal_flush(journal, 0);
 	jbd2_journal_unlock_updates(journal);
 	if (status < 0)
 		mlog_errno(status);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index db0e1920cb12..8543233b0388 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1370,6 +1370,10 @@ JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit,	FAST_COMMIT)
 						 * mode */
 #define JBD2_FAST_COMMIT_ONGOING	0x100	/* Fast commit is ongoing */
 #define JBD2_FULL_COMMIT_ONGOING	0x200	/* Full commit is ongoing */
+#define JBD2_JOURNAL_FLUSH_DISCARD	0x0001
+#define JBD2_JOURNAL_FLUSH_ZEROOUT	0x0002
+#define JBD2_JOURNAL_FLUSH_VALID	(JBD2_JOURNAL_FLUSH_DISCARD | \
+					JBD2_JOURNAL_FLUSH_ZEROOUT)
 
 /*
  * Function declarations for the journaling transaction and buffer
@@ -1500,7 +1504,7 @@ extern int	 jbd2_journal_invalidatepage(journal_t *,
 				struct page *, unsigned int, unsigned int);
 extern int	 jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page);
 extern int	 jbd2_journal_stop(handle_t *);
-extern int	 jbd2_journal_flush (journal_t *);
+extern int	 jbd2_journal_flush(journal_t *journal, unsigned int flags);
 extern void	 jbd2_journal_lock_updates (journal_t *);
 extern void	 jbd2_journal_unlock_updates (journal_t *);
 
-- 
2.31.1.751.gd2f1c929bd-goog


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH v5 2/3] ext4: add ioctl EXT4_IOC_CHECKPOINT
  2021-05-18 15:13 [PATCH v5 1/3] ext4: add discard/zeroout flags to journal flush Leah Rumancik
@ 2021-05-18 15:13 ` Leah Rumancik
  2021-06-23  1:38   ` Theodore Ts'o
  2021-05-18 15:13 ` [PATCH v5 3/3] ext4: update journal documentation Leah Rumancik
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 6+ messages in thread
From: Leah Rumancik @ 2021-05-18 15:13 UTC (permalink / raw)
  To: linux-ext4; +Cc: tytso, Leah Rumancik

ioctl EXT4_IOC_CHECKPOINT checkpoints and flushes the journal. This
includes forcing all the transactions to the log, checkpointing the
transactions, and flushing the log to disk. This ioctl takes u32 "flags"
as an argument. Three flags are supported. EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN
can be used to verify input to the ioctl. It returns error if there is any
invalid input, otherwise it returns success without performing
any checkpointing. The other two flags, EXT4_IOC_CHECKPOINT_FLAG_DISCARD
and EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT, can be used to issue requests to
discard or zeroout the journal logs blocks, respectively. At this
point, EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT is primarily added to enable
testing of this codepath on devices that don't support discard.
EXT4_IOC_CHECKPOINT_FLAG_DISCARD and EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT
cannot both be set.

Systems that wish to achieve content deletion SLO can set up a daemon
that calls this ioctl at a regular interval such that it matches with the
SLO requirement. Thus, with this patch, the ext4_dir_entry2 wipeout
patch[1], and the Ext4 "-o discard" mount option set, Ext4 can now
guarantee that all file contents, file metatdata, and filenames will not
be accessible through the filesystem and will have had discard or
zeroout requests issued for corresponding device blocks.

The __jbd2_journal_erase function could also be used to discard or
zero-fill the journal during journal load after recovery. This would
provide a potential solution to a journal replay bug reported earlier this
year[2]. After a successful journal recovery, e2fsck can call this ioctl to
discard the journal as well.

[1] https://lore.kernel.org/linux-ext4/YIHknqxngB1sUdie@mit.edu/
[2] https://lore.kernel.org/linux-ext4/YDZoaacIYStFQT8g@mit.edu/

Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>

Changes in v4:
- update commit description
- update error codes
- update code formatting
- add flag EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN
- add flag EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT

Changes in v5:
- update error checking
- make DRY_RUN include checks on input
- added info about DRY_RUN in commit
- added explicit conversion from ioctl flags to jbd2 flags
---
 fs/ext4/ext4.h  |  9 +++++++++
 fs/ext4/ioctl.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a29660db86ac..5aa203534997 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -729,6 +729,7 @@ enum {
 #define EXT4_IOC_CLEAR_ES_CACHE		_IO('f', 40)
 #define EXT4_IOC_GETSTATE		_IOW('f', 41, __u32)
 #define EXT4_IOC_GET_ES_CACHE		_IOWR('f', 42, struct fiemap)
+#define EXT4_IOC_CHECKPOINT		_IOW('f', 43, __u32)
 
 #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
 
@@ -750,6 +751,14 @@ enum {
 #define EXT4_STATE_FLAG_NEWENTRY	0x00000004
 #define EXT4_STATE_FLAG_DA_ALLOC_CLOSE	0x00000008
 
+/* flags for ioctl EXT4_IOC_CHECKPOINT */
+#define EXT4_IOC_CHECKPOINT_FLAG_DISCARD	0x1
+#define EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT	0x2
+#define EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN	0x4
+#define EXT4_IOC_CHECKPOINT_FLAG_VALID		(EXT4_IOC_CHECKPOINT_FLAG_DISCARD | \
+						EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT | \
+						EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN)
+
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
  * ioctl commands in 32 bit emulation
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index d5512e17a13f..d25eaec1afdc 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -818,6 +818,47 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg)
 	return error;
 }
 
+static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg)
+{
+	int err = 0;
+	__u32 flags = 0;
+	unsigned int flush_flags = 0;
+	struct super_block *sb = file_inode(filp)->i_sb;
+
+	if (copy_from_user(&flags, (__u32 __user *)arg,
+				sizeof(__u32)))
+		return -EFAULT;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* check for invalid bits set */
+	if ((flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID) ||
+				((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
+				(flags & JBD2_JOURNAL_FLUSH_ZEROOUT)))
+		return -EINVAL;
+
+	if (!EXT4_SB(sb)->s_journal)
+		return -ENODEV;
+
+	if (flags & EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN)
+		return 0;
+
+	if (flags & EXT4_IOC_CHECKPOINT_FLAG_DISCARD)
+		flush_flags |= JBD2_JOURNAL_FLUSH_DISCARD;
+
+	if (flags & EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT) {
+		flush_flags |= JBD2_JOURNAL_FLUSH_ZEROOUT;
+		pr_info_ratelimited("warning: checkpointing journal with EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT can be slow");
+	}
+
+	jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+	err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, flush_flags);
+	jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+
+	return err;
+}
+
 static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -1325,6 +1366,9 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return fsverity_ioctl_read_metadata(filp,
 						    (const void __user *)arg);
 
+	case EXT4_IOC_CHECKPOINT:
+		return ext4_ioctl_checkpoint(filp, arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -1413,6 +1457,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case EXT4_IOC_GET_ES_CACHE:
 	case FS_IOC_FSGETXATTR:
 	case FS_IOC_FSSETXATTR:
+	case EXT4_IOC_CHECKPOINT:
 		break;
 	default:
 		return -ENOIOCTLCMD;
-- 
2.31.1.751.gd2f1c929bd-goog


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH v5 3/3] ext4: update journal documentation
  2021-05-18 15:13 [PATCH v5 1/3] ext4: add discard/zeroout flags to journal flush Leah Rumancik
  2021-05-18 15:13 ` [PATCH v5 2/3] ext4: add ioctl EXT4_IOC_CHECKPOINT Leah Rumancik
@ 2021-05-18 15:13 ` Leah Rumancik
  2021-06-03 18:00 ` [PATCH v5 1/3] ext4: add discard/zeroout flags to journal flush Theodore Ts'o
  2021-06-23  1:36 ` Theodore Ts'o
  3 siblings, 0 replies; 6+ messages in thread
From: Leah Rumancik @ 2021-05-18 15:13 UTC (permalink / raw)
  To: linux-ext4; +Cc: tytso, Leah Rumancik

Add a section about journal checkpointing, including information about
the ioctl EXT4_IOC_CHECKPOINT which can be used to trigger a journal
checkpoint from userspace.

Also, update the journal allocation information to reflect that up to
10240000 blocks are used for the journal and that the journal is not
necessarily contiguous.

Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>

Changes in v5:
- clarify behavior of DRY_RUN flag
---
 Documentation/filesystems/ext4/journal.rst | 39 +++++++++++++++++-----
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/Documentation/filesystems/ext4/journal.rst b/Documentation/filesystems/ext4/journal.rst
index cdbfec473167..5fad38860f17 100644
--- a/Documentation/filesystems/ext4/journal.rst
+++ b/Documentation/filesystems/ext4/journal.rst
@@ -4,14 +4,14 @@ Journal (jbd2)
 --------------
 
 Introduced in ext3, the ext4 filesystem employs a journal to protect the
-filesystem against corruption in the case of a system crash. A small
-continuous region of disk (default 128MiB) is reserved inside the
-filesystem as a place to land “important” data writes on-disk as quickly
-as possible. Once the important data transaction is fully written to the
-disk and flushed from the disk write cache, a record of the data being
-committed is also written to the journal. At some later point in time,
-the journal code writes the transactions to their final locations on
-disk (this could involve a lot of seeking or a lot of small
+filesystem against metadata inconsistencies in the case of a system crash. Up
+to 10,240,000 file system blocks (see man mke2fs(8) for more details on journal
+size limits) can be reserved inside the filesystem as a place to land
+“important” data writes on-disk as quickly as possible. Once the important
+data transaction is fully written to the disk and flushed from the disk write
+cache, a record of the data being committed is also written to the journal. At
+some later point in time, the journal code writes the transactions to their
+final locations on disk (this could involve a lot of seeking or a lot of small
 read-write-erases) before erasing the commit record. Should the system
 crash during the second slow write, the journal can be replayed all the
 way to the latest commit record, guaranteeing the atomicity of whatever
@@ -731,3 +731,26 @@ point, the refcount for inode 11 is not reliable, but that gets fixed by the
 replay of last inode 11 tag. Thus, by converting a non-idempotent procedure
 into a series of idempotent outcomes, fast commits ensured idempotence during
 the replay.
+
+Journal Checkpoint
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Checkpointing the journal ensures all transactions and their associated buffers
+are submitted to the disk. In-progress transactions are waited upon and included
+in the checkpoint. Checkpointing is used internally during critical updates to
+the filesystem including journal recovery, filesystem resizing, and freeing of
+the journal_t structure.
+
+A journal checkpoint can be triggered from userspace via the ioctl
+EXT4_IOC_CHECKPOINT. This ioctl takes a single, u64 argument for flags.
+Currently, three flags are supported. First, EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN
+can be used to verify input to the ioctl. It returns error if there is any
+invalid input, otherwise it returns success without performing
+any checkpointing. This can be used to check whether the ioctl exists on a
+system and to verify there are no issues with arguments or flags. The
+other two flags are EXT4_IOC_CHECKPOINT_FLAG_DISCARD and
+EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT. These flags cause the journal blocks to be
+discarded or zero-filled, respectively, after the journal checkpoint is
+complete. EXT4_IOC_CHECKPOINT_FLAG_DISCARD and EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT
+cannot both be set. The ioctl may be useful when snapshotting a system or for
+complying with content deletion SLOs.
-- 
2.31.1.751.gd2f1c929bd-goog


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v5 1/3] ext4: add discard/zeroout flags to journal flush
  2021-05-18 15:13 [PATCH v5 1/3] ext4: add discard/zeroout flags to journal flush Leah Rumancik
  2021-05-18 15:13 ` [PATCH v5 2/3] ext4: add ioctl EXT4_IOC_CHECKPOINT Leah Rumancik
  2021-05-18 15:13 ` [PATCH v5 3/3] ext4: update journal documentation Leah Rumancik
@ 2021-06-03 18:00 ` Theodore Ts'o
  2021-06-23  1:36 ` Theodore Ts'o
  3 siblings, 0 replies; 6+ messages in thread
From: Theodore Ts'o @ 2021-06-03 18:00 UTC (permalink / raw)
  To: Leah Rumancik; +Cc: linux-ext4

On Tue, May 18, 2021 at 03:13:25PM +0000, Leah Rumancik wrote:
> Add a flags argument to jbd2_journal_flush to enable discarding or
> zero-filling the journal blocks while flushing the journal.
> 
> Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>

Thanks, I've applied this series.

					- Ted

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v5 1/3] ext4: add discard/zeroout flags to journal flush
  2021-05-18 15:13 [PATCH v5 1/3] ext4: add discard/zeroout flags to journal flush Leah Rumancik
                   ` (2 preceding siblings ...)
  2021-06-03 18:00 ` [PATCH v5 1/3] ext4: add discard/zeroout flags to journal flush Theodore Ts'o
@ 2021-06-23  1:36 ` Theodore Ts'o
  3 siblings, 0 replies; 6+ messages in thread
From: Theodore Ts'o @ 2021-06-23  1:36 UTC (permalink / raw)
  To: Leah Rumancik; +Cc: linux-ext4

On Tue, May 18, 2021 at 03:13:25PM +0000, Leah Rumancik wrote:
> Add a flags argument to jbd2_journal_flush to enable discarding or
> zero-filling the journal blocks while flushing the journal.
> 
> Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
> 
> Changes in v4:
> - restructured code division between patches
> - changed jbd2_journal_flush flags arg from bool to unsigned long long
> 
> Changes in v5:
> - changed jbd2_journal_flush flags to unsigned int
> - changed name of jbd2_journal_flush flags from JBD2_ERASE* to
> JBD2_JOURNAL_FLUSH*
> - cleaned up loop in jbd2_journal_erase which finds contiguous regions
> - updated flag checking
> ---

I noticed a minor issue in this commit, and so I've made the following
change to this commit in the ext4 tree.

	       	       	      	     	  - Ted

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 521ce41c242c..3a2ed60ea8b7 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1715,7 +1715,7 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
 	if (!q)
 		return -ENXIO;
 
-	if (JBD2_JOURNAL_FLUSH_DISCARD & !blk_queue_discard(q))
+	if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q))
 		return -EOPNOTSUPP;
 
 	/*

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v5 2/3] ext4: add ioctl EXT4_IOC_CHECKPOINT
  2021-05-18 15:13 ` [PATCH v5 2/3] ext4: add ioctl EXT4_IOC_CHECKPOINT Leah Rumancik
@ 2021-06-23  1:38   ` Theodore Ts'o
  0 siblings, 0 replies; 6+ messages in thread
From: Theodore Ts'o @ 2021-06-23  1:38 UTC (permalink / raw)
  To: Leah Rumancik; +Cc: linux-ext4

On Tue, May 18, 2021 at 03:13:26PM +0000, Leah Rumancik wrote:
> ioctl EXT4_IOC_CHECKPOINT checkpoints and flushes the journal. This
> includes forcing all the transactions to the log, checkpointing the
> transactions, and flushing the log to disk. This ioctl takes u32 "flags"
> as an argument. Three flags are supported. EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN
> can be used to verify input to the ioctl. It returns error if there is any
> invalid input, otherwise it returns success without performing
> any checkpointing. The other two flags, EXT4_IOC_CHECKPOINT_FLAG_DISCARD
> and EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT, can be used to issue requests to
> discard or zeroout the journal logs blocks, respectively. At this
> point, EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT is primarily added to enable
> testing of this codepath on devices that don't support discard.
> EXT4_IOC_CHECKPOINT_FLAG_DISCARD and EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT
> cannot both be set.
> 
> Systems that wish to achieve content deletion SLO can set up a daemon
> that calls this ioctl at a regular interval such that it matches with the
> SLO requirement. Thus, with this patch, the ext4_dir_entry2 wipeout
> patch[1], and the Ext4 "-o discard" mount option set, Ext4 can now
> guarantee that all file contents, file metatdata, and filenames will not
> be accessible through the filesystem and will have had discard or
> zeroout requests issued for corresponding device blocks.
> 
> The __jbd2_journal_erase function could also be used to discard or
> zero-fill the journal during journal load after recovery. This would
> provide a potential solution to a journal replay bug reported earlier this
> year[2]. After a successful journal recovery, e2fsck can call this ioctl to
> discard the journal as well.
> 
> [1] https://lore.kernel.org/linux-ext4/YIHknqxngB1sUdie@mit.edu/
> [2] https://lore.kernel.org/linux-ext4/YDZoaacIYStFQT8g@mit.edu/
> 
> Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>

FYI, I've made the following change to this commit in the ext4 tree,
in order to fix a test failure of ext4/050 when running on a block
device that don't support discard --- for example, when running the
ext4/dax test config.

Cheers,

					- Ted

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 1290fbda1399..5730aeca563c 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -805,6 +805,7 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg)
 	__u32 flags = 0;
 	unsigned int flush_flags = 0;
 	struct super_block *sb = file_inode(filp)->i_sb;
+	struct request_queue *q;
 
 	if (copy_from_user(&flags, (__u32 __user *)arg,
 				sizeof(__u32)))
@@ -822,6 +823,15 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg)
 	if (!EXT4_SB(sb)->s_journal)
 		return -ENODEV;
 
+	if (flags & ~JBD2_JOURNAL_FLUSH_VALID)
+		return -EINVAL;
+
+	q = bdev_get_queue(EXT4_SB(sb)->s_journal->j_dev);
+	if (!q)
+		return -ENXIO;
+	if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q))
+		return -EOPNOTSUPP;
+
 	if (flags & EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN)
 		return 0;
 

^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-06-23  1:38 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-05-18 15:13 [PATCH v5 1/3] ext4: add discard/zeroout flags to journal flush Leah Rumancik
2021-05-18 15:13 ` [PATCH v5 2/3] ext4: add ioctl EXT4_IOC_CHECKPOINT Leah Rumancik
2021-06-23  1:38   ` Theodore Ts'o
2021-05-18 15:13 ` [PATCH v5 3/3] ext4: update journal documentation Leah Rumancik
2021-06-03 18:00 ` [PATCH v5 1/3] ext4: add discard/zeroout flags to journal flush Theodore Ts'o
2021-06-23  1:36 ` Theodore Ts'o

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).