* [PATCH-v6 1/3] vfs: add support for a lazytime mount option
2014-12-02 0:29 [PATCH-v6 0/3] add support for a lazytime mount option Theodore Ts'o
@ 2014-12-02 0:29 ` Theodore Ts'o
2014-12-02 0:29 ` [PATCH-v6 2/3] vfs: add find_inode_nowait() function Theodore Ts'o
2014-12-02 0:29 ` [PATCH-v6 3/3] ext4: add optimization for the lazytime mount option Theodore Ts'o
2 siblings, 0 replies; 4+ messages in thread
From: Theodore Ts'o @ 2014-12-02 0:29 UTC (permalink / raw)
To: Linux Filesystem Development List; +Cc: Ext4 Developers List, Theodore Ts'o
Add a new mount option which enables a new "lazytime" mode. This mode
causes atime, mtime, and ctime updates to only be made to the
in-memory version of the inode. The on-disk times will only get
updated when (a) if the inode needs to be updated for some non-time
related change, (b) if userspace calls fsync(), syncfs() or sync(), or
(c) just before an undeleted inode is evicted from memory.
This is OK according to POSIX because there are no guarantees after a
crash unless userspace explicitly requests via a fsync(2) call.
For workloads which feature a large number of random write to a
preallocated file, the lazytime mount option significantly reduces
writes to the inode table. The repeated 4k writes to a single block
will result in undesirable stress on flash devices and SMR disk
drives. Even on conventional HDD's, the repeated writes to the inode
table block will trigger Adjacent Track Interference (ATI) remediation
latencies, which very negatively impact long tail latencies --- which
is a very big deal for web serving tiers (for example).
Google-Bug-Id: 18297052
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
fs/fs-writeback.c | 52 +++++++++++++++++++++++++++++++--------
fs/gfs2/file.c | 4 +--
fs/inode.c | 34 +++++++++++++++++++-------
fs/jfs/file.c | 2 +-
fs/libfs.c | 2 +-
fs/proc_namespace.c | 1 +
fs/sync.c | 8 ++++++
include/linux/backing-dev.h | 1 +
include/linux/fs.h | 4 +++
include/trace/events/writeback.h | 53 +++++++++++++++++++++++++++++++++++++++-
include/uapi/linux/fs.h | 4 ++-
mm/backing-dev.c | 10 ++++++--
12 files changed, 148 insertions(+), 27 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ef9bef1..29a5dfe 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -247,14 +247,19 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
return ret;
}
+#define EXPIRE_DIRTY_ATIME 0x0001
+
/*
* Move expired (dirtied before work->older_than_this) dirty inodes from
* @delaying_queue to @dispatch_queue.
*/
static int move_expired_inodes(struct list_head *delaying_queue,
struct list_head *dispatch_queue,
+ int flags,
struct wb_writeback_work *work)
{
+ unsigned long *older_than_this = NULL;
+ unsigned long expire_time;
LIST_HEAD(tmp);
struct list_head *pos, *node;
struct super_block *sb = NULL;
@@ -262,13 +267,21 @@ static int move_expired_inodes(struct list_head *delaying_queue,
int do_sb_sort = 0;
int moved = 0;
+ if ((flags & EXPIRE_DIRTY_ATIME) == 0)
+ older_than_this = work->older_than_this;
+ else if ((work->reason == WB_REASON_SYNC) == 0) {
+ expire_time = jiffies - (HZ * 86400);
+ older_than_this = &expire_time;
+ }
while (!list_empty(delaying_queue)) {
inode = wb_inode(delaying_queue->prev);
- if (work->older_than_this &&
- inode_dirtied_after(inode, *work->older_than_this))
+ if (older_than_this &&
+ inode_dirtied_after(inode, *older_than_this))
break;
list_move(&inode->i_wb_list, &tmp);
moved++;
+ if (flags & EXPIRE_DIRTY_ATIME)
+ set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
if (sb_is_blkdev_sb(inode->i_sb))
continue;
if (sb && sb != inode->i_sb)
@@ -309,9 +322,12 @@ out:
static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
{
int moved;
+
assert_spin_locked(&wb->list_lock);
list_splice_init(&wb->b_more_io, &wb->b_io);
- moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
+ moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
+ moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
+ EXPIRE_DIRTY_ATIME, work);
trace_writeback_queue_io(wb, work, moved);
}
@@ -435,6 +451,8 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
* updates after data IO completion.
*/
redirty_tail(inode, wb);
+ } else if (inode->i_state & I_DIRTY_TIME) {
+ list_move(&inode->i_wb_list, &wb->b_dirty_time);
} else {
/* The inode is clean. Remove from writeback lists. */
list_del_init(&inode->i_wb_list);
@@ -482,11 +500,18 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
/* Clear I_DIRTY_PAGES if we've written out all dirty pages */
if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
inode->i_state &= ~I_DIRTY_PAGES;
- dirty = inode->i_state & I_DIRTY;
- inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+ dirty = inode->i_state & (I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+ if ((dirty && (inode->i_state & I_DIRTY_TIME)) ||
+ (inode->i_state & I_DIRTY_TIME_EXPIRED)) {
+ dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
+ trace_writeback_lazytime(inode);
+ }
+ inode->i_state &= ~dirty;
spin_unlock(&inode->i_lock);
+ if (dirty & I_DIRTY_TIME)
+ mark_inode_dirty_sync(inode);
/* Don't write the inode if only I_DIRTY_PAGES was set */
- if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+ if (dirty) {
int err = write_inode(inode, wbc);
if (ret == 0)
ret = err;
@@ -534,7 +559,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
* make sure inode is on some writeback list and leave it there unless
* we have completely cleaned the inode.
*/
- if (!(inode->i_state & I_DIRTY) &&
+ if (!(inode->i_state & I_DIRTY_ALL) &&
(wbc->sync_mode != WB_SYNC_ALL ||
!mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
goto out;
@@ -549,7 +574,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
* If inode is clean, remove it from writeback lists. Otherwise don't
* touch it. See comment above for explanation.
*/
- if (!(inode->i_state & I_DIRTY))
+ if (!(inode->i_state & I_DIRTY_ALL))
list_del_init(&inode->i_wb_list);
spin_unlock(&wb->list_lock);
inode_sync_complete(inode);
@@ -691,7 +716,7 @@ static long writeback_sb_inodes(struct super_block *sb,
wrote += write_chunk - wbc.nr_to_write;
spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
- if (!(inode->i_state & I_DIRTY))
+ if (!(inode->i_state & I_DIRTY_ALL))
wrote++;
requeue_inode(inode, wb, &wbc);
inode_sync_complete(inode);
@@ -1134,6 +1159,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
struct super_block *sb = inode->i_sb;
struct backing_dev_info *bdi = NULL;
+ trace_writeback_mark_inode_dirty(inode, flags);
+
/*
* Don't do this for I_DIRTY_PAGES - that doesn't actually
* dirty the inode itself
@@ -1164,6 +1191,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
if ((inode->i_state & flags) != flags) {
const int was_dirty = inode->i_state & I_DIRTY;
+ if ((flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) &&
+ (inode->i_state & I_DIRTY_TIME))
+ inode->i_state &= ~I_DIRTY_TIME;
+
inode->i_state |= flags;
/*
@@ -1210,7 +1241,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
}
inode->dirtied_when = jiffies;
- list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+ list_move(&inode->i_wb_list, (flags & I_DIRTY_TIME) ?
+ &bdi->wb.b_dirty_time : &bdi->wb.b_dirty);
spin_unlock(&bdi->wb.list_lock);
if (wakeup_bdi)
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 80dd44d..e584bf9 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -654,7 +654,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
{
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- int sync_state = inode->i_state & I_DIRTY;
+ int sync_state = inode->i_state & I_DIRTY_ALL;
struct gfs2_inode *ip = GFS2_I(inode);
int ret = 0, ret1 = 0;
@@ -667,7 +667,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
if (!gfs2_is_jdata(ip))
sync_state &= ~I_DIRTY_PAGES;
if (datasync)
- sync_state &= ~I_DIRTY_SYNC;
+ sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME);
if (sync_state) {
ret = sync_inode_metadata(inode, 1);
diff --git a/fs/inode.c b/fs/inode.c
index 26753ba..ceba9c5 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -18,6 +18,7 @@
#include <linux/buffer_head.h> /* for inode_has_buffers */
#include <linux/ratelimit.h>
#include <linux/list_lru.h>
+#include <trace/events/writeback.h>
#include "internal.h"
/*
@@ -30,7 +31,7 @@
* inode_sb_list_lock protects:
* sb->s_inodes, inode->i_sb_list
* bdi->wb.list_lock protects:
- * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
+ * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list
* inode_hash_lock protects:
* inode_hashtable, inode->i_hash
*
@@ -414,7 +415,8 @@ static void inode_lru_list_add(struct inode *inode)
*/
void inode_add_lru(struct inode *inode)
{
- if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) &&
+ if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
+ I_FREEING | I_WILL_FREE)) &&
!atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
inode_lru_list_add(inode);
}
@@ -645,7 +647,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
spin_unlock(&inode->i_lock);
continue;
}
- if (inode->i_state & I_DIRTY && !kill_dirty) {
+ if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
spin_unlock(&inode->i_lock);
busy = 1;
continue;
@@ -1430,11 +1432,20 @@ static void iput_final(struct inode *inode)
*/
void iput(struct inode *inode)
{
- if (inode) {
- BUG_ON(inode->i_state & I_CLEAR);
-
- if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
- iput_final(inode);
+ if (!inode)
+ return;
+ BUG_ON(inode->i_state & I_CLEAR);
+retry:
+ if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
+ if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
+ atomic_inc(&inode->i_count);
+ inode->i_state &= ~I_DIRTY_TIME;
+ spin_unlock(&inode->i_lock);
+ trace_writeback_lazytime_iput(inode);
+ mark_inode_dirty_sync(inode);
+ goto retry;
+ }
+ iput_final(inode);
}
}
EXPORT_SYMBOL(iput);
@@ -1510,7 +1521,12 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
inode->i_ctime = *time;
if (flags & S_MTIME)
inode->i_mtime = *time;
- mark_inode_dirty_sync(inode);
+
+ if ((inode->i_sb->s_flags & MS_LAZYTIME) && !(flags & S_VERSION) &&
+ !(inode->i_state & I_DIRTY))
+ __mark_inode_dirty(inode, I_DIRTY_TIME);
+ else
+ __mark_inode_dirty(inode, I_DIRTY_SYNC);
return 0;
}
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 33aa0cc..10815f8 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -39,7 +39,7 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
return rc;
mutex_lock(&inode->i_mutex);
- if (!(inode->i_state & I_DIRTY) ||
+ if (!(inode->i_state & I_DIRTY_ALL) ||
(datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
/* Make sure committed changes hit the disk */
jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
diff --git a/fs/libfs.c b/fs/libfs.c
index 171d284..7cb9cef 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -948,7 +948,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
mutex_lock(&inode->i_mutex);
ret = sync_mapping_buffers(inode->i_mapping);
- if (!(inode->i_state & I_DIRTY))
+ if (!(inode->i_state & I_DIRTY_ALL))
goto out;
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
goto out;
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 73ca174..f98234a 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -44,6 +44,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb)
{ MS_SYNCHRONOUS, ",sync" },
{ MS_DIRSYNC, ",dirsync" },
{ MS_MANDLOCK, ",mand" },
+ { MS_LAZYTIME, ",lazytime" },
{ 0, NULL }
};
const struct proc_fs_info *fs_infop;
diff --git a/fs/sync.c b/fs/sync.c
index bdc729d..6ac7bf0 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -177,8 +177,16 @@ SYSCALL_DEFINE1(syncfs, int, fd)
*/
int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
{
+ struct inode *inode = file->f_mapping->host;
+
if (!file->f_op->fsync)
return -EINVAL;
+ if (!datasync && (inode->i_state & I_DIRTY_TIME)) {
+ spin_lock(&inode->i_lock);
+ inode->i_state &= ~I_DIRTY_TIME;
+ spin_unlock(&inode->i_lock);
+ mark_inode_dirty_sync(inode);
+ }
return file->f_op->fsync(file, start, end, datasync);
}
EXPORT_SYMBOL(vfs_fsync_range);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 5da6012..4cdf733 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -55,6 +55,7 @@ struct bdi_writeback {
struct list_head b_dirty; /* dirty inodes */
struct list_head b_io; /* parked for writeback */
struct list_head b_more_io; /* parked for more writeback */
+ struct list_head b_dirty_time; /* time stamps are dirty */
spinlock_t list_lock; /* protects the b_* lists */
};
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ab779e..bb4e878 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1720,8 +1720,12 @@ struct super_operations {
#define __I_DIO_WAKEUP 9
#define I_DIO_WAKEUP (1 << I_DIO_WAKEUP)
#define I_LINKABLE (1 << 10)
+#define I_DIRTY_TIME (1 << 11)
+#define __I_DIRTY_TIME_EXPIRED 12
+#define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED)
#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
+#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
extern void __mark_inode_dirty(struct inode *, int);
static inline void mark_inode_dirty(struct inode *inode)
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index cee02d6..1a90d28 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -18,6 +18,8 @@
{I_FREEING, "I_FREEING"}, \
{I_CLEAR, "I_CLEAR"}, \
{I_SYNC, "I_SYNC"}, \
+ {I_DIRTY_TIME, "I_DIRTY_TIME"}, \
+ {I_DIRTY_TIME_EXPIRED, "I_DIRTY_TIME_EXPIRED"}, \
{I_REFERENCED, "I_REFERENCED"} \
)
@@ -68,6 +70,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
TP_STRUCT__entry (
__array(char, name, 32)
__field(unsigned long, ino)
+ __field(unsigned long, state)
__field(unsigned long, flags)
),
@@ -78,16 +81,25 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
strncpy(__entry->name,
bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
__entry->ino = inode->i_ino;
+ __entry->state = inode->i_state;
__entry->flags = flags;
),
- TP_printk("bdi %s: ino=%lu flags=%s",
+ TP_printk("bdi %s: ino=%lu state=%s flags=%s",
__entry->name,
__entry->ino,
+ show_inode_state(__entry->state),
show_inode_state(__entry->flags)
)
);
+DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty,
+
+ TP_PROTO(struct inode *inode, int flags),
+
+ TP_ARGS(inode, flags)
+);
+
DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start,
TP_PROTO(struct inode *inode, int flags),
@@ -598,6 +610,45 @@ DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
TP_ARGS(inode, wbc, nr_to_write)
);
+DECLARE_EVENT_CLASS(writeback_lazytime_template,
+ TP_PROTO(struct inode *inode),
+
+ TP_ARGS(inode),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field(unsigned long, ino )
+ __field(unsigned long, state )
+ __field( __u16, mode )
+ __field(unsigned long, dirtied_when )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->state = inode->i_state;
+ __entry->mode = inode->i_mode;
+ __entry->dirtied_when = inode->dirtied_when;
+ ),
+
+ TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino, __entry->dirtied_when,
+ show_inode_state(__entry->state), __entry->mode)
+);
+
+DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime,
+ TP_PROTO(struct inode *inode),
+
+ TP_ARGS(inode)
+);
+
+DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime_iput,
+ TP_PROTO(struct inode *inode),
+
+ TP_ARGS(inode)
+);
+
#endif /* _TRACE_WRITEBACK_H */
/* This part must be outside protection */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 3735fa0..9b964a5 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -90,6 +90,7 @@ struct inodes_stat_t {
#define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */
#define MS_I_VERSION (1<<23) /* Update inode I_version field */
#define MS_STRICTATIME (1<<24) /* Always perform atime updates */
+#define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
/* These sb flags are internal to the kernel */
#define MS_NOSEC (1<<28)
@@ -100,7 +101,8 @@ struct inodes_stat_t {
/*
* Superblock flags that can be altered by MS_REMOUNT
*/
-#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION)
+#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION|\
+ MS_LAZYTIME)
/*
* Old magic mount flag and mask
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0ae0df5..915feea 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -69,10 +69,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
- unsigned long nr_dirty, nr_io, nr_more_io;
+ unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
struct inode *inode;
- nr_dirty = nr_io = nr_more_io = 0;
+ nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
spin_lock(&wb->list_lock);
list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
nr_dirty++;
@@ -80,6 +80,9 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
nr_io++;
list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
nr_more_io++;
+ list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list)
+ if (inode->i_state & I_DIRTY_TIME)
+ nr_dirty_time++;
spin_unlock(&wb->list_lock);
global_dirty_limits(&background_thresh, &dirty_thresh);
@@ -98,6 +101,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
"b_dirty: %10lu\n"
"b_io: %10lu\n"
"b_more_io: %10lu\n"
+ "b_dirty_time: %10lu\n"
"bdi_list: %10u\n"
"state: %10lx\n",
(unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
@@ -111,6 +115,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
nr_dirty,
nr_io,
nr_more_io,
+ nr_dirty_time,
!list_empty(&bdi->bdi_list), bdi->state);
#undef K
@@ -418,6 +423,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
INIT_LIST_HEAD(&wb->b_dirty);
INIT_LIST_HEAD(&wb->b_io);
INIT_LIST_HEAD(&wb->b_more_io);
+ INIT_LIST_HEAD(&wb->b_dirty_time);
spin_lock_init(&wb->list_lock);
INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
}
--
2.1.0
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH-v6 3/3] ext4: add optimization for the lazytime mount option
2014-12-02 0:29 [PATCH-v6 0/3] add support for a lazytime mount option Theodore Ts'o
2014-12-02 0:29 ` [PATCH-v6 1/3] vfs: " Theodore Ts'o
2014-12-02 0:29 ` [PATCH-v6 2/3] vfs: add find_inode_nowait() function Theodore Ts'o
@ 2014-12-02 0:29 ` Theodore Ts'o
2 siblings, 0 replies; 4+ messages in thread
From: Theodore Ts'o @ 2014-12-02 0:29 UTC (permalink / raw)
To: Linux Filesystem Development List; +Cc: Ext4 Developers List, Theodore Ts'o
Add an optimization for the MS_LAZYTIME mount option so that we will
opportunistically write out any inodes with the I_DIRTY_TIME flag set
in a particular inode table block when we need to update some inode in
that inode table block anyway.
Also add some temporary code so that we can set the lazytime mount
option without needing a modified /sbin/mount program which can set
MS_LAZYTIME. We can eventually make this go away once util-linux has
added support.
Google-Bug-Id: 18297052
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
fs/ext4/inode.c | 65 ++++++++++++++++++++++++++++++++++++++++++---
fs/ext4/super.c | 10 +++++++
include/trace/events/ext4.h | 30 +++++++++++++++++++++
3 files changed, 102 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5653fa4..6327b28 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4139,6 +4139,65 @@ static int ext4_inode_blocks_set(handle_t *handle,
return 0;
}
+struct other_inode {
+ unsigned long orig_ino;
+ struct ext4_inode *raw_inode;
+};
+
+static int other_inode_match(struct inode * inode, unsigned long ino,
+ void *data)
+{
+ struct other_inode *oi = (struct other_inode *) data;
+
+ if ((inode->i_ino != ino) ||
+ (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
+ I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
+ ((inode->i_state & I_DIRTY_TIME) == 0))
+ return 0;
+ spin_lock(&inode->i_lock);
+ if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
+ I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0) &&
+ (inode->i_state & I_DIRTY_TIME)) {
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ inode->i_state &= ~I_DIRTY_TIME;
+ spin_unlock(&inode->i_lock);
+
+ spin_lock(&ei->i_raw_lock);
+ EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode);
+ EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode);
+ EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode);
+ ext4_inode_csum_set(inode, oi->raw_inode, ei);
+ spin_unlock(&ei->i_raw_lock);
+ trace_ext4_other_inode_update_time(inode, oi->orig_ino);
+ return -1;
+ }
+ spin_unlock(&inode->i_lock);
+ return -1;
+}
+
+/*
+ * Opportunistically update the other time fields for other inodes in
+ * the same inode table block.
+ */
+static void ext4_update_other_inodes_time(struct super_block *sb,
+ unsigned long orig_ino, char *buf)
+{
+ struct other_inode oi;
+ unsigned long ino;
+ int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+ int inode_size = EXT4_INODE_SIZE(sb);
+
+ oi.orig_ino = orig_ino;
+ ino = orig_ino & ~(inodes_per_block - 1);
+ for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
+ if (ino == orig_ino)
+ continue;
+ oi.raw_inode = (struct ext4_inode *) buf;
+ (void) find_inode_nowait(sb, ino, other_inode_match, &oi);
+ }
+}
+
/*
* Post the struct inode info into an on-disk inode location in the
* buffer-cache. This gobbles the caller's reference to the
@@ -4237,7 +4296,6 @@ static int ext4_do_update_inode(handle_t *handle,
for (block = 0; block < EXT4_N_BLOCKS; block++)
raw_inode->i_block[block] = ei->i_data[block];
}
-
if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
if (ei->i_extra_isize) {
@@ -4248,10 +4306,11 @@ static int ext4_do_update_inode(handle_t *handle,
cpu_to_le16(ei->i_extra_isize);
}
}
-
ext4_inode_csum_set(inode, raw_inode, ei);
-
spin_unlock(&ei->i_raw_lock);
+ if (inode->i_sb->s_flags & MS_LAZYTIME)
+ ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
+ bh->b_data);
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
rc = ext4_handle_dirty_metadata(handle, NULL, bh);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 58859bc..5310ec9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1132,6 +1132,7 @@ enum {
Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
Opt_usrquota, Opt_grpquota, Opt_i_version,
Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
+ Opt_lazytime, Opt_nolazytime,
Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
Opt_inode_readahead_blks, Opt_journal_ioprio,
Opt_dioread_nolock, Opt_dioread_lock,
@@ -1195,6 +1196,8 @@ static const match_table_t tokens = {
{Opt_i_version, "i_version"},
{Opt_stripe, "stripe=%u"},
{Opt_delalloc, "delalloc"},
+ {Opt_lazytime, "lazytime"},
+ {Opt_nolazytime, "nolazytime"},
{Opt_nodelalloc, "nodelalloc"},
{Opt_removed, "mblk_io_submit"},
{Opt_removed, "nomblk_io_submit"},
@@ -1452,6 +1455,12 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
case Opt_i_version:
sb->s_flags |= MS_I_VERSION;
return 1;
+ case Opt_lazytime:
+ sb->s_flags |= MS_LAZYTIME;
+ return 1;
+ case Opt_nolazytime:
+ sb->s_flags &= ~MS_LAZYTIME;
+ return 1;
}
for (m = ext4_mount_opts; m->token != Opt_err; m++)
@@ -5013,6 +5022,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
#endif
+ *flags = (*flags & ~MS_LAZYTIME) | (sb->s_flags & MS_LAZYTIME);
ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
kfree(orig_data);
return 0;
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 6cfb841..6e5abd6 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -73,6 +73,36 @@ struct extent_status;
{ FALLOC_FL_ZERO_RANGE, "ZERO_RANGE"})
+TRACE_EVENT(ext4_other_inode_update_time,
+ TP_PROTO(struct inode *inode, ino_t orig_ino),
+
+ TP_ARGS(inode, orig_ino),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( ino_t, ino )
+ __field( ino_t, orig_ino )
+ __field( uid_t, uid )
+ __field( gid_t, gid )
+ __field( __u16, mode )
+ ),
+
+ TP_fast_assign(
+ __entry->orig_ino = orig_ino;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->uid = i_uid_read(inode);
+ __entry->gid = i_gid_read(inode);
+ __entry->mode = inode->i_mode;
+ ),
+
+ TP_printk("dev %d,%d orig_ino %lu ino %lu mode 0%o uid %u gid %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long) __entry->orig_ino,
+ (unsigned long) __entry->ino, __entry->mode,
+ __entry->uid, __entry->gid)
+);
+
TRACE_EVENT(ext4_free_inode,
TP_PROTO(struct inode *inode),
--
2.1.0
^ permalink raw reply related [flat|nested] 4+ messages in thread