* [PATCH-v5 1/5] vfs: add support for a lazytime mount option
2014-11-28 6:00 [PATCH-v5 0/5] add support for a lazytime mount option Theodore Ts'o
@ 2014-11-28 6:00 ` Theodore Ts'o
2014-11-28 17:23 ` Jan Kara
2014-11-28 6:00 ` [PATCH-v5 2/5] vfs: don't let the dirty time inodes get more than a day stale Theodore Ts'o
` (5 subsequent siblings)
6 siblings, 1 reply; 18+ messages in thread
From: Theodore Ts'o @ 2014-11-28 6:00 UTC (permalink / raw)
To: Ext4 Developers List; +Cc: Linux Filesystem Development List, Theodore Ts'o
Add a new mount option which enables a new "lazytime" mode. This mode
causes atime, mtime, and ctime updates to only be made to the
in-memory version of the inode. The on-disk times will only get
updated when (a) if the inode needs to be updated for some non-time
related change, (b) if userspace calls fsync(), syncfs() or sync(), or
(c) just before an undeleted inode is evicted from memory.
This is OK according to POSIX because there are no guarantees after a
crash unless userspace explicitly requests via a fsync(2) call.
For workloads which feature a large number of random write to a
preallocated file, the lazytime mount option significantly reduces
writes to the inode table. The repeated 4k writes to a single block
will result in undesirable stress on flash devices and SMR disk
drives. Even on conventional HDD's, the repeated writes to the inode
table block will trigger Adjacent Track Interference (ATI) remediation
latencies, which very negatively impact 99.9 percentile latencies ---
which is a very big deal for web serving tiers (for example).
Google-Bug-Id: 18297052
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
fs/fs-writeback.c | 64 ++++++++++++++++++++++++++++++++++++++++-----
fs/inode.c | 41 ++++++++++++++++++++++++-----
fs/libfs.c | 2 +-
fs/logfs/readwrite.c | 2 +-
fs/nfsd/vfs.c | 2 +-
fs/pipe.c | 2 +-
fs/proc_namespace.c | 1 +
fs/sync.c | 8 ++++++
fs/ufs/truncate.c | 2 +-
include/linux/backing-dev.h | 1 +
include/linux/fs.h | 11 ++++++--
include/uapi/linux/fs.h | 1 +
mm/backing-dev.c | 10 +++++--
13 files changed, 126 insertions(+), 21 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ef9bef1..518f3bb 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -397,7 +397,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
* shot. If still dirty, it will be redirty_tail()'ed below. Update
* the dirty time to prevent enqueue and sync it again.
*/
- if ((inode->i_state & I_DIRTY) &&
+ if ((inode->i_state & I_DIRTY_WB) &&
(wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
inode->dirtied_when = jiffies;
@@ -428,13 +428,15 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
*/
redirty_tail(inode, wb);
}
- } else if (inode->i_state & I_DIRTY) {
+ } else if (inode->i_state & I_DIRTY_WB) {
/*
* Filesystems can dirty the inode during writeback operations,
* such as delayed allocation during submission or metadata
* updates after data IO completion.
*/
redirty_tail(inode, wb);
+ } else if (inode->i_state & I_DIRTY_TIME) {
+ list_move(&inode->i_wb_list, &wb->b_dirty_time);
} else {
/* The inode is clean. Remove from writeback lists. */
list_del_init(&inode->i_wb_list);
@@ -482,11 +484,15 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
/* Clear I_DIRTY_PAGES if we've written out all dirty pages */
if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
inode->i_state &= ~I_DIRTY_PAGES;
- dirty = inode->i_state & I_DIRTY;
- inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+ dirty = inode->i_state & (I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+ if (dirty && (inode->i_state & I_DIRTY_TIME))
+ dirty |= I_DIRTY_TIME;
+ inode->i_state &= ~dirty;
spin_unlock(&inode->i_lock);
+ if (dirty & I_DIRTY_TIME)
+ mark_inode_dirty_sync(inode);
/* Don't write the inode if only I_DIRTY_PAGES was set */
- if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+ if (dirty) {
int err = write_inode(inode, wbc);
if (ret == 0)
ret = err;
@@ -1162,7 +1168,11 @@ void __mark_inode_dirty(struct inode *inode, int flags)
spin_lock(&inode->i_lock);
if ((inode->i_state & flags) != flags) {
- const int was_dirty = inode->i_state & I_DIRTY;
+ const int was_dirty = inode->i_state & I_DIRTY_WB;
+
+ if ((flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) &&
+ (inode->i_state & I_DIRTY_TIME))
+ inode->i_state &= ~I_DIRTY_TIME;
inode->i_state |= flags;
@@ -1224,6 +1234,24 @@ out_unlock_inode:
}
EXPORT_SYMBOL(__mark_inode_dirty);
+void inode_requeue_dirtytime(struct inode *inode)
+{
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
+
+ spin_lock(&bdi->wb.list_lock);
+ spin_lock(&inode->i_lock);
+ if ((inode->i_state & I_DIRTY_WB) == 0) {
+ if (inode->i_state & I_DIRTY_TIME)
+ list_move(&inode->i_wb_list, &bdi->wb.b_dirty_time);
+ else
+ list_del_init(&inode->i_wb_list);
+ }
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&bdi->wb.list_lock);
+
+}
+EXPORT_SYMBOL(inode_requeue_dirtytime);
+
static void wait_sb_inodes(struct super_block *sb)
{
struct inode *inode, *old_inode = NULL;
@@ -1277,6 +1305,29 @@ static void wait_sb_inodes(struct super_block *sb)
iput(old_inode);
}
+/*
+ * Take all of the indoes on the dirty_time list, and mark them as
+ * dirty, so they will be written out.
+ */
+static void flush_sb_dirty_time(struct super_block *sb)
+{
+ struct bdi_writeback *wb = &sb->s_bdi->wb;
+ LIST_HEAD(tmp);
+
+ spin_lock(&wb->list_lock);
+ list_cut_position(&tmp, &wb->b_dirty_time, wb->b_dirty_time.prev);
+ while (!list_empty(&tmp)) {
+ struct inode *inode = wb_inode(tmp.prev);
+
+ list_del_init(&inode->i_wb_list);
+ spin_unlock(&wb->list_lock);
+ if (inode->i_state & I_DIRTY_TIME)
+ mark_inode_dirty_sync(inode);
+ spin_lock(&wb->list_lock);
+ }
+ spin_unlock(&wb->list_lock);
+}
+
/**
* writeback_inodes_sb_nr - writeback dirty inodes from given super_block
* @sb: the superblock
@@ -1388,6 +1439,7 @@ void sync_inodes_sb(struct super_block *sb)
return;
WARN_ON(!rwsem_is_locked(&sb->s_umount));
+ flush_sb_dirty_time(sb);
bdi_queue_work(sb->s_bdi, &work);
wait_for_completion(&done);
diff --git a/fs/inode.c b/fs/inode.c
index 26753ba..1ec0629 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -30,7 +30,7 @@
* inode_sb_list_lock protects:
* sb->s_inodes, inode->i_sb_list
* bdi->wb.list_lock protects:
- * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
+ * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list
* inode_hash_lock protects:
* inode_hashtable, inode->i_hash
*
@@ -1430,11 +1430,19 @@ static void iput_final(struct inode *inode)
*/
void iput(struct inode *inode)
{
- if (inode) {
- BUG_ON(inode->i_state & I_CLEAR);
-
- if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
- iput_final(inode);
+ if (!inode)
+ return;
+ BUG_ON(inode->i_state & I_CLEAR);
+retry:
+ if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
+ if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
+ atomic_inc(&inode->i_count);
+ inode->i_state &= ~I_DIRTY_TIME;
+ spin_unlock(&inode->i_lock);
+ mark_inode_dirty_sync(inode);
+ goto retry;
+ }
+ iput_final(inode);
}
}
EXPORT_SYMBOL(iput);
@@ -1510,6 +1518,27 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
inode->i_ctime = *time;
if (flags & S_MTIME)
inode->i_mtime = *time;
+
+ if ((inode->i_sb->s_flags & MS_LAZYTIME) &&
+ !(flags & S_VERSION) &&
+ !(inode->i_state & I_DIRTY_WB)) {
+ if (inode->i_state & I_DIRTY_TIME)
+ return 0;
+ spin_lock(&inode->i_lock);
+ if (inode->i_state & I_DIRTY_WB) {
+ spin_unlock(&inode->i_lock);
+ goto force_dirty;
+ }
+ if (inode->i_state & I_DIRTY_TIME) {
+ spin_unlock(&inode->i_lock);
+ return 0;
+ }
+ inode->i_state |= I_DIRTY_TIME;
+ spin_unlock(&inode->i_lock);
+ inode_requeue_dirtytime(inode);
+ return 0;
+ }
+force_dirty:
mark_inode_dirty_sync(inode);
return 0;
}
diff --git a/fs/libfs.c b/fs/libfs.c
index 171d284..b9923b2 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1066,7 +1066,7 @@ struct inode *alloc_anon_inode(struct super_block *s)
* list because mark_inode_dirty() will think
* that it already _is_ on the dirty list.
*/
- inode->i_state = I_DIRTY;
+ inode->i_state = I_DIRTY_WB;
inode->i_mode = S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 380d86e..5521842 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -2187,7 +2187,7 @@ void logfs_evict_inode(struct inode *inode)
* aliases, which are moved back. No write to the medium happens.
*/
/* Only deleted files may be dirty at this point */
- BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
+ BUG_ON(inode->i_state & I_DIRTY_WB && inode->i_nlink);
if (!block)
return;
if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 989129e..818c6fa 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -915,7 +915,7 @@ static int wait_for_concurrent_writes(struct file *file)
dprintk("nfsd: write resume %d\n", task_pid_nr(current));
}
- if (inode->i_state & I_DIRTY) {
+ if (inode->i_state & I_DIRTY_WB) {
dprintk("nfsd: write sync %d\n", task_pid_nr(current));
err = vfs_fsync(file, 0);
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 21981e5..fc9b923 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -660,7 +660,7 @@ static struct inode * get_pipe_inode(void)
* list because "mark_inode_dirty()" will think
* that it already _is_ on the dirty list.
*/
- inode->i_state = I_DIRTY;
+ inode->i_state = I_DIRTY_WB;
inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 73ca174..f98234a 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -44,6 +44,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb)
{ MS_SYNCHRONOUS, ",sync" },
{ MS_DIRSYNC, ",dirsync" },
{ MS_MANDLOCK, ",mand" },
+ { MS_LAZYTIME, ",lazytime" },
{ 0, NULL }
};
const struct proc_fs_info *fs_infop;
diff --git a/fs/sync.c b/fs/sync.c
index bdc729d..6ac7bf0 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -177,8 +177,16 @@ SYSCALL_DEFINE1(syncfs, int, fd)
*/
int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
{
+ struct inode *inode = file->f_mapping->host;
+
if (!file->f_op->fsync)
return -EINVAL;
+ if (!datasync && (inode->i_state & I_DIRTY_TIME)) {
+ spin_lock(&inode->i_lock);
+ inode->i_state &= ~I_DIRTY_TIME;
+ spin_unlock(&inode->i_lock);
+ mark_inode_dirty_sync(inode);
+ }
return file->f_op->fsync(file, start, end, datasync);
}
EXPORT_SYMBOL(vfs_fsync_range);
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index f04f89f..1d00a09 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -477,7 +477,7 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size)
retry |= ufs_trunc_tindirect (inode);
if (!retry)
break;
- if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
+ if (IS_SYNC(inode) && (inode->i_state & I_DIRTY_WB))
ufs_sync_inode (inode);
yield();
}
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 5da6012..4cdf733 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -55,6 +55,7 @@ struct bdi_writeback {
struct list_head b_dirty; /* dirty inodes */
struct list_head b_io; /* parked for writeback */
struct list_head b_more_io; /* parked for more writeback */
+ struct list_head b_dirty_time; /* time stamps are dirty */
spinlock_t list_lock; /* protects the b_* lists */
};
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ab779e..7932482 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1720,19 +1720,26 @@ struct super_operations {
#define __I_DIO_WAKEUP 9
#define I_DIO_WAKEUP (1 << I_DIO_WAKEUP)
#define I_LINKABLE (1 << 10)
+#define I_DIRTY_TIME (1 << 11)
-#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
+/* Inode should be on the b_dirty/b_io/b_more_io lists */
+#define I_DIRTY_WB (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
+/* Inode should be on the b_dirty/b_io/b_more_io/b_dirty_time lists */
+#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES | I_DIRTY_TIME)
+/* The inode itself is dirty */
+#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)
extern void __mark_inode_dirty(struct inode *, int);
static inline void mark_inode_dirty(struct inode *inode)
{
- __mark_inode_dirty(inode, I_DIRTY);
+ __mark_inode_dirty(inode, I_DIRTY_WB);
}
static inline void mark_inode_dirty_sync(struct inode *inode)
{
__mark_inode_dirty(inode, I_DIRTY_SYNC);
}
+extern void inode_requeue_dirtytime(struct inode *);
extern void inc_nlink(struct inode *inode);
extern void drop_nlink(struct inode *inode);
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 3735fa0..cc9713a 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -90,6 +90,7 @@ struct inodes_stat_t {
#define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */
#define MS_I_VERSION (1<<23) /* Update inode I_version field */
#define MS_STRICTATIME (1<<24) /* Always perform atime updates */
+#define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
/* These sb flags are internal to the kernel */
#define MS_NOSEC (1<<28)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0ae0df5..915feea 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -69,10 +69,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
- unsigned long nr_dirty, nr_io, nr_more_io;
+ unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
struct inode *inode;
- nr_dirty = nr_io = nr_more_io = 0;
+ nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
spin_lock(&wb->list_lock);
list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
nr_dirty++;
@@ -80,6 +80,9 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
nr_io++;
list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
nr_more_io++;
+ list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list)
+ if (inode->i_state & I_DIRTY_TIME)
+ nr_dirty_time++;
spin_unlock(&wb->list_lock);
global_dirty_limits(&background_thresh, &dirty_thresh);
@@ -98,6 +101,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
"b_dirty: %10lu\n"
"b_io: %10lu\n"
"b_more_io: %10lu\n"
+ "b_dirty_time: %10lu\n"
"bdi_list: %10u\n"
"state: %10lx\n",
(unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
@@ -111,6 +115,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
nr_dirty,
nr_io,
nr_more_io,
+ nr_dirty_time,
!list_empty(&bdi->bdi_list), bdi->state);
#undef K
@@ -418,6 +423,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
INIT_LIST_HEAD(&wb->b_dirty);
INIT_LIST_HEAD(&wb->b_io);
INIT_LIST_HEAD(&wb->b_more_io);
+ INIT_LIST_HEAD(&wb->b_dirty_time);
spin_lock_init(&wb->list_lock);
INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
}
--
2.1.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [PATCH-v5 1/5] vfs: add support for a lazytime mount option
2014-11-28 6:00 ` [PATCH-v5 1/5] vfs: " Theodore Ts'o
@ 2014-11-28 17:23 ` Jan Kara
[not found] ` <20141128181421.GA19461@google.com>
0 siblings, 1 reply; 18+ messages in thread
From: Jan Kara @ 2014-11-28 17:23 UTC (permalink / raw)
To: Theodore Ts'o; +Cc: Ext4 Developers List, Linux Filesystem Development List
On Fri 28-11-14 01:00:06, Ted Tso wrote:
> Add a new mount option which enables a new "lazytime" mode. This mode
> causes atime, mtime, and ctime updates to only be made to the
> in-memory version of the inode. The on-disk times will only get
> updated when (a) if the inode needs to be updated for some non-time
> related change, (b) if userspace calls fsync(), syncfs() or sync(), or
> (c) just before an undeleted inode is evicted from memory.
>
> This is OK according to POSIX because there are no guarantees after a
> crash unless userspace explicitly requests via a fsync(2) call.
>
> For workloads which feature a large number of random write to a
> preallocated file, the lazytime mount option significantly reduces
> writes to the inode table. The repeated 4k writes to a single block
> will result in undesirable stress on flash devices and SMR disk
> drives. Even on conventional HDD's, the repeated writes to the inode
> table block will trigger Adjacent Track Interference (ATI) remediation
> latencies, which very negatively impact 99.9 percentile latencies ---
> which is a very big deal for web serving tiers (for example).
>
> Google-Bug-Id: 18297052
>
> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
> ---
> fs/fs-writeback.c | 64 ++++++++++++++++++++++++++++++++++++++++-----
> fs/inode.c | 41 ++++++++++++++++++++++++-----
> fs/libfs.c | 2 +-
> fs/logfs/readwrite.c | 2 +-
> fs/nfsd/vfs.c | 2 +-
> fs/pipe.c | 2 +-
> fs/proc_namespace.c | 1 +
> fs/sync.c | 8 ++++++
> fs/ufs/truncate.c | 2 +-
> include/linux/backing-dev.h | 1 +
> include/linux/fs.h | 11 ++++++--
> include/uapi/linux/fs.h | 1 +
> mm/backing-dev.c | 10 +++++--
> 13 files changed, 126 insertions(+), 21 deletions(-)
>
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index ef9bef1..518f3bb 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -397,7 +397,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
> * shot. If still dirty, it will be redirty_tail()'ed below. Update
> * the dirty time to prevent enqueue and sync it again.
> */
> - if ((inode->i_state & I_DIRTY) &&
> + if ((inode->i_state & I_DIRTY_WB) &&
> (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
> inode->dirtied_when = jiffies;
>
> @@ -428,13 +428,15 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
> */
> redirty_tail(inode, wb);
> }
> - } else if (inode->i_state & I_DIRTY) {
> + } else if (inode->i_state & I_DIRTY_WB) {
> /*
> * Filesystems can dirty the inode during writeback operations,
> * such as delayed allocation during submission or metadata
> * updates after data IO completion.
> */
> redirty_tail(inode, wb);
> + } else if (inode->i_state & I_DIRTY_TIME) {
> + list_move(&inode->i_wb_list, &wb->b_dirty_time);
> } else {
> /* The inode is clean. Remove from writeback lists. */
> list_del_init(&inode->i_wb_list);
> @@ -482,11 +484,15 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
> /* Clear I_DIRTY_PAGES if we've written out all dirty pages */
> if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
> inode->i_state &= ~I_DIRTY_PAGES;
> - dirty = inode->i_state & I_DIRTY;
> - inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
> + dirty = inode->i_state & (I_DIRTY_SYNC | I_DIRTY_DATASYNC);
> + if (dirty && (inode->i_state & I_DIRTY_TIME))
> + dirty |= I_DIRTY_TIME;
> + inode->i_state &= ~dirty;
> spin_unlock(&inode->i_lock);
> + if (dirty & I_DIRTY_TIME)
> + mark_inode_dirty_sync(inode);
Hum, when someone calls fsync() for an inode, you likely want to sync
timestamps to disk even if everything else is clean. I think that doing
what you did in last version:
dirty = inode->i_state & I_DIRTY_INODE;
inode->i_state &= ~I_DIRTY_INODE;
spin_unlock(&inode->i_lock);
if (dirty & I_DIRTY_TIME)
mark_inode_dirty_sync(inode);
looks better to me. IMO when someone calls __writeback_single_inode() we
should write whatever we have...
> /* Don't write the inode if only I_DIRTY_PAGES was set */
> - if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
> + if (dirty) {
> int err = write_inode(inode, wbc);
> if (ret == 0)
> ret = err;
> @@ -1162,7 +1168,11 @@ void __mark_inode_dirty(struct inode *inode, int flags)
>
> spin_lock(&inode->i_lock);
> if ((inode->i_state & flags) != flags) {
> - const int was_dirty = inode->i_state & I_DIRTY;
> + const int was_dirty = inode->i_state & I_DIRTY_WB;
> +
> + if ((flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) &&
> + (inode->i_state & I_DIRTY_TIME))
> + inode->i_state &= ~I_DIRTY_TIME;
>
> inode->i_state |= flags;
>
> @@ -1224,6 +1234,24 @@ out_unlock_inode:
> }
> EXPORT_SYMBOL(__mark_inode_dirty);
>
> +void inode_requeue_dirtytime(struct inode *inode)
> +{
> + struct backing_dev_info *bdi = inode_to_bdi(inode);
> +
> + spin_lock(&bdi->wb.list_lock);
> + spin_lock(&inode->i_lock);
> + if ((inode->i_state & I_DIRTY_WB) == 0) {
> + if (inode->i_state & I_DIRTY_TIME)
> + list_move(&inode->i_wb_list, &bdi->wb.b_dirty_time);
> + else
> + list_del_init(&inode->i_wb_list);
> + }
> + spin_unlock(&inode->i_lock);
> + spin_unlock(&bdi->wb.list_lock);
> +
> +}
> +EXPORT_SYMBOL(inode_requeue_dirtytime);
> +
This function has a single call site - update_time(). I'd prefer to
handle this as a special case of __mark_inode_dirty() to have all the dirty
queueing in one place...
> static void wait_sb_inodes(struct super_block *sb)
> {
> struct inode *inode, *old_inode = NULL;
> @@ -1277,6 +1305,29 @@ static void wait_sb_inodes(struct super_block *sb)
> iput(old_inode);
> }
>
> +/*
> + * Take all of the indoes on the dirty_time list, and mark them as
> + * dirty, so they will be written out.
> + */
> +static void flush_sb_dirty_time(struct super_block *sb)
> +{
> + struct bdi_writeback *wb = &sb->s_bdi->wb;
> + LIST_HEAD(tmp);
> +
> + spin_lock(&wb->list_lock);
> + list_cut_position(&tmp, &wb->b_dirty_time, wb->b_dirty_time.prev);
> + while (!list_empty(&tmp)) {
> + struct inode *inode = wb_inode(tmp.prev);
> +
> + list_del_init(&inode->i_wb_list);
> + spin_unlock(&wb->list_lock);
> + if (inode->i_state & I_DIRTY_TIME)
> + mark_inode_dirty_sync(inode);
> + spin_lock(&wb->list_lock);
> + }
> + spin_unlock(&wb->list_lock);
> +}
> +
This shouldn't be necessary when you somewhat tweak what you do in
queue_io().
> /**
> * writeback_inodes_sb_nr - writeback dirty inodes from given super_block
> * @sb: the superblock
> @@ -1388,6 +1439,7 @@ void sync_inodes_sb(struct super_block *sb)
> return;
> WARN_ON(!rwsem_is_locked(&sb->s_umount));
>
> + flush_sb_dirty_time(sb);
> bdi_queue_work(sb->s_bdi, &work);
> wait_for_completion(&done);
>
> diff --git a/fs/inode.c b/fs/inode.c
> index 26753ba..1ec0629 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -30,7 +30,7 @@
> * inode_sb_list_lock protects:
> * sb->s_inodes, inode->i_sb_list
> * bdi->wb.list_lock protects:
> - * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
> + * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list
> * inode_hash_lock protects:
> * inode_hashtable, inode->i_hash
> *
> @@ -1430,11 +1430,19 @@ static void iput_final(struct inode *inode)
> */
> void iput(struct inode *inode)
> {
> - if (inode) {
> - BUG_ON(inode->i_state & I_CLEAR);
> -
> - if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
> - iput_final(inode);
> + if (!inode)
> + return;
> + BUG_ON(inode->i_state & I_CLEAR);
> +retry:
> + if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
> + if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
> + atomic_inc(&inode->i_count);
> + inode->i_state &= ~I_DIRTY_TIME;
> + spin_unlock(&inode->i_lock);
> + mark_inode_dirty_sync(inode);
> + goto retry;
> + }
> + iput_final(inode);
How about my suggestion from previous version to avoid the retry loop by
checking I_DIRTY_TIME before atomic_dec_and_lock()?
Honza
--
Jan Kara <jack@suse.cz>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH-v5 2/5] vfs: don't let the dirty time inodes get more than a day stale
2014-11-28 6:00 [PATCH-v5 0/5] add support for a lazytime mount option Theodore Ts'o
2014-11-28 6:00 ` [PATCH-v5 1/5] vfs: " Theodore Ts'o
@ 2014-11-28 6:00 ` Theodore Ts'o
2014-11-28 16:43 ` Jan Kara
2014-11-28 6:00 ` [PATCH 2/5] vfs: use writeback lists to provide lazytime one day timeout Theodore Ts'o
` (4 subsequent siblings)
6 siblings, 1 reply; 18+ messages in thread
From: Theodore Ts'o @ 2014-11-28 6:00 UTC (permalink / raw)
To: Ext4 Developers List; +Cc: Linux Filesystem Development List, Theodore Ts'o
Guarantee that the on-disk timestamps will be no more than 24 hours
stale.
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
fs/fs-writeback.c | 1 +
fs/inode.c | 20 ++++++++++++++++++++
include/linux/fs.h | 1 +
3 files changed, 22 insertions(+)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 518f3bb..15dec84 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1147,6 +1147,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
trace_writeback_dirty_inode_start(inode, flags);
+ inode->i_ts_dirty_day = 0;
if (sb->s_op->dirty_inode)
sb->s_op->dirty_inode(inode, flags);
diff --git a/fs/inode.c b/fs/inode.c
index 1ec0629..84a5a3d 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1507,6 +1507,9 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
*/
static int update_time(struct inode *inode, struct timespec *time, int flags)
{
+ struct timespec uptime;
+ unsigned short days_since_boot;
+
if (inode->i_op->update_time)
return inode->i_op->update_time(inode, time, flags);
@@ -1524,6 +1527,22 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
!(inode->i_state & I_DIRTY_WB)) {
if (inode->i_state & I_DIRTY_TIME)
return 0;
+ get_monotonic_boottime(&uptime);
+ days_since_boot = div_u64(uptime.tv_sec, 86400);
+ /*
+ * If i_ts_dirty_day is zero, then either we have not
+ * deferred timestamp updates, or the system has been
+ * up for less than a day (so days_since_boot is
+ * zero), so we can defer timestamp updates in that
+ * case. If a day or more has passed, then
+ * i_ts_dirty_day will be different from
+ * days_since_boot, and then we should update the
+ * on-disk inode and then we can clear i_ts_dirty_day.
+ */
+ if (inode->i_ts_dirty_day &&
+ (inode->i_ts_dirty_day != days_since_boot))
+ goto force_dirty;
+
spin_lock(&inode->i_lock);
if (inode->i_state & I_DIRTY_WB) {
spin_unlock(&inode->i_lock);
@@ -1534,6 +1553,7 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
return 0;
}
inode->i_state |= I_DIRTY_TIME;
+ inode->i_ts_dirty_day = days_since_boot;
spin_unlock(&inode->i_lock);
inode_requeue_dirtytime(inode);
return 0;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7932482..2b86b5d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -575,6 +575,7 @@ struct inode {
struct timespec i_ctime;
spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
unsigned short i_bytes;
+ unsigned short i_ts_dirty_day;
unsigned int i_blkbits;
blkcnt_t i_blocks;
--
2.1.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [PATCH-v5 2/5] vfs: don't let the dirty time inodes get more than a day stale
2014-11-28 6:00 ` [PATCH-v5 2/5] vfs: don't let the dirty time inodes get more than a day stale Theodore Ts'o
@ 2014-11-28 16:43 ` Jan Kara
0 siblings, 0 replies; 18+ messages in thread
From: Jan Kara @ 2014-11-28 16:43 UTC (permalink / raw)
To: Theodore Ts'o; +Cc: Ext4 Developers List, Linux Filesystem Development List
On Fri 28-11-14 01:00:07, Ted Tso wrote:
> Guarantee that the on-disk timestamps will be no more than 24 hours
> stale.
Why is this still necessary after what you do in patch 1/5?
Honza
>
> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
> ---
> fs/fs-writeback.c | 1 +
> fs/inode.c | 20 ++++++++++++++++++++
> include/linux/fs.h | 1 +
> 3 files changed, 22 insertions(+)
>
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index 518f3bb..15dec84 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -1147,6 +1147,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
> if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
> trace_writeback_dirty_inode_start(inode, flags);
>
> + inode->i_ts_dirty_day = 0;
> if (sb->s_op->dirty_inode)
> sb->s_op->dirty_inode(inode, flags);
>
> diff --git a/fs/inode.c b/fs/inode.c
> index 1ec0629..84a5a3d 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -1507,6 +1507,9 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
> */
> static int update_time(struct inode *inode, struct timespec *time, int flags)
> {
> + struct timespec uptime;
> + unsigned short days_since_boot;
> +
> if (inode->i_op->update_time)
> return inode->i_op->update_time(inode, time, flags);
>
> @@ -1524,6 +1527,22 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
> !(inode->i_state & I_DIRTY_WB)) {
> if (inode->i_state & I_DIRTY_TIME)
> return 0;
> + get_monotonic_boottime(&uptime);
> + days_since_boot = div_u64(uptime.tv_sec, 86400);
> + /*
> + * If i_ts_dirty_day is zero, then either we have not
> + * deferred timestamp updates, or the system has been
> + * up for less than a day (so days_since_boot is
> + * zero), so we can defer timestamp updates in that
> + * case. If a day or more has passed, then
> + * i_ts_dirty_day will be different from
> + * days_since_boot, and then we should update the
> + * on-disk inode and then we can clear i_ts_dirty_day.
> + */
> + if (inode->i_ts_dirty_day &&
> + (inode->i_ts_dirty_day != days_since_boot))
> + goto force_dirty;
> +
> spin_lock(&inode->i_lock);
> if (inode->i_state & I_DIRTY_WB) {
> spin_unlock(&inode->i_lock);
> @@ -1534,6 +1553,7 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
> return 0;
> }
> inode->i_state |= I_DIRTY_TIME;
> + inode->i_ts_dirty_day = days_since_boot;
> spin_unlock(&inode->i_lock);
> inode_requeue_dirtytime(inode);
> return 0;
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 7932482..2b86b5d 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -575,6 +575,7 @@ struct inode {
> struct timespec i_ctime;
> spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
> unsigned short i_bytes;
> + unsigned short i_ts_dirty_day;
> unsigned int i_blkbits;
> blkcnt_t i_blocks;
>
> --
> 2.1.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Jan Kara <jack@suse.cz>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH 2/5] vfs: use writeback lists to provide lazytime one day timeout
2014-11-28 6:00 [PATCH-v5 0/5] add support for a lazytime mount option Theodore Ts'o
2014-11-28 6:00 ` [PATCH-v5 1/5] vfs: " Theodore Ts'o
2014-11-28 6:00 ` [PATCH-v5 2/5] vfs: don't let the dirty time inodes get more than a day stale Theodore Ts'o
@ 2014-11-28 6:00 ` Theodore Ts'o
2014-11-28 17:20 ` Jan Kara
2014-11-28 6:00 ` [PATCH-v5 3/5] vfs: add lazytime tracepoints for better debugging Theodore Ts'o
` (3 subsequent siblings)
6 siblings, 1 reply; 18+ messages in thread
From: Theodore Ts'o @ 2014-11-28 6:00 UTC (permalink / raw)
To: Ext4 Developers List; +Cc: Linux Filesystem Development List, Theodore Ts'o
Queue inodes with dirty timestamps for writeback 24 hours after they
were initially dirtied.
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
fs/fs-writeback.c | 29 +++++++++++++++++++++++------
1 file changed, 23 insertions(+), 6 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3d562e2..bd76590 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -253,7 +253,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
*/
static int move_expired_inodes(struct list_head *delaying_queue,
struct list_head *dispatch_queue,
- struct wb_writeback_work *work)
+ unsigned long *older_than_this)
{
LIST_HEAD(tmp);
struct list_head *pos, *node;
@@ -264,8 +264,8 @@ static int move_expired_inodes(struct list_head *delaying_queue,
while (!list_empty(delaying_queue)) {
inode = wb_inode(delaying_queue->prev);
- if (work->older_than_this &&
- inode_dirtied_after(inode, *work->older_than_this))
+ if (older_than_this &&
+ inode_dirtied_after(inode, *older_than_this))
break;
list_move(&inode->i_wb_list, &tmp);
moved++;
@@ -309,9 +309,14 @@ out:
static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
{
int moved;
+ unsigned long one_day_later = jiffies + (HZ * 86400);
+
assert_spin_locked(&wb->list_lock);
list_splice_init(&wb->b_more_io, &wb->b_io);
- moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
+ moved = move_expired_inodes(&wb->b_dirty, &wb->b_io,
+ work->older_than_this);
+ moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
+ &one_day_later);
trace_writeback_queue_io(wb, work, moved);
}
@@ -679,6 +684,17 @@ static long writeback_sb_inodes(struct super_block *sb,
inode->i_state |= I_SYNC;
spin_unlock(&inode->i_lock);
+ /*
+ * If the inode is marked dirty time but is not dirty,
+ * then at last for ext3 and ext4 we need to call
+ * mark_inode_dirty_sync in order to get the inode
+ * timestamp transferred to the on disk inode, since
+ * write_inode is a no-op for those file systems.
+ */
+ if ((inode->i_state & I_DIRTY_TIME) &&
+ ((inode->i_state & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0))
+ mark_inode_dirty_sync(inode);
+
write_chunk = writeback_chunk_size(wb->bdi, work);
wbc.nr_to_write = write_chunk;
wbc.pages_skipped = 0;
@@ -1237,9 +1253,10 @@ void inode_requeue_dirtytime(struct inode *inode)
spin_lock(&bdi->wb.list_lock);
spin_lock(&inode->i_lock);
if ((inode->i_state & I_DIRTY_WB) == 0) {
- if (inode->i_state & I_DIRTY_TIME)
+ if (inode->i_state & I_DIRTY_TIME) {
+ inode->dirtied_when = jiffies;
list_move(&inode->i_wb_list, &bdi->wb.b_dirty_time);
- else
+ } else
list_del_init(&inode->i_wb_list);
}
spin_unlock(&inode->i_lock);
--
2.1.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [PATCH 2/5] vfs: use writeback lists to provide lazytime one day timeout
2014-11-28 6:00 ` [PATCH 2/5] vfs: use writeback lists to provide lazytime one day timeout Theodore Ts'o
@ 2014-11-28 17:20 ` Jan Kara
0 siblings, 0 replies; 18+ messages in thread
From: Jan Kara @ 2014-11-28 17:20 UTC (permalink / raw)
To: Theodore Ts'o; +Cc: Ext4 Developers List, Linux Filesystem Development List
On Fri 28-11-14 01:00:08, Ted Tso wrote:
> Queue inodes with dirty timestamps for writeback 24 hours after they
> were initially dirtied.
Oh I see, this patch should probably replace the other 2/5 patch.
> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
> ---
> fs/fs-writeback.c | 29 +++++++++++++++++++++++------
> 1 file changed, 23 insertions(+), 6 deletions(-)
>
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index 3d562e2..bd76590 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -253,7 +253,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
> */
> static int move_expired_inodes(struct list_head *delaying_queue,
> struct list_head *dispatch_queue,
> - struct wb_writeback_work *work)
> + unsigned long *older_than_this)
> {
> LIST_HEAD(tmp);
> struct list_head *pos, *node;
> @@ -264,8 +264,8 @@ static int move_expired_inodes(struct list_head *delaying_queue,
>
> while (!list_empty(delaying_queue)) {
> inode = wb_inode(delaying_queue->prev);
> - if (work->older_than_this &&
> - inode_dirtied_after(inode, *work->older_than_this))
> + if (older_than_this &&
> + inode_dirtied_after(inode, *older_than_this))
> break;
> list_move(&inode->i_wb_list, &tmp);
> moved++;
> @@ -309,9 +309,14 @@ out:
> static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
> {
> int moved;
> + unsigned long one_day_later = jiffies + (HZ * 86400);
> +
> assert_spin_locked(&wb->list_lock);
> list_splice_init(&wb->b_more_io, &wb->b_io);
> - moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
> + moved = move_expired_inodes(&wb->b_dirty, &wb->b_io,
> + work->older_than_this);
> + moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
> + &one_day_later);
I'd change this to:
if (work->sync_mode == WB_SYNC_ALL) {
moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
work->older_than_this);
} else {
moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
&one_day_later);
}
That way you can get rid of special sync handling. Also just fold this
patch into 1/5...
> trace_writeback_queue_io(wb, work, moved);
> }
>
> @@ -679,6 +684,17 @@ static long writeback_sb_inodes(struct super_block *sb,
> inode->i_state |= I_SYNC;
> spin_unlock(&inode->i_lock);
>
> + /*
> + * If the inode is marked dirty time but is not dirty,
> + * then at last for ext3 and ext4 we need to call
> + * mark_inode_dirty_sync in order to get the inode
> + * timestamp transferred to the on disk inode, since
> + * write_inode is a no-op for those file systems.
> + */
> + if ((inode->i_state & I_DIRTY_TIME) &&
> + ((inode->i_state & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0))
> + mark_inode_dirty_sync(inode);
> +
This isn't necessary - you already handle this in
__writeback_single_inode(). Or am I missing something?
Honza
--
Jan Kara <jack@suse.cz>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH-v5 3/5] vfs: add lazytime tracepoints for better debugging
2014-11-28 6:00 [PATCH-v5 0/5] add support for a lazytime mount option Theodore Ts'o
` (2 preceding siblings ...)
2014-11-28 6:00 ` [PATCH 2/5] vfs: use writeback lists to provide lazytime one day timeout Theodore Ts'o
@ 2014-11-28 6:00 ` Theodore Ts'o
2014-11-28 6:00 ` [PATCH-v5 4/5] vfs: add find_inode_nowait() function Theodore Ts'o
` (2 subsequent siblings)
6 siblings, 0 replies; 18+ messages in thread
From: Theodore Ts'o @ 2014-11-28 6:00 UTC (permalink / raw)
To: Ext4 Developers List; +Cc: Linux Filesystem Development List, Theodore Ts'o
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
fs/fs-writeback.c | 1 +
fs/inode.c | 5 +++++
include/trace/events/fs.h | 56 +++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 62 insertions(+)
create mode 100644 include/trace/events/fs.h
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 15dec84..9facc18 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -27,6 +27,7 @@
#include <linux/backing-dev.h>
#include <linux/tracepoint.h>
#include <linux/device.h>
+#include <trace/events/fs.h>
#include "internal.h"
/*
diff --git a/fs/inode.c b/fs/inode.c
index 84a5a3d..e062640 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -20,6 +20,9 @@
#include <linux/list_lru.h>
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/fs.h>
+
/*
* Inode locking rules:
*
@@ -1440,6 +1443,7 @@ retry:
inode->i_state &= ~I_DIRTY_TIME;
spin_unlock(&inode->i_lock);
mark_inode_dirty_sync(inode);
+ trace_fs_lazytime_iput(inode);
goto retry;
}
iput_final(inode);
@@ -1556,6 +1560,7 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
inode->i_ts_dirty_day = days_since_boot;
spin_unlock(&inode->i_lock);
inode_requeue_dirtytime(inode);
+ trace_fs_lazytime_defer(inode);
return 0;
}
force_dirty:
diff --git a/include/trace/events/fs.h b/include/trace/events/fs.h
new file mode 100644
index 0000000..fa32295
--- /dev/null
+++ b/include/trace/events/fs.h
@@ -0,0 +1,56 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM fs
+
+#if !defined(_TRACE_FS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_FS_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(fs__inode,
+ TP_PROTO(struct inode *inode),
+
+ TP_ARGS(inode),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( ino_t, ino )
+ __field( uid_t, uid )
+ __field( gid_t, gid )
+ __field( __u16, mode )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->uid = i_uid_read(inode);
+ __entry->gid = i_gid_read(inode);
+ __entry->mode = inode->i_mode;
+ ),
+
+ TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long) __entry->ino, __entry->mode,
+ __entry->uid, __entry->gid)
+);
+
+DEFINE_EVENT(fs__inode, fs_lazytime_defer,
+ TP_PROTO(struct inode *inode),
+
+ TP_ARGS(inode)
+);
+
+DEFINE_EVENT(fs__inode, fs_lazytime_iput,
+ TP_PROTO(struct inode *inode),
+
+ TP_ARGS(inode)
+);
+
+DEFINE_EVENT(fs__inode, fs_lazytime_flush,
+ TP_PROTO(struct inode *inode),
+
+ TP_ARGS(inode)
+);
+#endif /* _TRACE_FS_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
--
2.1.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH-v5 4/5] vfs: add find_inode_nowait() function
2014-11-28 6:00 [PATCH-v5 0/5] add support for a lazytime mount option Theodore Ts'o
` (3 preceding siblings ...)
2014-11-28 6:00 ` [PATCH-v5 3/5] vfs: add lazytime tracepoints for better debugging Theodore Ts'o
@ 2014-11-28 6:00 ` Theodore Ts'o
2014-11-28 6:00 ` [PATCH-v5 5/5] ext4: add optimization for the lazytime mount option Theodore Ts'o
2014-11-28 8:55 ` [PATCH-v5 0/5] add support for a " Sedat Dilek
6 siblings, 0 replies; 18+ messages in thread
From: Theodore Ts'o @ 2014-11-28 6:00 UTC (permalink / raw)
To: Ext4 Developers List; +Cc: Linux Filesystem Development List, Theodore Ts'o
Add a new function find_inode_nowait() which is an even more general
version of ilookup5_nowait(). It is designed for callers which need
very fine grained control over when the function is allowed to block
or increment the inode's reference count.
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
fs/inode.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/fs.h | 5 +++++
2 files changed, 55 insertions(+)
diff --git a/fs/inode.c b/fs/inode.c
index e062640..240e777 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1283,6 +1283,56 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
}
EXPORT_SYMBOL(ilookup);
+/**
+ * find_inode_nowait - find an inode in the inode cache
+ * @sb: super block of file system to search
+ * @hashval: hash value (usually inode number) to search for
+ * @match: callback used for comparisons between inodes
+ * @data: opaque data pointer to pass to @match
+ *
+ * Search for the inode specified by @hashval and @data in the inode
+ * cache, where the helper function @match will return 0 if the inode
+ * does not match, 1 if the inode does match, and -1 if the search
+ * should be stopped. The @match function must be responsible for
+ * taking the i_lock spin_lock and checking i_state for an inode being
+ * freed or being initialized, and incrementing the reference count
+ * before returning 1. It also must not sleep, since it is called with
+ * the inode_hash_lock spinlock held.
+ *
+ * This is a even more generalized version of ilookup5() when the
+ * function must never block --- find_inode() can block in
+ * __wait_on_freeing_inode() --- or when the caller can not increment
+ * the reference count because the resulting iput() might cause an
+ * inode eviction(). The tradeoff is that the @match funtion must be
+ * very carefully implemented.
+ */
+struct inode *find_inode_nowait(struct super_block *sb,
+ unsigned long hashval,
+ int (*match)(struct inode *, unsigned long,
+ void *),
+ void *data)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+ struct inode *inode, *ret_inode = NULL;
+ int mval;
+
+ spin_lock(&inode_hash_lock);
+ hlist_for_each_entry(inode, head, i_hash) {
+ if (inode->i_sb != sb)
+ continue;
+ mval = match(inode, hashval, data);
+ if (mval == 0)
+ continue;
+ if (mval == 1)
+ ret_inode = inode;
+ goto out;
+ }
+out:
+ spin_unlock(&inode_hash_lock);
+ return ret_inode;
+}
+EXPORT_SYMBOL(find_inode_nowait);
+
int insert_inode_locked(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2b86b5d..ae9a3ea 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2418,6 +2418,11 @@ extern struct inode *ilookup(struct super_block *sb, unsigned long ino);
extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *);
extern struct inode * iget_locked(struct super_block *, unsigned long);
+extern struct inode *find_inode_nowait(struct super_block *,
+ unsigned long,
+ int (*match)(struct inode *,
+ unsigned long, void *),
+ void *data);
extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
extern int insert_inode_locked(struct inode *);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
--
2.1.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH-v5 5/5] ext4: add optimization for the lazytime mount option
2014-11-28 6:00 [PATCH-v5 0/5] add support for a lazytime mount option Theodore Ts'o
` (4 preceding siblings ...)
2014-11-28 6:00 ` [PATCH-v5 4/5] vfs: add find_inode_nowait() function Theodore Ts'o
@ 2014-11-28 6:00 ` Theodore Ts'o
2014-11-28 8:55 ` [PATCH-v5 0/5] add support for a " Sedat Dilek
6 siblings, 0 replies; 18+ messages in thread
From: Theodore Ts'o @ 2014-11-28 6:00 UTC (permalink / raw)
To: Ext4 Developers List; +Cc: Linux Filesystem Development List, Theodore Ts'o
Add an optimization for the MS_LAZYTIME mount option so that we will
opportunistically write out any inodes with the I_DIRTY_TIME flag set
in a particular inode table block when we need to update some inode in
that inode table block anyway.
Also add some temporary code so that we can set the lazytime mount
option without needing a modified /sbin/mount program which can set
MS_LAZYTIME. We can eventually make this go away once util-linux has
added support.
Google-Bug-Id: 18297052
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
fs/ext4/inode.c | 66 ++++++++++++++++++++++++++++++++++++++++++---
fs/ext4/super.c | 9 +++++++
include/trace/events/ext4.h | 30 +++++++++++++++++++++
3 files changed, 102 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5653fa4..0e60d90 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4139,6 +4139,66 @@ static int ext4_inode_blocks_set(handle_t *handle,
return 0;
}
+struct other_inode {
+ unsigned long orig_ino;
+ struct ext4_inode *raw_inode;
+};
+
+static int other_inode_match(struct inode * inode, unsigned long ino,
+ void *data)
+{
+ struct other_inode *oi = (struct other_inode *) data;
+
+ if ((inode->i_ino != ino) ||
+ (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
+ I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
+ ((inode->i_state & I_DIRTY_TIME) == 0))
+ return 0;
+ spin_lock(&inode->i_lock);
+ if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
+ I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0) &&
+ (inode->i_state & I_DIRTY_TIME)) {
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ inode->i_state &= ~I_DIRTY_TIME;
+ inode->i_ts_dirty_day = 0;
+ spin_unlock(&inode->i_lock);
+
+ spin_lock(&ei->i_raw_lock);
+ EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode);
+ EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode);
+ EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode);
+ ext4_inode_csum_set(inode, oi->raw_inode, ei);
+ spin_unlock(&ei->i_raw_lock);
+ trace_ext4_other_inode_update_time(inode, oi->orig_ino);
+ return -1;
+ }
+ spin_unlock(&inode->i_lock);
+ return -1;
+}
+
+/*
+ * Opportunistically update the other time fields for other inodes in
+ * the same inode table block.
+ */
+static void ext4_update_other_inodes_time(struct super_block *sb,
+ unsigned long orig_ino, char *buf)
+{
+ struct other_inode oi;
+ unsigned long ino;
+ int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+ int inode_size = EXT4_INODE_SIZE(sb);
+
+ oi.orig_ino = orig_ino;
+ ino = orig_ino & ~(inodes_per_block - 1);
+ for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
+ if (ino == orig_ino)
+ continue;
+ oi.raw_inode = (struct ext4_inode *) buf;
+ (void) find_inode_nowait(sb, ino, other_inode_match, &oi);
+ }
+}
+
/*
* Post the struct inode info into an on-disk inode location in the
* buffer-cache. This gobbles the caller's reference to the
@@ -4237,7 +4297,6 @@ static int ext4_do_update_inode(handle_t *handle,
for (block = 0; block < EXT4_N_BLOCKS; block++)
raw_inode->i_block[block] = ei->i_data[block];
}
-
if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
if (ei->i_extra_isize) {
@@ -4248,10 +4307,11 @@ static int ext4_do_update_inode(handle_t *handle,
cpu_to_le16(ei->i_extra_isize);
}
}
-
ext4_inode_csum_set(inode, raw_inode, ei);
-
spin_unlock(&ei->i_raw_lock);
+ if (inode->i_sb->s_flags & MS_LAZYTIME)
+ ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
+ bh->b_data);
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
rc = ext4_handle_dirty_metadata(handle, NULL, bh);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 58859bc..93a2b7a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1132,6 +1132,7 @@ enum {
Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
Opt_usrquota, Opt_grpquota, Opt_i_version,
Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
+ Opt_lazytime, Opt_nolazytime,
Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
Opt_inode_readahead_blks, Opt_journal_ioprio,
Opt_dioread_nolock, Opt_dioread_lock,
@@ -1195,6 +1196,8 @@ static const match_table_t tokens = {
{Opt_i_version, "i_version"},
{Opt_stripe, "stripe=%u"},
{Opt_delalloc, "delalloc"},
+ {Opt_lazytime, "lazytime"},
+ {Opt_nolazytime, "nolazytime"},
{Opt_nodelalloc, "nodelalloc"},
{Opt_removed, "mblk_io_submit"},
{Opt_removed, "nomblk_io_submit"},
@@ -1452,6 +1455,12 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
case Opt_i_version:
sb->s_flags |= MS_I_VERSION;
return 1;
+ case Opt_lazytime:
+ sb->s_flags |= MS_LAZYTIME;
+ return 1;
+ case Opt_nolazytime:
+ sb->s_flags &= ~MS_LAZYTIME;
+ return 1;
}
for (m = ext4_mount_opts; m->token != Opt_err; m++)
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 6cfb841..6e5abd6 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -73,6 +73,36 @@ struct extent_status;
{ FALLOC_FL_ZERO_RANGE, "ZERO_RANGE"})
+TRACE_EVENT(ext4_other_inode_update_time,
+ TP_PROTO(struct inode *inode, ino_t orig_ino),
+
+ TP_ARGS(inode, orig_ino),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( ino_t, ino )
+ __field( ino_t, orig_ino )
+ __field( uid_t, uid )
+ __field( gid_t, gid )
+ __field( __u16, mode )
+ ),
+
+ TP_fast_assign(
+ __entry->orig_ino = orig_ino;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->uid = i_uid_read(inode);
+ __entry->gid = i_gid_read(inode);
+ __entry->mode = inode->i_mode;
+ ),
+
+ TP_printk("dev %d,%d orig_ino %lu ino %lu mode 0%o uid %u gid %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long) __entry->orig_ino,
+ (unsigned long) __entry->ino, __entry->mode,
+ __entry->uid, __entry->gid)
+);
+
TRACE_EVENT(ext4_free_inode,
TP_PROTO(struct inode *inode),
--
2.1.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [PATCH-v5 0/5] add support for a lazytime mount option
2014-11-28 6:00 [PATCH-v5 0/5] add support for a lazytime mount option Theodore Ts'o
` (5 preceding siblings ...)
2014-11-28 6:00 ` [PATCH-v5 5/5] ext4: add optimization for the lazytime mount option Theodore Ts'o
@ 2014-11-28 8:55 ` Sedat Dilek
2014-11-28 15:07 ` Theodore Ts'o
2014-11-28 16:32 ` Jan Kara
6 siblings, 2 replies; 18+ messages in thread
From: Sedat Dilek @ 2014-11-28 8:55 UTC (permalink / raw)
To: Theodore Ts'o; +Cc: Ext4 Developers List, Linux Filesystem Development List
On Fri, Nov 28, 2014 at 7:00 AM, Theodore Ts'o <tytso@mit.edu> wrote:
> This is an updated version of what had originally been an
> ext4-specific patch which significantly improves performance by lazily
> writing timestamp updates (and in particular, mtime updates) to disk.
> The in-memory timestamps are always correct, but they are only written
> to disk when required for correctness.
>
> This provides a huge performance boost for ext4 due to how it handles
> journalling, but it's valuable for all file systems running on flash
> storage or drive-managed SMR disks by reducing the metadata write
> load. So upon request, I've moved the functionality to the VFS layer.
> Once the /sbin/mount program adds support for MS_LAZYTIME, all file
> systems should be able to benefit from this optimization.
>
> There is still an ext4-specific optimization, which may be applicable
> for other file systems which store more than one inode in a block, but
> it will require file system specific code. It is purely optional,
> however.
>
> Please note the changes to update_time() and the new write_time() inode
> operations functions, which impact btrfs and xfs. The changes are
> fairly simple, but I would appreciate confirmation from the btrfs and
> xfs teams that I got things right. Thanks!!
>
Some questions... on how to test this...
[ Base ]
Is this patchset on top of ext4-next (ext4.git#dev)? Might someone
test on top of Linux v3.18-rc6 with pulled in ext4.git#dev2?
[ Userland ]
Do I need an updated userland (/sbin/mount)? IOW, adding "lazytime" to
my ext4-line(s) in /etc/fstab is enough?
[ Benchmarks ]
Do you have numbers - how big/fast is the benefit? On a desktop machine?
Thanks in advance.
- Sedat -
> Changes since -v4:
> - Fix ext4 optimization so it does not need to increment (and more
> problematically, decrement) the inode reference count
> - Per Christoph's suggestion, drop support for btrfs and xfs for now,
> issues with how btrfs and xfs handle dirty inode tracking. We can add
> btrfs and xfs support back later or at the end of this series if we
> want to revisit this decision.
> - Miscellaneous cleanups
>
> Changes since -v3:
> - inodes with I_DIRTY_TIME set are placed on a new bdi list,
> b_dirty_time. This allows filesystem-level syncs to more
> easily iterate over those inodes that need to have their
> timestamps written to disk.
> - dirty timestamps will be written out asynchronously on the final
> iput, instead of when the inode gets evicted.
> - separate the definition of the new function
> find_active_inode_nowait() to a separate patch
> - create separate flag masks: I_DIRTY_WB and I_DIRTY_INODE, which
> indicate whether the inode needs to be on the write back lists,
> or whether the inode itself is dirty, while I_DIRTY means any one
> of the inode dirty flags are set. This simplifies the fs
> writeback logic which needs to test for different combinations of
> the inode dirty flags in different places.
>
> Changes since -v2:
> - If update_time() updates i_version, it will not use lazytime (i..e,
> the inode will be marked dirty so the change will be persisted on to
> disk sooner rather than later). Yes, this eliminates the
> benefits of lazytime if the user is experting the file system via
> NFSv4. Sad, but NFS's requirements seem to mandate this.
> - Fix time wrapping bug 49 days after the system boots (on a system
> with a 32-bit jiffies). Use get_monotonic_boottime() instead.
> - Clean up type warning in include/tracing/ext4.h
> - Added explicit parenthesis for stylistic reasons
> - Added an is_readonly() inode operations method so btrfs doesn't
> have to duplicate code in update_time().
>
> Changes since -v1:
> - Added explanatory comments in update_time() regarding i_ts_dirty_days
> - Fix type used for days_since_boot
> - Improve SMP scalability in update_time and ext4_update_other_inodes_time
> - Added tracepoints to help test and characterize how often and under
> what circumstances inodes have their timestamps lazily updated
>
> Theodore Ts'o (5):
> vfs: add support for a lazytime mount option
> vfs: don't let the dirty time inodes get more than a day stale
> vfs: add lazytime tracepoints for better debugging
> vfs: add find_inode_nowait() function
> ext4: add optimization for the lazytime mount option
>
> fs/ext4/inode.c | 66 +++++++++++++++++++++++--
> fs/ext4/super.c | 9 ++++
> fs/fs-writeback.c | 66 ++++++++++++++++++++++---
> fs/inode.c | 116 +++++++++++++++++++++++++++++++++++++++++---
> fs/libfs.c | 2 +-
> fs/logfs/readwrite.c | 2 +-
> fs/nfsd/vfs.c | 2 +-
> fs/pipe.c | 2 +-
> fs/proc_namespace.c | 1 +
> fs/sync.c | 8 +++
> fs/ufs/truncate.c | 2 +-
> include/linux/backing-dev.h | 1 +
> include/linux/fs.h | 17 ++++++-
> include/trace/events/ext4.h | 30 ++++++++++++
> include/trace/events/fs.h | 56 +++++++++++++++++++++
> include/uapi/linux/fs.h | 1 +
> mm/backing-dev.c | 10 +++-
> 17 files changed, 367 insertions(+), 24 deletions(-)
> create mode 100644 include/trace/events/fs.h
>
> --
> 2.1.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH-v5 0/5] add support for a lazytime mount option
2014-11-28 8:55 ` [PATCH-v5 0/5] add support for a " Sedat Dilek
@ 2014-11-28 15:07 ` Theodore Ts'o
2014-11-28 16:32 ` Jan Kara
1 sibling, 0 replies; 18+ messages in thread
From: Theodore Ts'o @ 2014-11-28 15:07 UTC (permalink / raw)
To: Sedat Dilek; +Cc: Ext4 Developers List, Linux Filesystem Development List
On Fri, Nov 28, 2014 at 09:55:19AM +0100, Sedat Dilek wrote:
> Some questions... on how to test this...
>
> [ Base ]
> Is this patchset on top of ext4-next (ext4.git#dev)? Might someone
> test on top of Linux v3.18-rc6 with pulled in ext4.git#dev2?
Yes and yes. I've been doing xfstest runs using the dev2 branch, and
my laptop has been running mainline with the ext4.git#dev plus
lazytime patches (i.e., ext4.git#dev2) pulled in.
> [ Userland ]
> Do I need an updated userland (/sbin/mount)? IOW, adding "lazytime" to
> my ext4-line(s) in /etc/fstab is enough?
There is some temporary infrastructure for ext4 so that adding
"lazytime" to the ext4 line should be enough.
> [ Benchmarks ]
> Do you have numbers - how big/fast is the benefit? On a desktop machine?
For a typical desktop machine workload, I doubt you will notice a
difference. The tracepoints allow you to see how much work can get
delayed. What I've been doing on my laptop:
for i in / /u1 /build ; do mount -o remount,lazytime $i; done
cd /sys/kernel/debug/tracing
echo fs:fs_lazytime_defer > set_event
echo fs:fs_lazytime_iput > set_event
echo fs:fs_lazytime_flush > set_event
echo ext4:ext4_other_inode_update_time >> set_event
cat trace_pipe
The place where this will show a big benefit is where you are doing
non-allocating writes to a file --- for example, if you have a large
database (or possibly a VM image file) where you are sending random
writes to the file. Without this patch, every five seconds the
inode's mtime field will get updated on every write (if you are using
256 byte inodes, and thus have fine-grained timestamps) or once a
second (if you are using 128 byte inodes). These inode timestamp
updates will be sent to disk once every five seconds, either via
ext4's journaling mechanisms, or if you are running in no-journal
mode, via the dirty inode writeback.
How does this manifest in numbers? Well, if you have journalling
enabled, the metadata updates are journalled, so if you are running a
sufficiently aggressive workload on a large SMP system, you can see
the journal lock contention. If you were to set up a benchmarking
configuration with a ramdisk on a 16+ core machine, and used a fio job
writing to a large file on said ramdisk from all the cores in
parallel, I'm sure you would see the effects.
For people who care about long tail latency, interference effects from
journal commits (especially if you have other file system traffic
happening at the same time), or from the journal checkpoint would also
be measurable. Dmitry has measured this and had been looking at this
as a performance bug using some kind of an AIO workload, but I don't
have more details about this.
If you aren't using the journal, and you care about long tail latency
very much, it turns out that HDD's do have an effect similar to the
SSD write disturb effect, where random writes to the same block will
eventually cause the HDD to need to rewrite a number of tracks around
a particular area of disk which is constantly getting hammered. This
is around 100ms, and is very easily measurable if you have a set up
that is looking for long-tail latency effects and are trying to
control them. (This is what originally inspired this patch set.)
>From my usage on my desktop, looking at the tracepoint data, you will
also see fs_lazytime_defer traces from relatime updates (where the
atime is older than a day, or when atime <= mtime), and from compile
runs where the linker writes the ELF header after writing every thing
else --- it turns out the libbfd linker has a terrible random write
pattern, good for stress testing NFS servers and delayed allocation
algorithms. :-)
In the latter case, upon closer inspection the mtime in memory is a
millisecond or so newer than the time on disk (since if you do an
allocating write which changes i_size or i_blocks, the contents of the
inode will be sent to disk, including the mtime at the time).
However, whether you look at the on-disk or in-memory mtime for said
generated foo.o file, either will be newer than the source foo.c file,
so even if we crash without the slightly stale mtime not getting saved
to disk, it shouldn't cause any problem in actual practice. (The
safety question has come up in some internal discussions, so I've
looked at this question fairly carefully.)
[ I can imagine an ext4 optimization where as part of the commit
process, we check all of the inode tables about to be commited, and
capture the updated inode times from any lazily updated timestamps,
which would avoid this issue. This would be an even more aggressive
optimization than what is currently in the patch set which does this
whenever an adjacent inode in the same inode table block is updated.
I'm not sure trying to do the commit-time optimization is worth it,
however. ]
Anyway, the deferred relatime updates are thus the main real benefit
you would see on a desktop workload (since the mtime updates from the
final linker write would typically end up in the same commit, so it
really doesn't avoid a HDD write op). And while these random writes
from the relatime updates aren't _nice_ for a SSD in terms of
performance or write endurance, for most desktop workloads the effect
isn't going to be great enough to be measurable. Things might be
different on a drive-managed SMR disk, or on eMMC flash or SD cards
with a really crappy flash translation layer, but I haven't had a
chance to look at the effect on those types of storage media to date.
Cheers,
- Ted
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH-v5 0/5] add support for a lazytime mount option
2014-11-28 8:55 ` [PATCH-v5 0/5] add support for a " Sedat Dilek
2014-11-28 15:07 ` Theodore Ts'o
@ 2014-11-28 16:32 ` Jan Kara
1 sibling, 0 replies; 18+ messages in thread
From: Jan Kara @ 2014-11-28 16:32 UTC (permalink / raw)
To: Sedat Dilek
Cc: Theodore Ts'o, Ext4 Developers List,
Linux Filesystem Development List
On Fri 28-11-14 09:55:19, Sedat Dilek wrote:
> On Fri, Nov 28, 2014 at 7:00 AM, Theodore Ts'o <tytso@mit.edu> wrote:
> > This is an updated version of what had originally been an
> > ext4-specific patch which significantly improves performance by lazily
> > writing timestamp updates (and in particular, mtime updates) to disk.
> > The in-memory timestamps are always correct, but they are only written
> > to disk when required for correctness.
> >
> > This provides a huge performance boost for ext4 due to how it handles
> > journalling, but it's valuable for all file systems running on flash
> > storage or drive-managed SMR disks by reducing the metadata write
> > load. So upon request, I've moved the functionality to the VFS layer.
> > Once the /sbin/mount program adds support for MS_LAZYTIME, all file
> > systems should be able to benefit from this optimization.
> >
> > There is still an ext4-specific optimization, which may be applicable
> > for other file systems which store more than one inode in a block, but
> > it will require file system specific code. It is purely optional,
> > however.
> >
> > Please note the changes to update_time() and the new write_time() inode
> > operations functions, which impact btrfs and xfs. The changes are
> > fairly simple, but I would appreciate confirmation from the btrfs and
> > xfs teams that I got things right. Thanks!!
> >
>
> Some questions... on how to test this...
>
> [ Base ]
> Is this patchset on top of ext4-next (ext4.git#dev)? Might someone
> test on top of Linux v3.18-rc6 with pulled in ext4.git#dev2?
>
> [ Userland ]
> Do I need an updated userland (/sbin/mount)? IOW, adding "lazytime" to
> my ext4-line(s) in /etc/fstab is enough?
>
> [ Benchmarks ]
> Do you have numbers - how big/fast is the benefit? On a desktop machine?
Actually a benefit you may notice on a laptop machine is that disk will
wake up less often. When I was looking for reasons of disk wakeup on a
desktop machine, some of these were mtime updates of unix socket inodes.
This patches will make them go away.
Honza
>
> Thanks in advance.
>
> - Sedat -
>
> > Changes since -v4:
> > - Fix ext4 optimization so it does not need to increment (and more
> > problematically, decrement) the inode reference count
> > - Per Christoph's suggestion, drop support for btrfs and xfs for now,
> > issues with how btrfs and xfs handle dirty inode tracking. We can add
> > btrfs and xfs support back later or at the end of this series if we
> > want to revisit this decision.
> > - Miscellaneous cleanups
> >
> > Changes since -v3:
> > - inodes with I_DIRTY_TIME set are placed on a new bdi list,
> > b_dirty_time. This allows filesystem-level syncs to more
> > easily iterate over those inodes that need to have their
> > timestamps written to disk.
> > - dirty timestamps will be written out asynchronously on the final
> > iput, instead of when the inode gets evicted.
> > - separate the definition of the new function
> > find_active_inode_nowait() to a separate patch
> > - create separate flag masks: I_DIRTY_WB and I_DIRTY_INODE, which
> > indicate whether the inode needs to be on the write back lists,
> > or whether the inode itself is dirty, while I_DIRTY means any one
> > of the inode dirty flags are set. This simplifies the fs
> > writeback logic which needs to test for different combinations of
> > the inode dirty flags in different places.
> >
> > Changes since -v2:
> > - If update_time() updates i_version, it will not use lazytime (i..e,
> > the inode will be marked dirty so the change will be persisted on to
> > disk sooner rather than later). Yes, this eliminates the
> > benefits of lazytime if the user is experting the file system via
> > NFSv4. Sad, but NFS's requirements seem to mandate this.
> > - Fix time wrapping bug 49 days after the system boots (on a system
> > with a 32-bit jiffies). Use get_monotonic_boottime() instead.
> > - Clean up type warning in include/tracing/ext4.h
> > - Added explicit parenthesis for stylistic reasons
> > - Added an is_readonly() inode operations method so btrfs doesn't
> > have to duplicate code in update_time().
> >
> > Changes since -v1:
> > - Added explanatory comments in update_time() regarding i_ts_dirty_days
> > - Fix type used for days_since_boot
> > - Improve SMP scalability in update_time and ext4_update_other_inodes_time
> > - Added tracepoints to help test and characterize how often and under
> > what circumstances inodes have their timestamps lazily updated
> >
> > Theodore Ts'o (5):
> > vfs: add support for a lazytime mount option
> > vfs: don't let the dirty time inodes get more than a day stale
> > vfs: add lazytime tracepoints for better debugging
> > vfs: add find_inode_nowait() function
> > ext4: add optimization for the lazytime mount option
> >
> > fs/ext4/inode.c | 66 +++++++++++++++++++++++--
> > fs/ext4/super.c | 9 ++++
> > fs/fs-writeback.c | 66 ++++++++++++++++++++++---
> > fs/inode.c | 116 +++++++++++++++++++++++++++++++++++++++++---
> > fs/libfs.c | 2 +-
> > fs/logfs/readwrite.c | 2 +-
> > fs/nfsd/vfs.c | 2 +-
> > fs/pipe.c | 2 +-
> > fs/proc_namespace.c | 1 +
> > fs/sync.c | 8 +++
> > fs/ufs/truncate.c | 2 +-
> > include/linux/backing-dev.h | 1 +
> > include/linux/fs.h | 17 ++++++-
> > include/trace/events/ext4.h | 30 ++++++++++++
> > include/trace/events/fs.h | 56 +++++++++++++++++++++
> > include/uapi/linux/fs.h | 1 +
> > mm/backing-dev.c | 10 +++-
> > 17 files changed, 367 insertions(+), 24 deletions(-)
> > create mode 100644 include/trace/events/fs.h
> >
> > --
> > 2.1.0
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Jan Kara <jack@suse.cz>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 18+ messages in thread