From: Fengguang Wu <wfg@mail.ustc.edu.cn>
To: Andrew Morton <akpm@osdl.org>
Cc: Ken Chen <kenchen@google.com>, Mike Waychison <mikew@google.com>,
Andrew Morton <akpm@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Subject: [PATCH 5/6] check dirty inode list
Date: Sun, 19 Aug 2007 14:53:54 +0800 [thread overview]
Message-ID: <387507685.60942@ustc.edu.cn> (raw)
Message-ID: <20070819071445.006427549@mail.ustc.edu.cn> (raw)
In-Reply-To: 20070819065349.160284305@mail.ustc.edu.cn
[-- Attachment #1: check_dirty_inode_list.patch --]
[-- Type: text/plain, Size: 6004 bytes --]
From: Andrew Morton <akpm@linux-foundation.org>
The per-superblock dirty-inode list super_block.s_dirty is supposed to be
sorted in reverse order of each inode's time-of-first-dirtying. This is so
that the kupdate function can avoid having to walk all the dirty inodes on the
list: it terminates the search as soon as it finds an inode which was dirtied
less than 30 seconds ago (dirty_expire_centisecs).
We have a bunch of several-year-old bugs which cause that list to not be in
the correct reverse-time-order. The result of this is that under certain
obscure circumstances, inodes get stuck and basically never get written back.
It has been reported a couple of times, but nobody really cared much because
most people use ordered-mode journalling filesystems, which take care of the
writeback independently. Plus we will _eventually_ get onto these inodes even
when the list is out of order, and a /bin/sync will still work OK.
However this is a pretty important data-integrity issue for filesystems such
as ext2.
As preparation for fixing these bugs, this patch adds a pile of fantastically
expensive debugging code which checks the sanity of the s_dirty list all over
the place, so we find out as soon as it goes bad.
The debugging code is controlled by /proc/sys/fs/inode_debug, which defaults
to off. The debugging will disable itself whenever it detects a misordering,
to avoid log spew.
We can remove all this code later.
Cc: Mike Waychison <mikew@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
fs/fs-writeback.c | 77 ++++++++++++++++++++++++++++++++++++
include/linux/writeback.h | 1
kernel/sysctl.c | 8 +++
3 files changed, 86 insertions(+)
--- linux-2.6.23-rc2-mm2.orig/fs/fs-writeback.c
+++ linux-2.6.23-rc2-mm2/fs/fs-writeback.c
@@ -24,6 +24,75 @@
#include <linux/buffer_head.h>
#include "internal.h"
+int sysctl_inode_debug __read_mostly;
+
+static int __check(struct list_head *head, int print_stuff)
+{
+ struct list_head *cursor = head;
+ unsigned long dirtied_when = 0;
+
+ while ((cursor = cursor->prev) != head) {
+ struct inode *inode = list_entry(cursor, struct inode, i_list);
+ if (print_stuff) {
+ printk("%p:%lu\n", inode, inode->dirtied_when);
+ } else {
+ if (dirtied_when &&
+ time_before(inode->dirtied_when, dirtied_when))
+ return 1;
+ dirtied_when = inode->dirtied_when;
+ }
+ }
+ return 0;
+}
+
+static void __check_dirty_inode_list(struct super_block *sb,
+ struct inode *inode, const char *file, int line)
+{
+ if (!sysctl_inode_debug)
+ return;
+
+ if (__check(&sb->s_dirty, 0)) {
+ sysctl_inode_debug = 0;
+ if (inode)
+ printk("%s:%d: s_dirty got screwed up. inode=%p:%lu\n",
+ file, line, inode, inode->dirtied_when);
+ else
+ printk("%s:%d: s_dirty got screwed up\n", file, line);
+ __check(&sb->s_dirty, 1);
+ }
+ if (__check(&sb->s_io, 0)) {
+ sysctl_inode_debug = 0;
+ if (inode)
+ printk("%s:%d: s_io got screwed up. inode=%p:%lu\n",
+ file, line, inode, inode->dirtied_when);
+ else
+ printk("%s:%d: s_io got screwed up\n", file, line);
+ __check(&sb->s_io, 1);
+ }
+ if (__check(&sb->s_more_io, 0)) {
+ sysctl_inode_debug = 0;
+ if (inode)
+ printk("%s:%d: s_more_io got screwed up. inode=%p:%lu\n",
+ file, line, inode, inode->dirtied_when);
+ else
+ printk("%s:%d: s_more_io got screwed up\n", file, line);
+ __check(&sb->s_more_io, 1);
+ }
+}
+
+#define check_dirty_inode_list(sb) \
+ do { \
+ if (unlikely(sysctl_inode_debug)) \
+ __check_dirty_inode_list(sb, NULL, __FILE__, __LINE__); \
+ } while (0)
+
+#define check_dirty_inode(inode) \
+ do { \
+ if (unlikely(sysctl_inode_debug)) \
+ __check_dirty_inode_list(inode->i_sb, inode, \
+ __FILE__, __LINE__); \
+ } while (0)
+
/**
* __mark_inode_dirty - internal function
* @inode: inode to mark
@@ -122,8 +191,10 @@ void __mark_inode_dirty(struct inode *in
* reposition it (that would break s_dirty time-ordering).
*/
if (!was_dirty) {
+ check_dirty_inode(inode);
inode->dirtied_when = jiffies;
list_move(&inode->i_list, &sb->s_dirty);
+ check_dirty_inode(inode);
}
}
out:
@@ -152,6 +223,7 @@ static void redirty_tail(struct inode *i
{
struct super_block *sb = inode->i_sb;
+ check_dirty_inode(inode);
if (!list_empty(&sb->s_dirty)) {
struct inode *tail_inode;
@@ -161,6 +233,7 @@ static void redirty_tail(struct inode *i
inode->dirtied_when = jiffies;
}
list_move(&inode->i_list, &sb->s_dirty);
+ check_dirty_inode(inode);
}
/*
@@ -168,7 +241,9 @@ static void redirty_tail(struct inode *i
*/
static void requeue_io(struct inode *inode)
{
+ check_dirty_inode(inode);
list_move(&inode->i_list, &inode->i_sb->s_more_io);
+ check_dirty_inode(inode);
}
static void inode_sync_complete(struct inode *inode)
@@ -463,8 +538,10 @@ int generic_sync_sb_inodes(struct super_
if (!ret)
ret = err;
if (wbc->sync_mode == WB_SYNC_HOLD) {
+ check_dirty_inode(inode);
inode->dirtied_when = jiffies;
list_move(&inode->i_list, &sb->s_dirty);
+ check_dirty_inode(inode);
}
if (current_is_pdflush())
writeback_release(bdi);
--- linux-2.6.23-rc2-mm2.orig/include/linux/writeback.h
+++ linux-2.6.23-rc2-mm2/include/linux/writeback.h
@@ -140,5 +140,6 @@ void writeback_set_ratelimit(void);
extern int nr_pdflush_threads; /* Global so it can be exported to sysctl
read-only. */
+extern int sysctl_inode_debug;
#endif /* WRITEBACK_H */
--- linux-2.6.23-rc2-mm2.orig/kernel/sysctl.c
+++ linux-2.6.23-rc2-mm2/kernel/sysctl.c
@@ -1238,6 +1238,14 @@ static struct ctl_table fs_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "inode_debug",
+ .data = &sysctl_inode_debug,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
{
.ctl_name = CTL_UNNUMBERED,
--
next prev parent reply other threads:[~2007-08-19 7:16 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-08-19 6:53 [PATCH 0/6] dirty inode lists time delay/ordering fixes Fengguang Wu
2007-08-19 6:53 ` Fengguang Wu
2007-08-19 6:53 ` [PATCH 1/6] writeback: fix time ordering of the per superblock inode lists 8 Fengguang Wu
2007-08-19 6:53 ` Fengguang Wu
2007-08-19 6:53 ` [PATCH 2/6] writeback: fix ntfs with sb_has_dirty_inodes() Fengguang Wu
2007-08-19 6:53 ` Fengguang Wu
2007-08-19 6:53 ` [PATCH 3/6] writeback: remove pages_skipped accounting in __block_write_full_page() Fengguang Wu
2007-08-19 6:53 ` Fengguang Wu
2007-08-19 6:53 ` [PATCH 4/6] writeback: introduce writeback_control.more_io to indicate more io Fengguang Wu
2007-08-19 6:53 ` Fengguang Wu
2007-08-19 6:53 ` Fengguang Wu [this message]
2007-08-19 6:53 ` [PATCH 5/6] check dirty inode list Fengguang Wu
2007-08-19 6:53 ` [PATCH 6/6] prevent time-ordering warnings Fengguang Wu
2007-08-19 6:53 ` Fengguang Wu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=387507685.60942@ustc.edu.cn \
--to=wfg@mail.ustc.edu.cn \
--cc=akpm@linux-foundation.org \
--cc=akpm@osdl.org \
--cc=kenchen@google.com \
--cc=mikew@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.