[PATCH] adding per sb inode list to make invalidate

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] adding per sb inode list to make invalidate_inodes() faster
@ 2004-09-09 15:39 Kirill Korotaev
  2004-09-09 15:51 ` Linus Torvalds
  0 siblings, 1 reply; 14+ messages in thread
From: Kirill Korotaev @ 2004-09-09 15:39 UTC (permalink / raw)
  To: akpm, torvalds, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 596 bytes --]

I sent this patch once before for 2.4 kernels, here is the version for 
2.6.8.1.

This patch fixes the problem that huge inode cache
can make invalidate_inodes() calls very long. Meanwhile
lock_kernel() and inode_lock are being held and the system
can freeze for seconds.
I detected this problem on kernel with 4gb split. When inode cache
was 2.5Gb (1,000,000 of inodes in cache) it took seconds to umount
or turn quota off.

So this patch introduces per-super block inode list which makes
possible to scan sb's only inodes when doing umount.

Signed-Off-By: Kirill Korotaev <dev@sw.ru>

Kirill


[-- Attachment #2: diff-inode-invalidate-20040908 --]
[-- Type: text/plain, Size: 6022 bytes --]

--- ./include/linux/fs.h.invl	2004-08-14 14:55:09.000000000 +0400
+++ ./include/linux/fs.h	2004-09-08 18:47:46.989249792 +0400
@@ -419,6 +419,7 @@ static inline int mapping_writably_mappe
 struct inode {
 	struct hlist_node	i_hash;
 	struct list_head	i_list;
+	struct list_head	i_sb_list;
 	struct list_head	i_dentry;
 	unsigned long		i_ino;
 	atomic_t		i_count;
@@ -750,6 +751,7 @@ struct super_block {
 	atomic_t		s_active;
 	void                    *s_security;
 
+	struct list_head	s_inodes;	/* all inodes */
 	struct list_head	s_dirty;	/* dirty inodes */
 	struct list_head	s_io;		/* parked for writeback */
 	struct hlist_head	s_anon;		/* anonymous dentries for (nfs) exporting */
--- ./fs/hugetlbfs/inode.c.invl	2004-08-14 14:56:14.000000000 +0400
+++ ./fs/hugetlbfs/inode.c	2004-09-08 19:17:05.787871760 +0400
@@ -198,6 +198,7 @@ static void hugetlbfs_delete_inode(struc
 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(inode->i_sb);
 
 	hlist_del_init(&inode->i_hash);
+	list_del(&inode->i_sb_list);
 	list_del_init(&inode->i_list);
 	inode->i_state |= I_FREEING;
 	inodes_stat.nr_inodes--;
@@ -240,6 +241,7 @@ static void hugetlbfs_forget_inode(struc
 	inodes_stat.nr_unused--;
 	hlist_del_init(&inode->i_hash);
 out_truncate:
+	list_del(&inode->i_sb_list);
 	list_del_init(&inode->i_list);
 	inode->i_state |= I_FREEING;
 	inodes_stat.nr_inodes--;
--- ./fs/inode.c.invl	2004-09-08 18:35:48.082540224 +0400
+++ ./fs/inode.c	2004-09-08 18:57:45.873205592 +0400
@@ -295,7 +295,7 @@ static void dispose_list(struct list_hea
 /*
  * Invalidate all inodes for a device.
  */
-static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose)
+static int invalidate_list(struct list_head *head, struct list_head * dispose)
 {
 	struct list_head *next;
 	int busy = 0, count = 0;
@@ -308,12 +308,11 @@ static int invalidate_list(struct list_h
 		next = next->next;
 		if (tmp == head)
 			break;
-		inode = list_entry(tmp, struct inode, i_list);
-		if (inode->i_sb != sb)
-			continue;
+		inode = list_entry(tmp, struct inode, i_sb_list);
 		invalidate_inode_buffers(inode);
 		if (!atomic_read(&inode->i_count)) {
 			hlist_del_init(&inode->i_hash);
+			list_del(&inode->i_sb_list);
 			list_move(&inode->i_list, dispose);
 			inode->i_state |= I_FREEING;
 			count++;
@@ -349,10 +348,7 @@ int invalidate_inodes(struct super_block
 
 	down(&iprune_sem);
 	spin_lock(&inode_lock);
-	busy = invalidate_list(&inode_in_use, sb, &throw_away);
-	busy |= invalidate_list(&inode_unused, sb, &throw_away);
-	busy |= invalidate_list(&sb->s_dirty, sb, &throw_away);
-	busy |= invalidate_list(&sb->s_io, sb, &throw_away);
+	busy = invalidate_list(&sb->s_inodes, &throw_away);
 	spin_unlock(&inode_lock);
 
 	dispose_list(&throw_away);
@@ -452,6 +448,7 @@ static void prune_icache(int nr_to_scan)
 				continue;
 		}
 		hlist_del_init(&inode->i_hash);
+		list_del(&inode->i_sb_list);
 		list_move(&inode->i_list, &freeable);
 		inode->i_state |= I_FREEING;
 		nr_pruned++;
@@ -561,6 +558,7 @@ struct inode *new_inode(struct super_blo
 	if (inode) {
 		spin_lock(&inode_lock);
 		inodes_stat.nr_inodes++;
+		list_add(&inode->i_sb_list, &sb->s_inodes);
 		list_add(&inode->i_list, &inode_in_use);
 		inode->i_ino = ++last_ino;
 		inode->i_state = 0;
@@ -609,6 +607,7 @@ static struct inode * get_new_inode(stru
 				goto set_failed;
 
 			inodes_stat.nr_inodes++;
+			list_add(&inode->i_sb_list, &sb->s_inodes);
 			list_add(&inode->i_list, &inode_in_use);
 			hlist_add_head(&inode->i_hash, head);
 			inode->i_state = I_LOCK|I_NEW;
@@ -657,6 +656,7 @@ static struct inode * get_new_inode_fast
 		if (!old) {
 			inode->i_ino = ino;
 			inodes_stat.nr_inodes++;
+			list_add(&inode->i_sb_list, &sb->s_inodes);
 			list_add(&inode->i_list, &inode_in_use);
 			hlist_add_head(&inode->i_hash, head);
 			inode->i_state = I_LOCK|I_NEW;
@@ -993,6 +993,7 @@ void generic_delete_inode(struct inode *
 {
 	struct super_operations *op = inode->i_sb->s_op;
 
+	list_del(&inode->i_sb_list);
 	list_del_init(&inode->i_list);
 	inode->i_state|=I_FREEING;
 	inodes_stat.nr_inodes--;
@@ -1038,6 +1039,7 @@ static void generic_forget_inode(struct 
 		inodes_stat.nr_unused--;
 		hlist_del_init(&inode->i_hash);
 	}
+	list_del(&inode->i_sb_list);
 	list_del_init(&inode->i_list);
 	inode->i_state|=I_FREEING;
 	inodes_stat.nr_inodes--;
@@ -1230,33 +1232,15 @@ int remove_inode_dquot_ref(struct inode 
 void remove_dquot_ref(struct super_block *sb, int type, struct list_head *tofree_head)
 {
 	struct inode *inode;
-	struct list_head *act_head;
 
 	if (!sb->dq_op)
 		return;	/* nothing to do */
-	spin_lock(&inode_lock);	/* This lock is for inodes code */
 
+	spin_lock(&inode_lock);	/* This lock is for inodes code */
 	/* We hold dqptr_sem so we are safe against the quota code */
-	list_for_each(act_head, &inode_in_use) {
-		inode = list_entry(act_head, struct inode, i_list);
-		if (inode->i_sb == sb && !IS_NOQUOTA(inode))
-			remove_inode_dquot_ref(inode, type, tofree_head);
-	}
-	list_for_each(act_head, &inode_unused) {
-		inode = list_entry(act_head, struct inode, i_list);
-		if (inode->i_sb == sb && !IS_NOQUOTA(inode))
-			remove_inode_dquot_ref(inode, type, tofree_head);
-	}
-	list_for_each(act_head, &sb->s_dirty) {
-		inode = list_entry(act_head, struct inode, i_list);
-		if (!IS_NOQUOTA(inode))
-			remove_inode_dquot_ref(inode, type, tofree_head);
-	}
-	list_for_each(act_head, &sb->s_io) {
-		inode = list_entry(act_head, struct inode, i_list);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list)
 		if (!IS_NOQUOTA(inode))
 			remove_inode_dquot_ref(inode, type, tofree_head);
-	}
 	spin_unlock(&inode_lock);
 }
 
--- ./fs/super.c.invl	2004-08-14 14:55:22.000000000 +0400
+++ ./fs/super.c	2004-09-08 18:46:47.227334984 +0400
@@ -65,6 +65,7 @@ static struct super_block *alloc_super(v
 		}
 		INIT_LIST_HEAD(&s->s_dirty);
 		INIT_LIST_HEAD(&s->s_io);
+		INIT_LIST_HEAD(&s->s_inodes);
 		INIT_LIST_HEAD(&s->s_files);
 		INIT_LIST_HEAD(&s->s_instances);
 		INIT_HLIST_HEAD(&s->s_anon);

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] adding per sb inode list to make invalidate_inodes() faster
  2004-09-09 15:39 [PATCH] adding per sb inode list to make invalidate_inodes() faster Kirill Korotaev
@ 2004-09-09 15:51 ` Linus Torvalds
  2004-09-09 17:19   ` William Lee Irwin III
  2004-09-10  8:32   ` Kirill Korotaev
  0 siblings, 2 replies; 14+ messages in thread
From: Linus Torvalds @ 2004-09-09 15:51 UTC (permalink / raw)
  To: Kirill Korotaev; +Cc: akpm, linux-kernel

On Thu, 9 Sep 2004, Kirill Korotaev wrote:
> 
> This patch fixes the problem that huge inode cache
> can make invalidate_inodes() calls very long. Meanwhile
> lock_kernel() and inode_lock are being held and the system
> can freeze for seconds.

Hmm.. I don't mind the approach per se, but I get very nervous about the 
fact that I don't see any initialization of "inode->i_sb_list".

Yes, you do a

	list_add(&inode->i_sb_list, &sb->s_inodes);

in new_inode(), but there are a ton of users that allocate inodes other 
ways, and more importantly, even if this was the only allocation function, 
you do various "list_del(&inode->i_sb_list)" things which leaves the inode 
around but with an invalid superblock list.

So at the very _least_, you should document why all of this is safe very 
carefully (I get nervous about fundamental FS infrastructure changes), and 
it should be left to simmer in -mm for a longish time to make sure it 
really works..

Call me chicken.

		Linus "tweet tweet" Torvalds

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] adding per sb inode list to make invalidate_inodes() faster
  2004-09-09 15:51 ` Linus Torvalds
@ 2004-09-09 17:19   ` William Lee Irwin III
  2004-09-09 18:06     ` Andrew Morton
  2004-09-10  8:32   ` Kirill Korotaev
  1 sibling, 1 reply; 14+ messages in thread
From: William Lee Irwin III @ 2004-09-09 17:19 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Kirill Korotaev, akpm, linux-kernel

On Thu, Sep 09, 2004 at 08:51:45AM -0700, Linus Torvalds wrote:
> Hmm.. I don't mind the approach per se, but I get very nervous about the 
> fact that I don't see any initialization of "inode->i_sb_list".
> Yes, you do a
> 	list_add(&inode->i_sb_list, &sb->s_inodes);
> in new_inode(), but there are a ton of users that allocate inodes other 
> ways, and more importantly, even if this was the only allocation function, 
> you do various "list_del(&inode->i_sb_list)" things which leaves the inode 
> around but with an invalid superblock list.
> So at the very _least_, you should document why all of this is safe very 
> carefully (I get nervous about fundamental FS infrastructure changes), and 
> it should be left to simmer in -mm for a longish time to make sure it 
> really works..
> Call me chicken.

Some version of this patch has been in 2.6.x-mm for a long while. I've
not reviewed this version of the patch for differences with the -mm
code. It would probably be best to look at the -mm bits as they've had
sustained exposure for quite some time.


-- wli

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] adding per sb inode list to make invalidate_inodes() faster
  2004-09-09 17:19   ` William Lee Irwin III
@ 2004-09-09 18:06     ` Andrew Morton
  2004-09-09 18:18       ` William Lee Irwin III
  0 siblings, 1 reply; 14+ messages in thread
From: Andrew Morton @ 2004-09-09 18:06 UTC (permalink / raw)
  To: William Lee Irwin III; +Cc: torvalds, dev, linux-kernel

William Lee Irwin III <wli@holomorphy.com> wrote:
>
> On Thu, Sep 09, 2004 at 08:51:45AM -0700, Linus Torvalds wrote:
>  > Hmm.. I don't mind the approach per se, but I get very nervous about the 
>  > fact that I don't see any initialization of "inode->i_sb_list".
>  > Yes, you do a
>  > 	list_add(&inode->i_sb_list, &sb->s_inodes);
>  > in new_inode(), but there are a ton of users that allocate inodes other 
>  > ways, and more importantly, even if this was the only allocation function, 
>  > you do various "list_del(&inode->i_sb_list)" things which leaves the inode 
>  > around but with an invalid superblock list.
>  > So at the very _least_, you should document why all of this is safe very 
>  > carefully (I get nervous about fundamental FS infrastructure changes), and 
>  > it should be left to simmer in -mm for a longish time to make sure it 
>  > really works..
>  > Call me chicken.
> 
>  Some version of this patch has been in 2.6.x-mm for a long while.

One year.

> I've
>  not reviewed this version of the patch for differences with the -mm
>  code. It would probably be best to look at the -mm bits as they've had
>  sustained exposure for quite some time.

Yes.

I have not merged it up because it seems rather dopey to add eight bytes to
the inode to speed up something as rare as umount.

Is there a convincing reason for proceeding with the change?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] adding per sb inode list to make invalidate_inodes() faster
  2004-09-09 18:06     ` Andrew Morton
@ 2004-09-09 18:18       ` William Lee Irwin III
  2004-09-09 19:08         ` Andrew Morton
  0 siblings, 1 reply; 14+ messages in thread
From: William Lee Irwin III @ 2004-09-09 18:18 UTC (permalink / raw)
  To: Andrew Morton; +Cc: torvalds, dev, linux-kernel

William Lee Irwin III <wli@holomorphy.com> wrote:
>> I've not reviewed this version of the patch for differences with the -mm
>> code. It would probably be best to look at the -mm bits as they've had
>> sustained exposure for quite some time.

On Thu, Sep 09, 2004 at 11:06:22AM -0700, Andrew Morton wrote:
> Yes.
> I have not merged it up because it seems rather dopey to add eight bytes to
> the inode to speed up something as rare as umount.
> Is there a convincing reason for proceeding with the change?

The only motive I'm aware of is for latency in the presence of things
such as autofs. It's also worth noting that in the presence of things
such as removable media umount is also much more common. I personally
find this sufficiently compelling. Kirill may have additional ammunition.

Also, the additional sizeof(struct list_head) is only a requirement
while the global inode LRU is maintained. I believed it would have
been beneficial to have localized the LRU to the sb also, which would
have maintained sizeof(struct inode0 at parity with current mainline.

-- wli

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] adding per sb inode list to make invalidate_inodes() faster
  2004-09-09 18:18       ` William Lee Irwin III
@ 2004-09-09 19:08         ` Andrew Morton
  2004-09-09 19:35           ` William Lee Irwin III
  2004-09-10  8:54           ` Kirill Korotaev
  0 siblings, 2 replies; 14+ messages in thread
From: Andrew Morton @ 2004-09-09 19:08 UTC (permalink / raw)
  To: William Lee Irwin III; +Cc: torvalds, dev, linux-kernel

William Lee Irwin III <wli@holomorphy.com> wrote:
>
> On Thu, Sep 09, 2004 at 11:06:22AM -0700, Andrew Morton wrote:
>  > Yes.
>  > I have not merged it up because it seems rather dopey to add eight bytes to
>  > the inode to speed up something as rare as umount.
>  > Is there a convincing reason for proceeding with the change?
> 
>  The only motive I'm aware of is for latency in the presence of things
>  such as autofs. It's also worth noting that in the presence of things
>  such as removable media umount is also much more common. I personally
>  find this sufficiently compelling. Kirill may have additional ammunition.

Well.  That's why I'm keeping the patch alive-but-unmerged.  Waiting to see
who wants it.

There are people who have large machines which are automounting hundreds of
different NFS servers.  I'd certainly expect such a machine to experience
ongoing umount glitches.  But no reports have yet been sighted by this
little black duck.

>  Also, the additional sizeof(struct list_head) is only a requirement
>  while the global inode LRU is maintained. I believed it would have
>  been beneficial to have localized the LRU to the sb also, which would
>  have maintained sizeof(struct inode0 at parity with current mainline.

Could be.  We would give each superblock its own shrinker callback and
everything should balance out nicely (hah).

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] adding per sb inode list to make invalidate_inodes() faster
  2004-09-09 19:08         ` Andrew Morton
@ 2004-09-09 19:35           ` William Lee Irwin III
  2004-09-10  8:54           ` Kirill Korotaev
  1 sibling, 0 replies; 14+ messages in thread
From: William Lee Irwin III @ 2004-09-09 19:35 UTC (permalink / raw)
  To: Andrew Morton; +Cc: torvalds, dev, linux-kernel

William Lee Irwin III <wli@holomorphy.com> wrote:
>>  The only motive I'm aware of is for latency in the presence of things
>>  such as autofs. It's also worth noting that in the presence of things
>>  such as removable media umount is also much more common. I personally
>>  find this sufficiently compelling. Kirill may have additional ammunition.

On Thu, Sep 09, 2004 at 12:08:18PM -0700, Andrew Morton wrote:
> Well.  That's why I'm keeping the patch alive-but-unmerged.  Waiting to see
> who wants it.
> There are people who have large machines which are automounting hundreds of
> different NFS servers.  I'd certainly expect such a machine to experience
> ongoing umount glitches.  But no reports have yet been sighted by this
> little black duck.

Unfortunately all the scenarios I'm aware of where this is an ongoing
issue involve extremely downrev and vendor-centric kernel versions along
with the usual ultraconservative system administration, so this specific
concern is somewhat downplayed as other ongoing functional concerns
(e.g. direct IO on nfs breaking, deadlocks, getting zillions of fs's to
mount at all, etc.) have far higher priority than performance concerns.


William Lee Irwin III <wli@holomorphy.com> wrote:
>>  Also, the additional sizeof(struct list_head) is only a requirement
>>  while the global inode LRU is maintained. I believed it would have
>>  been beneficial to have localized the LRU to the sb also, which would
>>  have maintained sizeof(struct inode0 at parity with current mainline.

On Thu, Sep 09, 2004 at 12:08:18PM -0700, Andrew Morton wrote:
> Could be.  We would give each superblock its own shrinker callback and
> everything should balance out nicely (hah).

Hmm. My first leaning and implementation was to hierarchically LRU
inodes within sb's, but I suppose that's plausible. Let me know if you
want the shrinker callbacks or something else in particular done.


-- wli

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] adding per sb inode list to make invalidate_inodes() faster
  2004-09-09 15:51 ` Linus Torvalds
  2004-09-09 17:19   ` William Lee Irwin III
@ 2004-09-10  8:32   ` Kirill Korotaev
  2004-09-10 14:22     ` Linus Torvalds
  1 sibling, 1 reply; 14+ messages in thread
From: Kirill Korotaev @ 2004-09-10  8:32 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: akpm, linux-kernel

Linus Torvalds wrote:
> 
> On Thu, 9 Sep 2004, Kirill Korotaev wrote:
> 
>>This patch fixes the problem that huge inode cache
>>can make invalidate_inodes() calls very long. Meanwhile
>>lock_kernel() and inode_lock are being held and the system
>>can freeze for seconds.
> 
> 
> Hmm.. I don't mind the approach per se, but I get very nervous about the 
> fact that I don't see any initialization of "inode->i_sb_list".
inode->i_sb_list is a link list_head, not real list head (real list head 
is sb->s_inodes and it's initialized). i.e. it doesn't require 
initialization.

all the operations I perform on i_sb_list are
- list_add(&inode->i_sb_list, ...);
- list_del(&inode->i_sb_list);

So it's all safe.

> Yes, you do a
> 	list_add(&inode->i_sb_list, &sb->s_inodes);
> 
> in new_inode(), but there are a ton of users that allocate inodes other 
> ways, and more importantly, even if this was the only allocation function, 
> you do various "list_del(&inode->i_sb_list)" things which leaves the inode 
> around but with an invalid superblock list.
1. struct inode is allocated only in one place!
it's alloc_inode(). Next alloc_inode() is static and is called from 3 
places:
new_inode(), get_new_inode() and get_new_inode_fast().

All 3 above functions do list_add(&inode->i_sb_list, &sb->s_inodes);
i.e. newly allocated inodes are always in super block list.

2. list_del(&inode->i_sb_list) doesn't leave super block list invalid!

I state that I remove inodes from sb list only and only when usual 
inode->i_list is removed and inode can't be found after that moment 
neither in my per sb list nor in any other list (unused_inodes, 
inodes_in_use, sb->s_io, etc.)

See the details below.

> So at the very _least_, you should document why all of this is safe very 
> carefully (I get nervous about fundamental FS infrastructure changes), and 
> it should be left to simmer in -mm for a longish time to make sure it 
> really works..
Ok. This patch is safe because the use of new inode->i_sb_list list is 
fully symmetric to the use of inode->i_list. i.e.

- when inode is created it's added by inode->i_list to one of std lists 
(inodes_in_use, unused_inodes, sb->s_io). It lives in one of this lists 
during whole lifetime. So in places where inode is created I've just 
added list_add(&inode->i_sb_list, &sb->s_inodes). There are 3 such 
places: new_inode(), get_new_inode() and get_new_inode_fast()

- when inode is about to be destroyed it's usually removed from std 
lists (and sometimes is moved to 'freeable' list). It's the places where 
inode is removed from the hash as well. In such places I've just 
inserted list_del(&inode->i_sb_list). These places are in 
generic_forget_inode(), generic_delete_inode(), invalidate_list(), 
prune_icache(), hugetlbfs_delete_inode(), hugetlbfs_forget_inode().

So as you can see from the description the lifetime of inode in 
sb->s_inodes list is the same as in hash and other std lists.
And these new per-sb list is protected by the same inode_lock.

To be sure that there are no other places where i_list field is used 
somehow in other ways I've just grepped it.

Kirill


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] adding per sb inode list to make invalidate_inodes() faster
  2004-09-09 19:08         ` Andrew Morton
  2004-09-09 19:35           ` William Lee Irwin III
@ 2004-09-10  8:54           ` Kirill Korotaev
  2004-09-10  9:05             ` Andrew Morton
  2004-09-10 20:14             ` Denis Vlasenko
  1 sibling, 2 replies; 14+ messages in thread
From: Kirill Korotaev @ 2004-09-10  8:54 UTC (permalink / raw)
  To: Andrew Morton; +Cc: William Lee Irwin III, torvalds, linux-kernel

Andrew Morton wrote:

> William Lee Irwin III <wli@holomorphy.com> wrote:
>>On Thu, Sep 09, 2004 at 11:06:22AM -0700, Andrew Morton wrote:
>> > Yes.
>> > I have not merged it up because it seems rather dopey to add eight bytes to
>> > the inode to speed up something as rare as umount.
>> > Is there a convincing reason for proceeding with the change?
>>
>> The only motive I'm aware of is for latency in the presence of things
>> such as autofs. It's also worth noting that in the presence of things
>> such as removable media umount is also much more common. I personally
>> find this sufficiently compelling. Kirill may have additional ammunition.

> Well.  That's why I'm keeping the patch alive-but-unmerged.  Waiting to see
> who wants it.

> There are people who have large machines which are automounting hundreds of
> different NFS servers.  I'd certainly expect such a machine to experience
> ongoing umount glitches.  But no reports have yet been sighted by this
> little black duck.
I think It's not always evident where the problem is. For many people 
waiting 2 seconds is ok and they pay no much attention to this small 
little hangs.

1. I saw the bug in bugzilla from NFS people you pointed to me last time 
yourself where the same problem was detected.

2. We come across this problem accidentally. After we started using 4GB 
split in production systems we faced small hangs of umount and quota 
on/off. We have hundreds of super blocks in the systems and do 
mount/umount, quota on/off quite often, so it was quite a noticable 
hangs for us though unfatal.

We extracted the problem in the test case and resolved the issue. The 
patch I posted here a year ago (for 2.4 kernels) was used by us about 
for a year in ALL our production systems.

Well for sure this bug can be triggered only on really big servers with
a huge amount of memory and cache size.
It's up to you whether to apply it or not. I understand your position 
about 8 bytes, but probably it's just a question of using kernel, 
whether it's a user or server system.
Probably we can introduce some config option which would trigger 
features such as this one for enterprise systems.

>> Also, the additional sizeof(struct list_head) is only a requirement
>> while the global inode LRU is maintained. I believed it would have
>> been beneficial to have localized the LRU to the sb also, which would
>> have maintained sizeof(struct inode0 at parity with current mainline.
> 
> Could be.  We would give each superblock its own shrinker callback and
> everything should balance out nicely (hah).
heh, and how do you plan to make per-sb LRU to be fair?

Kirill

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] adding per sb inode list to make invalidate_inodes() faster
  2004-09-10  8:54           ` Kirill Korotaev
@ 2004-09-10  9:05             ` Andrew Morton
  2004-09-10 20:14             ` Denis Vlasenko
  1 sibling, 0 replies; 14+ messages in thread
From: Andrew Morton @ 2004-09-10  9:05 UTC (permalink / raw)
  To: Kirill Korotaev; +Cc: wli, torvalds, linux-kernel

Kirill Korotaev <dev@sw.ru> wrote:
>
> Well for sure this bug can be triggered only on really big servers with
>  a huge amount of memory and cache size.
>  It's up to you whether to apply it or not. I understand your position 
>  about 8 bytes, but probably it's just a question of using kernel, 
>  whether it's a user or server system.
>  Probably we can introduce some config option which would trigger 
>  features such as this one for enterprise systems.

I am paralysed by indecision!

It would be nice if we had evidence that more than one site in the world
was affected by this :(

I can't see an less space-consuming alternative here (apart from per-sb lru)

>  >> Also, the additional sizeof(struct list_head) is only a requirement
>  >> while the global inode LRU is maintained. I believed it would have
>  >> been beneficial to have localized the LRU to the sb also, which would
>  >> have maintained sizeof(struct inode0 at parity with current mainline.
>  > 
>  > Could be.  We would give each superblock its own shrinker callback and
>  > everything should balance out nicely (hah).
>
>  heh, and how do you plan to make per-sb LRU to be fair?

Good point.  

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] adding per sb inode list to make invalidate_inodes() faster
  2004-09-10  8:32   ` Kirill Korotaev
@ 2004-09-10 14:22     ` Linus Torvalds
  2004-09-10 16:56       ` Kirill Korotaev
  0 siblings, 1 reply; 14+ messages in thread
From: Linus Torvalds @ 2004-09-10 14:22 UTC (permalink / raw)
  To: Kirill Korotaev; +Cc: akpm, linux-kernel

On Fri, 10 Sep 2004, Kirill Korotaev wrote:
> > 
> > Hmm.. I don't mind the approach per se, but I get very nervous about
> > the fact that I don't see any initialization of "inode->i_sb_list".
>
> inode->i_sb_list is a link list_head, not real list head (real list head
> is sb->s_inodes and it's initialized). i.e. it doesn't require
> initialization.

It _does_ require initialization. And no, there is no difference between a 
"real" head and a entry "link" in the list. They both need to be 
initialized.

> all the operations I perform on i_sb_list are
> - list_add(&inode->i_sb_list, ...);

This one is ok without initialzing the entry, since it will do so itself.

> - list_del(&inode->i_sb_list);

This one is NOT ok. If list_del() is ever done on a link entry that hasn't 
been initialized, you crash. If "list_del()" is ever done twice on an 
entry, you will crash and/or corrupt memory elsewhere. 

> So it's all safe.

It's not "all safe". You had better explain _why_ it's safe. You do so 
later:

> 1. struct inode is allocated only in one place!
> it's alloc_inode(). Next alloc_inode() is static and is called from 3 
> places:
> new_inode(), get_new_inode() and get_new_inode_fast().
> 
> All 3 above functions do list_add(&inode->i_sb_list, &sb->s_inodes);
> i.e. newly allocated inodes are always in super block list.

Good. _This_ is what I was after.

> 2. list_del(&inode->i_sb_list) doesn't leave super block list invalid!

No, but it leaves _itself_ invalid. There had better not be anything that 
touches it ever after without an initialization. That wasn't obvious...

		Linus

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] adding per sb inode list to make invalidate_inodes() faster
  2004-09-10 14:22     ` Linus Torvalds
@ 2004-09-10 16:56       ` Kirill Korotaev
  0 siblings, 0 replies; 14+ messages in thread
From: Kirill Korotaev @ 2004-09-10 16:56 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: akpm, linux-kernel

Linus Torvalds wrote:
 > On Fri, 10 Sep 2004, Kirill Korotaev wrote:

 >>>Hmm.. I don't mind the approach per se, but I get very nervous about
 >>>the fact that I don't see any initialization of "inode->i_sb_list".
 >>
 >>inode->i_sb_list is a link list_head, not real list head (real list head
 >>is sb->s_inodes and it's initialized). i.e. it doesn't require
 >>initialization.
 >
 > It _does_ require initialization. And no, there is no difference 
between a
 > "real" head and a entry "link" in the list. They both need to be
 > initialized.
 >
 >>all the operations I perform on i_sb_list are
 >>- list_add(&inode->i_sb_list, ...);
 >
 > This one is ok without initialzing the entry, since it will do so itself.
 >
 >>- list_del(&inode->i_sb_list);
 >
 > This one is NOT ok. If list_del() is ever done on a link entry that 
hasn't
 > been initialized, you crash. If "list_del()" is ever done twice on an
 > entry, you will crash and/or corrupt memory elsewhere.
We never do list_del twice, nor we do list_del on unitialized inodes!

 >>1. struct inode is allocated only in one place!
 >>it's alloc_inode(). Next alloc_inode() is static and is called from 3
 >>places:
 >>new_inode(), get_new_inode() and get_new_inode_fast().
 >>
 >>All 3 above functions do list_add(&inode->i_sb_list, &sb->s_inodes);
 >>i.e. newly allocated inodes are always in super block list.

 > Good. _This_ is what I was after.
 >
 >
 >>2. list_del(&inode->i_sb_list) doesn't leave super block list invalid!
 >
 > No, but it leaves _itself_ invalid. There had better not be anything 
that
 > touches it ever after without an initialization. That wasn't obvious...
ok. But now I hope I proved that it's ok?
no one does list_del() twice and no one does list_del() on unitialized 
inodes.

If you want i_sb_list to be really ALWAYS initialized than we can 
replace list_del with list_del_init and insert INIT_LIST_HEAD(i_sb_list) 
in inode_init_once().
But I don't think it's a good idea.
Moreover, I think that list_del_init() and other initialization 
functions of link list_heads in such places usually only hide the 
problems, not solve them.

Kirill


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] adding per sb inode list to make invalidate_inodes() faster
  2004-09-10  8:54           ` Kirill Korotaev
  2004-09-10  9:05             ` Andrew Morton
@ 2004-09-10 20:14             ` Denis Vlasenko
  2004-09-11  9:15               ` Re[2]: " Kirill Korotaev
  1 sibling, 1 reply; 14+ messages in thread
From: Denis Vlasenko @ 2004-09-10 20:14 UTC (permalink / raw)
  To: Kirill Korotaev, Andrew Morton
  Cc: William Lee Irwin III, torvalds, linux-kernel

> >> The only motive I'm aware of is for latency in the presence of things
> >> such as autofs. It's also worth noting that in the presence of things
> >> such as removable media umount is also much more common. I personally
> >> find this sufficiently compelling. Kirill may have additional
> >> ammunition.
> >
> > Well.  That's why I'm keeping the patch alive-but-unmerged.  Waiting to
> > see who wants it.
> >
> > There are people who have large machines which are automounting hundreds
> > of different NFS servers.  I'd certainly expect such a machine to
> > experience ongoing umount glitches.  But no reports have yet been sighted
> > by this little black duck.
>
> I think It's not always evident where the problem is. For many people
> waiting 2 seconds is ok and they pay no much attention to this small
> little hangs.
>
> 1. I saw the bug in bugzilla from NFS people you pointed to me last time
> yourself where the same problem was detected.

What bug? Are you talking about umount being not so fast or something else?
--
vda


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re[2]: [PATCH] adding per sb inode list to make invalidate_inodes() faster
  2004-09-10 20:14             ` Denis Vlasenko
@ 2004-09-11  9:15               ` Kirill Korotaev
  0 siblings, 0 replies; 14+ messages in thread
From: Kirill Korotaev @ 2004-09-11  9:15 UTC (permalink / raw)
  To: Denis Vlasenko
  Cc: Andrew Morton, William Lee Irwin III, torvalds, linux-kernel

Hello Denis,

Saturday, September 11, 2004, 12:14:24 AM, you wrote:

>> >> The only motive I'm aware of is for latency in the presence of things
>> >> such as autofs. It's also worth noting that in the presence of things
>> >> such as removable media umount is also much more common. I personally
>> >> find this sufficiently compelling. Kirill may have additional
>> >> ammunition.
>> >
>> > Well.  That's why I'm keeping the patch alive-but-unmerged.  Waiting to
>> > see who wants it.
>> >
>> > There are people who have large machines which are automounting hundreds
>> > of different NFS servers.  I'd certainly expect such a machine to
>> > experience ongoing umount glitches.  But no reports have yet been sighted
>> > by this little black duck.
>>
>> I think It's not always evident where the problem is. For many people
>> waiting 2 seconds is ok and they pay no much attention to this small
>> little hangs.
>>
>> 1. I saw the bug in bugzilla from NFS people you pointed to me last time
>> yourself where the same problem was detected.

DV> What bug? Are you talking about umount being not so fast or something else?
yup. umount requires traversing of ALL inodes in the system (even
inodes from other super blocks). So umount and quota off can last very
very LONG time.

Have you faced this bug before?

-- 
Best regards,
 Kirill                            mailto:dev@sw.ru


^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2004-09-11  9:11 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-09-09 15:39 [PATCH] adding per sb inode list to make invalidate_inodes() faster Kirill Korotaev
2004-09-09 15:51 ` Linus Torvalds
2004-09-09 17:19   ` William Lee Irwin III
2004-09-09 18:06     ` Andrew Morton
2004-09-09 18:18       ` William Lee Irwin III
2004-09-09 19:08         ` Andrew Morton
2004-09-09 19:35           ` William Lee Irwin III
2004-09-10  8:54           ` Kirill Korotaev
2004-09-10  9:05             ` Andrew Morton
2004-09-10 20:14             ` Denis Vlasenko
2004-09-11  9:15               ` Re[2]: " Kirill Korotaev
2004-09-10  8:32   ` Kirill Korotaev
2004-09-10 14:22     ` Linus Torvalds
2004-09-10 16:56       ` Kirill Korotaev

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox