linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH] ensure i_ino uniqueness in filesystems without permanent inode numbers (via idr)
@ 2006-12-01 14:48 Jeff Layton
  2006-12-01 16:52 ` Randy Dunlap
  0 siblings, 1 reply; 18+ messages in thread
From: Jeff Layton @ 2006-12-01 14:48 UTC (permalink / raw)
  To: linux-fsdevel, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2232 bytes --]

This patch is a proof of concept. It works, but still needs a bit of
polish before it's ready for submission. First, the problems:

1) on filesystems w/o permanent inode numbers, i_ino values can be
larger than 32 bits, which can cause problems for some 32 bit userspace
programs on a 64 bit kernel.

2) many filesystems call new_inode and assume that the i_ino values they
are given are unique. They are not guaranteed to be so, since the static
counter can wrap.

3) after allocating a new inode, some filesystems call iunique to try to
get a unique i_ino value, but they don't actually add their inodes to
the hashtable, so they're still not guaranteed to be unique.

This patch is a first step at correcting these problems. This adds 2 new
functions, an idr_register and idr_unregister. Filesystems can call
idr_register at inode creation time, and then at deletion time, we'll
automatically unregister them.

This patch also adds a new s_generation counter to the superblock.
Because i_ino's can be reused so quickly, we don't want NFS getting
confused when it happens. When iunique_register is called, we'll assign
the s_generation value to the i_generation, and then increment it to
help ensure that we get different filehandles.

There are some things that need to be cleaned up, of course:

- error handling for the idr calls

- recheck all the possible places where the inode should be unhashed

- don't attempt to remove inodes with values <100

- convert other filesystems

- remove the static counter from new_inode and (maybe) eliminate iunique

The patch also converts pipefs to use the new scheme as an example. Al
Viro had expressed some concern with an earlier patch that this might
slow down pipe creation. I've done some testing and I think the impact
will be minimal. Timing a small program that creates and closes 100
million pipes in a loop:

patched:
-------------
real    8m8.623s
user    0m37.418s
sys     7m31.196s

unpatched:
--------------
real    8m7.150s
user    0m40.943s
sys     7m26.204s

As the number of pipes grows on the system, this time may grow somewhat
but it doesn't seem like it would be terrible.

Comments and suggestions appreciated.

Signed-off-by: Jeff Layton <jlayton@redhat.com>



[-- Attachment #2: idr_register.patch --]
[-- Type: text/x-patch, Size: 3643 bytes --]

diff --git a/fs/inode.c b/fs/inode.c
index 26cdb11..841e2fc 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -288,6 +288,8 @@ static void dispose_list(struct list_hea
 		list_del_init(&inode->i_sb_list);
 		spin_unlock(&inode_lock);
 
+		iunique_unregister(inode);
+
 		wake_up_inode(inode);
 		destroy_inode(inode);
 		nr_disposed++;
@@ -706,6 +708,34 @@ retry:
 
 EXPORT_SYMBOL(iunique);
 
+int iunique_register(struct inode *inode, int max_reserved)
+{
+	int rv;
+
+	rv = idr_pre_get(&inode->i_sb->s_inode_ids, GFP_KERNEL);
+	if (! rv)
+		return -ENOMEM;
+
+	spin_lock(&inode->i_sb->s_inode_ids_lock);
+	rv = idr_get_new_above(&inode->i_sb->s_inode_ids, inode,
+		max_reserved+1, (int *) &inode->i_ino);
+	inode->i_generation = inode->i_sb->s_generation++;
+	spin_unlock(&inode->i_sb->s_inode_ids_lock);
+	return rv;
+}
+
+EXPORT_SYMBOL(iunique_register);
+
+void iunique_unregister(struct inode *inode)
+{
+	spin_lock(&inode->i_sb->s_inode_ids_lock);
+	if (idr_find(&inode->i_sb->s_inode_ids, (int) inode->i_ino))
+		idr_remove(&inode->i_sb->s_inode_ids, (int) inode->i_ino);
+	spin_unlock(&inode->i_sb->s_inode_ids_lock);
+}
+
+EXPORT_SYMBOL(iunique_unregister);
+
 struct inode *igrab(struct inode *inode)
 {
 	spin_lock(&inode_lock);
@@ -1025,6 +1055,7 @@ void generic_delete_inode(struct inode *
 	spin_lock(&inode_lock);
 	hlist_del_init(&inode->i_hash);
 	spin_unlock(&inode_lock);
+	iunique_unregister(inode);
 	wake_up_inode(inode);
 	BUG_ON(inode->i_state != I_CLEAR);
 	destroy_inode(inode);
@@ -1057,6 +1088,7 @@ static void generic_forget_inode(struct 
 	inode->i_state |= I_FREEING;
 	inodes_stat.nr_inodes--;
 	spin_unlock(&inode_lock);
+	iunique_unregister(inode);
 	if (inode->i_data.nrpages)
 		truncate_inode_pages(&inode->i_data, 0);
 	clear_inode(inode);
diff --git a/fs/pipe.c b/fs/pipe.c
index b1626f2..d74ae65 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -845,6 +845,9 @@ static struct inode * get_pipe_inode(voi
 	if (!inode)
 		goto fail_inode;
 
+	if (iunique_register(inode, 0))
+		goto fail_iput;
+
 	pipe = alloc_pipe_info(inode);
 	if (!pipe)
 		goto fail_iput;
diff --git a/fs/super.c b/fs/super.c
index 47e554c..d2dbdec 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -93,6 +93,8 @@ static struct super_block *alloc_super(s
 		s->s_qcop = sb_quotactl_ops;
 		s->s_op = &default_op;
 		s->s_time_gran = 1000000000;
+		idr_init(&s->s_inode_ids);
+		spin_lock_init(&s->s_inode_ids_lock);
 	}
 out:
 	return s;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2fe6e3f..3ad12a6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -278,6 +278,7 @@ #include <linux/prio_tree.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/mutex.h>
+#include <linux/idr.h>
 
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
@@ -961,6 +962,12 @@ #endif
 	/* Granularity of c/m/atime in ns.
 	   Cannot be worse than a second */
 	u32		   s_time_gran;
+
+	/* for fs's with dynamic i_ino values, track them with idr, and increment
+	   the generation every time we register a new inode */
+	__u32			s_generation;
+	struct idr		s_inode_ids;
+	spinlock_t		s_inode_ids_lock;
 };
 
 extern struct timespec current_fs_time(struct super_block *sb);
@@ -1681,6 +1688,8 @@ extern void inode_init_once(struct inode
 extern void iput(struct inode *);
 extern struct inode * igrab(struct inode *);
 extern ino_t iunique(struct super_block *, ino_t);
+extern int iunique_register(struct inode *, int);
+extern void iunique_unregister(struct inode *);
 extern int inode_needs_sync(struct inode *inode);
 extern void generic_delete_inode(struct inode *inode);
 extern void generic_drop_inode(struct inode *inode);


^ permalink raw reply related	[flat|nested] 18+ messages in thread
* [RFC][PATCH] ensure i_ino uniqueness in filesystems without permanent inode numbers (via idr)
@ 2006-11-14 20:22 Jeff Layton
  2006-11-14 20:26 ` Al Viro
  2006-11-15 17:27 ` Matthew Wilcox
  0 siblings, 2 replies; 18+ messages in thread
From: Jeff Layton @ 2006-11-14 20:22 UTC (permalink / raw)
  To: linux-fsdevel

This patch is a proof of concept. It works, but still needs a bit of
polish before it's ready for submission. First, the problems:

1) on filesystems w/o permanent inode numbers, i_ino values can be
larger than 32 bits, which can cause problems for some 32 bit userspace
programs on a 64 bit kernel.

2) many filesystems call new_inode and assume that the i_ino values they
are given are unique. They are not guaranteed to be so, since the static
counter can wrap.

3) after allocating a new inode, some filesystems call iunique to try to
get a unique i_ino value, but they don't actually add their inodes to
the hashtable, and so they're still not guaranteed to be unique.

This patch is a first step at correcting these problems. It declares a
new file_system_type flag (FS_I_INO_DYNAMIC). If this is set, then when
new_inode is called, we'll use the IDR functions to generate a unique
31-bit value, leaving the first 100 inode numbers for statically
allocated stuff like root inodes, etc. At inode deletion time, we
idr_remove it so the i_ino value can be reused.

The patch also converts pipefs to use the new scheme (though the
conversion just consists of setting the fs_flags value for its
file_system_type).

There are some things that need to be cleaned up, of course:

- error handling for the idr calls

- recheck all the possible places where the inode should be unhashed

- don't attempt to remove inodes with values <100

- convert other filesystems

- remove the static counter from new_inode and (maybe) eliminate iunique

It seems to basically work, but there is one oddity. idr seems to prefer
reusing values when it can. So if a program, for instance, makes a pipe,
then closes it, and makes a new one, the second one is likely to get the
same st_ino value as the first one. The value is still unique, but I
have to wonder if this may cause some userspace programs to get
confused...

I'm posting this to get some feedback. If this approach seems
acceptable, then I'll clean up the patch and start working on converting
the filesystems individually.

Comments and suggestions appreciated...

-- Jeff

diff --git a/fs/inode.c b/fs/inode.c
index 26cdb11..52779f4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -93,6 +93,12 @@ DEFINE_SPINLOCK(inode_lock);
 static DEFINE_MUTEX(iprune_mutex);
 
 /*
+ * for filesystems that dynamically allocate inode numbers, we reserve the
+ * first 100 for statically assigned values (root inodes and such)
+ */
+#define DYNAMIC_I_INO_RESERVED 100
+
+/*
  * Statistics gathering..
  */
 struct inodes_stat_t inodes_stat;
@@ -116,6 +122,7 @@ static struct inode *alloc_inode(struct 
 
 		inode->i_sb = sb;
 		inode->i_blkbits = sb->s_blocksize_bits;
+		inode->i_ino = 0;
 		inode->i_flags = 0;
 		atomic_set(&inode->i_count, 1);
 		inode->i_op = &empty_iops;
@@ -286,6 +293,8 @@ static void dispose_list(struct list_hea
 		spin_lock(&inode_lock);
 		hlist_del_init(&inode->i_hash);
 		list_del_init(&inode->i_sb_list);
+		if (inode->i_sb->s_type->fs_flags & FS_I_INO_DYNAMIC)
+			idr_remove(&inode->i_sb->s_inode_ids, inode->i_ino);
 		spin_unlock(&inode_lock);
 
 		wake_up_inode(inode);
@@ -531,11 +540,17 @@ struct inode *new_inode(struct super_blo
 	
 	inode = alloc_inode(sb);
 	if (inode) {
+		if (sb->s_type->fs_flags & FS_I_INO_DYNAMIC)
+			idr_pre_get(&sb->s_inode_ids, GFP_KERNEL);
 		spin_lock(&inode_lock);
 		inodes_stat.nr_inodes++;
 		list_add(&inode->i_list, &inode_in_use);
 		list_add(&inode->i_sb_list, &sb->s_inodes);
-		inode->i_ino = ++last_ino;
+		if (sb->s_type->fs_flags & FS_I_INO_DYNAMIC)
+			idr_get_new_above(&sb->s_inode_ids, inode,
+				DYNAMIC_I_INO_RESERVED, (int *) &inode->i_ino);
+		else
+			inode->i_ino = ++last_ino;
 		inode->i_state = 0;
 		spin_unlock(&inode_lock);
 	}
@@ -1024,6 +1039,8 @@ void generic_delete_inode(struct inode *
 	}
 	spin_lock(&inode_lock);
 	hlist_del_init(&inode->i_hash);
+	if (inode->i_sb->s_type->fs_flags & FS_I_INO_DYNAMIC)
+		idr_remove(&inode->i_sb->s_inode_ids, inode->i_ino);
 	spin_unlock(&inode_lock);
 	wake_up_inode(inode);
 	BUG_ON(inode->i_state != I_CLEAR);
@@ -1052,6 +1069,8 @@ static void generic_forget_inode(struct 
 		inodes_stat.nr_unused--;
 		hlist_del_init(&inode->i_hash);
 	}
+	if (inode->i_sb->s_type->fs_flags & FS_I_INO_DYNAMIC)
+		idr_remove(&sb->s_inode_ids, inode->i_ino);
 	list_del_init(&inode->i_list);
 	list_del_init(&inode->i_sb_list);
 	inode->i_state |= I_FREEING;
diff --git a/fs/pipe.c b/fs/pipe.c
index b1626f2..ea50833 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1005,6 +1005,7 @@ static struct file_system_type pipe_fs_t
 	.name		= "pipefs",
 	.get_sb		= pipefs_get_sb,
 	.kill_sb	= kill_anon_super,
+	.fs_flags	= FS_I_INO_DYNAMIC,
 };
 
 static int __init init_pipe_fs(void)
diff --git a/fs/super.c b/fs/super.c
index 47e554c..23b662b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -93,6 +93,7 @@ static struct super_block *alloc_super(s
 		s->s_qcop = sb_quotactl_ops;
 		s->s_op = &default_op;
 		s->s_time_gran = 1000000000;
+		idr_init(&s->s_inode_ids);
 	}
 out:
 	return s;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2fe6e3f..2c1a215 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -91,6 +91,7 @@ #define SEL_EX		4
 /* public flags for file_system_type */
 #define FS_REQUIRES_DEV 1 
 #define FS_BINARY_MOUNTDATA 2
+#define FS_I_INO_DYNAMIC 4	/* i_ino values are not permanent */
 #define FS_REVAL_DOT	16384	/* Check the paths ".", ".." for staleness */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move()
 					 * during rename() internally.
@@ -278,6 +279,7 @@ #include <linux/prio_tree.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/mutex.h>
+#include <linux/idr.h>
 
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
@@ -961,6 +963,9 @@ #endif
 	/* Granularity of c/m/atime in ns.
 	   Cannot be worse than a second */
 	u32		   s_time_gran;
+
+	/* for fs's with dynamic i_ino values, track them with idr */
+	struct idr		s_inode_ids;
 };
 
 extern struct timespec current_fs_time(struct super_block *sb);



^ permalink raw reply related	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2006-12-03 12:49 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-12-01 14:48 [RFC][PATCH] ensure i_ino uniqueness in filesystems without permanent inode numbers (via idr) Jeff Layton
2006-12-01 16:52 ` Randy Dunlap
2006-12-01 17:21   ` Jeff Layton
2006-12-01 17:42     ` Randy Dunlap
2006-12-02  5:30     ` Brad Boyer
2006-12-03  2:56       ` Jeff Layton
2006-12-02 12:58         ` Brad Boyer
2006-12-03 11:52           ` Al Boldi
2006-12-03 12:49           ` Jeff Layton
  -- strict thread matches above, loose matches on Subject: below --
2006-11-14 20:22 Jeff Layton
2006-11-14 20:26 ` Al Viro
2006-11-15 16:42   ` Jeff Layton
2006-11-15 16:44     ` Jeff Layton
2006-11-16 14:06     ` Al Viro
2006-11-16 14:34       ` Jeff Layton
2006-11-15 17:27 ` Matthew Wilcox
2006-11-15 17:56   ` Jörn Engel
2006-11-15 20:36     ` Jeff Layton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).