public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* PATCH - raise max_anon limit
@ 2004-02-06 22:15 Tim Hockin
  2004-02-07  8:55 ` Andrew Morton
  0 siblings, 1 reply; 17+ messages in thread
From: Tim Hockin @ 2004-02-06 22:15 UTC (permalink / raw)
  To: Linux Kernel mailing list, torvalds, akpm, viro

[-- Attachment #1: Type: text/plain, Size: 326 bytes --]

Attached is a patch to raise the limit of anonymous block devices.  The
sysctl allows the admin to set the order of pages allocated for the unnamed
bitmap from 1 page to the full MINORBITS limit.

what think?

Tim

-- 
Tim Hockin
Sun Microsystems, Linux Software Engineering
thockin@sun.com
All opinions are my own, not Sun's

[-- Attachment #2: max_anon_sysctl-2.6.2-4.diff --]
[-- Type: text/plain, Size: 7245 bytes --]

===== fs/namespace.c 1.52 vs edited =====
--- 1.52/fs/namespace.c	Tue Feb  3 21:37:02 2004
+++ edited/fs/namespace.c	Thu Feb  5 17:20:55 2004
@@ -25,6 +25,7 @@
 
 extern int __init init_rootfs(void);
 extern int __init sysfs_init(void);
+extern void __init max_anon_init(void);
 
 /* spinlock for vfsmount related operations, inplace of dcache_lock */
 spinlock_t vfsmount_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
@@ -1171,6 +1172,7 @@
 		d++;
 		i--;
 	} while (i);
+	max_anon_init();
 	sysfs_init();
 	init_rootfs();
 	init_mount_tree();
===== fs/super.c 1.110 vs edited =====
--- 1.110/fs/super.c	Sun Oct  5 01:07:55 2003
+++ edited/fs/super.c	Fri Feb  6 12:56:58 2004
@@ -24,6 +24,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
+#include <linux/init.h>
 #include <linux/acct.h>
 #include <linux/blkdev.h>
 #include <linux/quotaops.h>
@@ -34,6 +35,7 @@
 #include <linux/vfs.h>
 #include <linux/writeback.h>		/* for the emergency remount stuff */
 #include <asm/uaccess.h>
+#include <asm/semaphore.h>
 
 
 void get_filesystem(struct file_system_type *fs);
@@ -535,16 +537,101 @@
  * filesystems which don't use real block-devices.  -- jrs
  */
 
-enum {Max_anon = 256};
-static unsigned long unnamed_dev_in_use[Max_anon/(8*sizeof(unsigned long))];
+int max_anon_order; /* = 0 */
+static int max_anon;
+static unsigned long *unnamed_dev_in_use;
 static spinlock_t unnamed_dev_lock = SPIN_LOCK_UNLOCKED;/* protects the above */
 
+static int set_max_anon_order(int old_order, int new_order)
+{
+	unsigned long *new_map;
+	unsigned long *old_map;
+	int new_bytes;
+	int old_bytes;
+
+	/*
+	 * you can only raise the order, or we have to handle the case of
+	 * having used bits that will not exist after lowering the order
+	 */
+	if (new_order < old_order) {
+		/* the sysctl proc_handler has already stored the value */
+		max_anon_order = old_order;
+		return -EINVAL;
+	}
+
+	/*
+	 * writing too high a value clamps to the highest value
+	 * max order is : 2^MINORBITS / 8 (bits per byte) / 2^PAGE_SHIFT
+	 */
+	if (new_order > (MINORBITS - PAGE_SHIFT - 3))
+		new_order = MINORBITS - PAGE_SHIFT - 3;
+
+	if (new_order == old_order)
+		return 0;
+
+	new_map = (unsigned long *)__get_free_pages(GFP_KERNEL, new_order);
+	if (!new_map) {
+		printk(KERN_ERR "Could not allocate new anonymous device map");
+		max_anon_order = old_order;
+		return -ENOMEM;
+	}
+	new_bytes = (1U << new_order) * PAGE_SIZE;
+
+	old_bytes = (1U << old_order) * PAGE_SIZE;
+
+	/* zero and copy old bit array, save the state */
+	memset(new_map, 0, new_bytes);
+	spin_lock(&unnamed_dev_lock);
+	old_map = unnamed_dev_in_use;
+	memcpy(new_map, old_map, old_bytes);
+	unnamed_dev_in_use = new_map;
+	max_anon = new_bytes * 8;
+	spin_unlock(&unnamed_dev_lock);
+	free_pages((unsigned long)old_map, old_order);
+	max_anon_order = new_order;
+
+	return 0;
+}
+
+int max_anon_sysctl_handler(ctl_table *table, int write, struct file *filp,
+    void __user *buffer, size_t *lenp)
+{
+	int ret;
+	int old_order;
+	static DECLARE_MUTEX(max_anon_sem);
+
+	down(&max_anon_sem);
+	old_order = max_anon_order;
+	ret = proc_dointvec(table, write, filp, buffer, lenp);
+	if (ret)
+		goto out;
+	if (write)
+		ret = set_max_anon_order(old_order, max_anon_order);
+out:
+	up(&max_anon_sem);
+	return ret;
+}
+
+void __init max_anon_init(void)
+{
+	int new_bytes;
+
+	unnamed_dev_in_use = (unsigned long *)__get_free_pages(GFP_ATOMIC,
+	    max_anon_order);
+	if (!unnamed_dev_in_use) {
+		panic("Could not initialize anonymous device map");
+	}
+	new_bytes = (1U << max_anon_order) * PAGE_SIZE;
+	memset(unnamed_dev_in_use, 0, new_bytes);
+	max_anon = new_bytes * 8;
+}
+
 int set_anon_super(struct super_block *s, void *data)
 {
 	int dev;
 	spin_lock(&unnamed_dev_lock);
-	dev = find_first_zero_bit(unnamed_dev_in_use, Max_anon);
-	if (dev == Max_anon) {
+	dev = find_first_zero_bit(unnamed_dev_in_use, max_anon);
+	if (dev == max_anon) {
 		spin_unlock(&unnamed_dev_lock);
 		return -EMFILE;
 	}
===== include/linux/fs.h 1.283 vs edited =====
--- 1.283/include/linux/fs.h	Mon Jan 19 15:38:10 2004
+++ edited/include/linux/fs.h	Thu Feb  5 17:20:55 2004
@@ -19,6 +19,7 @@
 #include <linux/cache.h>
 #include <linux/radix-tree.h>
 #include <linux/kobject.h>
+#include <linux/sysctl.h>
 #include <asm/atomic.h>
 
 struct iovec;
@@ -1045,6 +1046,8 @@
 			void *data);
 struct super_block *get_sb_pseudo(struct file_system_type *, char *,
 			struct super_operations *ops, unsigned long);
+int max_anon_sysctl_handler(ctl_table *table, int write, struct file *filp,
+			void __user *buffer, size_t *lenp);
 
 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
 #define fops_get(fops) \
===== include/linux/sysctl.h 1.59 vs edited =====
--- 1.59/include/linux/sysctl.h	Sun Feb  1 11:17:41 2004
+++ edited/include/linux/sysctl.h	Thu Feb  5 17:20:55 2004
@@ -615,6 +615,7 @@
 	FS_LEASE_TIME=15,	/* int: maximum time to wait for a lease break */
 	FS_DQSTATS=16,	/* disc quota usage statistics */
 	FS_XFS=17,	/* struct: control xfs parameters */
+	FS_MAX_ANON_ORDER=18, /* int: max anonymous blockdevs oreder */
 };
 
 /* /proc/sys/fs/quota/ */
===== kernel/sysctl.c 1.59 vs edited =====
--- 1.59/kernel/sysctl.c	Tue Feb  3 21:30:55 2004
+++ edited/kernel/sysctl.c	Thu Feb  5 17:20:55 2004
@@ -38,6 +38,7 @@
 #include <linux/security.h>
 #include <linux/initrd.h>
 #include <linux/times.h>
+#include <linux/fs.h>
 #include <asm/uaccess.h>
 
 #ifdef CONFIG_ROOT_NFS
@@ -63,6 +64,7 @@
 extern int min_free_kbytes;
 extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
+extern int max_anon_order;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -813,6 +815,14 @@
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= FS_MAX_ANON_ORDER,
+		.procname	= "max-anon-order",
+		.data		= &max_anon_order,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &max_anon_sysctl_handler,
 	},
 	{ .ctl_name = 0 }
 };
===== Documentation/sysctl/fs.txt 1.2 vs edited =====
--- 1.2/Documentation/sysctl/fs.txt	Mon Dec 30 04:29:09 2002
+++ edited/Documentation/sysctl/fs.txt	Thu Feb  5 17:31:17 2004
@@ -23,6 +23,7 @@
 - inode-max
 - inode-nr
 - inode-state
+- max-anon-order
 - overflowuid
 - overflowgid
 - super-max
@@ -116,6 +117,19 @@
 preshrink is nonzero when the nr_inodes > inode-max and the
 system needs to prune the inode list instead of allocating
 more.
+
+==============================================================
+
+max-anon-order
+
+Unnamed block devices are dummy devices used by virtual filesystems which
+don't use real block-devices, such as NFS.  The maximum number of unnamed
+block devices is controlled by this sysctl value.  The value represents a
+power-of-two page size.  For example, setting the default order, 0, results
+in 2^0 = 1 page being allocated for the anonymous device bitmap.  Setting
+the order to 2 results in 2^2 = 4 pages being allocated for the bitmap.
+Once increased, this value can not be decreased.  There is a limit of
+2^MINORBITS bits available at maximum.
 
 ==============================================================
 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: PATCH - raise max_anon limit
  2004-02-06 22:15 PATCH - raise max_anon limit Tim Hockin
@ 2004-02-07  8:55 ` Andrew Morton
  2004-02-07  9:48   ` viro
  0 siblings, 1 reply; 17+ messages in thread
From: Andrew Morton @ 2004-02-07  8:55 UTC (permalink / raw)
  To: thockin; +Cc: linux-kernel, torvalds, viro

Tim Hockin <thockin@sun.com> wrote:
>
> Attached is a patch to raise the limit of anonymous block devices.  The
>  sysctl allows the admin to set the order of pages allocated for the unnamed
>  bitmap from 1 page to the full MINORBITS limit.

It would be better to lose the sysctl and do it all dynamically.

Options are:

a) realloc the bitmap when it fills up

   Simple, a bit crufty, doesn't release memory.

b) lib/radix-tree.c

   Each entry in the radix tree can be a bitmap (radix-tree.c should
   have been defined to store unsigned longs, not void*'s.  Oh well), so
   you get good space utilisation, but finding a new entry will take ten or
   so lines of code.

c) lib/idr.c

   Worst space utilisation, but simplest code.


I'd go with c), personally.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: PATCH - raise max_anon limit
  2004-02-07  8:55 ` Andrew Morton
@ 2004-02-07  9:48   ` viro
  2004-02-11 20:33     ` Tim Hockin
  0 siblings, 1 reply; 17+ messages in thread
From: viro @ 2004-02-07  9:48 UTC (permalink / raw)
  To: Andrew Morton; +Cc: thockin, linux-kernel, torvalds, viro

On Sat, Feb 07, 2004 at 12:55:05AM -0800, Andrew Morton wrote:
> Tim Hockin <thockin@sun.com> wrote:
> >
> > Attached is a patch to raise the limit of anonymous block devices.  The
> >  sysctl allows the admin to set the order of pages allocated for the unnamed
> >  bitmap from 1 page to the full MINORBITS limit.
> 
> It would be better to lose the sysctl and do it all dynamically.
> 
> Options are:
> 
> a) realloc the bitmap when it fills up
> 
>    Simple, a bit crufty, doesn't release memory.
> 
> b) lib/radix-tree.c
> 
>    Each entry in the radix tree can be a bitmap (radix-tree.c should
>    have been defined to store unsigned longs, not void*'s.  Oh well), so
>    you get good space utilisation, but finding a new entry will take ten or
>    so lines of code.
> 
> c) lib/idr.c
> 
>    Worst space utilisation, but simplest code.

d) grab a couple of pages and be done with that.  That gives us 64Kbits.

e) grab max(1/8000 of entire memory, 128Kb).  That will guarantee that
we run out of memory or minors before we fill the bitmap.

PS: psu.edu address is still valid, but I rarely read that mailbox...

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: PATCH - raise max_anon limit
  2004-02-07  9:48   ` viro
@ 2004-02-11 20:33     ` Tim Hockin
  2004-02-11 20:38       ` Linus Torvalds
  0 siblings, 1 reply; 17+ messages in thread
From: Tim Hockin @ 2004-02-11 20:33 UTC (permalink / raw)
  To: viro; +Cc: Andrew Morton, linux-kernel, torvalds, viro

On Sat, Feb 07, 2004 at 09:48:47AM +0000, viro@parcelfarce.linux.theplanet.co.uk wrote:
> > It would be better to lose the sysctl and do it all dynamically.
> > 
> > Options are:
> > 
> > a) realloc the bitmap when it fills up
> > 
> >    Simple, a bit crufty, doesn't release memory.

This can work if it's OK to allocate pages during set_max_anon() (which
includes changing the spinlock to a sema or always allocating before the
lock).

> d) grab a couple of pages and be done with that.  That gives us 64Kbits.

Maybe that is just the simplest answer?  It can be a simple constant that is
changeable at compile time, and leave it at that

What's most likely to cause the least argument?

> PS: psu.edu address is still valid, but I rarely read that mailbox...

Sorry - it's what was listed in MAINTAINERS, so I used it.

Tim
-- 
Tim Hockin
Sun Microsystems, Linux Software Engineering
thockin@sun.com
All opinions are my own, not Sun's

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: PATCH - raise max_anon limit
  2004-02-11 20:33     ` Tim Hockin
@ 2004-02-11 20:38       ` Linus Torvalds
  2004-02-11 21:09         ` Tim Hockin
  0 siblings, 1 reply; 17+ messages in thread
From: Linus Torvalds @ 2004-02-11 20:38 UTC (permalink / raw)
  To: Tim Hockin; +Cc: viro, Andrew Morton, linux-kernel, viro



On Wed, 11 Feb 2004, Tim Hockin wrote:

> > d) grab a couple of pages and be done with that.  That gives us 64Kbits.
> 
> Maybe that is just the simplest answer?  It can be a simple constant that is
> changeable at compile time, and leave it at that
> 
> What's most likely to cause the least argument?

I'd suggest just raising it to 64k or so, that's likely to be acceptable, 
and it's a static 8kB array. That's likely not much more than the code 
needed to worry about dynamic entries, yet I'd assume that changing it 
from 256 to 64k is going to make most people say "enough".

		Linus

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: PATCH - raise max_anon limit
  2004-02-11 20:38       ` Linus Torvalds
@ 2004-02-11 21:09         ` Tim Hockin
  2004-02-11 21:53           ` Andrew Morton
  0 siblings, 1 reply; 17+ messages in thread
From: Tim Hockin @ 2004-02-11 21:09 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: viro, Andrew Morton, linux-kernel, viro

[-- Attachment #1: Type: text/plain, Size: 704 bytes --]

On Wed, Feb 11, 2004 at 12:38:11PM -0800, Linus Torvalds wrote:
> > Maybe that is just the simplest answer?  It can be a simple constant that is
> > changeable at compile time, and leave it at that
> > 
> > What's most likely to cause the least argument?
> 
> I'd suggest just raising it to 64k or so, that's likely to be acceptable, 
> and it's a static 8kB array. That's likely not much more than the code 
> needed to worry about dynamic entries, yet I'd assume that changing it 
> from 256 to 64k is going to make most people say "enough".

How's this then?  It doesn't get any simpler..

-- 
Tim Hockin
Sun Microsystems, Linux Software Engineering
thockin@sun.com
All opinions are my own, not Sun's

[-- Attachment #2: max_anon_raise-2.6.2-1.diff --]
[-- Type: text/plain, Size: 504 bytes --]

===== fs/super.c 1.110 vs edited =====
--- 1.110/fs/super.c	Sun Oct  5 01:07:55 2003
+++ edited/fs/super.c	Wed Feb 11 11:56:02 2004
@@ -535,7 +535,8 @@
  * filesystems which don't use real block-devices.  -- jrs
  */
 
-enum {Max_anon = 256};
+/* you can raise this as high as 2^MINORBITS if you REALLY need more */
+enum {Max_anon = 65536};
 static unsigned long unnamed_dev_in_use[Max_anon/(8*sizeof(unsigned long))];
 static spinlock_t unnamed_dev_lock = SPIN_LOCK_UNLOCKED;/* protects the above */
 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: PATCH - raise max_anon limit
  2004-02-11 21:09         ` Tim Hockin
@ 2004-02-11 21:53           ` Andrew Morton
  2004-02-11 22:28             ` Tim Hockin
  0 siblings, 1 reply; 17+ messages in thread
From: Andrew Morton @ 2004-02-11 21:53 UTC (permalink / raw)
  To: thockin; +Cc: torvalds, viro, linux-kernel

Tim Hockin <thockin@sun.com> wrote:
>
> > I'd suggest just raising it to 64k or so, that's likely to be acceptable, 
> > and it's a static 8kB array. That's likely not much more than the code 
> > needed to worry about dynamic entries, yet I'd assume that changing it 
> > from 256 to 64k is going to make most people say "enough".
> 
> How's this then?  It doesn't get any simpler..

Well it is lazy, wastes 0.4% of a 2M machine's memory and still has a
hard-wired limit.

Wanna test this?

 25-akpm/fs/super.c |   38 +++++++++++++++++++++++---------------
 1 files changed, 23 insertions(+), 15 deletions(-)

diff -puN fs/super.c~max_anon-use-idr fs/super.c
--- 25/fs/super.c~max_anon-use-idr	Wed Feb 11 13:41:57 2004
+++ 25-akpm/fs/super.c	Wed Feb 11 13:50:16 2004
@@ -23,6 +23,8 @@
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/init.h>
+#include <asm/semaphore.h>
 #include <linux/smp_lock.h>
 #include <linux/acct.h>
 #include <linux/blkdev.h>
@@ -33,6 +35,7 @@
 #include <linux/security.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>		/* for the emergency remount stuff */
+#include <linux/idr.h>
 #include <asm/uaccess.h>
 
 
@@ -536,38 +539,43 @@ void emergency_remount(void)
  * filesystems which don't use real block-devices.  -- jrs
  */
 
-enum {Max_anon = 256};
-static unsigned long unnamed_dev_in_use[Max_anon/(8*sizeof(unsigned long))];
-static spinlock_t unnamed_dev_lock = SPIN_LOCK_UNLOCKED;/* protects the above */
+static struct idr unnamed_dev_idr;
+static DECLARE_MUTEX(unnamed_dev_sem);
 
 int set_anon_super(struct super_block *s, void *data)
 {
 	int dev;
-	spin_lock(&unnamed_dev_lock);
-	dev = find_first_zero_bit(unnamed_dev_in_use, Max_anon);
-	if (dev == Max_anon) {
-		spin_unlock(&unnamed_dev_lock);
-		return -EMFILE;
+
+	down(&unnamed_dev_sem);
+	if (idr_pre_get(&unnamed_dev_idr) == 0) {
+		up(&unnamed_dev_sem);
+		return -ENOMEM;
 	}
-	set_bit(dev, unnamed_dev_in_use);
-	spin_unlock(&unnamed_dev_lock);
+	dev = idr_get_new(&unnamed_dev_idr, NULL);
+	up(&unnamed_dev_sem);
 	s->s_dev = MKDEV(0, dev);
 	return 0;
 }
-
 EXPORT_SYMBOL(set_anon_super);
 
 void kill_anon_super(struct super_block *sb)
 {
 	int slot = MINOR(sb->s_dev);
+
 	generic_shutdown_super(sb);
-	spin_lock(&unnamed_dev_lock);
-	clear_bit(slot, unnamed_dev_in_use);
-	spin_unlock(&unnamed_dev_lock);
+	down(&unnamed_dev_sem);
+	idr_remove(&unnamed_dev_idr, slot);
+	up(&unnamed_dev_sem);
 }
-
 EXPORT_SYMBOL(kill_anon_super);
 
+static int __init unnamed_dev_idr_init(void)
+{
+	idr_init(&unnamed_dev_idr);
+	return 0;
+}
+core_initcall(unnamed_dev_idr_init);
+
 void kill_litter_super(struct super_block *sb)
 {
 	if (sb->s_root)

_


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: PATCH - raise max_anon limit
  2004-02-11 21:53           ` Andrew Morton
@ 2004-02-11 22:28             ` Tim Hockin
  2004-02-11 22:48               ` Andrew Morton
  0 siblings, 1 reply; 17+ messages in thread
From: Tim Hockin @ 2004-02-11 22:28 UTC (permalink / raw)
  To: Andrew Morton; +Cc: torvalds, viro, linux-kernel

On Wed, Feb 11, 2004 at 01:53:25PM -0800, Andrew Morton wrote:
> > How's this then?  It doesn't get any simpler..
> 
> Well it is lazy, wastes 0.4% of a 2M machine's memory and still has a
> hard-wired limit.
> 
> Wanna test this?

sure:

-------------------
sysfs: could not mount!
Unable to handle kernel paging request at virtual address ffffff01
 printing eip:
c014d288
*pde = 00000000
Oops: 0000 [#1]
CPU:    0
EIP:    0060:[<c014d288>]    Not tainted
EFLAGS: 00010086
EIP is at kmem_cache_alloc+0x2d/0x76
eax: 00000000   ebx: 00000246   ecx: c0450f4c   edx: ffffff01
esi: 000000d0   edi: 00000000   ebp: c04b5ef4   esp: c04b5ee0
ds: 007b   es: 007b   ss: 0068
Process swapper (pid: 0, threadinfo=c04b4000 task=c044a7a0)
Stack: c03f6606 00000740 c0450f4c f7fe9e00 c0539e68 c04b5f14 c025f671 00000000
       000000d0 00000000 c0450f4c f7fe9e00 c0454220 c04b5f28 c0170c77 c0539e68
       00000077 00000000 c04b5f48 c016fecc f7fe9e00 00000000 f7fe9e00 f7fe74c0
Call Trace:
 [<c025f671>] idr_pre_get+0x39/0xc5
 [<c0170c77>] set_anon_super+0x3d/0x89
 [<c016fecc>] sget+0xf2/0x22b
 [<c0170f66>] get_sb_nodev+0x32/0x8e
 [<c0170c3a>] set_anon_super+0x0/0x89
 [<c01710d8>] do_kern_mount+0x56/0xc5
 [<c01d67b8>] ramfs_fill_super+0x0/0x75
 [<c0105000>] rest_init+0x0/0x93
 [<c04c90ac>] init_mount_tree+0x2d/0x191
 [<c0105000>] rest_init+0x0/0x93
 [<c04c8eb9>] vfs_caches_init+0xa4/0xc8
 [<c016910c>] filp_ctor+0x0/0xb4
 [<c01691c0>] filp_dtor+0x0/0xac
 [<c04b6818>] start_kernel+0x15d/0x1e5
 [<c04b6427>] unknown_bootoption+0x0/0xfa

Code: 8b 02 85 c0 74 1f 83 e8 01 c7 42 0c 01 00 00 00 89 02 8b 44
 <0>Kernel panic: Attempted to kill the idle task!
In idle task - not syncing
-------------------


This was a lab machine, and I'm not at the lab.  I'll try to get someone to
reboot it for me. :P

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: PATCH - raise max_anon limit
  2004-02-11 22:28             ` Tim Hockin
@ 2004-02-11 22:48               ` Andrew Morton
       [not found]                 ` <20040211233852.GN9155@sun.com>
  0 siblings, 1 reply; 17+ messages in thread
From: Andrew Morton @ 2004-02-11 22:48 UTC (permalink / raw)
  To: thockin; +Cc: torvalds, viro, linux-kernel

Tim Hockin <thockin@sun.com> wrote:
>
> > Wanna test this?
> 
> sure:
> 
> -------------------
> sysfs: could not mount!
> Unable to handle kernel paging request at virtual address ffffff01

Bah, ordering problem.


 25-akpm/fs/super.c         |   36 +++++++++++++++++++++---------------
 25-akpm/include/linux/fs.h |    1 +
 25-akpm/init/main.c        |    1 +
 3 files changed, 23 insertions(+), 15 deletions(-)

diff -puN fs/super.c~max_anon-use-idr fs/super.c
--- 25/fs/super.c~max_anon-use-idr	Wed Feb 11 14:42:15 2004
+++ 25-akpm/fs/super.c	Wed Feb 11 14:47:16 2004
@@ -23,6 +23,8 @@
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/init.h>
+#include <asm/semaphore.h>
 #include <linux/smp_lock.h>
 #include <linux/acct.h>
 #include <linux/blkdev.h>
@@ -33,6 +35,7 @@
 #include <linux/security.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>		/* for the emergency remount stuff */
+#include <linux/idr.h>
 #include <asm/uaccess.h>
 
 
@@ -536,38 +539,41 @@ void emergency_remount(void)
  * filesystems which don't use real block-devices.  -- jrs
  */
 
-enum {Max_anon = 256};
-static unsigned long unnamed_dev_in_use[Max_anon/(8*sizeof(unsigned long))];
-static spinlock_t unnamed_dev_lock = SPIN_LOCK_UNLOCKED;/* protects the above */
+static struct idr unnamed_dev_idr;
+static DECLARE_MUTEX(unnamed_dev_sem);
 
 int set_anon_super(struct super_block *s, void *data)
 {
 	int dev;
-	spin_lock(&unnamed_dev_lock);
-	dev = find_first_zero_bit(unnamed_dev_in_use, Max_anon);
-	if (dev == Max_anon) {
-		spin_unlock(&unnamed_dev_lock);
-		return -EMFILE;
+
+	down(&unnamed_dev_sem);
+	if (idr_pre_get(&unnamed_dev_idr) == 0) {
+		up(&unnamed_dev_sem);
+		return -ENOMEM;
 	}
-	set_bit(dev, unnamed_dev_in_use);
-	spin_unlock(&unnamed_dev_lock);
+	dev = idr_get_new(&unnamed_dev_idr, NULL);
+	up(&unnamed_dev_sem);
 	s->s_dev = MKDEV(0, dev);
 	return 0;
 }
-
 EXPORT_SYMBOL(set_anon_super);
 
 void kill_anon_super(struct super_block *sb)
 {
 	int slot = MINOR(sb->s_dev);
+
 	generic_shutdown_super(sb);
-	spin_lock(&unnamed_dev_lock);
-	clear_bit(slot, unnamed_dev_in_use);
-	spin_unlock(&unnamed_dev_lock);
+	down(&unnamed_dev_sem);
+	idr_remove(&unnamed_dev_idr, slot);
+	up(&unnamed_dev_sem);
 }
-
 EXPORT_SYMBOL(kill_anon_super);
 
+void __init unnamed_dev_init(void)
+{
+	idr_init(&unnamed_dev_idr);
+}
+
 void kill_litter_super(struct super_block *sb)
 {
 	if (sb->s_root)
diff -puN include/linux/fs.h~max_anon-use-idr include/linux/fs.h
--- 25/include/linux/fs.h~max_anon-use-idr	Wed Feb 11 14:45:52 2004
+++ 25-akpm/include/linux/fs.h	Wed Feb 11 14:46:06 2004
@@ -1051,6 +1051,7 @@ struct super_block *sget(struct file_sys
 			void *data);
 struct super_block *get_sb_pseudo(struct file_system_type *, char *,
 			struct super_operations *ops, unsigned long);
+void unnamed_dev_init(void);
 
 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
 #define fops_get(fops) \
diff -puN init/main.c~max_anon-use-idr init/main.c
--- 25/init/main.c~max_anon-use-idr	Wed Feb 11 14:48:12 2004
+++ 25-akpm/init/main.c	Wed Feb 11 14:48:15 2004
@@ -473,6 +473,7 @@ asmlinkage void __init start_kernel(void
 	fork_init(num_physpages);
 	proc_caches_init();
 	buffer_init();
+	unnamed_dev_init();
 	security_scaffolding_startup();
 	vfs_caches_init(num_physpages);
 	radix_tree_init();

_


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: PATCH - raise max_anon limit
       [not found]                       ` <20040211164233.5f233595.akpm@osdl.org>
@ 2004-02-12  1:08                         ` Tim Hockin
  2004-02-12  1:20                           ` Andrew Morton
  0 siblings, 1 reply; 17+ messages in thread
From: Tim Hockin @ 2004-02-12  1:08 UTC (permalink / raw)
  To: Andrew Morton; +Cc: torvalds, viro, Linux Kernel mailing list

On Wed, Feb 11, 2004 at 04:42:33PM -0800, Andrew Morton wrote:
On Wed, Feb 11, 2004 at 04:42:33PM -0800, Andrew Morton wrote:
> > Indeed.  MKDEV() already masks off the high order stuff, so that is OK.
>_
> That means that we've lost the original idr key and can no longer remove
> the thing, doesn't it?

No, it doesn't store the counter with the id.  They expect you to do that.
My best understanding is that thi sis to prevent re-use of the same key.
I'm not sure I grok why it is useful.  If you release a key, it should be
safe to reuse.  Period.  I assume there was some use case that brought about
this "feature" but if so, I don't know what it is.  The big comment about it
is just confusing me.

> > On idr_get_new(), we can just check for
> > 	dev & ((1<<MINORBITS)-1) == (1<<MINORBITS)-1)
> > and return -EMFILE.
> >_
> > That combined with a gfp mask to idr and the assumption that idr's
> > counter
> > won't ever grow beyond (sizeof(int)*8 - MINORBITS) (12) bits
> >_
> > Shall I whip that up and test it?  Do you prefer a gfp mask to idr_init
> > that
> > sticks around for all allocations or a GFP mask to idr_pre_get?

Offer repeated. :)

-- 
Tim Hockin
Sun Microsystems, Linux Software Engineering
thockin@sun.com
All opinions are my own, not Sun's

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: PATCH - raise max_anon limit
  2004-02-12  1:08                         ` Tim Hockin
@ 2004-02-12  1:20                           ` Andrew Morton
  2004-02-12  2:22                             ` Tim Hockin
  2004-02-12 17:26                             ` Jim Houston
  0 siblings, 2 replies; 17+ messages in thread
From: Andrew Morton @ 2004-02-12  1:20 UTC (permalink / raw)
  To: thockin; +Cc: torvalds, viro, linux-kernel, jim.houston

Tim Hockin <thockin@sun.com> wrote:
>
> On Wed, Feb 11, 2004 at 04:42:33PM -0800, Andrew Morton wrote:
> On Wed, Feb 11, 2004 at 04:42:33PM -0800, Andrew Morton wrote:
> > > Indeed.  MKDEV() already masks off the high order stuff, so that is OK.
> >_
> > That means that we've lost the original idr key and can no longer remove
> > the thing, doesn't it?
> 
> No, it doesn't store the counter with the id.  They expect you to do that.
> My best understanding is that thi sis to prevent re-use of the same key.
> I'm not sure I grok why it is useful.  If you release a key, it should be
> safe to reuse.  Period.  I assume there was some use case that brought about
> this "feature" but if so, I don't know what it is.  The big comment about it
> is just confusing me.

Maybe Jim can tell us why it's there.  Certainly, the idr interface would
be more useful if it just returned id's which start from zero.


> > > On idr_get_new(), we can just check for
> > > 	dev & ((1<<MINORBITS)-1) == (1<<MINORBITS)-1)
> > > and return -EMFILE.
> > >_
> > > That combined with a gfp mask to idr and the assumption that idr's
> > > counter
> > > won't ever grow beyond (sizeof(int)*8 - MINORBITS) (12) bits
> > >_
> > > Shall I whip that up and test it?  Do you prefer a gfp mask to idr_init
> > > that
> > > sticks around for all allocations or a GFP mask to idr_pre_get?
> 
> Offer repeated. :)

Please.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: PATCH - raise max_anon limit
  2004-02-12  1:20                           ` Andrew Morton
@ 2004-02-12  2:22                             ` Tim Hockin
  2004-02-12 17:26                             ` Jim Houston
  1 sibling, 0 replies; 17+ messages in thread
From: Tim Hockin @ 2004-02-12  2:22 UTC (permalink / raw)
  To: Andrew Morton; +Cc: torvalds, viro, linux-kernel, jim.houston

[-- Attachment #1: Type: text/plain, Size: 1542 bytes --]

On Wed, Feb 11, 2004 at 05:20:46PM -0800, Andrew Morton wrote:
> > Offer repeated. :)
> 
> Please.

Patch attached.  I somehow blew up my box when I overran the 800 port limit
in nfs, now.  More fun.

never happened before, but I don't know how itr could be this patch's
fault..

RPC: can't bind to reserved port.
NFS: cannot retrieve file system info.
nfs_read_super: get root inode failed
------------[ cut here ]------------
kernel BUG at fs/inode.c:1090!
invalid operand: 0000 [#2]
CPU:    1
EIP:    0060:[<c0187fc4>]    Not tainted
EFLAGS: 00010246
EIP is at iput+0x72/0x7c
eax: c0455b20   ebx: f4537690   ecx: c01f6fbc   edx: 00000001
esi: f4a24000   edi: f4537690   ebp: f4a25e2c   esp: f4a25e20
ds: 007b   es: 007b   ss: 0068
Process mount (pid: 3029, threadinfo=f4a24000 task=f729b940)
Stack: f4654480 f4a24000 f4654480 f4a25e4c c01835a2 f4537690 f4537690 f4654488
       f4654480 f4a99200 c0455b20 f4a25e6c c016fbb6 f4654480 c03c6398 f4eab000
       f4a24000 0000019c f4a24000 f4a25e84 c0170da7 f4a99200 f46844ea f44aad80
Call Trace:
 [<c01835a2>] dput+0x183/0x3ea
 [<c016fbb6>] generic_shutdown_super+0x3f/0x26b
 [<c03c6398>] xprt_shutdown+0x48/0x7a
 [<c0170da7>] kill_anon_super+0x1d/0xad
 [<c01fc6a5>] nfs_kill_super+0x1a/0x28
 [<c016f892>] deactivate_super+0xa4/0x15a
 [<c01fc61a>] nfs_get_sb+0x1e7/0x258
 [<c01711db>] do_kern_mount+0x56/0xc5
 [<c018b980>] do_add_mount+0x8a/0x19a
 [<c018bcec>] do_mount+0x151/0x194

-- 
Tim Hockin
Sun Microsystems, Linux Software Engineering
thockin@sun.com
All opinions are my own, not Sun's

[-- Attachment #2: max_anon_raise-2.6.2-2.diff --]
[-- Type: text/plain, Size: 4854 bytes --]

===== include/linux/idr.h 1.3 vs edited =====
--- 1.3/include/linux/idr.h	Thu Mar 27 21:13:36 2003
+++ edited/include/linux/idr.h	Wed Feb 11 16:25:20 2004
@@ -58,7 +58,7 @@
  */
 
 void *idr_find(struct idr *idp, int id);
-int idr_pre_get(struct idr *idp);
+int idr_pre_get(struct idr *idp, unsigned gfp_mask);
 int idr_get_new(struct idr *idp, void *ptr);
 void idr_remove(struct idr *idp, int id);
 void idr_init(struct idr *idp);
===== lib/idr.c 1.3 vs edited =====
--- 1.3/lib/idr.c	Thu Mar 27 21:13:36 2003
+++ edited/lib/idr.c	Wed Feb 11 16:24:57 2004
@@ -62,13 +62,13 @@
  *   to the rest of the functions.  The structure is defined in the
  *   header.
 
- * int idr_pre_get(struct idr *idp)
+ * int idr_pre_get(struct idr *idp, unsigned gfp_mask)
 
  *   This function should be called prior to locking and calling the
  *   following function.  It pre allocates enough memory to satisfy the
- *   worst possible allocation.  It can sleep, so must not be called
- *   with any spinlocks held.  If the system is REALLY out of memory
- *   this function returns 0, other wise 1.
+ *   worst possible allocation.  Unless gfp_mask is GFP_ATOMIC, it can
+ *   sleep, so must not be called with any spinlocks held.  If the system is
+ *   REALLY out of memory this function returns 0, other wise 1.
 
  * int idr_get_new(struct idr *idp, void *ptr);
  
@@ -135,11 +135,11 @@
 	spin_unlock(&idp->lock);
 }
 
-int idr_pre_get(struct idr *idp)
+int idr_pre_get(struct idr *idp, unsigned gfp_mask)
 {
 	while (idp->id_free_cnt < idp->layers + 1) {
 		struct idr_layer *new;
-		new = kmem_cache_alloc(idr_layer_cache, GFP_KERNEL);
+		new = kmem_cache_alloc(idr_layer_cache, gfp_mask);
 		if(new == NULL)
 			return (0);
 		free_layer(idp, new);
===== fs/super.c 1.110 vs edited =====
--- 1.110/fs/super.c	Sun Oct  5 01:07:55 2003
+++ edited/fs/super.c	Wed Feb 11 17:08:07 2004
@@ -23,6 +23,7 @@
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/init.h>
 #include <linux/smp_lock.h>
 #include <linux/acct.h>
 #include <linux/blkdev.h>
@@ -33,6 +34,7 @@
 #include <linux/security.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>		/* for the emergency remount stuff */
+#include <linux/idr.h>
 #include <asm/uaccess.h>
 
 
@@ -535,22 +537,26 @@
  * filesystems which don't use real block-devices.  -- jrs
  */
 
-enum {Max_anon = 256};
-static unsigned long unnamed_dev_in_use[Max_anon/(8*sizeof(unsigned long))];
+static struct idr unnamed_dev_idr;
 static spinlock_t unnamed_dev_lock = SPIN_LOCK_UNLOCKED;/* protects the above */
 
 int set_anon_super(struct super_block *s, void *data)
 {
 	int dev;
+
 	spin_lock(&unnamed_dev_lock);
-	dev = find_first_zero_bit(unnamed_dev_in_use, Max_anon);
-	if (dev == Max_anon) {
+	if (idr_pre_get(&unnamed_dev_idr, GFP_ATOMIC) == 0) {
 		spin_unlock(&unnamed_dev_lock);
-		return -EMFILE;
+		return -ENOMEM;
 	}
-	set_bit(dev, unnamed_dev_in_use);
+	dev = idr_get_new(&unnamed_dev_idr, NULL);
 	spin_unlock(&unnamed_dev_lock);
-	s->s_dev = MKDEV(0, dev);
+
+	if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) {
+		idr_remove(&unnamed_dev_idr, dev);
+		return -EMFILE;
+	}
+	s->s_dev = MKDEV(0, dev & MINORMASK);
 	return 0;
 }
 
@@ -559,13 +565,19 @@
 void kill_anon_super(struct super_block *sb)
 {
 	int slot = MINOR(sb->s_dev);
+
 	generic_shutdown_super(sb);
 	spin_lock(&unnamed_dev_lock);
-	clear_bit(slot, unnamed_dev_in_use);
+	idr_remove(&unnamed_dev_idr, slot);
 	spin_unlock(&unnamed_dev_lock);
 }
 
 EXPORT_SYMBOL(kill_anon_super);
+
+void __init unnamed_dev_init(void)
+{
+	idr_init(&unnamed_dev_idr);
+}
 
 void kill_litter_super(struct super_block *sb)
 {
===== include/linux/fs.h 1.283 vs edited =====
--- 1.283/include/linux/fs.h	Mon Jan 19 15:38:10 2004
+++ edited/include/linux/fs.h	Wed Feb 11 13:43:31 2004
@@ -1045,6 +1045,7 @@
 			void *data);
 struct super_block *get_sb_pseudo(struct file_system_type *, char *,
 			struct super_operations *ops, unsigned long);
+void unnamed_dev_init(void);
 
 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
 #define fops_get(fops) \
===== init/main.c 1.119 vs edited =====
--- 1.119/init/main.c	Tue Feb  3 21:28:11 2004
+++ edited/init/main.c	Wed Feb 11 13:43:31 2004
@@ -450,6 +450,7 @@
 	fork_init(num_physpages);
 	proc_caches_init();
 	buffer_init();
+	unnamed_dev_init();
 	security_scaffolding_startup();
 	vfs_caches_init(num_physpages);
 	radix_tree_init();
===== kernel/posix-timers.c 1.26 vs edited =====
--- 1.26/kernel/posix-timers.c	Tue Feb  3 21:35:50 2004
+++ edited/kernel/posix-timers.c	Wed Feb 11 16:26:41 2004
@@ -426,7 +426,7 @@
 
 	spin_lock_init(&new_timer->it_lock);
 	do {
-		if (unlikely(!idr_pre_get(&posix_timers_id))) {
+		if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) {
 			error = -EAGAIN;
 			new_timer->it_id = (timer_t)-1;
 			goto out;

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: PATCH - raise max_anon limit
  2004-02-12  1:20                           ` Andrew Morton
  2004-02-12  2:22                             ` Tim Hockin
@ 2004-02-12 17:26                             ` Jim Houston
  2004-02-12 18:49                               ` Tim Hockin
  2004-02-12 22:03                               ` Andrew Morton
  1 sibling, 2 replies; 17+ messages in thread
From: Jim Houston @ 2004-02-12 17:26 UTC (permalink / raw)
  To: Andrew Morton; +Cc: thockin, torvalds, viro, linux-kernel, george

On Wed, 2004-02-11 at 20:20, Andrew Morton wrote:
> Tim Hockin <thockin@sun.com> wrote:
> > No, it doesn't store the counter with the id.  They expect you to do that.
> > My best understanding is that thi sis to prevent re-use of the same key.
> > I'm not sure I grok why it is useful.  If you release a key, it should be
> > safe to reuse.  Period.  I assume there was some use case that brought about
> > this "feature" but if so, I don't know what it is.  The big comment about it
> > is just confusing me.
> 
> Maybe Jim can tell us why it's there.  Certainly, the idr interface would
> be more useful if it just returned id's which start from zero.

Hi Andrew, Everyone,

If this new use of idr.c as a sparse bitmap catches on, it might deserve
a new flavor which would not waste the space for the pointer array
at the lowest layer.

When I wrote the original code, I was thinking of allocating process
id values where there is a tradition of allocating sequential values.
 
George Anzinger rewrote most of my code.  The r in idr.c is for
immediate reuse.  His version picks the lowest available bit in the
sparse bitmap.  The RESERVED_BITS comments seem to be stale.

The rational for avoiding immediate reuse of id values is to catch
application errors.   Consider:

	fd1 = open_like_call(...);
	read(fd1,...);
	close(fd1);
	fd2 = open_like_call(...);
	write(fd1...);

If fd2 has a different value than the recently closed fd1, the
error is detected immediately.

Jim Houston - Concurrent Computer Corp.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: PATCH - raise max_anon limit
  2004-02-12 17:26                             ` Jim Houston
@ 2004-02-12 18:49                               ` Tim Hockin
  2004-02-13  2:01                                 ` Jamie Lokier
  2004-02-12 22:03                               ` Andrew Morton
  1 sibling, 1 reply; 17+ messages in thread
From: Tim Hockin @ 2004-02-12 18:49 UTC (permalink / raw)
  To: Jim Houston; +Cc: Andrew Morton, torvalds, viro, linux-kernel, george

On Thu, Feb 12, 2004 at 12:26:14PM -0500, Jim Houston wrote:
> > Maybe Jim can tell us why it's there.  Certainly, the idr interface would
> > be more useful if it just returned id's which start from zero.
> 
> Hi Andrew, Everyone,
> 
> If this new use of idr.c as a sparse bitmap catches on, it might deserve
> a new flavor which would not waste the space for the pointer array
> at the lowest layer.

the only place I found using idr as-is is posix timers.  I haven't looked at
it's usage pattern much, but I assume it does use the pointers.  I guess
we're using up sizeof(void *) for every id we allocate, which is yuck.

Do we need to clone idr.c into bitmap.c and simplify?

> George Anzinger rewrote most of my code.  The r in idr.c is for
> immediate reuse.  His version picks the lowest available bit in the

That is the behavior that makes most sense, to me.

> The rational for avoiding immediate reuse of id values is to catch
> application errors.   Consider:
> 
> 	fd1 = open_like_call(...);
> 	read(fd1,...);
> 	close(fd1);
> 	fd2 = open_like_call(...);
> 	write(fd1...);
> 
> If fd2 has a different value than the recently closed fd1, the
> error is detected immediately.

Is that really worth working around in such a gross way?  No offense to the
idea, but that's a pretty dumb bug to be hacking a failsafe for :)

-- 
Tim Hockin
Sun Microsystems, Linux Software Engineering
thockin@sun.com
All opinions are my own, not Sun's

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: PATCH - raise max_anon limit
  2004-02-12 17:26                             ` Jim Houston
  2004-02-12 18:49                               ` Tim Hockin
@ 2004-02-12 22:03                               ` Andrew Morton
  2004-02-13  1:12                                 ` George Anzinger
  1 sibling, 1 reply; 17+ messages in thread
From: Andrew Morton @ 2004-02-12 22:03 UTC (permalink / raw)
  To: Jim Houston; +Cc: thockin, torvalds, viro, linux-kernel, george

Jim Houston <jim.houston@ccur.com> wrote:
>
> On Wed, 2004-02-11 at 20:20, Andrew Morton wrote:
> > Tim Hockin <thockin@sun.com> wrote:
> > > No, it doesn't store the counter with the id.  They expect you to do that.
> > > My best understanding is that thi sis to prevent re-use of the same key.
> > > I'm not sure I grok why it is useful.  If you release a key, it should be
> > > safe to reuse.  Period.  I assume there was some use case that brought about
> > > this "feature" but if so, I don't know what it is.  The big comment about it
> > > is just confusing me.
> > 
> > Maybe Jim can tell us why it's there.  Certainly, the idr interface would
> > be more useful if it just returned id's which start from zero.
> 
> Hi Andrew, Everyone,
> 
> If this new use of idr.c as a sparse bitmap catches on,

I think it should catch on - it is a fairly common kernel requirement.  The
max_anon thing requires it, and I am also pressing it upon the scsi guys to
handle enormous numbers of disks (depends on how they end up doing that). 
In neither case is the associated pointer needed.

> When I wrote the original code, I was thinking of allocating process
> id values where there is a tradition of allocating sequential values.

File descriptors are like that too.

> George Anzinger rewrote most of my code.  The r in idr.c is for
> immediate reuse.  His version picks the lowest available bit in the
> sparse bitmap.  The RESERVED_BITS comments seem to be stale.
> 
> The rational for avoiding immediate reuse of id values is to catch
> application errors.   Consider:
> 
> 	fd1 = open_like_call(...);
> 	read(fd1,...);
> 	close(fd1);
> 	fd2 = open_like_call(...);
> 	write(fd1...);
> 
> If fd2 has a different value than the recently closed fd1, the
> error is detected immediately.
> 

In this case the debug capability is getting in the way of real-world
requirements, which is not good.

idr_pre_get() is not very good IMO.  For a start, it's racy:

	idr_pre_get();
	lock();
	idr_get_new();
	unlock();

how do we know that some other CPU didn't come in and steal our
preallocation?  That's why I (buggily) converted unnamed_dev_lock from a
spinlock to a semaphore, so we could perform the preallocation under the
same locking.

It would be better, and more idiomatic if idr_get_new() were to take a gfp
mask and to perform its own allocation.  That has its own problems and if
the code is under really heavy stress one might need to emulate
radix_tree_preload()/radix_tree_preload_end(), but for most things that's a
bit over the top.



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: PATCH - raise max_anon limit
  2004-02-12 22:03                               ` Andrew Morton
@ 2004-02-13  1:12                                 ` George Anzinger
  0 siblings, 0 replies; 17+ messages in thread
From: George Anzinger @ 2004-02-13  1:12 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Jim Houston, thockin, torvalds, viro, linux-kernel

Andrew Morton wrote:
> Jim Houston <jim.houston@ccur.com> wrote:
> 
>>On Wed, 2004-02-11 at 20:20, Andrew Morton wrote:
>>
>>>Tim Hockin <thockin@sun.com> wrote:
>>>
>>>>No, it doesn't store the counter with the id.  They expect you to do that.
>>>>My best understanding is that thi sis to prevent re-use of the same key.
>>>>I'm not sure I grok why it is useful.  If you release a key, it should be
>>>>safe to reuse.  Period.  I assume there was some use case that brought about
>>>>this "feature" but if so, I don't know what it is.  The big comment about it
>>>>is just confusing me.
>>>
>>>Maybe Jim can tell us why it's there.  Certainly, the idr interface would
>>>be more useful if it just returned id's which start from zero.
>>
>>Hi Andrew, Everyone,
>>
>>If this new use of idr.c as a sparse bitmap catches on,
> 
> 
> I think it should catch on - it is a fairly common kernel requirement.  The
> max_anon thing requires it, and I am also pressing it upon the scsi guys to
> handle enormous numbers of disks (depends on how they end up doing that). 
> In neither case is the associated pointer needed.
> 
> 
>>When I wrote the original code, I was thinking of allocating process
>>id values where there is a tradition of allocating sequential values.
> 
> 
> File descriptors are like that too.
> 
> 
>>George Anzinger rewrote most of my code.  The r in idr.c is for
>>immediate reuse.  His version picks the lowest available bit in the
>>sparse bitmap.  The RESERVED_BITS comments seem to be stale.
>>
>>The rational for avoiding immediate reuse of id values is to catch
>>application errors.   Consider:
>>
>>	fd1 = open_like_call(...);
>>	read(fd1,...);
>>	close(fd1);
>>	fd2 = open_like_call(...);
>>	write(fd1...);
>>
>>If fd2 has a different value than the recently closed fd1, the
>>error is detected immediately.
>>
> 
> 
> In this case the debug capability is getting in the way of real-world
> requirements, which is not good.
> 
> idr_pre_get() is not very good IMO.  For a start, it's racy:
> 
> 	idr_pre_get();
> 	lock();
> 	idr_get_new();
> 	unlock();
> 
> how do we know that some other CPU didn't come in and steal our
> preallocation?  That's why I (buggily) converted unnamed_dev_lock from a
> spinlock to a semaphore, so we could perform the preallocation under the
> same locking.
> 
> It would be better, and more idiomatic if idr_get_new() were to take a gfp
> mask and to perform its own allocation.  That has its own problems and if
> the code is under really heavy stress one might need to emulate
> radix_tree_preload()/radix_tree_preload_end(), but for most things that's a
> bit over the top.


Another option might be to use a pre allocate pool size based on the number of 
CPUs.  This does not require CPU*n elements as it is is a tree and "n" here is 
worst case so something like "n+#CPUs" elements would be enough.  (n, by the 
way, is the maximum number of levels in the tree).  Its a bit sloppy but, IMHO, 
would survive almost all storms.
> 
> 
> 

-- 
George Anzinger   george@mvista.com
High-res-timers:  http://sourceforge.net/projects/high-res-timers/
Preemption patch: http://www.kernel.org/pub/linux/kernel/people/rml


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: PATCH - raise max_anon limit
  2004-02-12 18:49                               ` Tim Hockin
@ 2004-02-13  2:01                                 ` Jamie Lokier
  0 siblings, 0 replies; 17+ messages in thread
From: Jamie Lokier @ 2004-02-13  2:01 UTC (permalink / raw)
  To: Tim Hockin
  Cc: Jim Houston, Andrew Morton, torvalds, viro, linux-kernel, george

Tim Hockin wrote:
> > The rational for avoiding immediate reuse of id values is to catch
> > application errors.   Consider:
> > 
> > 	fd1 = open_like_call(...);
> > 	read(fd1,...);
> > 	close(fd1);
> > 	fd2 = open_like_call(...);
> > 	write(fd1...);
> > 
> > If fd2 has a different value than the recently closed fd1, the
> > error is detected immediately.
> 
> Is that really worth working around in such a gross way?  No offense to the
> idea, but that's a pretty dumb bug to be hacking a failsafe for :)

I'm pretty sure POSIX requires fd2 to be equal to fd1 if it is the
lowest free file descriptor number.

Unfortunately.  An O(1) fd allocation algorithm would be nice.

-- Jamie

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2004-02-13  2:01 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-02-06 22:15 PATCH - raise max_anon limit Tim Hockin
2004-02-07  8:55 ` Andrew Morton
2004-02-07  9:48   ` viro
2004-02-11 20:33     ` Tim Hockin
2004-02-11 20:38       ` Linus Torvalds
2004-02-11 21:09         ` Tim Hockin
2004-02-11 21:53           ` Andrew Morton
2004-02-11 22:28             ` Tim Hockin
2004-02-11 22:48               ` Andrew Morton
     [not found]                 ` <20040211233852.GN9155@sun.com>
     [not found]                   ` <20040211155754.5068332c.akpm@osdl.org>
     [not found]                     ` <20040212003840.GO9155@sun.com>
     [not found]                       ` <20040211164233.5f233595.akpm@osdl.org>
2004-02-12  1:08                         ` Tim Hockin
2004-02-12  1:20                           ` Andrew Morton
2004-02-12  2:22                             ` Tim Hockin
2004-02-12 17:26                             ` Jim Houston
2004-02-12 18:49                               ` Tim Hockin
2004-02-13  2:01                                 ` Jamie Lokier
2004-02-12 22:03                               ` Andrew Morton
2004-02-13  1:12                                 ` George Anzinger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox