public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Increase number of dynamic inodes in procfs (2.6.5)
@ 2004-04-13 19:36 Nathan Lynch
  2004-04-14  0:06 ` Andrew Morton
  0 siblings, 1 reply; 9+ messages in thread
From: Nathan Lynch @ 2004-04-13 19:36 UTC (permalink / raw)
  To: linux-kernel; +Cc: Andrew Morton

[-- Attachment #1: Type: text/plain, Size: 902 bytes --]

Hi-

On some larger ppc64 configurations /proc/device-tree is exhausting 
procfs' dynamic (non-pid) inode range (16K).  This patch makes the 
dynamic inode range 0xf0000000-0xffffffff and changes the inode number 
allocator to use a growable linked list of bitmaps.  Smaller 
configurations are unlikely to have a need for growing the bitmap list 
beyond the initial reservation of 4096 bits, which should reduce their 
exposure to the change.

The number of dynamic entries we need to be able to support is in the 
hundreds of thousands, so extending the existing range 
(00001000-00001fff) upwards would have collided with the pid range.  The 
range I have chosen should be more than enough.

This has been tested on ppc64 and i386.  AFAICT it does not affect the 
pid /proc entries or tools that use them (top, gdb).

Patch applies cleanly to 2.6.5 and 2.6.5-mm4.  Please cc me on replies.


Nathan

[-- Attachment #2: procfs_many_inodes.patch --]
[-- Type: text/x-patch, Size: 6833 bytes --]

diff -pru linux-2.6.5/fs/proc/generic.c linux-2.6.5.new/fs/proc/generic.c
--- linux-2.6.5/fs/proc/generic.c	2004-04-13 11:33:18.000000000 -0500
+++ linux-2.6.5.new/fs/proc/generic.c	2004-04-13 11:52:17.000000000 -0500
@@ -275,24 +275,106 @@ static int xlate_proc_name(const char *n
 	return 0;
 }
 
-static unsigned long proc_alloc_map[(PROC_NDYNAMIC + BITS_PER_LONG - 1) / BITS_PER_LONG];
+/*
+ * Some systems need *lots* of proc entries.  So the proc inode map is
+ * a growable linked list of bitmaps.  Smaller systems are unlikely to
+ * need to grow the map.
+ */
+
+#define PROC_DYNAMIC_FIRST 0xF0000000UL
+#define PROC_DYNAMIC_LAST  0xFFFFFFFFUL
+#define PROC_BITS_PER_MAP  4096
+#define PROC_MAX_BITMAPS   ((PROC_DYNAMIC_LAST - PROC_DYNAMIC_FIRST) / PROC_BITS_PER_MAP + 1)
+
+struct proc_inode_map {
+	int used; /* how many bits are set */
+	struct proc_inode_map *next;
+	unsigned long map[(PROC_BITS_PER_MAP + BITS_PER_LONG - 1) / BITS_PER_LONG];
+};
 
-spinlock_t proc_alloc_map_lock = SPIN_LOCK_UNLOCKED;
+static struct proc_inode_map inode_map;
 
-static int make_inode_number(void)
+DECLARE_MUTEX(proc_alloc_map_sem);
+
+/* Allocate a new inode number, creating a new bitmap if necessary.
+ * Return 0 if we run out of inodes, since that is reserved for the
+ * root inode.
+ */
+static unsigned long get_inode_number(void)
 {
-	int i;
-	spin_lock(&proc_alloc_map_lock);
-	i = find_first_zero_bit(proc_alloc_map, PROC_NDYNAMIC);
-	if (i < 0 || i >= PROC_NDYNAMIC) {
-		i = -1;
+	int bitno; /* bit to set in map */
+	unsigned long ino = 0; /* inode number to return */
+	int map_idx = 0;
+	struct proc_inode_map *map = &inode_map;
+
+	down(&proc_alloc_map_sem);
+
+	/* Find either the first non-empty map, or the last map */
+	while (PROC_BITS_PER_MAP == map->used && map->next) {
+		map_idx++;
+		map = map->next;
+	}
+
+	/* Check for overflow */
+	if (map_idx == PROC_MAX_BITMAPS-1 && map->used == PROC_BITS_PER_MAP) {
+		printk(KERN_WARNING "procfs ran out of inodes!\n");
 		goto out;
 	}
-	set_bit(i, proc_alloc_map);
-	i += PROC_DYNAMIC_FIRST;
+
+	/* Allocate a new map if the last one is full */
+	if (PROC_BITS_PER_MAP == map->used) {
+		pr_debug("%s: extending inode map\n", __FUNCTION__);
+		map->next = kmalloc(sizeof(*map), GFP_KERNEL);
+		if (!map->next)
+			goto out;
+		map = map->next;
+		memset(map, 0, sizeof(*map));
+		bitno = 0;
+		map_idx++;
+	} else
+		bitno = find_first_zero_bit(map->map, PROC_BITS_PER_MAP);
+
+	BUG_ON(bitno < 0 || bitno >= PROC_BITS_PER_MAP);
+
+	set_bit(bitno, map->map);
+	map->used++;
+
+	ino = (map_idx * PROC_BITS_PER_MAP) + bitno + PROC_DYNAMIC_FIRST;
+
+	pr_debug("%s: setting bit %d in map %d, returning %lx\n",
+		 __FUNCTION__, bitno, map_idx, ino);
+
+	if (PROC_BITS_PER_MAP == map->used) {
+		pr_debug("%s: map #%d has filled\n", __FUNCTION__, map_idx);
+	}
+
 out:
-	spin_unlock(&proc_alloc_map_lock);
-	return i;
+	up(&proc_alloc_map_sem);
+	return ino;
+}
+
+static void release_inode_number(unsigned long inode)
+{
+	struct proc_inode_map *map = &inode_map;
+	int map_idx = (inode - PROC_DYNAMIC_FIRST) / PROC_BITS_PER_MAP;
+	int bitno = (inode - PROC_DYNAMIC_FIRST) % PROC_BITS_PER_MAP;
+
+	BUG_ON(bitno < 0 || bitno >= PROC_BITS_PER_MAP);
+
+	down(&proc_alloc_map_sem);
+
+	pr_debug("%s: releasing inode %lu, bit %d, map %d\n",
+		 __FUNCTION__, inode, bitno, map_idx);
+
+	while (map_idx--)
+		map = map->next;
+
+	clear_bit(bitno, map->map);
+	map->used--;
+
+	BUG_ON(map->used < 0 || map->used > PROC_BITS_PER_MAP);
+
+	up(&proc_alloc_map_sem);
 }
 
 static int
@@ -452,10 +534,10 @@ static struct inode_operations proc_dir_
 
 static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
 {
-	int	i;
+	unsigned long i;
 	
-	i = make_inode_number();
-	if (i < 0)
+	i = get_inode_number();
+	if (i == 0)
 		return -EAGAIN;
 	dp->low_ino = i;
 	dp->next = dir->subdir;
@@ -621,13 +703,13 @@ struct proc_dir_entry *create_proc_entry
 
 void free_proc_entry(struct proc_dir_entry *de)
 {
-	int ino = de->low_ino;
+	unsigned long ino = de->low_ino;
 
-	if (ino < PROC_DYNAMIC_FIRST ||
-	    ino >= PROC_DYNAMIC_FIRST+PROC_NDYNAMIC)
+	if (ino < PROC_DYNAMIC_FIRST)
 		return;
 	if (S_ISLNK(de->mode) && de->data)
 		kfree(de->data);
+	release_inode_number(ino);
 	kfree(de);
 }
 
@@ -653,8 +735,6 @@ void remove_proc_entry(const char *name,
 		de->next = NULL;
 		if (S_ISDIR(de->mode))
 			parent->nlink--;
-		clear_bit(de->low_ino - PROC_DYNAMIC_FIRST,
-			  proc_alloc_map);
 		proc_kill_inodes(de);
 		de->nlink = 0;
 		WARN_ON(de->subdir);
diff -pru linux-2.6.5/fs/proc/inode-alloc.txt linux-2.6.5.new/fs/proc/inode-alloc.txt
--- linux-2.6.5/fs/proc/inode-alloc.txt	2004-01-09 00:59:26.000000000 -0600
+++ linux-2.6.5.new/fs/proc/inode-alloc.txt	2004-04-13 11:41:11.000000000 -0500
@@ -4,9 +4,10 @@ Current inode allocations in the proc-fs
   00000001-00000fff	static entries	(goners)
        001		root-ino
 
-  00001000-00001fff	dynamic entries
+  00001000-00001fff	unused
   0001xxxx-7fffxxxx	pid-dir entries for pid 1-7fff
-  80000000-ffffffff	unused
+  80000000-efffffff	unused
+  f0000000-ffffffff	dynamic entries
 
 Goal:
 	a) once we'll split the thing into several virtual filesystems we
diff -pru linux-2.6.5/fs/proc/inode.c linux-2.6.5.new/fs/proc/inode.c
--- linux-2.6.5/fs/proc/inode.c	2004-04-13 11:33:18.000000000 -0500
+++ linux-2.6.5.new/fs/proc/inode.c	2004-04-13 11:41:11.000000000 -0500
@@ -181,7 +181,7 @@ static int parse_options(char *options,u
 	return 1;
 }
 
-struct inode * proc_get_inode(struct super_block * sb, int ino,
+struct inode * proc_get_inode(struct super_block * sb, unsigned long ino,
 				struct proc_dir_entry * de)
 {
 	struct inode * inode;
diff -pru linux-2.6.5/include/linux/proc_fs.h linux-2.6.5.new/include/linux/proc_fs.h
--- linux-2.6.5/include/linux/proc_fs.h	2004-04-13 11:33:20.000000000 -0500
+++ linux-2.6.5.new/include/linux/proc_fs.h	2004-04-13 11:41:11.000000000 -0500
@@ -24,11 +24,6 @@ enum {
 	PROC_ROOT_INO = 1,
 };
 
-/* Finally, the dynamically allocatable proc entries are reserved: */
-
-#define PROC_DYNAMIC_FIRST 4096
-#define PROC_NDYNAMIC      16384
-
 #define PROC_SUPER_MAGIC 0x9fa0
 
 /*
@@ -53,7 +48,7 @@ typedef	int (write_proc_t)(struct file *
 typedef int (get_info_t)(char *, char **, off_t, int);
 
 struct proc_dir_entry {
-	unsigned short low_ino;
+	unsigned long low_ino;
 	unsigned short namelen;
 	const char *name;
 	mode_t mode;
@@ -102,7 +97,7 @@ extern void remove_proc_entry(const char
 
 extern struct vfsmount *proc_mnt;
 extern int proc_fill_super(struct super_block *,void *,int);
-extern struct inode * proc_get_inode(struct super_block *, int, struct proc_dir_entry *);
+extern struct inode * proc_get_inode(struct super_block *, unsigned long, struct proc_dir_entry *);
 
 extern int proc_match(int, const char *,struct proc_dir_entry *);
 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Increase number of dynamic inodes in procfs (2.6.5)
  2004-04-13 19:36 [PATCH] Increase number of dynamic inodes in procfs (2.6.5) Nathan Lynch
@ 2004-04-14  0:06 ` Andrew Morton
  2004-04-14  5:01   ` Olof Johansson
  2004-04-15  2:38   ` Nathan Lynch
  0 siblings, 2 replies; 9+ messages in thread
From: Andrew Morton @ 2004-04-14  0:06 UTC (permalink / raw)
  To: Nathan Lynch; +Cc: linux-kernel

Nathan Lynch <nathanl@austin.ibm.com> wrote:
>
> On some larger ppc64 configurations /proc/device-tree is exhausting 
>  procfs' dynamic (non-pid) inode range (16K).  This patch makes the 
>  dynamic inode range 0xf0000000-0xffffffff

OK.

> and changes the inode number 
>  allocator to use a growable linked list of bitmaps.

This open-codes a simple version of lib/idr.c.  Please use lib/idr.c
instead.  There's an example in fs/super.c

This bit:

@@ -535,22 +537,26 @@ void emergency_remount(void)
  * filesystems which don't use real block-devices.  -- jrs
  */
 
-enum {Max_anon = 256};
-static unsigned long unnamed_dev_in_use[Max_anon/(8*sizeof(unsigned long))];
+static struct idr unnamed_dev_idr;
 static spinlock_t unnamed_dev_lock = SPIN_LOCK_UNLOCKED;/* protects the above */
 
 int set_anon_super(struct super_block *s, void *data)
 {
 	int dev;
+
 	spin_lock(&unnamed_dev_lock);
-	dev = find_first_zero_bit(unnamed_dev_in_use, Max_anon);
-	if (dev == Max_anon) {
+	if (idr_pre_get(&unnamed_dev_idr, GFP_ATOMIC) == 0) {
 		spin_unlock(&unnamed_dev_lock);
-		return -EMFILE;
+		return -ENOMEM;
 	}
-	set_bit(dev, unnamed_dev_in_use);
+	dev = idr_get_new(&unnamed_dev_idr, NULL);
 	spin_unlock(&unnamed_dev_lock);
-	s->s_dev = MKDEV(0, dev);
+
+	if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) {
+		idr_remove(&unnamed_dev_idr, dev);
+		return -EMFILE;
+	}
+	s->s_dev = MKDEV(0, dev & MINORMASK);
 	return 0;
 }
 
@@ -559,14 +565,20 @@ EXPORT_SYMBOL(set_anon_super);
 void kill_anon_super(struct super_block *sb)
 {
 	int slot = MINOR(sb->s_dev);
+
 	generic_shutdown_super(sb);
 	spin_lock(&unnamed_dev_lock);
-	clear_bit(slot, unnamed_dev_in_use);
+	idr_remove(&unnamed_dev_idr, slot);
 	spin_unlock(&unnamed_dev_lock);
 }
 
 EXPORT_SYMBOL(kill_anon_super);
 
+void __init unnamed_dev_init(void)
+{
+	idr_init(&unnamed_dev_idr);
+}
+
 void kill_litter_super(struct super_block *sb)
 {
 	if (sb->s_root)


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Increase number of dynamic inodes in procfs (2.6.5)
  2004-04-14  0:06 ` Andrew Morton
@ 2004-04-14  5:01   ` Olof Johansson
  2004-04-14  5:06     ` Olof Johansson
  2004-04-14  5:19     ` Andrew Morton
  2004-04-15  2:38   ` Nathan Lynch
  1 sibling, 2 replies; 9+ messages in thread
From: Olof Johansson @ 2004-04-14  5:01 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Nathan Lynch, linux-kernel

On Tue, 13 Apr 2004, Andrew Morton wrote:

> > and changes the inode number
> >  allocator to use a growable linked list of bitmaps.
>
> This open-codes a simple version of lib/idr.c.  Please use lib/idr.c
> instead.  There's an example in fs/super.c

The drawback of using lib/idr.c is the increased memory consumption for
keeping track of the pointers that are not used. For 100k entries that's
800KB lost (64-bit arch).

I've abstracted out Martin Bligh's and Ingo Molnar's scalable bitmap
allocator that is used by the PID allocator. I was planning on using it
for allocations of mmu_context ids on ppc64, but it seems like it could be
usable in this case too.

The code is a bit rough still, but if others could put it to use I
wouldn't mind cleaning it up. Snapshot inlined below, including the
conversion of the pid allocator to use it. I've kicked it around a bit but
not given it any serious testing yet. Comments are welcome.


-Olof

Olof Johansson                                        Office: 4F005/905
Linux on Power Development                            IBM Systems Group
Email: olof@austin.ibm.com                          Phone: 512-838-9858
All opinions are my own and not those of IBM

diff -Nru a/include/linux/largemap.h b/include/linux/largemap.h
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/include/linux/largemap.h	Wed Mar 17 09:48:58 2004
@@ -0,0 +1,67 @@
+/*
+ * include/linux/largemap.h
+ * Scalable, time-bounded bitmap allocator.
+ *
+ * Based on the PID allocator:
+ *
+ * (C) 2002 William Irwin, IBM
+ * (C) 2002 Ingo Molnar, Red Hat
+ *
+ * Broken out into separate module, minor tweaks:
+ * Copyright (C) 2004 Olof Johansson, IBM Corporation
+ */
+
+#ifndef _LINUX_LARGEMAP_H
+#define _LINUX_LARGEMAP_H
+
+#include <asm/atomic.h>
+#include <asm/page.h>
+#include <asm/spinlock.h>
+
+/*
+ * block pages start out as NULL, they get allocated upon
+ * first use and are never deallocated. This way a low limit
+ * value does not cause lots of bitmaps to be allocated, but
+ * the scheme scales to up to 4 million entries, runtime.
+ */
+
+struct lm_block {
+	atomic_t nr_free;
+	void    *page;
+};
+
+struct largemap {
+	unsigned int    lm_last;     /* Last allocated entry */
+	unsigned int    lm_maxlimit; /* Total max size of bitmap */
+	spinlock_t      lm_lock;     /* Protects the array->page entries */
+	struct lm_block lm_array[0]; /* Bigger depending on map size */
+};
+
+
+#define BITS_PER_PAGE		(PAGE_SIZE*8)
+#define BITS_PER_PAGE_MASK	(BITS_PER_PAGE-1)
+
+
+/* Allocate one entry, returning the number of it.
+ * limit is the highest entry to check for if available, 0 = no limit.
+ */
+extern long lm_alloc(struct largemap *map, unsigned int limit);
+
+/* Free one entry */
+static inline void lm_free(struct largemap *map, int entry)
+{
+	int offset = entry & BITS_PER_PAGE_MASK;
+	struct lm_block *block = &map->lm_array[entry / BITS_PER_PAGE];
+
+	clear_bit(offset, block->page);
+	atomic_inc(&block->nr_free);
+}
+
+/* Allocate a bitmap. Takes max size and first actual entry to be available */
+extern struct largemap *lm_allocmap(unsigned long maxsize,
+				    unsigned long firstfree);
+
+/* Free a whole bitmap */
+extern void lm_freemap(struct largemap *map);
+
+#endif /* !_LINUX_LARGEMAP_H */
diff -Nru a/kernel/pid.c b/kernel/pid.c
--- a/kernel/pid.c	Wed Mar 17 09:48:57 2004
+++ b/kernel/pid.c	Wed Mar 17 09:48:57 2004
@@ -25,125 +25,27 @@
 #include <linux/init.h>
 #include <linux/bootmem.h>
 #include <linux/hash.h>
+#include <linux/largemap.h>

 #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
 static struct list_head *pid_hash[PIDTYPE_MAX];
 static int pidhash_shift;
+static struct largemap *pidmap;

 int pid_max = PID_MAX_DEFAULT;
 int last_pid;

-#define RESERVED_PIDS		300
-
-#define PIDMAP_ENTRIES		(PID_MAX_LIMIT/PAGE_SIZE/8)
-#define BITS_PER_PAGE		(PAGE_SIZE*8)
-#define BITS_PER_PAGE_MASK	(BITS_PER_PAGE-1)
-
-/*
- * PID-map pages start out as NULL, they get allocated upon
- * first use and are never deallocated. This way a low pid_max
- * value does not cause lots of bitmaps to be allocated, but
- * the scheme scales to up to 4 million PIDs, runtime.
- */
-typedef struct pidmap {
-	atomic_t nr_free;
-	void *page;
-} pidmap_t;
-
-static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
-	 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
-
-static pidmap_t *map_limit = pidmap_array + PIDMAP_ENTRIES;
-
-static spinlock_t pidmap_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
-
-inline void free_pidmap(int pid)
+void free_pidmap(int pid)
 {
-	pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
-	int offset = pid & BITS_PER_PAGE_MASK;
-
-	clear_bit(offset, map->page);
-	atomic_inc(&map->nr_free);
-}
-
-/*
- * Here we search for the next map that has free bits left.
- * Normally the next map has free PIDs.
- */
-static inline pidmap_t *next_free_map(pidmap_t *map, int *max_steps)
-{
-	while (--*max_steps) {
-		if (++map == map_limit)
-			map = pidmap_array;
-		if (unlikely(!map->page)) {
-			unsigned long page = get_zeroed_page(GFP_KERNEL);
-			/*
-			 * Free the page if someone raced with us
-			 * installing it:
-			 */
-			spin_lock(&pidmap_lock);
-			if (map->page)
-				free_page(page);
-			else
-				map->page = (void *)page;
-			spin_unlock(&pidmap_lock);
-
-			if (!map->page)
-				break;
-		}
-		if (atomic_read(&map->nr_free))
-			return map;
-	}
-	return NULL;
+	lm_free(pidmap, pid);
 }

 int alloc_pidmap(void)
 {
-	int pid, offset, max_steps = PIDMAP_ENTRIES + 1;
-	pidmap_t *map;
+	int pid = lm_alloc(pidmap, pid_max);
+	last_pid = pid;

-	pid = last_pid + 1;
-	if (pid >= pid_max)
-		pid = RESERVED_PIDS;
-
-	offset = pid & BITS_PER_PAGE_MASK;
-	map = pidmap_array + pid / BITS_PER_PAGE;
-
-	if (likely(map->page && !test_and_set_bit(offset, map->page))) {
-		/*
-		 * There is a small window for last_pid updates to race,
-		 * but in that case the next allocation will go into the
-		 * slowpath and that fixes things up.
-		 */
-return_pid:
-		atomic_dec(&map->nr_free);
-		last_pid = pid;
-		return pid;
-	}
-
-	if (!offset || !atomic_read(&map->nr_free)) {
-next_map:
-		map = next_free_map(map, &max_steps);
-		if (!map)
-			goto failure;
-		offset = 0;
-	}
-	/*
-	 * Find the next zero bit:
-	 */
-scan_more:
-	offset = find_next_zero_bit(map->page, BITS_PER_PAGE, offset);
-	if (offset >= BITS_PER_PAGE)
-		goto next_map;
-	if (test_and_set_bit(offset, map->page))
-		goto scan_more;
-
-	/* we got the PID: */
-	pid = (map - pidmap_array) * BITS_PER_PAGE + offset;
-	goto return_pid;
-
-failure:
-	return -1;
+	return pid;
 }

 inline struct pid *find_pid(enum pid_type type, int nr)
@@ -219,7 +121,7 @@
 	for (type = 0; type < PIDTYPE_MAX; ++type)
 		if (find_pid(type, nr))
 			return;
-	free_pidmap(nr);
+	lm_free(pidmap, nr);
 }

 task_t *find_task_by_pid(int nr)
@@ -294,13 +196,12 @@
 void __init pidmap_init(void)
 {
 	int i;
-
-	pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL);
-	set_bit(0, pidmap_array->page);
-	atomic_dec(&pidmap_array->nr_free);
+
+	/* Allocate pid 0 by defalt */
+	pidmap = lm_allocmap(PID_MAX_LIMIT, 1);

 	/*
-	 * Allocate PID 0, and hash it via all PID types:
+	 * Hash pid 0 via all PID types:
 	 */

 	for (i = 0; i < PIDTYPE_MAX; i++)
diff -Nru a/lib/Makefile b/lib/Makefile
--- a/lib/Makefile	Wed Mar 17 09:48:57 2004
+++ b/lib/Makefile	Wed Mar 17 09:48:57 2004
@@ -6,7 +6,7 @@
 lib-y := errno.o ctype.o string.o vsprintf.o cmdline.o \
 	 bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \
 	 kobject.o idr.o div64.o parser.o int_sqrt.o \
-	 bitmap.o extable.o
+	 bitmap.o extable.o largemap.o

 lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
diff -Nru a/lib/largemap.c b/lib/largemap.c
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/lib/largemap.c	Wed Mar 17 09:48:57 2004
@@ -0,0 +1,190 @@
+/*
+ * lib/largemap.c
+ * Scalable, time-bounded bitmap allocator.
+ *
+ * Based on the PID allocator:
+ *
+ * (C) 2002 William Irwin, IBM
+ * (C) 2002 Ingo Molnar, Red Hat
+ *
+ * Broken out into separate module, minor tweaks:
+ * Copyright (C) 2004 Olof Johansson, IBM Corporation
+ *
+ * We have a list of bitmap pages, which bitmaps represent the
+ * allocation space. Allocating and freeing entries is completely
+ * lockless.
+ * The worst-case allocation scenario when all but one out
+ * of entries possible are allocated already: the scanning of list
+ * entries and at most PAGE_SIZE bytes.
+ * The typical fastpath is a single successful setbit.
+ * Freeing is O(1).
+ */
+
+#include <asm/mmu_context.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/hash.h>
+#include <linux/largemap.h>
+
+/*
+ * Here we search for the next map that has free bits left.
+ * Normally the next map has a free entry.
+ *
+ * Before allocating a new page, we need to re-scan from the beginning
+ * to find pages with free entries first. Otherwise we'll keep growing
+ * as long as the last page in the array is filled, no matter if
+ * there's space before it or not.
+ */
+static inline int find_free_block(struct largemap *map, unsigned int limit)
+{
+	int i;
+	unsigned int start;
+
+	start = map->lm_last / BITS_PER_PAGE;
+
+ rescan:
+	for (i = start; i < limit; i++) {
+		if (!atomic_read(&map->lm_array[i].nr_free))
+			continue;
+
+		if (unlikely(!map->lm_array[i].page)) {
+			unsigned long page;
+
+			if (start != 0) {
+				/* Make sure all pages have been
+				 * searched.
+				 */
+				start = 0;
+				goto rescan;
+			}
+
+			page = get_zeroed_page(GFP_KERNEL);
+			/*
+			 * Free the page if someone raced with us
+			 * installing it:
+			 */
+			spin_lock(&map->lm_lock);
+			if (map->lm_array[i].page)
+				free_page(page);
+			else
+				map->lm_array[i].page = (void *)page;
+			spin_unlock(&map->lm_lock);
+
+			if (!map->lm_array[i].page) {
+				/* Allocation failed */
+				return -ENOMEM;
+			}
+		}
+		return i;
+	}
+
+	if (start != 0) {
+		start = 0;
+		goto rescan;
+	} else
+		return -ENOSPC;
+}
+
+long lm_alloc(struct largemap *map, unsigned int limit)
+{
+	int entry, offset, nextblock;
+	struct lm_block *block;
+
+	if (!limit)
+		limit = map->lm_maxlimit;
+
+	entry = map->lm_last + 1;
+	if (entry >= limit)
+		entry = 0;
+
+	BUG_ON(limit > map->lm_maxlimit);
+
+	offset = entry & BITS_PER_PAGE_MASK;
+	block = &map->lm_array[entry / BITS_PER_PAGE];
+
+	if (likely(block->page && !test_and_set_bit(offset, block->page))) {
+		/*
+		 * There is a small window for lm_last updates to race,
+		 * but in that case the next allocation will go into the
+		 * slowpath and that fixes things up.
+		 */
+return_entry:
+		atomic_dec(&block->nr_free);
+		map->lm_last = entry;
+		return entry;
+	}
+
+	if (!block->page || !atomic_read(&block->nr_free)) {
+next_block:
+		nextblock = find_free_block(map, (limit + BITS_PER_PAGE + 1) /
+					         BITS_PER_PAGE);
+		if (nextblock < 0)
+			goto failure;
+
+		block = &map->lm_array[nextblock];
+		offset = 0;
+	}
+
+scan_more:
+	offset = find_next_zero_bit(block->page, BITS_PER_PAGE, offset);
+	if (offset >= BITS_PER_PAGE)
+		goto next_block;
+	if (test_and_set_bit(offset, block->page))
+		goto scan_more;
+
+	/* we found and set a bit */
+	entry = (block - map->lm_array) * BITS_PER_PAGE + offset;
+	goto return_entry;
+
+failure:
+	return -1;
+}
+
+
+struct largemap *lm_allocmap(unsigned long maxsize, unsigned long firstfree)
+{
+	struct largemap *map;
+	int arraysize = (maxsize + PAGE_SIZE*8-1)/PAGE_SIZE/8;
+	int i;
+
+	/* We could get rid of the firstfree <= BITS_PER_PAGE
+	 * limitation by using a offset-member in the structure
+	 * instead, but with all current users firstfree is a low
+	 * value.
+	 */
+	BUG_ON(firstfree > BITS_PER_PAGE || firstfree > maxsize);
+
+	map = kmalloc(sizeof(struct largemap) +
+		      arraysize*sizeof(struct lm_block), GFP_KERNEL);
+
+	map->lm_maxlimit = maxsize;
+	map->lm_last = firstfree ? firstfree-1 : 0;
+	spin_lock_init(&map->lm_lock);
+
+	for (i = 0; i < arraysize; i++) {
+		map->lm_array[i].page = NULL;
+		atomic_set(&map->lm_array[i].nr_free, BITS_PER_PAGE);
+	}
+
+	map->lm_array[0].page = (void *)get_zeroed_page(GFP_KERNEL);
+
+	for (i = 0; i < firstfree; i++) {
+		set_bit(i, map->lm_array[0].page);
+		atomic_dec(&map->lm_array[0].nr_free);
+	}
+
+	return map;
+}
+
+void lm_freemap(struct largemap *map)
+{
+	int i;
+
+	for (i = 0; i < map->lm_maxlimit; i++)
+		if (map->lm_array[i].page)
+			free_page((unsigned long)map->lm_array[i].page);
+
+	kfree(map);
+}



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Increase number of dynamic inodes in procfs (2.6.5)
  2004-04-14  5:01   ` Olof Johansson
@ 2004-04-14  5:06     ` Olof Johansson
  2004-04-14  5:19     ` Andrew Morton
  1 sibling, 0 replies; 9+ messages in thread
From: Olof Johansson @ 2004-04-14  5:06 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Nathan Lynch, linux-kernel

On Wed, 14 Apr 2004, Olof Johansson wrote:

> I've abstracted out Martin Bligh's and Ingo Molnar's scalable bitmap

Ooops, of course I mean Bill Irwin's and Ingo Molnar's scalable bitmap.
Sorry about that.


Olof Johansson                                        Office: 4F005/905
Linux on Power Development                            IBM Systems Group
Email: olof@austin.ibm.com                          Phone: 512-838-9858
All opinions are my own and not those of IBM


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Increase number of dynamic inodes in procfs (2.6.5)
  2004-04-14  5:01   ` Olof Johansson
  2004-04-14  5:06     ` Olof Johansson
@ 2004-04-14  5:19     ` Andrew Morton
  1 sibling, 0 replies; 9+ messages in thread
From: Andrew Morton @ 2004-04-14  5:19 UTC (permalink / raw)
  To: Olof Johansson; +Cc: nathanl, linux-kernel

Olof Johansson <olof@austin.ibm.com> wrote:
>
> On Tue, 13 Apr 2004, Andrew Morton wrote:
> 
>  > > and changes the inode number
>  > >  allocator to use a growable linked list of bitmaps.
>  >
>  > This open-codes a simple version of lib/idr.c.  Please use lib/idr.c
>  > instead.  There's an example in fs/super.c
> 
>  The drawback of using lib/idr.c is the increased memory consumption for
>  keeping track of the pointers that are not used. For 100k entries that's
>  800KB lost (64-bit arch).

Which is less that 2% of the space which is used by the 100k inodes.

We've previously discussed allocating the idr pointer array separately, but
the extra cache miss for the users who need it, plus the 2% argument keep
on trumping that.  (We could avoid the cache miss with zero-length-array
tricks, actually).

>  I've abstracted out Martin Bligh's and Ingo Molnar's scalable bitmap
>  allocator that is used by the PID allocator.

Your current requirement does not appear to justify this additional
infrastructure.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Increase number of dynamic inodes in procfs (2.6.5)
  2004-04-14  0:06 ` Andrew Morton
  2004-04-14  5:01   ` Olof Johansson
@ 2004-04-15  2:38   ` Nathan Lynch
  2004-04-15  2:51     ` Andrew Morton
  1 sibling, 1 reply; 9+ messages in thread
From: Nathan Lynch @ 2004-04-15  2:38 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Olof Johansson

[-- Attachment #1: Type: text/plain, Size: 514 bytes --]

Andrew Morton wrote:
> Nathan Lynch <nathanl@austin.ibm.com> wrote:
> 
>>On some larger ppc64 configurations /proc/device-tree is exhausting 
>> procfs' dynamic (non-pid) inode range (16K).  This patch makes the 
>> dynamic inode range 0xf0000000-0xffffffff
>>and changes the inode number 
>> allocator to use a growable linked list of bitmaps.
> 
> This open-codes a simple version of lib/idr.c.  Please use lib/idr.c
> instead.  There's an example in fs/super.c

Ok, thanks for the tip.  Is this better?

Nathan

[-- Attachment #2: procfs_inum_idr.patch --]
[-- Type: text/x-patch, Size: 5624 bytes --]

diff -pru linux-2.6.5-mm4/fs/proc/generic.c linux-2.6.5-mm4.new/fs/proc/generic.c
--- linux-2.6.5-mm4/fs/proc/generic.c	2004-04-14 20:32:34.000000000 -0500
+++ linux-2.6.5-mm4.new/fs/proc/generic.c	2004-04-14 20:34:57.000000000 -0500
@@ -15,6 +15,8 @@
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/smp_lock.h>
+#include <linux/init.h>
+#include <linux/idr.h>
 #include <asm/uaccess.h>
 #include <asm/bitops.h>
 
@@ -275,24 +277,51 @@ static int xlate_proc_name(const char *n
 	return 0;
 }
 
-static unsigned long proc_alloc_map[(PROC_NDYNAMIC + BITS_PER_LONG - 1) / BITS_PER_LONG];
+static struct idr proc_inum_idr;
+static spinlock_t proc_inum_lock = SPIN_LOCK_UNLOCKED; /* protects the above */
 
-spinlock_t proc_alloc_map_lock = SPIN_LOCK_UNLOCKED;
+#define PROC_DYNAMIC_FIRST 0xF0000000UL
 
-static int make_inode_number(void)
+void __init init_proc_inum_idr(void)
 {
-	int i;
-	spin_lock(&proc_alloc_map_lock);
-	i = find_first_zero_bit(proc_alloc_map, PROC_NDYNAMIC);
-	if (i < 0 || i >= PROC_NDYNAMIC) {
-		i = -1;
-		goto out;
-	}
-	set_bit(i, proc_alloc_map);
-	i += PROC_DYNAMIC_FIRST;
-out:
-	spin_unlock(&proc_alloc_map_lock);
-	return i;
+	idr_init(&proc_inum_idr);
+}
+
+/*
+ * Return an inode number between PROC_DYNAMIC_FIRST and
+ * 0xffffffff, or zero on failure.
+ */
+static unsigned int get_inode_number(void)
+{
+	unsigned int i, inum = 0;
+
+retry:
+	if (0 == idr_pre_get(&proc_inum_idr, GFP_KERNEL))
+		return 0;
+
+	spin_lock(&proc_inum_lock);
+	i = idr_get_new(&proc_inum_idr, NULL);
+	spin_unlock(&proc_inum_lock);
+
+	if (i == -1)
+		goto retry;
+
+	inum = (i & MAX_ID_MASK) + PROC_DYNAMIC_FIRST;
+
+	/* inum will never be more than 0xf0ffffff, so no check
+	 * for overflow.
+	 */
+
+	return inum;
+}
+
+static void release_inode_number(unsigned int inum)
+{
+	int id = (inum - PROC_DYNAMIC_FIRST) | ~MAX_ID_MASK;
+
+	spin_lock(&proc_inum_lock);
+	idr_remove(&proc_inum_idr, id);
+	spin_unlock(&proc_inum_lock);
 }
 
 static int
@@ -346,7 +375,7 @@ struct dentry *proc_lookup(struct inode 
 			if (de->namelen != dentry->d_name.len)
 				continue;
 			if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
-				int ino = de->low_ino;
+				unsigned int ino = de->low_ino;
 				error = -EINVAL;
 				inode = proc_get_inode(dir->i_sb, ino, de);
 				break;
@@ -452,10 +481,10 @@ static struct inode_operations proc_dir_
 
 static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
 {
-	int	i;
+	unsigned int i;
 	
-	i = make_inode_number();
-	if (i < 0)
+	i = get_inode_number();
+	if (i == 0)
 		return -EAGAIN;
 	dp->low_ino = i;
 	dp->next = dir->subdir;
@@ -621,11 +650,13 @@ struct proc_dir_entry *create_proc_entry
 
 void free_proc_entry(struct proc_dir_entry *de)
 {
-	int ino = de->low_ino;
+	unsigned int ino = de->low_ino;
 
-	if (ino < PROC_DYNAMIC_FIRST ||
-	    ino >= PROC_DYNAMIC_FIRST+PROC_NDYNAMIC)
+	if (ino < PROC_DYNAMIC_FIRST)
 		return;
+
+	release_inode_number(ino);
+
 	if (S_ISLNK(de->mode) && de->data)
 		kfree(de->data);
 	kfree(de);
@@ -653,8 +684,6 @@ void remove_proc_entry(const char *name,
 		de->next = NULL;
 		if (S_ISDIR(de->mode))
 			parent->nlink--;
-		clear_bit(de->low_ino - PROC_DYNAMIC_FIRST,
-			  proc_alloc_map);
 		proc_kill_inodes(de);
 		de->nlink = 0;
 		WARN_ON(de->subdir);
diff -pru linux-2.6.5-mm4/fs/proc/inode-alloc.txt linux-2.6.5-mm4.new/fs/proc/inode-alloc.txt
--- linux-2.6.5-mm4/fs/proc/inode-alloc.txt	2004-01-09 00:59:26.000000000 -0600
+++ linux-2.6.5-mm4.new/fs/proc/inode-alloc.txt	2004-04-14 20:34:57.000000000 -0500
@@ -4,9 +4,10 @@ Current inode allocations in the proc-fs
   00000001-00000fff	static entries	(goners)
        001		root-ino
 
-  00001000-00001fff	dynamic entries
+  00001000-00001fff	unused
   0001xxxx-7fffxxxx	pid-dir entries for pid 1-7fff
-  80000000-ffffffff	unused
+  80000000-efffffff	unused
+  f0000000-ffffffff	dynamic entries
 
 Goal:
 	a) once we'll split the thing into several virtual filesystems we
diff -pru linux-2.6.5-mm4/fs/proc/inode.c linux-2.6.5-mm4.new/fs/proc/inode.c
--- linux-2.6.5-mm4/fs/proc/inode.c	2004-04-14 20:32:34.000000000 -0500
+++ linux-2.6.5-mm4.new/fs/proc/inode.c	2004-04-14 20:34:57.000000000 -0500
@@ -181,7 +181,7 @@ static int parse_options(char *options,u
 	return 1;
 }
 
-struct inode * proc_get_inode(struct super_block * sb, int ino,
+struct inode * proc_get_inode(struct super_block * sb, unsigned int ino,
 				struct proc_dir_entry * de)
 {
 	struct inode * inode;
diff -pru linux-2.6.5-mm4/include/linux/proc_fs.h linux-2.6.5-mm4.new/include/linux/proc_fs.h
--- linux-2.6.5-mm4/include/linux/proc_fs.h	2004-04-14 20:32:35.000000000 -0500
+++ linux-2.6.5-mm4.new/include/linux/proc_fs.h	2004-04-14 20:34:57.000000000 -0500
@@ -26,9 +26,6 @@ enum {
 
 /* Finally, the dynamically allocatable proc entries are reserved: */
 
-#define PROC_DYNAMIC_FIRST 4096
-#define PROC_NDYNAMIC      16384
-
 #define PROC_SUPER_MAGIC 0x9fa0
 
 /*
@@ -53,7 +50,7 @@ typedef	int (write_proc_t)(struct file *
 typedef int (get_info_t)(char *, char **, off_t, int);
 
 struct proc_dir_entry {
-	unsigned short low_ino;
+	unsigned int low_ino;
 	unsigned short namelen;
 	const char *name;
 	mode_t mode;
@@ -102,7 +99,7 @@ extern void remove_proc_entry(const char
 
 extern struct vfsmount *proc_mnt;
 extern int proc_fill_super(struct super_block *,void *,int);
-extern struct inode * proc_get_inode(struct super_block *, int, struct proc_dir_entry *);
+extern struct inode * proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
 
 extern int proc_match(int, const char *,struct proc_dir_entry *);
 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Increase number of dynamic inodes in procfs (2.6.5)
  2004-04-15  2:38   ` Nathan Lynch
@ 2004-04-15  2:51     ` Andrew Morton
  2004-04-15  3:13       ` Nathan Lynch
  0 siblings, 1 reply; 9+ messages in thread
From: Andrew Morton @ 2004-04-15  2:51 UTC (permalink / raw)
  To: Nathan Lynch; +Cc: linux-kernel, olof

Nathan Lynch <nathanl@austin.ibm.com> wrote:
>
> > This open-codes a simple version of lib/idr.c.  Please use lib/idr.c
>  > instead.  There's an example in fs/super.c
> 
>  Ok, thanks for the tip.  Is this better?

Looks OK.  How well tested was it?  Nothing calls init_proc_inum_idr().
Maybe all-zeroes happens to work.

Happily, there's an idr patch in -mm which allows us to initialise these
guys at compile time, so...


diff -puN fs/proc/generic.c~increase-number-of-dynamic-inodes-in-procfs-265-idr-init fs/proc/generic.c
--- 25/fs/proc/generic.c~increase-number-of-dynamic-inodes-in-procfs-265-idr-init	2004-04-14 19:47:41.313236640 -0700
+++ 25-akpm/fs/proc/generic.c	2004-04-14 19:48:42.484937128 -0700
@@ -277,16 +277,11 @@ static int xlate_proc_name(const char *n
 	return 0;
 }
 
-static struct idr proc_inum_idr;
+static DEFINE_IDR(proc_inum_idr);
 static spinlock_t proc_inum_lock = SPIN_LOCK_UNLOCKED; /* protects the above */
 
 #define PROC_DYNAMIC_FIRST 0xF0000000UL
 
-void __init init_proc_inum_idr(void)
-{
-	idr_init(&proc_inum_idr);
-}
-
 /*
  * Return an inode number between PROC_DYNAMIC_FIRST and
  * 0xffffffff, or zero on failure.
@@ -376,6 +371,7 @@ struct dentry *proc_lookup(struct inode 
 				continue;
 			if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
 				unsigned int ino = de->low_ino;
+
 				error = -EINVAL;
 				inode = proc_get_inode(dir->i_sb, ino, de);
 				break;

_


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Increase number of dynamic inodes in procfs (2.6.5)
  2004-04-15  2:51     ` Andrew Morton
@ 2004-04-15  3:13       ` Nathan Lynch
  2004-04-15  3:21         ` Andrew Morton
  0 siblings, 1 reply; 9+ messages in thread
From: Nathan Lynch @ 2004-04-15  3:13 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, olof

Andrew Morton wrote:
> Nathan Lynch <nathanl@austin.ibm.com> wrote:
> 
>>>This open-codes a simple version of lib/idr.c.  Please use lib/idr.c
>>
>> > instead.  There's an example in fs/super.c
>>
>> Ok, thanks for the tip.  Is this better?
> 
> 
> Looks OK.  How well tested was it?  Nothing calls init_proc_inum_idr().
> Maybe all-zeroes happens to work.

Sorry, I tested it all day; it just happens to work :)  Olof just 
pointed the error out to me, too.

I successfully allocated 152371 proc entries using this.

During testing I made sure that release_inode_number was actually 
releasing id's by inserting a call to idr_find before idr_remove, and 
using a dummy token for idr_get_new instead of NULL.  I can pass those 
bits along if you like.

BTW I found the use of idr in kernel/posix-timers.c to be more 
consistent with the comments in lib/idr.c so I emulated that.  I think 
the call to idr_remove in fs/super.c::set_anon_super needs to be holding 
unnamed_dev_lock.

Thanks,
Nathan

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Increase number of dynamic inodes in procfs (2.6.5)
  2004-04-15  3:13       ` Nathan Lynch
@ 2004-04-15  3:21         ` Andrew Morton
  0 siblings, 0 replies; 9+ messages in thread
From: Andrew Morton @ 2004-04-15  3:21 UTC (permalink / raw)
  To: Nathan Lynch; +Cc: linux-kernel, olof

Nathan Lynch <nathanl@austin.ibm.com> wrote:
>
> Andrew Morton wrote:
> > Nathan Lynch <nathanl@austin.ibm.com> wrote:
> > 
> >>>This open-codes a simple version of lib/idr.c.  Please use lib/idr.c
> >>
> >> > instead.  There's an example in fs/super.c
> >>
> >> Ok, thanks for the tip.  Is this better?
> > 
> > 
> > Looks OK.  How well tested was it?  Nothing calls init_proc_inum_idr().
> > Maybe all-zeroes happens to work.
> 
> Sorry, I tested it all day; it just happens to work :)

heh.  It gave me an instasplat with spinlock debugging enabled.

> I think 
> the call to idr_remove in fs/super.c::set_anon_super needs to be holding 
> unnamed_dev_lock.

Indeed.  I'll fix that up, thanks.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2004-04-15  3:22 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-04-13 19:36 [PATCH] Increase number of dynamic inodes in procfs (2.6.5) Nathan Lynch
2004-04-14  0:06 ` Andrew Morton
2004-04-14  5:01   ` Olof Johansson
2004-04-14  5:06     ` Olof Johansson
2004-04-14  5:19     ` Andrew Morton
2004-04-15  2:38   ` Nathan Lynch
2004-04-15  2:51     ` Andrew Morton
2004-04-15  3:13       ` Nathan Lynch
2004-04-15  3:21         ` Andrew Morton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox