public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH] Linux-2.5 fix/improve get_pid()
@ 2002-08-12 16:15 Jim Houston
  0 siblings, 0 replies; 51+ messages in thread
From: Jim Houston @ 2002-08-12 16:15 UTC (permalink / raw)
  To: frankeh, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1475 bytes --]


I was doing some similar work last week and stumbled on the
this discussion.  I have been working with the Posix timers
patch and was looking for a nice way to allocate ids for
timers.

I looked at get_pid() and also the code used for ipc/msg.c
and friends.  I thought I could do better so I wrote the
attached routines.  They are a subtle variation on the bitmap
theme.  I'm mixing a bitmap with a sparse array which is implemented
as a radix tree.  This means that the data structure
grows/shrinks as needed.  The interface is:

	id = id_new(idp, pointer);
	id_remove(idp, id);
	pointer = id_lookup(idp, id);

The idp is a pointer to the structure which defines the id space.
This provides, not only the bitmap allocation of the id but also, mapping
the id to a pointer.  In the case of pids, this would replace the
hash table used by find_task_by_pid().

So far I have been playing with this code in user space.  I'm including
my test harness.  I plan to try it with your program later today.
My program was written more to test correctness, but I do collect
a few numbers.  Here is the output:

jhouston@linux:~/id > time ./id_test
my_id.count=1
new_cnt=5002501
avr_depth = 4996
id_lookup took 101 cycles
id_new took 204 cycles
id_remove took 148 cycles 
real    0m3.476s

That's about 0.5us total to allocate an id, lookup a random id
and remove it with a table containing 5000 ids.  The times
only double for 500000 ids.

Jim Houston - Concurrent Computer Corporation.

[-- Attachment #2: id.c --]
[-- Type: text/plain, Size: 2825 bytes --]

/*
 * Small id to pointer translation service.  
 *
 * It uses a radix tree like structure as a sparse array indexed 
 * by the id to obtain the pointer.  The bitmap makes allocating
 * an new id quick.  
 */

#include "id.h"

#ifdef __KERNEL__
static kmem_cache_t *id_layer_cache;
#endif

void *id_lookup(struct id *idp, int id)
{
	int n = idp->layers * ID_BITS;
	struct id_layer *p = idp->top;

	if (id >= (1 << n))
		return(NULL);

	while (n > 0 && p) {
		n -= ID_BITS;
		p = p->ary[(id >> n) & ID_MASK];
	}
	return((void *)p);
}

static int sub_alloc(struct id_layer *p, int shift, int id, void *ptr)
{
	int n = (id >> shift) & ID_MASK;
	int bitmap = p->bitmap;
	int id_base = id & ~((1 << (shift+ID_BITS))-1);
	int v;
	
	for ( ; n <= ID_MASK; n++, id = id_base + (n << shift)) {
		if (bitmap & (1 << n))
			continue;
		if (shift == 0) {
			p->ary[n] = (struct id_layer *)ptr;
			p->bitmap |= 1<<n;
			return(id);
		}
		if (!p->ary[n])
			p->ary[n] = alloc_layer();
		if (v = sub_alloc(p->ary[n], shift-ID_BITS, id, ptr)) {
			update_bitmap(p, n);
			return(v);
		}
	}
	return(0);
}

int id_new(struct id *idp, void *ptr)
{
	int n = idp->layers * ID_BITS;
	int last = idp->last;
	struct id_layer **p, *new;
	int id, v;
	
	/*
	 * Add a new layer if the array is full or the last id
	 * was at the limit and we don't want to wrap.
	 */
	if ((last == ((1 << n)-1) && last < idp->min_wrap) ||
		idp->count == (1 << n)) {
		++idp->layers;
		n += ID_BITS;
		new = alloc_layer();
		new->ary[0] = idp->top;
		idp->top = new;
		update_bitmap(new, 0);
	}
	if (last >= ((1 << n)-1))
		last = 0;

	/*
	 * Search for a free id starting after last id allocated.
	 * If that fails wrap back to start.
	 */
	id = last+1;
	if (!(v = sub_alloc(idp->top, n-ID_BITS, id, ptr)))
		v = sub_alloc(idp->top, n-ID_BITS, 1, ptr);
	idp->last = v;
	idp->count++;
	return(v);
}


static int sub_remove(struct id_layer *p, int shift, int id)
{
	int n = (id >> shift) & ID_MASK;
	int i, bitmap, rv;
	
if (!p) {
printf("in sub_remove for id=%d called with null pointer.\n", id);
return(0);
}
	rv = 0;
	bitmap = p->bitmap & ~(1<<n);
	p->bitmap = bitmap;
	if (shift == 0) {
		p->ary[n] = NULL;
		rv = !bitmap;
	} else {
		if (sub_remove(p->ary[n], shift-ID_BITS, id)) {
			free_layer(p->ary[n]);
			p->ary[n] = 0;
			for (i = 0; i < (1 << ID_BITS); i++)
				if (p->ary[i])
					break;
			if (i == (1 << ID_BITS))
				rv = 1;
		}
	}
	return(rv);
}

void id_remove(struct id *idp, int id)
{
	sub_remove(idp->top, (idp->layers-1)*ID_BITS, id);
	idp->count--;
}

void id_init(struct id *idp, int min_wrap)
{
#ifdef __KERNEL__
	id_layer_cache = kmem_cache_create("id_layer_cache", 
		sizeof(struct id_layer), 0, 0, 0, 0);
#endif
	idp->count = 1;
	idp->last = 0;
	idp->layers = 1;
	idp->top = alloc_layer();
	idp->top->bitmap = 1;
	idp->min_wrap = min_wrap;
}


[-- Attachment #3: id.h --]
[-- Type: text/plain, Size: 1289 bytes --]

/*
 * Small id to pointer translation service avoiding fixed sized
 * tables.
 */

#define ID_BITS 5

#define ID_MASK ((1 << ID_BITS)-1)
#define ID_FULL ((1 << (1 << ID_BITS))-1)

struct id_layer {
	unsigned int	bitmap: (1<<ID_BITS);
	struct id_layer	*ary[1<<ID_BITS];
};

struct id {
	int		layers;
	int		last;
	int		count;
	int		min_wrap;
	struct id_layer *top;
};

void *id_lookup(struct id *idp, int id);
int id_new(struct id *idp, void *ptr);
void id_remove(struct id *idp, int id);
void id_init(struct id *idp, int min_wrap);

static inline update_bitmap(struct id_layer *p, int bit)
{
	if (p->ary[bit] && p->ary[bit]->bitmap == (typeof(p->bitmap))-1)
		p->bitmap |= 1<<bit;
	else
		p->bitmap &= ~(1<<bit);
}

#ifndef __KERNEL__

#ifndef NULL
#define NULL 0
#endif

static inline struct id_layer *alloc_layer()
{
	struct id_layer *new;

	new = (struct id_layer *)malloc(sizeof(struct id_layer));
	bzero((void *)new, sizeof(struct id_layer));
	return(new);
	
}

static inline void free_layer(struct id_layer *p)
{
	free((void *)p);
}

#else

extern kmem_cache_t *id_layer_cache;

static inline struct id_layer *alloc_layer()
{
	return(kmem_cache_alloc(id_layer_cache, GFP_KERNEL));
}

static inline void free_layer(struct id_layer *p)
{
	kmem_cache_free(id_layer_cache, p);
}

#endif


[-- Attachment #4: id_test.c --]
[-- Type: text/plain, Size: 2133 bytes --]

/*
 * Test program for id.c
 */

#include <stdlib.h>
#include "id.h"

struct id my_id;

main()
{
	int i, n;
	void *v;

	id_init(&my_id, 10000);
	
	random_test();
}

#define LIST_SZ 50000

struct id_list {
	int id;
	int ptr;
} id_list[LIST_SZ];

int list_cnt;
int new_cnt;
int max = 5000;
int count = 10000000;

long long avr_depth;

#define rdtsc(low,high) \
     __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
  
static inline unsigned long get_tsc(void)
{
	register unsigned long eax, edx;

	rdtsc(eax,edx);
	return(eax);
}



random_test()
{
	int n, i;
	void *v;
	unsigned long t1, t2, t3, t4;

	for (n = 0; n < count; n++) {
		/*
		 * favor insertion so we will tend to run will max
		 * id's active.
		 */
		if (list_cnt && (list_cnt > max || rand() < (RAND_MAX/4))) {
			i = rand() % list_cnt;
			v = id_lookup(&my_id, id_list[i].id);
			if ((int)v != id_list[i].ptr) {
				printf("list_cnt=%d, i=%d\n", list_cnt, i);
				printf("failed id=%d, expected %d got %d\n",
					id_list[i].id, id_list[i].ptr, (int)v);
			} else {
#if 0
				printf("rm id=%d, ptr=%d\n",
					id_list[i].id, id_list[i].ptr);
#endif
				id_remove(&my_id, id_list[i].id);
			}
			id_list[i] = id_list[--list_cnt];
		} else {
			new_cnt++;
			id_list[list_cnt].id = id_new(&my_id, (void *)new_cnt);
			id_list[list_cnt].ptr = new_cnt;
#if 0
			printf("ins id=%d, ptr=%d\n",
				id_list[list_cnt].id, id_list[list_cnt].ptr);
#endif
			list_cnt++;
			avr_depth += list_cnt;
		}
	}
t1 = get_tsc();
id_lookup(&my_id, id_list[0].id);
t2 = get_tsc();
n = id_new(&my_id, (void *)++new_cnt);
t3 = get_tsc();
id_remove(&my_id, n);
t4 = get_tsc();

	for (i = 0; i < list_cnt; i++) {
		v = id_lookup(&my_id, id_list[i].id);
		if ((int)v != id_list[i].ptr) {
			printf("failed id=%d, expected %d got %d\n",
				id_list[i].id, id_list[i], (int)v);
		}
		id_remove(&my_id, id_list[i].id);
	}
	printf("my_id.count=%d\n", my_id.count);
	printf("new_cnt=%d\n", new_cnt);
	printf("avr_depth = %d\n", (int)(avr_depth/new_cnt));
	printf("id_lookup took %d cycles\n", t2-t1);
	printf("id_new took %d cycles\n", t3-t2);
	printf("id_remove took %d cycles\n", t4-t3);
}

^ permalink raw reply	[flat|nested] 51+ messages in thread
* Re: [PATCH] Linux-2.5 fix/improve get_pid()
@ 2002-08-09  0:53 Hubertus Franke
  0 siblings, 0 replies; 51+ messages in thread
From: Hubertus Franke @ 2002-08-09  0:53 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andries Brouwer, Andrew Morton, andrea, davej, lkml, Paul Larson,
	Rik van Riel

                                                                                                               
                                                                                                               
                                                                                                               


I package my stuff up that allows to pull this into user space and run much
more efficient
pid allocation test for performance....
I'll also include the comparision numbers.  Currently remote, so I don't
have access to
that info.
Based on that though it seems with random pid deletion (we surely could
argue about
that one) there still seems reasonable benefits to "consider" a twolevel
algo.
More tomorrow.

Hubertus Franke
Enterprise Linux Group (Mgr),  Linux Technology Center (Member Scalability)
, OS-PIC (Chair)
email: frankeh@us.ibm.com
(w) 914-945-2003    (fax) 914-945-4425   TL: 862-2003




                                                                                                                                     
                      Linus Torvalds                                                                                                 
                      <torvalds@transme        To:       Hubertus Franke/Watson/IBM@IBMUS                                            
                      ta.com>                  cc:       Rik van Riel <riel@conectiva.com.br>, Andries Brouwer <aebr@win.tue.nl>,    
                                                Andrew Morton <akpm@zip.com.au>, <andrea@suse.de>, <davej@suse.de>, lkml <linux-     
                      08/08/2002 06:26          kernel@vger.kernel.org>, Paul Larson <plars@austin.ibm.com>                          
                      PM                       Subject:  Re: [PATCH] Linux-2.5 fix/improve get_pid()                                 
                                                                                                                                     
                                                                                                                                     
                                                                                                                                     




On Thu, 8 Aug 2002, Linus Torvalds wrote:
>
> So let's just try Andries approach, suggested patch as follows..

"ps" seems to do ok from a visual standpoint at least up to 99 million.
Maybe it won't look that good after that, I'm too lazy to test.

The following trivial program is useful for efficiently allocating pid
numbers without blowing chunks on the VM subsystem and spending all the
time on page table updates - for people who want to test (look out: I've
got dual 2.4GHz CPU's with HT, so getting up to 10+ million was easy, your
milage may wary and at some point you should just compile a kernel that
starts higher ;).

                         Linus

---
#include <sys/types.h>
#include <sys/wait.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>

int main()
{
        int i;
        for (i = 1; i < 250000; i++) {
                if (!vfork())
                        exit(1);
                if (waitpid(-1, NULL, WNOHANG) < 0)
                        perror("waitpid");
        }
        return 0;
}






^ permalink raw reply	[flat|nested] 51+ messages in thread
* Re: [PATCH] Linux-2.5 fix/improve get_pid()
@ 2002-08-08 21:50 Hubertus Franke
  0 siblings, 0 replies; 51+ messages in thread
From: Hubertus Franke @ 2002-08-08 21:50 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, andrea, davej, lkml, Paul Larson

                                                                                                               
                                                                                                               
                                                                                                               


Agreed ....
The mark-and-sweep is the correct method for 16-bits ! For 32-bits its not
possible !
Two level is over-engineered (as I told Paul).

If you want I can post the data that I collected comparing
(a) stock kernel
(b) my mark and sweep
(c) my double mark and sweep (try looking forward and then scan task list)
?

Let me know if you are interested.
That should clear up the issue quickly giving you some average allocation
times etc.....
as a function of random used pids.



Hubertus Franke
Enterprise Linux Group (Mgr),  Linux Technology Center (Member Scalability)
, OS-PIC (Chair)
email: frankeh@us.ibm.com
(w) 914-945-2003    (fax) 914-945-4425   TL: 862-2003




                                                                                                                                     
                      Linus Torvalds                                                                                                 
                      <torvalds@transme        To:       Andrew Morton <akpm@zip.com.au>                                             
                      ta.com>                  cc:       Paul Larson <plars@austin.ibm.com>, lkml <linux-kernel@vger.kernel.org>,    
                                                <davej@suse.de>, Hubertus Franke/Watson/IBM@IBMUS, <andrea@suse.de>                  
                      08/08/2002 04:24         Subject:  Re: [PATCH] Linux-2.5 fix/improve get_pid()                                 
                      PM                                                                                                             
                                                                                                                                     
                                                                                                                                     




On Wed, 7 Aug 2002, Andrew Morton wrote:
>
> Has this been evaluated against Bill Irwin's constant-time
> allocator?  Bill says it has slightly worse normal-case and
> vastly improved worst-case performance over the stock allocator.
> Not sure how it stacks up against this one.   Plus it's much nicer
> looking code.

Guys, this discussion is getting ridiculous.

Doing a bit allocator should be trivial, but it's hard to know when a bit
is to be free'd. You can't just do it at "exit()" time, because even if
pid X exits, that doesn't mean that X can be re-used: it may still be used
as a pgid or a tid by some other process Y.

So if you really want to take this approach, you need to count the uses of
"pid X", and free the bitmap entry only when that count goes to zero. I
see no such logic in Bill Irwin's code, only a comment about last use
(which doesn't explain how to notice that last use).

Without that per-pid-count thing clarified, I don't think the (otherwise
fairly straightforward) approach of Bills really flies.

For that reason I think the mark-and-sweep thing is the right thing to do,
but I think the two-level algorithm is just over-engineering and not worth
it. And I do hate that getpid_mutex thing. Having a blocking lock for
something as simple as pid allocation just smells horribly wrong to me.

Moving the pid allocation to later (so that it doesn't need to handle
operations that block in between allocation and "we're done" and the pid
allocation can be a spinlock) sounds like a fairly obvious thing to do.

I don't see why we would need the "pid" until the very last moment, at
which point we already have the tasklist lock, in fact.

And I hate overengineering.

                         Linus





^ permalink raw reply	[flat|nested] 51+ messages in thread
* Re: [PATCH] Linux-2.5 fix/improve get_pid()
@ 2002-08-08 21:43 Hubertus Franke
  2002-08-08 22:02 ` Linus Torvalds
  0 siblings, 1 reply; 51+ messages in thread
From: Hubertus Franke @ 2002-08-08 21:43 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Andries Brouwer, Andrew Morton, andrea, davej, lkml, Paul Larson,
	Linus Torvalds

                                                                                                               
                                                                                                               
                                                                                                               


That is true. All was done under the 16-bit assumption
My hunch is that the current algorithm might actually work quite well
for a sparsely populated pid-space (32-bits).
A bitmap-ed based solution is not possible at that point due to space
requirements.

Should be easy to figure out.

Hubertus Franke
Enterprise Linux Group (Mgr),  Linux Technology Center (Member Scalability)
, OS-PIC (Chair)
email: frankeh@us.ibm.com
(w) 914-945-2003    (fax) 914-945-4425   TL: 862-2003




                                                                                                                                     
                      Rik van Riel                                                                                                   
                      <riel@conectiva.         To:       Hubertus Franke/Watson/IBM@IBMUS                                            
                      com.br>                  cc:       Andries Brouwer <aebr@win.tue.nl>, Andrew Morton <akpm@zip.com.au>,         
                                                <andrea@suse.de>, <davej@suse.de>, lkml <linux-kernel@vger.kernel.org>, Paul Larson  
                      08/08/2002 04:15          <plars@austin.ibm.com>, Linus Torvalds <torvalds@transmeta.com>                      
                      PM                       Subject:  Re: [PATCH] Linux-2.5 fix/improve get_pid()                                 
                                                                                                                                     
                                                                                                                                     
                                                                                                                                     



On Thu, 8 Aug 2002, Hubertus Franke wrote:

> Which one sounds like the best one ?
>
> Assuming that for now we have to stick to 16-bit pid_t ....

That assumption is pretty central to the debate.

I don't see the standard get_pid nor the bitmap based
get_pid scale to something larger than a 16-bit pid_t.

If we're not sure yet whether we want to keep pid_t 16
bits it might be worth putting in an algorithm that does
scale to larger numbers, if only so the switch to a larger
pid_t will be more straightforward.

kind regards,

Rik
--
             http://www.linuxsymposium.org/2002/
"You're one of those condescending OLS attendants"
"Here's a nickle kid.  Go buy yourself a real t-shirt"

http://www.surriel.com/                    http://distro.conectiva.com/





^ permalink raw reply	[flat|nested] 51+ messages in thread
* Re: [PATCH] Linux-2.5 fix/improve get_pid()
@ 2002-08-08 18:56 Hubertus Franke
  2002-08-08 19:15 ` Rik van Riel
  2002-08-08 19:18 ` Paul Larson
  0 siblings, 2 replies; 51+ messages in thread
From: Hubertus Franke @ 2002-08-08 18:56 UTC (permalink / raw)
  To: Andries Brouwer
  Cc: Andrew Morton, andrea, davej, lkml, Paul Larson, Linus Torvalds

                                                                                                               
                                                                                                               
                                                                                                               


Which one sounds like the best one ?

Assuming that for now we have to stick to 16-bit pid_t ....
There are the following patches out there ....:

(a) Vanilla version which breaks down after 22K pids and really sucks above
32.5K
(b) Bill Irwin's, which keeps track of which PID is free and which one is
not ?
(c) Andrea's patch which searches in the bitmap when we are running out
(d) Paul's patch, which I believe was based on one of my earlier submission
(03/02) that
                                 essentially switches between (a)+(c) at
the break even point.

Assuming that Paul's patch still resembles somewhat my earlier intentions:
My observation of (d) back in february was that the current approach with
using next_pid = ++last_pid in the general case is pretty good and that
only the determination of a guaranteed free range sucks due to O(N**2)
algorithm sucks and using a bitmap to determine the next safe range
through a O(N) algorithm is good. I determined the breakeven point with
random pid deletion to be ~22K where the current algorithm worked darn
well.
One difference to Andrea's patch (c) is  (if I remember his code correctly)
that
I used was that I always look forward in the bitmap for the next safe range
in the
bitmap when I exhausted the previous one without having to traverse the
task list.
Only if non is found I traverse the task list and try the bit search one
more time.

I feel (c) or (d) is a better solution over (a) and (b)...   Open for
discussion.
I have a test program that does the random pid deletion and pid allocation
in user space. All what's required is to copy the get_pid() code from the
kernel into there... Can make that available ...

I don't know what Paul has done to the patch since then ....

-- Hubertus Franke  (frankeh@watson.ibm.com)





                                                                                                                                     
                      Andries Brouwer                                                                                                
                      <aebr@win.tue.nl>        To:       Andrew Morton <akpm@zip.com.au>                                             
                                               cc:       Paul Larson <plars@austin.ibm.com>, Linus Torvalds <torvalds@transmeta.     
                      08/07/2002 08:24          com>, lkml <linux-kernel@vger.kernel.org>, davej@suse.de, Hubertus                   
                      PM                        Franke/Watson/IBM@IBMUS, andrea@suse.de                                              
                                               Subject:  Re: [PATCH] Linux-2.5 fix/improve get_pid()                                 
                                                                                                                                     
                                                                                                                                     
                                                                                                                                     



On Wed, Aug 07, 2002 at 04:06:05PM -0700, Andrew Morton wrote:

> Has this been evaluated against Bill Irwin's constant-time
> allocator?  Bill says it has slightly worse normal-case and
> vastly improved worst-case performance over the stock allocator.
> Not sure how it stacks up against this one.   Plus it's much nicer
> looking code.

> #define PID_MAX                    0x8000
> #define RESERVED_PIDS        300
>
> #define MAP0_SIZE            (PID_MAX   >> BITS_PER_LONG_SHIFT)
> #define MAP1_SIZE            (MAP0_SIZE >> BITS_PER_LONG_SHIFT)
> #define MAP2_SIZE            (MAP1_SIZE >> BITS_PER_LONG_SHIFT)
>
> static unsigned long pid_map0[MAP0_SIZE];
> static unsigned long pid_map1[MAP1_SIZE];
> static unsigned long pid_map2[MAP2_SIZE];

Here it is of interest how large a pid is.
With a 64-bit pid_t it is just

  static pid_t last_pid;

  pid_t get_next_pid() {
             return ++last_pid;
  }

since 2^64 is a really large number.
Unfortunately glibc does not support this (on x86).

With a 32-bit pid_t wraparounds will occur, but very infrequently.
Thus, finding the next pid will be very fast on average, much faster
than the above algorithm, and no arrays are required.
One only loses the guaranteed constant time property.
Unless hard real time is required, this sounds like the best version.

Andries




^ permalink raw reply	[flat|nested] 51+ messages in thread
* [PATCH] Linux-2.5 fix/improve get_pid()
@ 2002-08-07 22:03 Paul Larson
  2002-08-07 23:06 ` Andrew Morton
  0 siblings, 1 reply; 51+ messages in thread
From: Paul Larson @ 2002-08-07 22:03 UTC (permalink / raw)
  To: Linus Torvalds, lkml; +Cc: davej, frankeh, andrea


This patch provides an improved version of get_pid() while also taking
care of the bug that causes the machine to hang when you hit PID_MAX. 
It is based on both solutions to the problem provided by Hubertus Franke
and Andrea Arcangeli.  This uses a bitmap to find an available pid and
uses Hubertus's adaptive implementation to only use this when it is more
beneficial than the old mechanism.  The getpid_mutex from AA's patch is
also carried over to avoid the race where another cpu could get the same
pid before SET_LINKS was called.

This should patch cleanly against 2.5.30 or bk current.
Please apply.

--- a/kernel/fork.c	Wed Aug  7 16:37:38 2002
+++ b/kernel/fork.c	Wed Aug  7 16:05:22 2002
@@ -50,6 +50,12 @@
 
 rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;  /* outer */
 
+/*
+ * Protectes next_safe, last_pid and it avoids races
+ * between get_pid and SET_LINKS().
+ */
+static DECLARE_MUTEX(getpid_mutex);
+
 void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
 {
 	unsigned long flags;
@@ -129,27 +135,107 @@
 	kmem_cache_free(task_struct_cachep,tsk);
 }
 
-/* Protects next_safe and last_pid. */
-spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
+/* this should be provided in every architecture */
+#ifndef SHIFT_PER_LONG
+#if BITS_PER_LONG == 64
+#   define SHIFT_PER_LONG 6
+#elif BITS_PER_LONG == 32
+#   define SHIFT_PER_LONG 5
+#else
+#error "SHIFT_PER_LONG"
+#endif
+#endif
+
+#define RESERVED_PIDS       (300)
+#define GETPID_THRESHOLD    (22000)  /* when to switch to a mapped algo */
+#define PID_MAP_SIZE        (PID_MAX >> SHIFT_PER_LONG)
+static unsigned long pid_map[PID_MAP_SIZE];
+static int next_safe = PID_MAX;
+
+static inline void mark_pid(int pid)
+{
+	__set_bit(pid,pid_map);
+}
+
+static int get_pid_by_map(int lastpid) 
+{
+	static int mark_and_sweep = 0;
+
+	int round = 0;
+	struct task_struct *p;
+	int i;
+	unsigned long mask;
+	int fpos;
+
+
+	if (mark_and_sweep) {
+repeat:
+		mark_and_sweep = 0;
+		memset(pid_map, 0, PID_MAP_SIZE * sizeof(unsigned long));
+		lastpid = RESERVED_PIDS;
+	}
+	for_each_task(p) {
+		mark_pid(p->pid);
+		mark_pid(p->pgrp);
+		mark_pid(p->tgid);
+		mark_pid(p->session);
+	}
+
+	/* find next free pid */
+	i = (lastpid >> SHIFT_PER_LONG);
+	mask = pid_map[i] | ((1 << ((lastpid & (BITS_PER_LONG-1)))) - 1);   
+
+	while ((mask == ~0) && (++i < PID_MAP_SIZE)) 
+		mask = pid_map[i];
+
+	if (i == PID_MAP_SIZE) { 
+		if (round == 0) {
+			round = 1;
+			goto repeat;
+		}
+		next_safe = RESERVED_PIDS;
+		mark_and_sweep = 1;  /* mark next time */
+		return 0; 
+	}
+
+	fpos = ffz(mask);
+	i &= (PID_MAX-1);
+	lastpid = (i << SHIFT_PER_LONG) + fpos;
+
+	/* find next save pid */
+	mask &= ~((1 << fpos) - 1);
+
+	while ((mask == 0) && (++i < PID_MAP_SIZE)) 
+		mask = pid_map[i];
+
+	if (i==PID_MAP_SIZE) 
+		next_safe = PID_MAX;
+	else 
+		next_safe = (i << SHIFT_PER_LONG) + ffs(mask) - 1;
+	return lastpid;
+}
 
 static int get_pid(unsigned long flags)
 {
-	static int next_safe = PID_MAX;
 	struct task_struct *p;
 	int pid;
 
-	if (flags & CLONE_IDLETASK)
-		return 0;
-
-	spin_lock(&lastpid_lock);
 	if((++last_pid) & 0xffff8000) {
-		last_pid = 300;		/* Skip daemons etc. */
+		last_pid = RESERVED_PIDS;	/* Skip daemons etc. */
 		goto inside;
 	}
 	if(last_pid >= next_safe) {
 inside:
 		next_safe = PID_MAX;
 		read_lock(&tasklist_lock);
+		if (nr_threads > GETPID_THRESHOLD) {
+			last_pid = get_pid_by_map(last_pid);
+			if (last_pid == 0) {
+				last_pid = PID_MAX;
+				goto nomorepids; 
+			}
+		} else {
+			int beginpid = last_pid;
 	repeat:
 		for_each_task(p) {
 			if(p->pid == last_pid	||
@@ -158,24 +244,33 @@
 			   p->session == last_pid) {
 				if(++last_pid >= next_safe) {
 					if(last_pid & 0xffff8000)
-						last_pid = 300;
+						last_pid = RESERVED_PIDS;
 					next_safe = PID_MAX;
 				}
+				if(last_pid == beginpid)
+					goto nomorepids;
 				goto repeat;
 			}
 			if(p->pid > last_pid && next_safe > p->pid)
 				next_safe = p->pid;
 			if(p->pgrp > last_pid && next_safe > p->pgrp)
 				next_safe = p->pgrp;
+			if(p->tgid > last_pid && next_safe > p->tgid)
+				next_safe = p->tgid;
 			if(p->session > last_pid && next_safe > p->session)
 				next_safe = p->session;
 		}
+		}
 		read_unlock(&tasklist_lock);
 	}
 	pid = last_pid;
-	spin_unlock(&lastpid_lock);
 
 	return pid;
+
+nomorepids:
+	next_safe = last_pid = PID_MAX;
+	read_unlock(&tasklist_lock);
+	return 0;
 }
 
 static inline int dup_mmap(struct mm_struct * mm)
@@ -669,7 +764,14 @@
 	p->state = TASK_UNINTERRUPTIBLE;
 
 	copy_flags(clone_flags, p);
-	p->pid = get_pid(clone_flags);
+	down(&getpid_mutex);
+	if (clone_flags & CLONE_IDLETASK)
+		p->pid = 0;
+	else {
+		p->pid = get_pid(clone_flags);
+		if (p->pid == 0)
+			goto bad_fork_cleanup;
+	}
 	p->proc_dentry = NULL;
 
 	INIT_LIST_HEAD(&p->run_list);
@@ -793,10 +895,20 @@
 		list_add(&p->thread_group, &current->thread_group);
 	}
 
+	/*
+	 * We must do the SET_LINKS() under the getpid_mutex, to avoid
+	 * another CPU to get our same PID between the release of of the
+	 * getpid_mutex and the SET_LINKS().
+	 *
+	 * In short to avoid SMP races the new child->pid must be just visible
+	 * in the tasklist by the time we drop the getpid_mutex.
+	 */
 	SET_LINKS(p);
+
 	hash_pid(p);
 	nr_threads++;
 	write_unlock_irq(&tasklist_lock);
+	up(&getpid_mutex);
 	retval = 0;
 
 fork_out:
@@ -819,6 +931,7 @@
 bad_fork_cleanup_security:
 	security_ops->task_free_security(p);
 bad_fork_cleanup:
+	up(&getpid_mutex);
 	put_exec_domain(p->thread_info->exec_domain);
 	if (p->binfmt && p->binfmt->module)
 		__MOD_DEC_USE_COUNT(p->binfmt->module);


^ permalink raw reply	[flat|nested] 51+ messages in thread

end of thread, other threads:[~2002-08-12 16:02 UTC | newest]

Thread overview: 51+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-08-12 16:15 [PATCH] Linux-2.5 fix/improve get_pid() Jim Houston
  -- strict thread matches above, loose matches on Subject: below --
2002-08-09  0:53 Hubertus Franke
2002-08-08 21:50 Hubertus Franke
2002-08-08 21:43 Hubertus Franke
2002-08-08 22:02 ` Linus Torvalds
2002-08-08 22:26   ` Linus Torvalds
2002-08-08 23:09     ` Albert D. Cahalan
2002-08-09  3:26       ` Chris Adams
2002-08-09  7:04         ` Albert D. Cahalan
2002-08-09  8:48           ` Helge Hafting
2002-08-09 10:32             ` Alan Cox
2002-08-09  9:35               ` Andrew Morton
2002-08-09 10:37               ` Andries Brouwer
2002-08-09 13:00           ` Chris Adams
2002-08-09 14:39             ` Albert D. Cahalan
2002-08-09 13:59           ` Rik van Riel
2002-08-09 14:57             ` Albert D. Cahalan
2002-08-08 22:34   ` Paul Larson
2002-08-08 22:44     ` Rik van Riel
2002-08-08 22:37   ` Rik van Riel
2002-08-09  1:49     ` Andries Brouwer
2002-08-09 19:34   ` Paul Larson
2002-08-09 20:13     ` Paul Larson
2002-08-09 20:40     ` Andries Brouwer
2002-08-09 21:23     ` Linus Torvalds
2002-08-09 21:42       ` Linus Torvalds
2002-08-09 21:46         ` Paul Larson
2002-08-09 22:04           ` Linus Torvalds
2002-08-10 17:23             ` Jamie Lokier
2002-08-10 18:33               ` Linus Torvalds
2002-08-10 18:48                 ` Jamie Lokier
2002-08-11 20:41                   ` Alan Cox
2002-08-11 19:58                     ` Jamie Lokier
2002-08-11 21:23                       ` Alan Cox
2002-08-11 20:10                         ` Jamie Lokier
2002-08-12  8:56             ` Padraig Brady
2002-08-12 10:37               ` Alan Cox
2002-08-12  9:21                 ` Padraig Brady
2002-08-12 14:40             ` Paul Larson
2002-08-08 18:56 Hubertus Franke
2002-08-08 19:15 ` Rik van Riel
2002-08-08 19:18 ` Paul Larson
2002-08-07 22:03 Paul Larson
2002-08-07 23:06 ` Andrew Morton
2002-08-08  0:24   ` Andries Brouwer
2002-08-08 19:42     ` William Lee Irwin III
2002-08-08 20:47       ` Andries Brouwer
2002-08-08 20:24   ` Linus Torvalds
2002-08-08 21:30     ` H. Peter Anvin
2002-08-08 21:45     ` William Lee Irwin III
2002-08-09  4:42     ` William Lee Irwin III

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox