Re: [PATCH] Linux-2.5 fix/improve get

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
@ 2002-08-12 16:15 Jim Houston
  0 siblings, 0 replies; 51+ messages in thread
From: Jim Houston @ 2002-08-12 16:15 UTC (permalink / raw)
  To: frankeh, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1475 bytes --]


I was doing some similar work last week and stumbled on the
this discussion.  I have been working with the Posix timers
patch and was looking for a nice way to allocate ids for
timers.

I looked at get_pid() and also the code used for ipc/msg.c
and friends.  I thought I could do better so I wrote the
attached routines.  They are a subtle variation on the bitmap
theme.  I'm mixing a bitmap with a sparse array which is implemented
as a radix tree.  This means that the data structure
grows/shrinks as needed.  The interface is:

	id = id_new(idp, pointer);
	id_remove(idp, id);
	pointer = id_lookup(idp, id);

The idp is a pointer to the structure which defines the id space.
This provides, not only the bitmap allocation of the id but also, mapping
the id to a pointer.  In the case of pids, this would replace the
hash table used by find_task_by_pid().

So far I have been playing with this code in user space.  I'm including
my test harness.  I plan to try it with your program later today.
My program was written more to test correctness, but I do collect
a few numbers.  Here is the output:

jhouston@linux:~/id > time ./id_test
my_id.count=1
new_cnt=5002501
avr_depth = 4996
id_lookup took 101 cycles
id_new took 204 cycles
id_remove took 148 cycles 
real    0m3.476s

That's about 0.5us total to allocate an id, lookup a random id
and remove it with a table containing 5000 ids.  The times
only double for 500000 ids.

Jim Houston - Concurrent Computer Corporation.

[-- Attachment #2: id.c --]
[-- Type: text/plain, Size: 2825 bytes --]

/*
 * Small id to pointer translation service.  
 *
 * It uses a radix tree like structure as a sparse array indexed 
 * by the id to obtain the pointer.  The bitmap makes allocating
 * an new id quick.  
 */

#include "id.h"

#ifdef __KERNEL__
static kmem_cache_t *id_layer_cache;
#endif

void *id_lookup(struct id *idp, int id)
{
	int n = idp->layers * ID_BITS;
	struct id_layer *p = idp->top;

	if (id >= (1 << n))
		return(NULL);

	while (n > 0 && p) {
		n -= ID_BITS;
		p = p->ary[(id >> n) & ID_MASK];
	}
	return((void *)p);
}

static int sub_alloc(struct id_layer *p, int shift, int id, void *ptr)
{
	int n = (id >> shift) & ID_MASK;
	int bitmap = p->bitmap;
	int id_base = id & ~((1 << (shift+ID_BITS))-1);
	int v;
	
	for ( ; n <= ID_MASK; n++, id = id_base + (n << shift)) {
		if (bitmap & (1 << n))
			continue;
		if (shift == 0) {
			p->ary[n] = (struct id_layer *)ptr;
			p->bitmap |= 1<<n;
			return(id);
		}
		if (!p->ary[n])
			p->ary[n] = alloc_layer();
		if (v = sub_alloc(p->ary[n], shift-ID_BITS, id, ptr)) {
			update_bitmap(p, n);
			return(v);
		}
	}
	return(0);
}

int id_new(struct id *idp, void *ptr)
{
	int n = idp->layers * ID_BITS;
	int last = idp->last;
	struct id_layer **p, *new;
	int id, v;
	
	/*
	 * Add a new layer if the array is full or the last id
	 * was at the limit and we don't want to wrap.
	 */
	if ((last == ((1 << n)-1) && last < idp->min_wrap) ||
		idp->count == (1 << n)) {
		++idp->layers;
		n += ID_BITS;
		new = alloc_layer();
		new->ary[0] = idp->top;
		idp->top = new;
		update_bitmap(new, 0);
	}
	if (last >= ((1 << n)-1))
		last = 0;

	/*
	 * Search for a free id starting after last id allocated.
	 * If that fails wrap back to start.
	 */
	id = last+1;
	if (!(v = sub_alloc(idp->top, n-ID_BITS, id, ptr)))
		v = sub_alloc(idp->top, n-ID_BITS, 1, ptr);
	idp->last = v;
	idp->count++;
	return(v);
}


static int sub_remove(struct id_layer *p, int shift, int id)
{
	int n = (id >> shift) & ID_MASK;
	int i, bitmap, rv;
	
if (!p) {
printf("in sub_remove for id=%d called with null pointer.\n", id);
return(0);
}
	rv = 0;
	bitmap = p->bitmap & ~(1<<n);
	p->bitmap = bitmap;
	if (shift == 0) {
		p->ary[n] = NULL;
		rv = !bitmap;
	} else {
		if (sub_remove(p->ary[n], shift-ID_BITS, id)) {
			free_layer(p->ary[n]);
			p->ary[n] = 0;
			for (i = 0; i < (1 << ID_BITS); i++)
				if (p->ary[i])
					break;
			if (i == (1 << ID_BITS))
				rv = 1;
		}
	}
	return(rv);
}

void id_remove(struct id *idp, int id)
{
	sub_remove(idp->top, (idp->layers-1)*ID_BITS, id);
	idp->count--;
}

void id_init(struct id *idp, int min_wrap)
{
#ifdef __KERNEL__
	id_layer_cache = kmem_cache_create("id_layer_cache", 
		sizeof(struct id_layer), 0, 0, 0, 0);
#endif
	idp->count = 1;
	idp->last = 0;
	idp->layers = 1;
	idp->top = alloc_layer();
	idp->top->bitmap = 1;
	idp->min_wrap = min_wrap;
}


[-- Attachment #3: id.h --]
[-- Type: text/plain, Size: 1289 bytes --]

/*
 * Small id to pointer translation service avoiding fixed sized
 * tables.
 */

#define ID_BITS 5

#define ID_MASK ((1 << ID_BITS)-1)
#define ID_FULL ((1 << (1 << ID_BITS))-1)

struct id_layer {
	unsigned int	bitmap: (1<<ID_BITS);
	struct id_layer	*ary[1<<ID_BITS];
};

struct id {
	int		layers;
	int		last;
	int		count;
	int		min_wrap;
	struct id_layer *top;
};

void *id_lookup(struct id *idp, int id);
int id_new(struct id *idp, void *ptr);
void id_remove(struct id *idp, int id);
void id_init(struct id *idp, int min_wrap);

static inline update_bitmap(struct id_layer *p, int bit)
{
	if (p->ary[bit] && p->ary[bit]->bitmap == (typeof(p->bitmap))-1)
		p->bitmap |= 1<<bit;
	else
		p->bitmap &= ~(1<<bit);
}

#ifndef __KERNEL__

#ifndef NULL
#define NULL 0
#endif

static inline struct id_layer *alloc_layer()
{
	struct id_layer *new;

	new = (struct id_layer *)malloc(sizeof(struct id_layer));
	bzero((void *)new, sizeof(struct id_layer));
	return(new);
	
}

static inline void free_layer(struct id_layer *p)
{
	free((void *)p);
}

#else

extern kmem_cache_t *id_layer_cache;

static inline struct id_layer *alloc_layer()
{
	return(kmem_cache_alloc(id_layer_cache, GFP_KERNEL));
}

static inline void free_layer(struct id_layer *p)
{
	kmem_cache_free(id_layer_cache, p);
}

#endif


[-- Attachment #4: id_test.c --]
[-- Type: text/plain, Size: 2133 bytes --]

/*
 * Test program for id.c
 */

#include <stdlib.h>
#include "id.h"

struct id my_id;

main()
{
	int i, n;
	void *v;

	id_init(&my_id, 10000);
	
	random_test();
}

#define LIST_SZ 50000

struct id_list {
	int id;
	int ptr;
} id_list[LIST_SZ];

int list_cnt;
int new_cnt;
int max = 5000;
int count = 10000000;

long long avr_depth;

#define rdtsc(low,high) \
     __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
  
static inline unsigned long get_tsc(void)
{
	register unsigned long eax, edx;

	rdtsc(eax,edx);
	return(eax);
}



random_test()
{
	int n, i;
	void *v;
	unsigned long t1, t2, t3, t4;

	for (n = 0; n < count; n++) {
		/*
		 * favor insertion so we will tend to run will max
		 * id's active.
		 */
		if (list_cnt && (list_cnt > max || rand() < (RAND_MAX/4))) {
			i = rand() % list_cnt;
			v = id_lookup(&my_id, id_list[i].id);
			if ((int)v != id_list[i].ptr) {
				printf("list_cnt=%d, i=%d\n", list_cnt, i);
				printf("failed id=%d, expected %d got %d\n",
					id_list[i].id, id_list[i].ptr, (int)v);
			} else {
#if 0
				printf("rm id=%d, ptr=%d\n",
					id_list[i].id, id_list[i].ptr);
#endif
				id_remove(&my_id, id_list[i].id);
			}
			id_list[i] = id_list[--list_cnt];
		} else {
			new_cnt++;
			id_list[list_cnt].id = id_new(&my_id, (void *)new_cnt);
			id_list[list_cnt].ptr = new_cnt;
#if 0
			printf("ins id=%d, ptr=%d\n",
				id_list[list_cnt].id, id_list[list_cnt].ptr);
#endif
			list_cnt++;
			avr_depth += list_cnt;
		}
	}
t1 = get_tsc();
id_lookup(&my_id, id_list[0].id);
t2 = get_tsc();
n = id_new(&my_id, (void *)++new_cnt);
t3 = get_tsc();
id_remove(&my_id, n);
t4 = get_tsc();

	for (i = 0; i < list_cnt; i++) {
		v = id_lookup(&my_id, id_list[i].id);
		if ((int)v != id_list[i].ptr) {
			printf("failed id=%d, expected %d got %d\n",
				id_list[i].id, id_list[i], (int)v);
		}
		id_remove(&my_id, id_list[i].id);
	}
	printf("my_id.count=%d\n", my_id.count);
	printf("new_cnt=%d\n", new_cnt);
	printf("avr_depth = %d\n", (int)(avr_depth/new_cnt));
	printf("id_lookup took %d cycles\n", t2-t1);
	printf("id_new took %d cycles\n", t3-t2);
	printf("id_remove took %d cycles\n", t4-t3);
}

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
@ 2002-08-09  0:53 Hubertus Franke
  0 siblings, 0 replies; 51+ messages in thread
From: Hubertus Franke @ 2002-08-09  0:53 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andries Brouwer, Andrew Morton, andrea, davej, lkml, Paul Larson,
	Rik van Riel

                                                                                                               
                                                                                                               
                                                                                                               


I package my stuff up that allows to pull this into user space and run much
more efficient
pid allocation test for performance....
I'll also include the comparision numbers.  Currently remote, so I don't
have access to
that info.
Based on that though it seems with random pid deletion (we surely could
argue about
that one) there still seems reasonable benefits to "consider" a twolevel
algo.
More tomorrow.

Hubertus Franke
Enterprise Linux Group (Mgr),  Linux Technology Center (Member Scalability)
, OS-PIC (Chair)
email: frankeh@us.ibm.com
(w) 914-945-2003    (fax) 914-945-4425   TL: 862-2003




                                                                                                                                     
                      Linus Torvalds                                                                                                 
                      <torvalds@transme        To:       Hubertus Franke/Watson/IBM@IBMUS                                            
                      ta.com>                  cc:       Rik van Riel <riel@conectiva.com.br>, Andries Brouwer <aebr@win.tue.nl>,    
                                                Andrew Morton <akpm@zip.com.au>, <andrea@suse.de>, <davej@suse.de>, lkml <linux-     
                      08/08/2002 06:26          kernel@vger.kernel.org>, Paul Larson <plars@austin.ibm.com>                          
                      PM                       Subject:  Re: [PATCH] Linux-2.5 fix/improve get_pid()                                 
                                                                                                                                     
                                                                                                                                     
                                                                                                                                     




On Thu, 8 Aug 2002, Linus Torvalds wrote:
>
> So let's just try Andries approach, suggested patch as follows..

"ps" seems to do ok from a visual standpoint at least up to 99 million.
Maybe it won't look that good after that, I'm too lazy to test.

The following trivial program is useful for efficiently allocating pid
numbers without blowing chunks on the VM subsystem and spending all the
time on page table updates - for people who want to test (look out: I've
got dual 2.4GHz CPU's with HT, so getting up to 10+ million was easy, your
milage may wary and at some point you should just compile a kernel that
starts higher ;).

                         Linus

---
#include <sys/types.h>
#include <sys/wait.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>

int main()
{
        int i;
        for (i = 1; i < 250000; i++) {
                if (!vfork())
                        exit(1);
                if (waitpid(-1, NULL, WNOHANG) < 0)
                        perror("waitpid");
        }
        return 0;
}






^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
@ 2002-08-08 21:50 Hubertus Franke
  0 siblings, 0 replies; 51+ messages in thread
From: Hubertus Franke @ 2002-08-08 21:50 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, andrea, davej, lkml, Paul Larson

Agreed ....
The mark-and-sweep is the correct method for 16-bits ! For 32-bits its not
possible !
Two level is over-engineered (as I told Paul).

If you want I can post the data that I collected comparing
(a) stock kernel
(b) my mark and sweep
(c) my double mark and sweep (try looking forward and then scan task list)
?

Let me know if you are interested.
That should clear up the issue quickly giving you some average allocation
times etc.....
as a function of random used pids.

Hubertus Franke
Enterprise Linux Group (Mgr),  Linux Technology Center (Member Scalability)
, OS-PIC (Chair)
email: frankeh@us.ibm.com
(w) 914-945-2003    (fax) 914-945-4425   TL: 862-2003

                      Linus Torvalds                                                                                                 
                      <torvalds@transme        To:       Andrew Morton <akpm@zip.com.au>                                             
                      ta.com>                  cc:       Paul Larson <plars@austin.ibm.com>, lkml <linux-kernel@vger.kernel.org>,    
                                                <davej@suse.de>, Hubertus Franke/Watson/IBM@IBMUS, <andrea@suse.de>                  
                      08/08/2002 04:24         Subject:  Re: [PATCH] Linux-2.5 fix/improve get_pid()                                 
                      PM                                                                                                             

On Wed, 7 Aug 2002, Andrew Morton wrote:
>
> Has this been evaluated against Bill Irwin's constant-time
> allocator?  Bill says it has slightly worse normal-case and
> vastly improved worst-case performance over the stock allocator.
> Not sure how it stacks up against this one.   Plus it's much nicer
> looking code.

Guys, this discussion is getting ridiculous.

Doing a bit allocator should be trivial, but it's hard to know when a bit
is to be free'd. You can't just do it at "exit()" time, because even if
pid X exits, that doesn't mean that X can be re-used: it may still be used
as a pgid or a tid by some other process Y.

So if you really want to take this approach, you need to count the uses of
"pid X", and free the bitmap entry only when that count goes to zero. I
see no such logic in Bill Irwin's code, only a comment about last use
(which doesn't explain how to notice that last use).

Without that per-pid-count thing clarified, I don't think the (otherwise
fairly straightforward) approach of Bills really flies.

For that reason I think the mark-and-sweep thing is the right thing to do,
but I think the two-level algorithm is just over-engineering and not worth
it. And I do hate that getpid_mutex thing. Having a blocking lock for
something as simple as pid allocation just smells horribly wrong to me.

Moving the pid allocation to later (so that it doesn't need to handle
operations that block in between allocation and "we're done" and the pid
allocation can be a spinlock) sounds like a fairly obvious thing to do.

I don't see why we would need the "pid" until the very last moment, at
which point we already have the tasklist lock, in fact.

And I hate overengineering.

                         Linus

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
@ 2002-08-08 21:43 Hubertus Franke
  2002-08-08 22:02 ` Linus Torvalds
  0 siblings, 1 reply; 51+ messages in thread
From: Hubertus Franke @ 2002-08-08 21:43 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Andries Brouwer, Andrew Morton, andrea, davej, lkml, Paul Larson,
	Linus Torvalds

                                                                                                               
                                                                                                               
                                                                                                               


That is true. All was done under the 16-bit assumption
My hunch is that the current algorithm might actually work quite well
for a sparsely populated pid-space (32-bits).
A bitmap-ed based solution is not possible at that point due to space
requirements.

Should be easy to figure out.

Hubertus Franke
Enterprise Linux Group (Mgr),  Linux Technology Center (Member Scalability)
, OS-PIC (Chair)
email: frankeh@us.ibm.com
(w) 914-945-2003    (fax) 914-945-4425   TL: 862-2003




                                                                                                                                     
                      Rik van Riel                                                                                                   
                      <riel@conectiva.         To:       Hubertus Franke/Watson/IBM@IBMUS                                            
                      com.br>                  cc:       Andries Brouwer <aebr@win.tue.nl>, Andrew Morton <akpm@zip.com.au>,         
                                                <andrea@suse.de>, <davej@suse.de>, lkml <linux-kernel@vger.kernel.org>, Paul Larson  
                      08/08/2002 04:15          <plars@austin.ibm.com>, Linus Torvalds <torvalds@transmeta.com>                      
                      PM                       Subject:  Re: [PATCH] Linux-2.5 fix/improve get_pid()                                 
                                                                                                                                     
                                                                                                                                     
                                                                                                                                     



On Thu, 8 Aug 2002, Hubertus Franke wrote:

> Which one sounds like the best one ?
>
> Assuming that for now we have to stick to 16-bit pid_t ....

That assumption is pretty central to the debate.

I don't see the standard get_pid nor the bitmap based
get_pid scale to something larger than a 16-bit pid_t.

If we're not sure yet whether we want to keep pid_t 16
bits it might be worth putting in an algorithm that does
scale to larger numbers, if only so the switch to a larger
pid_t will be more straightforward.

kind regards,

Rik
--
             http://www.linuxsymposium.org/2002/
"You're one of those condescending OLS attendants"
"Here's a nickle kid.  Go buy yourself a real t-shirt"

http://www.surriel.com/                    http://distro.conectiva.com/





^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 21:43 Hubertus Franke
@ 2002-08-08 22:02 ` Linus Torvalds
  2002-08-08 22:26   ` Linus Torvalds
                     ` (3 more replies)
  0 siblings, 4 replies; 51+ messages in thread
From: Linus Torvalds @ 2002-08-08 22:02 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: Rik van Riel, Andries Brouwer, Andrew Morton, andrea, davej, lkml,
	Paul Larson


On Thu, 8 Aug 2002, Hubertus Franke wrote:
> 
> That is true. All was done under the 16-bit assumption
> My hunch is that the current algorithm might actually work quite well
> for a sparsely populated pid-space (32-bits).

I agree.

So let's just try Andries approach, suggested patch as follows..

(yeah, silly mask and MAX_PID, but since even the kernel uses signed
integers for some of it, this way it never gets close to being an issue).

		Linus

----
--- 1.2/include/linux/threads.h	Tue Feb  5 07:23:04 2002
+++ edited/include/linux/threads.h	Thu Aug  8 14:58:28 2002
@@ -19,6 +19,7 @@
 /*
  * This controls the maximum pid allocated to a process
  */
-#define PID_MAX 0x8000
+#define PID_MASK 0x3fffffff
+#define PID_MAX (PID_MASK+1)
 
 #endif
===== kernel/fork.c 1.57 vs edited =====
--- 1.57/kernel/fork.c	Tue Jul 30 15:49:20 2002
+++ edited/kernel/fork.c	Thu Aug  8 15:00:16 2002
@@ -142,7 +142,7 @@
 		return 0;
 
 	spin_lock(&lastpid_lock);
-	if((++last_pid) & 0xffff8000) {
+	if((++last_pid) & ~PID_MASK) {
 		last_pid = 300;		/* Skip daemons etc. */
 		goto inside;
 	}
@@ -157,7 +157,7 @@
 			   p->tgid == last_pid	||
 			   p->session == last_pid) {
 				if(++last_pid >= next_safe) {
-					if(last_pid & 0xffff8000)
+					if(last_pid & ~PID_MASK)
 						last_pid = 300;
 					next_safe = PID_MAX;
 				}


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 22:02 ` Linus Torvalds
@ 2002-08-08 22:26   ` Linus Torvalds
  2002-08-08 23:09     ` Albert D. Cahalan
  2002-08-08 22:34   ` Paul Larson
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 51+ messages in thread
From: Linus Torvalds @ 2002-08-08 22:26 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: Rik van Riel, Andries Brouwer, Andrew Morton, andrea, davej, lkml,
	Paul Larson

On Thu, 8 Aug 2002, Linus Torvalds wrote:
> 
> So let's just try Andries approach, suggested patch as follows..

"ps" seems to do ok from a visual standpoint at least up to 99 million. 
Maybe it won't look that good after that, I'm too lazy to test.

The following trivial program is useful for efficiently allocating pid
numbers without blowing chunks on the VM subsystem and spending all the
time on page table updates - for people who want to test (look out: I've
got dual 2.4GHz CPU's with HT, so getting up to 10+ million was easy, your
milage may wary and at some point you should just compile a kernel that
starts higher ;).

		Linus

---
#include <sys/types.h>
#include <sys/wait.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>

int main()
{
        int i;
        for (i = 1; i < 250000; i++) {
                if (!vfork())
                        exit(1);
                if (waitpid(-1, NULL, WNOHANG) < 0)
                        perror("waitpid");
        }
        return 0;
}

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 22:26   ` Linus Torvalds
@ 2002-08-08 23:09     ` Albert D. Cahalan
  2002-08-09  3:26       ` Chris Adams
  0 siblings, 1 reply; 51+ messages in thread
From: Albert D. Cahalan @ 2002-08-08 23:09 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Hubertus Franke, Rik van Riel, Andries Brouwer, Andrew Morton,
	andrea, davej, lkml, Paul Larson

Linus Torvalds writes:

> "ps" seems to do ok from a visual standpoint at least up to 99 million. 
> Maybe it won't look that good after that, I'm too lazy to test.

Mind sharing what "ps -fj", "ps -lf", and "ps j" look like?
The standard tty is 80x24 BTW, and we already have serious
problems due to ever-expanding tty names.

How about a default limit of 9999, to be adjusted by
sysctl as needed?


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 23:09     ` Albert D. Cahalan
@ 2002-08-09  3:26       ` Chris Adams
  2002-08-09  7:04         ` Albert D. Cahalan
  0 siblings, 1 reply; 51+ messages in thread
From: Chris Adams @ 2002-08-09  3:26 UTC (permalink / raw)
  To: linux-kernel

Once upon a time, Albert D. Cahalan <acahalan@cs.uml.edu> said:
>Mind sharing what "ps -fj", "ps -lf", and "ps j" look like?
>The standard tty is 80x24 BTW, and we already have serious
>problems due to ever-expanding tty names.
>
>How about a default limit of 9999, to be adjusted by
>sysctl as needed?

I hope you meant 99999 (since we already have 5 digit PIDs).  It would
also seem to me that if it is adjustable, then "ps" would have to handle
it anyway, and making "ps" deal with adjustable size PIDs would be more
complex and error-prone.

Tru64 Unix 5.x uses 19 bit pids (up to 524288, so up to six digits - the
rest of the 32 bits go for cluster node, node sequence, and an unused
sign bit) without significant problems.  Their "ps" args aren't an exact
match, but they're close (lots of processes snipped):

$ ps -fj | head -2
UID         PID   PPID    C STIME    TTY             TIME CMD
cmadams  272363 301021  0.0 20:47:46 pts/1        0:00.09 -bash (bash)
$ ps -lf | head -2
       F S           UID    PID   PPID %CPU PRI  NI  RSS WCHAN    STARTED         TIME COMMAND
80c08001 I           200 272363 301021  0.0  44   0 520K wait     20:47:46     0:00.09 -bash (bash)
$ ps j | head -2
USER        PID   PPID   PGID   SESS JOBC S    TTY             TIME COMMAND
cmadams  272363 301021 272363 272363    0 I    pts/1        0:00.09 -bash (bash)
$ 

I had exactly one problem moving to 5.x with larger PIDs: the free "top"
program (Tru64 doesn't include "top") assumed 5 digit PIDs so the
columns didn't line up until I changed the output format by one column.

I routinely have 1000+ processes running on this server (many of them
short-lived things like POP checks, short CGIs, sendmail background
delivery, etc.), so having a larger PID space is important (having the
same PID reused within 15-30 seconds would be annoying).

I would like to see Linux running on this server (or at least this class
of server), and limiting the number of PIDs because of "ps" formatting
is not the way to go.

-- 
Chris Adams <cmadams@hiwaay.net>
Systems and Network Administrator - HiWAAY Internet Services
I don't speak for anybody but myself - that's enough trouble.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09  3:26       ` Chris Adams
@ 2002-08-09  7:04         ` Albert D. Cahalan
  2002-08-09  8:48           ` Helge Hafting
                             ` (2 more replies)
  0 siblings, 3 replies; 51+ messages in thread
From: Albert D. Cahalan @ 2002-08-09  7:04 UTC (permalink / raw)
  To: Chris Adams; +Cc: linux-kernel

Chris Adams writes:
> Once upon a time, Albert D. Cahalan <acahalan@cs.uml.edu> said:

>> Mind sharing what "ps -fj", "ps -lf", and "ps j" look like?
>> The standard tty is 80x24 BTW, and we already have serious
>> problems due to ever-expanding tty names.
>>
>> How about a default limit of 9999, to be adjusted by
>> sysctl as needed?
>
> I hope you meant 99999 (since we already have 5 digit PIDs).  It would
> also seem to me that if it is adjustable, then "ps" would have to handle
> it anyway, and making "ps" deal with adjustable size PIDs would be more
> complex and error-prone.

BTW, let's start with: not only "ps" is affected.
Off the top of my head: netstat, fuser, top, pstree...

I almost put 99999, but then I realized that that's silly.
For years Linux had a hard limit of about 4000 processes,
and not many people complained. It sure would be nice to
gain back a few of the columns lost to other stuff, so
that people could once again see command arguments.

The two real-word usage examples close at hand:

a. My full GNOME desktop has 48 processes.

b. The main UNIX shell server for CS students
   at this university has 79 processes.
   (Tru64, several hundred CS students, 2:25 am)

For "ps", adjustable width is a trivial addition.
Notice the UID ("ps -l", "ps -lf) handling, and
notice what signals ("ps s") do on a wide screen.
I could also just let the output be ugly, since
no normal system will have so many processes.

The problem is screen space, pure and simple. If the
default limit goes to over 1 billion, then "ps" output
must wrap lines. There is no alternative, unless you
think "System going down to reset PID numbers!" is OK.

> Tru64 Unix 5.x uses 19 bit pids (up to 524288, so up to six digits - the
> rest of the 32 bits go for cluster node, node sequence, and an unused
> sign bit) without significant problems.  Their "ps" args aren't an exact
> match, but they're close (lots of processes snipped):
> 
> $ ps -fj | head -2
> UID         PID   PPID    C STIME    TTY             TIME CMD
> cmadams  272363 301021  0.0 20:47:46 pts/1        0:00.09 -bash (bash)

Linux (and all SysV if I remember right) has 4 columns
of PID info that would need to expand:

UID        PID  PPID  PGID   SID  C STIME TTY          TIME CMD
albert   12975 12966 12975 12975  0 Aug02 pts/1    00:00:00 bash

> $ ps -lf | head -2
>        F S           UID    PID   PPID %CPU PRI  NI  RSS WCHAN    STARTED         TIME COMMAND
> 80c08001 I           200 272363 301021  0.0  44   0 520K wait     20:47:46     0:00.09 -bash (bash)

Eeew. That's broken; it won't display right on a normal
80x24 terminal. Linux "ps -lf" will just barely fit.

> $ ps j | head -2
> USER        PID   PPID   PGID   SESS JOBC S    TTY             TIME COMMAND
> cmadams  272363 301021 272363 272363    0 I    pts/1        0:00.09 -bash (bash)

For historic reasons, Linux has a whopping 5 columns
of PID info for this format:

 PPID   PID  PGID   SID TTY      TPGID STAT   UID   TIME COMMAND
    1   770   770   770 tty1     12893 S     1000   0:00 -bash
  770 12893 12893   770 tty1     12893 S     1000   0:00 /bin/sh /usr/bin/X11/st

> I routinely have 1000+ processes running on this server (many of them
> short-lived things like POP checks, short CGIs, sendmail background
> delivery, etc.), so having a larger PID space is important (having the
> same PID reused within 15-30 seconds would be annoying).

Increase the limit on this server if you wish. Problem?
I only suggest 9999 as the default. (which would actually
be just enough for you)

> I would like to see Linux running on this server (or at least this class
> of server), and limiting the number of PIDs because of "ps" formatting
> is not the way to go.

It's not just "ps", and I'm not saying you couldn't
adjust your system for a higher limit. I do think that
the out-of-box default shouldn't screw up formatting.
If you need to go past 9999, then you'll want to tweak
a few other settings as well. Low process counts are
the common case, and shouldn't be hurt by the fact that
a few people wish to run a billion processes.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09  7:04         ` Albert D. Cahalan
@ 2002-08-09  8:48           ` Helge Hafting
  2002-08-09 10:32             ` Alan Cox
  2002-08-09 13:00           ` Chris Adams
  2002-08-09 13:59           ` Rik van Riel
  2 siblings, 1 reply; 51+ messages in thread
From: Helge Hafting @ 2002-08-09  8:48 UTC (permalink / raw)
  To: Albert D. Cahalan, linux-kernel

"Albert D. Cahalan" wrote:

> The problem is screen space, pure and simple. If the
> default limit goes to over 1 billion, then "ps" output
> must wrap lines. There is no alternative, unless you
> think "System going down to reset PID numbers!" is OK.
> 

There is an alternative.  
Use 32-bit PID's, but with an additional rule for wraparound.
Simply wrap the PID when 
(nextPID > 2*number_of_processes && nextPID > 30000)
The latter one just to avoid wrapping at 10 when there are 
5 processes.  

This simple approach supports 32-bit PIDs for those 
that need them, while "ps" and friends always looks nice 
except for those who actually run large amounts of processes.
You won't get a very large PID unless you need to.

Helge Hafting

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09  8:48           ` Helge Hafting
@ 2002-08-09 10:32             ` Alan Cox
  2002-08-09  9:35               ` Andrew Morton
  2002-08-09 10:37               ` Andries Brouwer
  0 siblings, 2 replies; 51+ messages in thread
From: Alan Cox @ 2002-08-09 10:32 UTC (permalink / raw)
  To: Helge Hafting; +Cc: Albert D. Cahalan, linux-kernel

On Fri, 2002-08-09 at 09:48, Helge Hafting wrote:
> Use 32-bit PID's, but with an additional rule for wraparound.
> Simply wrap the PID when 
> (nextPID > 2*number_of_processes && nextPID > 30000)
> The latter one just to avoid wrapping at 10 when there are 
> 5 processes.  

Its much simpler to put max_pid into /proc/sys/kernel. That way we can
boot with 32000 process ids, which will ensure ancient stuff which has
16bit pid_t (old old libc) and also any old kernel interfaces which
expose it - does ipc ? 


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 10:32             ` Alan Cox
@ 2002-08-09  9:35               ` Andrew Morton
  2002-08-09 10:37               ` Andries Brouwer
  1 sibling, 0 replies; 51+ messages in thread
From: Andrew Morton @ 2002-08-09  9:35 UTC (permalink / raw)
  To: Alan Cox; +Cc: Helge Hafting, Albert D. Cahalan, linux-kernel

Alan Cox wrote:
> 
> On Fri, 2002-08-09 at 09:48, Helge Hafting wrote:
> > Use 32-bit PID's, but with an additional rule for wraparound.
> > Simply wrap the PID when
> > (nextPID > 2*number_of_processes && nextPID > 30000)
> > The latter one just to avoid wrapping at 10 when there are
> > 5 processes.
> 
> Its much simpler to put max_pid into /proc/sys/kernel. That way we can
> boot with 32000 process ids, which will ensure ancient stuff which has
> 16bit pid_t (old old libc) and also any old kernel interfaces which
> expose it - does ipc ?

procfs oopses with >65535 processes, btw.  Related to the i_ino
encoding:

	#define fake_ino(pid,ino) (((pid)<<16)|(ino))
	#define PROC_INODE_PROPER(inode) ((inode)->i_ino & ~0xffff)

it ruined Anton's evening ;)

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 10:32             ` Alan Cox
  2002-08-09  9:35               ` Andrew Morton
@ 2002-08-09 10:37               ` Andries Brouwer
  1 sibling, 0 replies; 51+ messages in thread
From: Andries Brouwer @ 2002-08-09 10:37 UTC (permalink / raw)
  To: Alan Cox; +Cc: Helge Hafting, Albert D. Cahalan, linux-kernel

On Fri, Aug 09, 2002 at 11:32:33AM +0100, Alan Cox wrote:

> Its much simpler to put max_pid into /proc/sys/kernel. That way we can
> boot with 32000 process ids, which will ensure ancient stuff which has
> 16bit pid_t (old old libc) and also any old kernel interfaces which
> expose it - does ipc ? 

I don't think it is a good idea to add lots of almost meaningless knobs.
This is not something one would like to adapt dynamically.
Someone who needs 16bit pid_t can compile her own kernel with a
suitable value of PID_MAX.

There is no old old libc with 16bit pid_t, I think.
I have libc 4.4.1 here with
	typedef int pid_t;
I have Linux 0.01 here with
	typedef int pid_t;

Yes, ipc exposed pid in a 16-bit field, namely in the fields
msg_lspid, msg_lrpid, shm_cpid, shm_lpid.
Two years ago I found all occurrences in all Linux sources,
but I forgot the details of the result; I do not recall
significant occurrences. Maybe someone will repeat this big grep.

Andries

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09  7:04         ` Albert D. Cahalan
  2002-08-09  8:48           ` Helge Hafting
@ 2002-08-09 13:00           ` Chris Adams
  2002-08-09 14:39             ` Albert D. Cahalan
  2002-08-09 13:59           ` Rik van Riel
  2 siblings, 1 reply; 51+ messages in thread
From: Chris Adams @ 2002-08-09 13:00 UTC (permalink / raw)
  To: Albert D. Cahalan; +Cc: linux-kernel

Once upon a time, Albert D. Cahalan <acahalan@cs.uml.edu> said:
> BTW, let's start with: not only "ps" is affected.
> Off the top of my head: netstat, fuser, top, pstree...

netstat: already wraps around and loses formatting when PIDs are shown
    (so no changes)
fuser: already has enough columns (because it uses "kernel" some places)
    (so no changes)
top: two line change
pstree: PIDs are displayed in "()" after process name (not formatted
    columns) (so no changes)

How many people actually _use_ PIDs on a regular basis?  When I did our
upgrade from Tru64 4.x to 5.x and got bigger PIDs, it took awhile before
anyone else even noticed.  Almost always when I'm doing something to
processes, I'm scripting it, not typing numbers.

> I almost put 99999, but then I realized that that's silly.
> For years Linux had a hard limit of about 4000 processes,
> and not many people complained. It sure would be nice to
> gain back a few of the columns lost to other stuff, so
> that people could once again see command arguments.

We're talking one or two more columns.  That's not going to magically
make it possible to see the command arguments.  If someone wants to see
the arguments, they're going to have to use "w" anyway (also, "normal"
ps with no arguments still has lots of space).

> The two real-word usage examples close at hand:
> 
> a. My full GNOME desktop has 48 processes.
> 
> b. The main UNIX shell server for CS students
>    at this university has 79 processes.
>    (Tru64, several hundred CS students, 2:25 am)

My GNOME desktop has 72 processes, and I've only got Mozilla and a
couple of xterms running right now.  My Tru64 server, with only 9 users
at 7:40 am, has 443 processes (during the business day we typically get
a lot more processes - we'll have 50-75 users logged in, lots of POP
checks running, and the usually at least once per day spam attack adding
a couple of hundred extra sendmail processes - that's when we hit 1000+
processes).

> The problem is screen space, pure and simple. If the
> default limit goes to over 1 billion, then "ps" output
> must wrap lines. There is no alternative, unless you
> think "System going down to reset PID numbers!" is OK.

Adding just one column (and going to 19 bits) allows 16 times as many
PIDs (or a sparse space 16 times as large).

> > $ ps -lf | head -2
> >        F S           UID    PID   PPID %CPU PRI  NI  RSS WCHAN    STARTED         TIME COMMAND
> > 80c08001 I           200 272363 301021  0.0  44   0 520K wait     20:47:46     0:00.09 -bash (bash)
> 
> Eeew. That's broken; it won't display right on a normal
> 80x24 terminal. Linux "ps -lf" will just barely fit.

Why does every possible output format of ps have to fit each process on
one line?  How often are these output formats used by normal people?

> > $ ps j | head -2
> > USER        PID   PPID   PGID   SESS JOBC S    TTY             TIME COMMAND
> > cmadams  272363 301021 272363 272363    0 I    pts/1        0:00.09 -bash (bash)

Note: one reason this one wrapped (instead of chopping the command) is
that Tru64 ps automatically displays everything (like lots of "w"
options) when output is not a TTY.

> It's not just "ps", and I'm not saying you couldn't
> adjust your system for a higher limit. I do think that
> the out-of-box default shouldn't screw up formatting.
> If you need to go past 9999, then you'll want to tweak
> a few other settings as well. Low process counts are
> the common case, and shouldn't be hurt by the fact that
> a few people wish to run a billion processes.

I'm not talking about a billion processes.  Also, are you going to make
all those other programs (well, "top" is the only one named above that
has to be changed) handle variable width PIDs now too?  That argument
cuts both ways.

I think setting things based on formatting of some modes of a couple of
programs that not that many users actually use is not the right way to
go.  We don't limit usernames to 8 characters anymore just because "ls"
truncates the name (not a kernel issue but the same concept).  If a
larger PID space works better for PID allocation, then go with a larger
PID space.

-- 
Chris Adams <cmadams@hiwaay.net>
Systems and Network Administrator - HiWAAY Internet Services
I don't speak for anybody but myself - that's enough trouble.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 13:00           ` Chris Adams
@ 2002-08-09 14:39             ` Albert D. Cahalan
  0 siblings, 0 replies; 51+ messages in thread
From: Albert D. Cahalan @ 2002-08-09 14:39 UTC (permalink / raw)
  To: Chris Adams; +Cc: Albert D. Cahalan, linux-kernel

Chris Adams writes:
> Once upon a time, Albert D. Cahalan <acahalan@cs.uml.edu> said:

>> BTW, let's start with: not only "ps" is affected.
>> Off the top of my head: netstat, fuser, top, pstree...
>
> netstat: already wraps around and loses formatting when PIDs are shown
>     (so no changes)

Sure, but no need to make it worse.

> fuser: already has enough columns (because it uses "kernel" some places)
>     (so no changes)
> top: two line change

Still this is screwing the common case for the truly unusual.

> pstree: PIDs are displayed in "()" after process name (not formatted
>     columns) (so no changes)
>
> How many people actually _use_ PIDs on a regular basis?  When I did our
> upgrade from Tru64 4.x to 5.x and got bigger PIDs, it took awhile before
> anyone else even noticed.  Almost always when I'm doing something to
> processes, I'm scripting it, not typing numbers.

I'm not counting Aunt Tillie.

Anybody on a terminal without cut-and-paste will suffer.

>> I almost put 99999, but then I realized that that's silly.
>> For years Linux had a hard limit of about 4000 processes,
>> and not many people complained. It sure would be nice to
>> gain back a few of the columns lost to other stuff, so
>> that people could once again see command arguments.
>
> We're talking one or two more columns.

It's as many as 25 character-columns. Linus was using
0x3fffffff as a mask. There are 5 field-columns with
PID data in the "ps j" format.

> That's not going to magically
> make it possible to see the command arguments.  If someone wants to see
> the arguments, they're going to have to use "w" anyway (also, "normal"
> ps with no arguments still has lots of space).

Huh? Try "ps -jf", which BTW has 4 PID columns. Also try "ps j".
I'm seeing arguments, and won't if PIDs get too wide.

>> The two real-word usage examples close at hand:
>> 
>> a. My full GNOME desktop has 48 processes.
>> 
>> b. The main UNIX shell server for CS students
>>    at this university has 79 processes.
>>    (Tru64, several hundred CS students, 2:25 am)
>
> My GNOME desktop has 72 processes, and I've only got Mozilla and a
> couple of xterms running right now.

Same as me, but I run Debian. Still, 72 is well below 9999.

> My Tru64 server, with only 9 users
> at 7:40 am, has 443 processes (during the business day we typically get
> a lot more processes - we'll have 50-75 users logged in, lots of POP
> checks running, and the usually at least once per day spam attack adding
> a couple of hundred extra sendmail processes - that's when we hit 1000+
> processes).

While I think this would fit under 9999, you're welcome
to adjust your system for any limit you like. I just don't
think the default should cater to systems that are both
rare and likely to have a real paid admin. It's your job to
increase the limit if you feel a need for more PID space.

>> The problem is screen space, pure and simple. If the
>> default limit goes to over 1 billion, then "ps" output
>> must wrap lines. There is no alternative, unless you
>> think "System going down to reset PID numbers!" is OK.
>
> Adding just one column (and going to 19 bits) allows 16
> times as many PIDs (or a sparse space 16 times as large).

Yes, and it trashes many common formats.

>>> $ ps -lf | head -2
>>>        F S           UID    PID   PPID %CPU PRI  NI  RSS WCHAN    STARTED         TIME COMMAND
>>> 80c08001 I           200 272363 301021  0.0  44   0 520K wait     20:47:46     0:00.09 -bash (bash)
>> 
>> Eeew. That's broken; it won't display right on a normal
>> 80x24 terminal. Linux "ps -lf" will just barely fit.
>
> Why does every possible output format of ps have to fit each process on
> one line?  How often are these output formats used by normal people?

How often are thousands of PIDs used by normal people?

>>> $ ps j | head -2
>>> USER        PID   PPID   PGID   SESS JOBC S    TTY             TIME COMMAND
>>> cmadams  272363 301021 272363 272363    0 I    pts/1        0:00.09 -bash (bash)
>
> Note: one reason this one wrapped (instead of chopping the command) is
> that Tru64 ps automatically displays everything (like lots of "w"
> options) when output is not a TTY.

It's either that or assume 80, unless you have a way to
get info from the controlling tty. I haven't looked into
that yet.

>> It's not just "ps", and I'm not saying you couldn't
>> adjust your system for a higher limit. I do think that
>> the out-of-box default shouldn't screw up formatting.
>> If you need to go past 9999, then you'll want to tweak
>> a few other settings as well. Low process counts are
>> the common case, and shouldn't be hurt by the fact that
>> a few people wish to run a billion processes.
>
> I'm not talking about a billion processes.

That's what Linus was testing with.

> Also, are you going to make
> all those other programs (well, "top" is the only one named above that
> has to be changed) handle variable width PIDs now too?  That argument
> cuts both ways.

Perhaps. You, with the odd case, can do without those programs.

> I think setting things based on formatting of some modes of a couple of
> programs that not that many users actually use is not the right way to
> go.  We don't limit usernames to 8 characters anymore just because "ls"
> truncates the name (not a kernel issue but the same concept).  If a
> larger PID space works better for PID allocation, then go with a larger
> PID space.

Arrrgh! That's a bug in ls, and yes you should limit usernames.
Think about:

Albert_Roberts
Albert_Robertson
Albert_Robbens

I think this is a much more popular problem than PID issues.
Lots of people with an NT background don't see why a long
username might cause problems. (names may be assigned by
corporate policy, and Linux admins must adapt) Maybe I could
do something about this if the default PID limit was 9999.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09  7:04         ` Albert D. Cahalan
  2002-08-09  8:48           ` Helge Hafting
  2002-08-09 13:00           ` Chris Adams
@ 2002-08-09 13:59           ` Rik van Riel
  2002-08-09 14:57             ` Albert D. Cahalan
  2 siblings, 1 reply; 51+ messages in thread
From: Rik van Riel @ 2002-08-09 13:59 UTC (permalink / raw)
  To: Albert D. Cahalan; +Cc: Chris Adams, linux-kernel

On Fri, 9 Aug 2002, Albert D. Cahalan wrote:

> I almost put 99999, but then I realized that that's silly.
> For years Linux had a hard limit of about 4000 processes,

That limit was removed some 2 years ago.

Rik
-- 
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/		http://distro.conectiva.com/


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 13:59           ` Rik van Riel
@ 2002-08-09 14:57             ` Albert D. Cahalan
  0 siblings, 0 replies; 51+ messages in thread
From: Albert D. Cahalan @ 2002-08-09 14:57 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Albert D. Cahalan, Chris Adams, linux-kernel

Rik van Riel writes:
> On Fri, 9 Aug 2002, Albert D. Cahalan wrote:
>
>> I almost put 99999, but then I realized that that's silly.
>> For years Linux had a hard limit of about 4000 processes,
>
> That limit was removed some 2 years ago.

Sure. Linux is 11 years old now. Also remember that that
was the _hard_ limit. We had a soft limit near 1000,
with a kernel recompile needed to increase it. Almost
everybody was satisfied with this.

Personally I'd change my adjustable PID limit to 999,
but I wouldn't suggest that as a default. 9999 should
cover almost all systems.

Self-adjusting could be nice. Then everybody starts
with 9, and gains a digit whenever space is 50% full.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 22:02 ` Linus Torvalds
  2002-08-08 22:26   ` Linus Torvalds
@ 2002-08-08 22:34   ` Paul Larson
  2002-08-08 22:44     ` Rik van Riel
  2002-08-08 22:37   ` Rik van Riel
  2002-08-09 19:34   ` Paul Larson
  3 siblings, 1 reply; 51+ messages in thread
From: Paul Larson @ 2002-08-08 22:34 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Hubertus Franke, Rik van Riel, Andries Brouwer, Andrew Morton,
	andrea, Dave Jones, lkml

On Thu, 2002-08-08 at 17:02, Linus Torvalds wrote:
> 
> On Thu, 8 Aug 2002, Hubertus Franke wrote:
> > 
> > That is true. All was done under the 16-bit assumption
> > My hunch is that the current algorithm might actually work quite well
> > for a sparsely populated pid-space (32-bits).
> 
> I agree.
The original issue that I had with all of this is the fact that if the
current algorithm can't find an available pid, it just sits there
churning forever and hangs the machine.  My original patch was really
just a very basic fix for that (see the 2.4 tree).  This makes it far
more unlikely for us to max out, but if we do aren't we just going to
have the same trouble all over?

Thanks,
Paul Larson


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 22:34   ` Paul Larson
@ 2002-08-08 22:44     ` Rik van Riel
  0 siblings, 0 replies; 51+ messages in thread
From: Rik van Riel @ 2002-08-08 22:44 UTC (permalink / raw)
  To: Paul Larson
  Cc: Linus Torvalds, Hubertus Franke, Andries Brouwer, Andrew Morton,
	andrea, Dave Jones, lkml

On 8 Aug 2002, Paul Larson wrote:

> The original issue that I had with all of this is the fact that if the
> current algorithm can't find an available pid, it just sits there
> churning forever and hangs the machine.  My original patch was really
> just a very basic fix for that (see the 2.4 tree).  This makes it far
> more unlikely for us to max out, but if we do aren't we just going to
> have the same trouble all over?

You'll need at least 330 million tasks to run out.

At a minimum kernel memory allocation of about 8 kB per
task, that's about 2600 GB of kernel data structures.

I'm not sure we'll hit that limit, ever. Not because
we won't have a TB of kernel data space at some point
in the future, but because 330 million tasks is a lot
more than we'd want to manage with just a few CPUs ;)

kind regards,

Rik
-- 
	http://www.linuxsymposium.org/2002/
"You're one of those condescending OLS attendants"
"Here's a nickle kid.  Go buy yourself a real t-shirt"

http://www.surriel.com/		http://distro.conectiva.com/


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 22:02 ` Linus Torvalds
  2002-08-08 22:26   ` Linus Torvalds
  2002-08-08 22:34   ` Paul Larson
@ 2002-08-08 22:37   ` Rik van Riel
  2002-08-09  1:49     ` Andries Brouwer
  2002-08-09 19:34   ` Paul Larson
  3 siblings, 1 reply; 51+ messages in thread
From: Rik van Riel @ 2002-08-08 22:37 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Hubertus Franke, Andries Brouwer, Andrew Morton, andrea, davej,
	lkml, Paul Larson

On Thu, 8 Aug 2002, Linus Torvalds wrote:
> On Thu, 8 Aug 2002, Hubertus Franke wrote:
> >
> > That is true. All was done under the 16-bit assumption
> > My hunch is that the current algorithm might actually work quite well
> > for a sparsely populated pid-space (32-bits).
>
> I agree.
>
> So let's just try Andries approach, suggested patch as follows..

Hmmm, I wonder how badly the system will behave when
we need to reset last_pid and next_safe with 30000
pids in use ...

regards,

Rik
-- 
	http://www.linuxsymposium.org/2002/
"You're one of those condescending OLS attendants"
"Here's a nickle kid.  Go buy yourself a real t-shirt"

http://www.surriel.com/		http://distro.conectiva.com/


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 22:37   ` Rik van Riel
@ 2002-08-09  1:49     ` Andries Brouwer
  0 siblings, 0 replies; 51+ messages in thread
From: Andries Brouwer @ 2002-08-09  1:49 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Linus Torvalds, Hubertus Franke, Andrew Morton, andrea, davej,
	lkml, Paul Larson

On Thu, Aug 08, 2002 at 07:37:08PM -0300, Rik van Riel wrote:

> Hmmm, I wonder how badly the system will behave when
> we need to reset last_pid and next_safe with 30000
> pids in use ...

On average very well, that is, the time spent in finding
the next pid is very small on average, but once in a while
there is a small hiccup.
On a hard real time system it may be reasonable to choose
for an average that is ten times higher and avoid the hiccup.

For systems that really have lots of very short-lived
processes, one might do what I suggested a moment ago
for a MP context: have the processes live in several lists,
that each only use part of the pid-space.

(One wants for_each_task to take a limited amount of time.
With N tasks, and sqrt(N) list heads one first picks the
shortest list, then loops over the list, all in time of order
sqrt(N). That works well, certainly up to a million processes.)

Andries

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 22:02 ` Linus Torvalds
                     ` (2 preceding siblings ...)
  2002-08-08 22:37   ` Rik van Riel
@ 2002-08-09 19:34   ` Paul Larson
  2002-08-09 20:13     ` Paul Larson
                       ` (2 more replies)
  3 siblings, 3 replies; 51+ messages in thread
From: Paul Larson @ 2002-08-09 19:34 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Hubertus Franke, Rik van Riel, Andries Brouwer, Andrew Morton,
	andrea, Dave Jones, lkml

I suspect that it would actually require more than just this.  I tried
this with the same test I've been using and had several failed attepmts
at low numbers by getting wierd unexpected signals (like 28), and then
one that ran for a much longer time and produced an oops with random
garbage to the console (trying to extract this now).

-Paul Larson

On Thu, 2002-08-08 at 17:02, Linus Torvalds wrote:
> ----
> --- 1.2/include/linux/threads.h	Tue Feb  5 07:23:04 2002
> +++ edited/include/linux/threads.h	Thu Aug  8 14:58:28 2002
> @@ -19,6 +19,7 @@
>  /*
>   * This controls the maximum pid allocated to a process
>   */
> -#define PID_MAX 0x8000
> +#define PID_MASK 0x3fffffff
> +#define PID_MAX (PID_MASK+1)
>  
>  #endif
> ===== kernel/fork.c 1.57 vs edited =====
> --- 1.57/kernel/fork.c	Tue Jul 30 15:49:20 2002
> +++ edited/kernel/fork.c	Thu Aug  8 15:00:16 2002
> @@ -142,7 +142,7 @@
>  		return 0;
>  
>  	spin_lock(&lastpid_lock);
> -	if((++last_pid) & 0xffff8000) {
> +	if((++last_pid) & ~PID_MASK) {
>  		last_pid = 300;		/* Skip daemons etc. */
>  		goto inside;
>  	}
> @@ -157,7 +157,7 @@
>  			   p->tgid == last_pid	||
>  			   p->session == last_pid) {
>  				if(++last_pid >= next_safe) {
> -					if(last_pid & 0xffff8000)
> +					if(last_pid & ~PID_MASK)
>  						last_pid = 300;
>  					next_safe = PID_MAX;
>  				}
> 
> 



^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 19:34   ` Paul Larson
@ 2002-08-09 20:13     ` Paul Larson
  2002-08-09 20:40     ` Andries Brouwer
  2002-08-09 21:23     ` Linus Torvalds
  2 siblings, 0 replies; 51+ messages in thread
From: Paul Larson @ 2002-08-09 20:13 UTC (permalink / raw)
  To: Paul Larson
  Cc: Linus Torvalds, Hubertus Franke, Rik van Riel, Andries Brouwer,
	Andrew Morton, andrea, Dave Jones, lkml

On Fri, 2002-08-09 at 14:34, Paul Larson wrote:
> I suspect that it would actually require more than just this.  I tried
> this with the same test I've been using and had several failed attepmts
> at low numbers by getting wierd unexpected signals (like 28), and then
> one that ran for a much longer time and produced an oops with random
> garbage to the console (trying to extract this now).
> 
> -Paul Larson
Here's the ksymoops output minus the unprintable chars that got sent
after it:

Unable to handle kernel paging request at virtual address 32313256
c015cb38
*pde = 37184001
Oops: 0000
CPU:    4
0010:[<c015cb38>]    Not tainted
Using defaults from ksymoops -t elf32-i386 -a i386
EFLAGS: 00010202
eax: d9768090   ebx: d9768090   ecx: 00000020   edx: 32313232
esi: c015cb10   edi: d9768090   ebp: 00000008   esp: f6cb9e34
ds: 0018   es: 0018   ss: 0018
Stack: c0151403 d9768090 d9768090 f6dabaa0 c01515ad d9768090 f6dabab8
c014f49b
d9768090 f6dabaa0 c0139fa3 00000020 00000001 000001d0 c013a21a c013a170
000001f8 f6cb9e7c 00000020 00000001 000001d0 000041e6 c014f8f0 00000010
Call Trace: [<c0151403>] [<c01515ad>] [<c014f49b>] [<c0139fa3>]
[<c013a21a>]
[<c013a170>] [<c014f8f0>] [<c0131f9e>] [<c0131fec>] [<c0132c64>]
[<c0132fa2>]
[<c0133010>] [<c0116717>] [<c0117186>] [<c011793e>] [<c0105bc5>]
[<c0107173>]
Code: 8b 42 24 85 c0 74 0b f0 ff 48 10 8b 42 24 83 48 14 08 52 e8

>>EIP; c015cb38 <proc_delete_inode+28/50>   <=====
Trace; c0151403 <generic_delete_inode+83/b0>
Trace; c01515ad <iput+5d/60>
Trace; c014f49b <prune_dcache+eb/180>
Trace; c0139fa3 <pdflush_operation+83/90>
Trace; c013a21a <wakeup_bdflush+1a/20>
Trace; c013a170 <background_writeout+0/90>
Trace; c014f8f0 <shrink_dcache_memory+20/40>
Trace; c0131f9e <shrink_caches+7e/a0>
Trace; c0131fec <try_to_free_pages+2c/50>
Trace; c0132c64 <balance_classzone+44/1f0>
Trace; c0132fa2 <__alloc_pages+192/1f0>
Trace; c0133010 <__get_free_pages+10/20>
Trace; c0116717 <dup_task_struct+17/70>
Trace; c0117186 <copy_process+56/7f0>
Trace; c011793e <do_fork+1e/a0>
Trace; c0105bc5 <sys_fork+15/30>
Trace; c0107173 <syscall_call+7/b>
Code;  c015cb38 <proc_delete_inode+28/50>
00000000 <_EIP>:
Code;  c015cb38 <proc_delete_inode+28/50>   <=====
   0:   8b 42 24                  mov    0x24(%edx),%eax   <=====
Code;  c015cb3b <proc_delete_inode+2b/50>
   3:   85 c0                     test   %eax,%eax
Code;  c015cb3d <proc_delete_inode+2d/50>
   5:   74 0b                     je     12 <_EIP+0x12> c015cb4a
<proc_delete_inode+3a/50>
Code;  c015cb3f <proc_delete_inode+2f/50>
   7:   f0 ff 48 10               lock decl 0x10(%eax)
Code;  c015cb43 <proc_delete_inode+33/50>
   b:   8b 42 24                  mov    0x24(%edx),%eax
Code;  c015cb46 <proc_delete_inode+36/50>
   e:   83 48 14 08               orl    $0x8,0x14(%eax)
Code;  c015cb4a <proc_delete_inode+3a/50>
  12:   52                        push   %edx
Code;  c015cb4b <proc_delete_inode+3b/50>
  13:   e8 00 00 00 00            call   18 <_EIP+0x18> c015cb50
<proc_delete_inode+40/50>


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 19:34   ` Paul Larson
  2002-08-09 20:13     ` Paul Larson
@ 2002-08-09 20:40     ` Andries Brouwer
  2002-08-09 21:23     ` Linus Torvalds
  2 siblings, 0 replies; 51+ messages in thread
From: Andries Brouwer @ 2002-08-09 20:40 UTC (permalink / raw)
  To: Paul Larson
  Cc: Linus Torvalds, Hubertus Franke, Rik van Riel, Andrew Morton,
	andrea, Dave Jones, lkml

On Fri, Aug 09, 2002 at 02:34:17PM -0500, Paul Larson wrote:

> I suspect that it would actually require more than just this.  I tried
> this with the same test I've been using and had several failed attepmts
> at low numbers by getting wierd unexpected signals (like 28), and then
> one that ran for a much longer time and produced an oops with random
> garbage to the console (trying to extract this now).

Not much more. Around 2.3.40 I have run with a large PID_MAX for a long time.
The patch that I submitted is still visible on the net various places
(I just tried  Andries pid_max  and found a few).

At that time the only other change (other than <linux/threads.h> and
kernel/fork.c) was in proc/base.c, namely

	-#define fake_ino(pid,ino) (((pid)<<16)|(ino)) 
	+#define fake_ino(pid,ino) (((1)<<16)|(ino)) 

and

	- if (!pid) 
	-         goto out; 
	- if (pid & 0xffff0000) 
	+ if (pid <= 0 || pid >= PID_MAX) 
                  goto out; 

(plucked from google output).

I have not checked precisely what change is appropriate in procfs today.


Andries

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 19:34   ` Paul Larson
  2002-08-09 20:13     ` Paul Larson
  2002-08-09 20:40     ` Andries Brouwer
@ 2002-08-09 21:23     ` Linus Torvalds
  2002-08-09 21:42       ` Linus Torvalds
  2 siblings, 1 reply; 51+ messages in thread
From: Linus Torvalds @ 2002-08-09 21:23 UTC (permalink / raw)
  To: Paul Larson
  Cc: Hubertus Franke, Rik van Riel, Andries Brouwer, Andrew Morton,
	andrea, Dave Jones, lkml

On 9 Aug 2002, Paul Larson wrote:
>
> I suspect that it would actually require more than just this. 

There are various /proc paths, Andries had a full patch at some point a 
long time ago.

My point was not so much that this is sufficient, but that the approach is 
valid. I'd rather go with the simple approach (that solves two problems) 
than with a much more complex approach (that solves only one).

And yes, I doubt I actually want to give all 30 bits to pid space in the 
long run (the per-node pid space argument is still valid), but it's better 
to start with 30 bits and actively trying to break things and then phasing 
it down later than to be timid and never even see the stuff that breaks.

		Linus

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 21:23     ` Linus Torvalds
@ 2002-08-09 21:42       ` Linus Torvalds
  2002-08-09 21:46         ` Paul Larson
  0 siblings, 1 reply; 51+ messages in thread
From: Linus Torvalds @ 2002-08-09 21:42 UTC (permalink / raw)
  To: Paul Larson
  Cc: Hubertus Franke, Rik van Riel, Andries Brouwer, Andrew Morton,
	andrea, Dave Jones, lkml

On Fri, 9 Aug 2002, Linus Torvalds wrote:
>
> There are various /proc paths, Andries had a full patch at some point a
> long time ago.

Hmm.. Giving them a quick glance-over, the /proc issues look like they
shouldn't be there in 2.5.x anyway, since the inode number really is
largely just a random number in 2.5 and all the real information is
squirelled away at path open time.

There's certainly a possibility for some cleanups, though.

		Linus

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 21:42       ` Linus Torvalds
@ 2002-08-09 21:46         ` Paul Larson
  2002-08-09 22:04           ` Linus Torvalds
  0 siblings, 1 reply; 51+ messages in thread
From: Paul Larson @ 2002-08-09 21:46 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Hubertus Franke, Rik van Riel, Andries Brouwer, Andrew Morton,
	andrea, Dave Jones, lkml

On Fri, 2002-08-09 at 16:42, Linus Torvalds wrote:

> Hmm.. Giving them a quick glance-over, the /proc issues look like they
> shouldn't be there in 2.5.x anyway, since the inode number really is
> largely just a random number in 2.5 and all the real information is
> squirelled away at path open time.
> 
> There's certainly a possibility for some cleanups, though.
So for now then, should I dig out my original (minimal) patch that
*just* fixed the "loop forever even if we're out of pids" problem?  Even
if we increase PID_MAX to something obscenely high, I think we should
still be checking for this.

-Paul Larson


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 21:46         ` Paul Larson
@ 2002-08-09 22:04           ` Linus Torvalds
  2002-08-10 17:23             ` Jamie Lokier
                               ` (2 more replies)
  0 siblings, 3 replies; 51+ messages in thread
From: Linus Torvalds @ 2002-08-09 22:04 UTC (permalink / raw)
  To: Paul Larson
  Cc: Hubertus Franke, Rik van Riel, Andries Brouwer, Andrew Morton,
	andrea, Dave Jones, lkml

On 9 Aug 2002, Paul Larson wrote:
> On Fri, 2002-08-09 at 16:42, Linus Torvalds wrote:
> > Hmm.. Giving them a quick glance-over, the /proc issues look like they
> > shouldn't be there in 2.5.x anyway, since the inode number really is
> > largely just a random number in 2.5 and all the real information is
> > squirelled away at path open time.

It looks like the biggest impact on /proc would be that the /proc/<pid> 
inodes wouldn't be 10%% unique any more, which in turn means that an 
old-style /bin/pwd that actually walks the tree backwards and checks the 
inode number would occasionally fail.

That in turn makes me suspect that we'd better off just biting the bullet 
and makign the inode numbers almost completely static, and forcing that 
particular failure mode early rather than hit it randomly due to unlucky 
timing.

Doing a simple strace shows that all the systems I have regular access to
use the "getcwd()" system call anyway, which gets this right on /proc (and
other filesystems that do not guarantee unique inode numbers)

> So for now then, should I dig out my original (minimal) patch that
> *just* fixed the "loop forever even if we're out of pids" problem?  Even
> if we increase PID_MAX to something obscenely high, I think we should
> still be checking for this.

Ehh, considering that especially with a 30-bit pid, there's _no_ way we'd
run out without some other serious problems hitting us long before (out of
memory being the obvious one), I don't think even that is an issue.

With a minimum of 8kB / pid for process overhead, you need to have at 
least 43 bits of physical address space completely populated just to cover 
the memory uses of that many pid's.

		Linus

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 22:04           ` Linus Torvalds
@ 2002-08-10 17:23             ` Jamie Lokier
  2002-08-10 18:33               ` Linus Torvalds
  2002-08-12  8:56             ` Padraig Brady
  2002-08-12 14:40             ` Paul Larson
  2 siblings, 1 reply; 51+ messages in thread
From: Jamie Lokier @ 2002-08-10 17:23 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Paul Larson, Hubertus Franke, Rik van Riel, Andries Brouwer,
	Andrew Morton, andrea, Dave Jones, lkml

Linus Torvalds wrote:
> Doing a simple strace shows that all the systems I have regular access to
> use the "getcwd()" system call anyway, which gets this right on /proc (and
> other filesystems that do not guarantee unique inode numbers)

Oh dear -- what of programs that assume duplicate inode numbers are hard
links, and therefore assume the same contents will be found in each
duplicate?

-- Jamie

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-10 17:23             ` Jamie Lokier
@ 2002-08-10 18:33               ` Linus Torvalds
  2002-08-10 18:48                 ` Jamie Lokier
  0 siblings, 1 reply; 51+ messages in thread
From: Linus Torvalds @ 2002-08-10 18:33 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Paul Larson, Hubertus Franke, Rik van Riel, Andries Brouwer,
	Andrew Morton, andrea, Dave Jones, lkml


On Sat, 10 Aug 2002, Jamie Lokier wrote:
>
> Linus Torvalds wrote:
> > Doing a simple strace shows that all the systems I have regular access to
> > use the "getcwd()" system call anyway, which gets this right on /proc (and
> > other filesystems that do not guarantee unique inode numbers)
> 
> Oh dear -- what of programs that assume duplicate inode numbers are hard
> links, and therefore assume the same contents will be found in each
> duplicate?

Well, anybody who tries to back up /proc with "tar" is in for some 
surprises anyway ;)

			Linus


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-10 18:33               ` Linus Torvalds
@ 2002-08-10 18:48                 ` Jamie Lokier
  2002-08-11 20:41                   ` Alan Cox
  0 siblings, 1 reply; 51+ messages in thread
From: Jamie Lokier @ 2002-08-10 18:48 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Paul Larson, Hubertus Franke, Rik van Riel, Andries Brouwer,
	Andrew Morton, andrea, Dave Jones, lkml

Linus Torvalds wrote:
> > Oh dear -- what of programs that assume duplicate inode numbers are hard
> > links, and therefore assume the same contents will be found in each
> > duplicate?
> 
> Well, anybody who tries to back up /proc with "tar" is in for some 
> surprises anyway ;)

I was thinking of an over-intelligent `find'.  But hey, as long as this
is only for the weird and wonderful /proc :-)

-- Jamie

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-10 18:48                 ` Jamie Lokier
@ 2002-08-11 20:41                   ` Alan Cox
  2002-08-11 19:58                     ` Jamie Lokier
  0 siblings, 1 reply; 51+ messages in thread
From: Alan Cox @ 2002-08-11 20:41 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Linus Torvalds, Paul Larson, Hubertus Franke, Rik van Riel,
	Andries Brouwer, Andrew Morton, andrea, Dave Jones, lkml

On Sat, 2002-08-10 at 19:48, Jamie Lokier wrote:
> Linus Torvalds wrote:
> > > Oh dear -- what of programs that assume duplicate inode numbers are hard
> > > links, and therefore assume the same contents will be found in each
> > > duplicate?
> > 
> > Well, anybody who tries to back up /proc with "tar" is in for some 
> > surprises anyway ;)
> 
> I was thinking of an over-intelligent `find'.  But hey, as long as this
> is only for the weird and wonderful /proc :-)

If they hold both handles open and stat them and find the same inode
number then yes its a bug. We have lots of room for inode numbers to
handle 2^30 processes and allow for 2^30 other files


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-11 20:41                   ` Alan Cox
@ 2002-08-11 19:58                     ` Jamie Lokier
  2002-08-11 21:23                       ` Alan Cox
  0 siblings, 1 reply; 51+ messages in thread
From: Jamie Lokier @ 2002-08-11 19:58 UTC (permalink / raw)
  To: Alan Cox
  Cc: Linus Torvalds, Paul Larson, Hubertus Franke, Rik van Riel,
	Andries Brouwer, Andrew Morton, andrea, Dave Jones, lkml

Alan Cox wrote:
> If they hold both handles open and stat them and find the same inode
> number then yes its a bug. We have lots of room for inode numbers to
> handle 2^30 processes and allow for 2^30 other files

So, in general, the way to detect hard links requires both objects to be
open at the same time?  I was sure it was enough to stat(), and check
(st1.st_ino == st2.st_ino && st1.st_dev == st2.st_dev).

Admittedly, one of the object could be renamed or deleted in that time
so it's not 100% reliable on changing filesystems.

Ah well.

-- Jamie

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-11 19:58                     ` Jamie Lokier
@ 2002-08-11 21:23                       ` Alan Cox
  2002-08-11 20:10                         ` Jamie Lokier
  0 siblings, 1 reply; 51+ messages in thread
From: Alan Cox @ 2002-08-11 21:23 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Linus Torvalds, Paul Larson, Hubertus Franke, Rik van Riel,
	Andries Brouwer, Andrew Morton, andrea, Dave Jones, lkml

On Sun, 2002-08-11 at 20:58, Jamie Lokier wrote:
> Alan Cox wrote:
> > If they hold both handles open and stat them and find the same inode
> > number then yes its a bug. We have lots of room for inode numbers to
> > handle 2^30 processes and allow for 2^30 other files
> 
> So, in general, the way to detect hard links requires both objects to be
> open at the same time?  I was sure it was enough to stat(), and check
> (st1.st_ino == st2.st_ino && st1.st_dev == st2.st_dev).
> 
> Admittedly, one of the object could be renamed or deleted in that time
> so it's not 100% reliable on changing filesystems.

Hence you need both open at the same time. 

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-11 21:23                       ` Alan Cox
@ 2002-08-11 20:10                         ` Jamie Lokier
  0 siblings, 0 replies; 51+ messages in thread
From: Jamie Lokier @ 2002-08-11 20:10 UTC (permalink / raw)
  To: Alan Cox
  Cc: Linus Torvalds, Paul Larson, Hubertus Franke, Rik van Riel,
	Andries Brouwer, Andrew Morton, andrea, Dave Jones, lkml

Alan Cox wrote:
> > So, in general, the way to detect hard links requires both objects to be
> > open at the same time?  I was sure it was enough to stat(), and check
> > (st1.st_ino == st2.st_ino && st1.st_dev == st2.st_dev).
> > 
> > Admittedly, one of the object could be renamed or deleted in that time
> > so it's not 100% reliable on changing filesystems.
> 
> Hence you need both open at the same time. 

... unless you know that what you're looking at isn't changing, or you
only guarantee correct results for parts of the filesystem which don't
change (`tar', `find' etc.)

Again, /proc is exempt! :)

-- Jamie

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 22:04           ` Linus Torvalds
  2002-08-10 17:23             ` Jamie Lokier
@ 2002-08-12  8:56             ` Padraig Brady
  2002-08-12 10:37               ` Alan Cox
  2002-08-12 14:40             ` Paul Larson
  2 siblings, 1 reply; 51+ messages in thread
From: Padraig Brady @ 2002-08-12  8:56 UTC (permalink / raw)
  To: lkml

Linus Torvalds wrote:
> On 9 Aug 2002, Paul Larson wrote:
> 
>>On Fri, 2002-08-09 at 16:42, Linus Torvalds wrote:
>>
>>>Hmm.. Giving them a quick glance-over, the /proc issues look like they
>>>shouldn't be there in 2.5.x anyway, since the inode number really is
>>>largely just a random number in 2.5 and all the real information is
>>>squirelled away at path open time.
>>
> 
> It looks like the biggest impact on /proc would be that the /proc/<pid> 
> inodes wouldn't be 10%% unique any more, which in turn means that an 
> old-style /bin/pwd that actually walks the tree backwards and checks the 
> inode number would occasionally fail.
> 
> That in turn makes me suspect that we'd better off just biting the bullet 
> and makign the inode numbers almost completely static, and forcing that 
> particular failure mode early rather than hit it randomly due to unlucky 
> timing.
> 
> Doing a simple strace shows that all the systems I have regular access to
> use the "getcwd()" system call anyway, which gets this right on /proc (and
> other filesystems that do not guarantee unique inode numbers)

Anyone care to clarify which filesystems don't
have unique inode numbers. I always thought you
could uniquely identify any file using a device,inode
tuple? Fair enough proc is "special" but can/should
you not assume unique inodes within all other filesystems?

Also why can't you allocate a unique number in /proc?

thanks,
Pádraig.


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-12  8:56             ` Padraig Brady
@ 2002-08-12 10:37               ` Alan Cox
  2002-08-12  9:21                 ` Padraig Brady
  0 siblings, 1 reply; 51+ messages in thread
From: Alan Cox @ 2002-08-12 10:37 UTC (permalink / raw)
  To: Padraig Brady; +Cc: lkml

On Mon, 2002-08-12 at 09:56, Padraig Brady wrote:
> Anyone care to clarify which filesystems don't
> have unique inode numbers. I always thought you
> could uniquely identify any file using a device,inode
> tuple? Fair enough proc is "special" but can/should
> you not assume unique inodes within all other filesystems?

2.4 functions correctly here in all the stuff I've looked at. I can't
speak for 2.5. In the 2.4 case you must be holding the files open during
the comparison. Some file systems like MSDOS invent inodes as they go,
never issuing the same one to two objects at the same time but happily
reissuing different inode numbers the next time around.

That breaks NFS but not much else

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-12 10:37               ` Alan Cox
@ 2002-08-12  9:21                 ` Padraig Brady
  0 siblings, 0 replies; 51+ messages in thread
From: Padraig Brady @ 2002-08-12  9:21 UTC (permalink / raw)
  To: lkml

Alan Cox wrote:
> On Mon, 2002-08-12 at 09:56, Padraig Brady wrote:
> 
>>Anyone care to clarify which filesystems don't
>>have unique inode numbers. I always thought you
>>could uniquely identify any file using a device,inode
>>tuple? Fair enough proc is "special" but can/should
>>you not assume unique inodes within all other filesystems?
> 
> 
> 2.4 functions correctly here in all the stuff I've looked at.

cool.

> I can't speak for 2.5. In the 2.4 case you must be holding the files open 
 > during the comparison. Some file systems like MSDOS invent inodes as
 > they go, never issuing the same one to two objects at the same time but
 > happily reissuing different inode numbers the next time around.

Sure, no hardlinks so who cares what the number is, as long
as it's unique.

> That breaks NFS but not much else

hmm..

thanks,
Pádraig.


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 22:04           ` Linus Torvalds
  2002-08-10 17:23             ` Jamie Lokier
  2002-08-12  8:56             ` Padraig Brady
@ 2002-08-12 14:40             ` Paul Larson
  2 siblings, 0 replies; 51+ messages in thread
From: Paul Larson @ 2002-08-12 14:40 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Hubertus Franke, Rik van Riel, Andries Brouwer, Andrew Morton,
	andrea, Dave Jones, lkml

Dave McCracken had a patch a while back that should probably be
considered with all this too. 
http://marc.theaimsgroup.com/?l=linux-kernel&m=100506843702466&w=2

This fixes the calculation of the max number of tasks so that it doesn't
consider highmem since it can't use use that anyway.

-Paul Larson

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
@ 2002-08-08 18:56 Hubertus Franke
  2002-08-08 19:15 ` Rik van Riel
  2002-08-08 19:18 ` Paul Larson
  0 siblings, 2 replies; 51+ messages in thread
From: Hubertus Franke @ 2002-08-08 18:56 UTC (permalink / raw)
  To: Andries Brouwer
  Cc: Andrew Morton, andrea, davej, lkml, Paul Larson, Linus Torvalds

Which one sounds like the best one ?

Assuming that for now we have to stick to 16-bit pid_t ....
There are the following patches out there ....:

(a) Vanilla version which breaks down after 22K pids and really sucks above
32.5K
(b) Bill Irwin's, which keeps track of which PID is free and which one is
not ?
(c) Andrea's patch which searches in the bitmap when we are running out
(d) Paul's patch, which I believe was based on one of my earlier submission
(03/02) that
                                 essentially switches between (a)+(c) at
the break even point.

Assuming that Paul's patch still resembles somewhat my earlier intentions:
My observation of (d) back in february was that the current approach with
using next_pid = ++last_pid in the general case is pretty good and that
only the determination of a guaranteed free range sucks due to O(N**2)
algorithm sucks and using a bitmap to determine the next safe range
through a O(N) algorithm is good. I determined the breakeven point with
random pid deletion to be ~22K where the current algorithm worked darn
well.
One difference to Andrea's patch (c) is  (if I remember his code correctly)
that
I used was that I always look forward in the bitmap for the next safe range
in the
bitmap when I exhausted the previous one without having to traverse the
task list.
Only if non is found I traverse the task list and try the bit search one
more time.

I feel (c) or (d) is a better solution over (a) and (b)...   Open for
discussion.
I have a test program that does the random pid deletion and pid allocation
in user space. All what's required is to copy the get_pid() code from the
kernel into there... Can make that available ...

I don't know what Paul has done to the patch since then ....

-- Hubertus Franke  (frankeh@watson.ibm.com)

                      Andries Brouwer                                                                                                
                      <aebr@win.tue.nl>        To:       Andrew Morton <akpm@zip.com.au>                                             
                                               cc:       Paul Larson <plars@austin.ibm.com>, Linus Torvalds <torvalds@transmeta.     
                      08/07/2002 08:24          com>, lkml <linux-kernel@vger.kernel.org>, davej@suse.de, Hubertus                   
                      PM                        Franke/Watson/IBM@IBMUS, andrea@suse.de                                              
                                               Subject:  Re: [PATCH] Linux-2.5 fix/improve get_pid()                                 

On Wed, Aug 07, 2002 at 04:06:05PM -0700, Andrew Morton wrote:

> Has this been evaluated against Bill Irwin's constant-time
> allocator?  Bill says it has slightly worse normal-case and
> vastly improved worst-case performance over the stock allocator.
> Not sure how it stacks up against this one.   Plus it's much nicer
> looking code.

> #define PID_MAX                    0x8000
> #define RESERVED_PIDS        300
>
> #define MAP0_SIZE            (PID_MAX   >> BITS_PER_LONG_SHIFT)
> #define MAP1_SIZE            (MAP0_SIZE >> BITS_PER_LONG_SHIFT)
> #define MAP2_SIZE            (MAP1_SIZE >> BITS_PER_LONG_SHIFT)
>
> static unsigned long pid_map0[MAP0_SIZE];
> static unsigned long pid_map1[MAP1_SIZE];
> static unsigned long pid_map2[MAP2_SIZE];

Here it is of interest how large a pid is.
With a 64-bit pid_t it is just

  static pid_t last_pid;

  pid_t get_next_pid() {
             return ++last_pid;
  }

since 2^64 is a really large number.
Unfortunately glibc does not support this (on x86).

With a 32-bit pid_t wraparounds will occur, but very infrequently.
Thus, finding the next pid will be very fast on average, much faster
than the above algorithm, and no arrays are required.
One only loses the guaranteed constant time property.
Unless hard real time is required, this sounds like the best version.

Andries

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 18:56 Hubertus Franke
@ 2002-08-08 19:15 ` Rik van Riel
  2002-08-08 19:18 ` Paul Larson
  1 sibling, 0 replies; 51+ messages in thread
From: Rik van Riel @ 2002-08-08 19:15 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: Andries Brouwer, Andrew Morton, andrea, davej, lkml, Paul Larson,
	Linus Torvalds

On Thu, 8 Aug 2002, Hubertus Franke wrote:

> Which one sounds like the best one ?
>
> Assuming that for now we have to stick to 16-bit pid_t ....

That assumption is pretty central to the debate.

I don't see the standard get_pid nor the bitmap based
get_pid scale to something larger than a 16-bit pid_t.

If we're not sure yet whether we want to keep pid_t 16
bits it might be worth putting in an algorithm that does
scale to larger numbers, if only so the switch to a larger
pid_t will be more straightforward.

kind regards,

Rik
-- 
	http://www.linuxsymposium.org/2002/
"You're one of those condescending OLS attendants"
"Here's a nickle kid.  Go buy yourself a real t-shirt"

http://www.surriel.com/		http://distro.conectiva.com/

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 18:56 Hubertus Franke
  2002-08-08 19:15 ` Rik van Riel
@ 2002-08-08 19:18 ` Paul Larson
  1 sibling, 0 replies; 51+ messages in thread
From: Paul Larson @ 2002-08-08 19:18 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: Andries Brouwer, Andrew Morton, andrea, Dave Jones, lkml,
	Linus Torvalds

On Thu, 2002-08-08 at 13:56, Hubertus Franke wrote:
> (a) Vanilla version which breaks down after 22K pids and 
> really sucks above 32.5K
> (b) Bill Irwin's, which keeps track of which PID is free and 
> which one is not ?
> (c) Andrea's patch which searches in the bitmap when we are 
> running out
> (d) Paul's patch, which I believe was based on one of my earlier 
> submission (03/02) that essentially switches between (a)+(c) at 
> the break even point.

> I feel (c) or (d) is a better solution over (a) and (b)...   Open for
> discussion.
> I have a test program that does the random pid deletion and pid allocation
> in user space. All what's required is to copy the get_pid() code from the
> kernel into there... Can make that available ...
> 
> I don't know what Paul has done to the patch since then ....

It's pretty much the same, with a couple of small changes.  After
forward porting your patch and the one from AA's patches (hch says that
one was actually done by Ihno Krumreich, thanks for the info), I added
the fix from (c) for the race, and moved the check for if(flags &
CLONE_IDLETASK) outside of get_pid into copy_process() since there's no
need to call get_pid and return just to find out that we need to use 0.

-Paul Larson


^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH] Linux-2.5 fix/improve get_pid()
@ 2002-08-07 22:03 Paul Larson
  2002-08-07 23:06 ` Andrew Morton
  0 siblings, 1 reply; 51+ messages in thread
From: Paul Larson @ 2002-08-07 22:03 UTC (permalink / raw)
  To: Linus Torvalds, lkml; +Cc: davej, frankeh, andrea


This patch provides an improved version of get_pid() while also taking
care of the bug that causes the machine to hang when you hit PID_MAX. 
It is based on both solutions to the problem provided by Hubertus Franke
and Andrea Arcangeli.  This uses a bitmap to find an available pid and
uses Hubertus's adaptive implementation to only use this when it is more
beneficial than the old mechanism.  The getpid_mutex from AA's patch is
also carried over to avoid the race where another cpu could get the same
pid before SET_LINKS was called.

This should patch cleanly against 2.5.30 or bk current.
Please apply.

--- a/kernel/fork.c	Wed Aug  7 16:37:38 2002
+++ b/kernel/fork.c	Wed Aug  7 16:05:22 2002
@@ -50,6 +50,12 @@
 
 rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;  /* outer */
 
+/*
+ * Protectes next_safe, last_pid and it avoids races
+ * between get_pid and SET_LINKS().
+ */
+static DECLARE_MUTEX(getpid_mutex);
+
 void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
 {
 	unsigned long flags;
@@ -129,27 +135,107 @@
 	kmem_cache_free(task_struct_cachep,tsk);
 }
 
-/* Protects next_safe and last_pid. */
-spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
+/* this should be provided in every architecture */
+#ifndef SHIFT_PER_LONG
+#if BITS_PER_LONG == 64
+#   define SHIFT_PER_LONG 6
+#elif BITS_PER_LONG == 32
+#   define SHIFT_PER_LONG 5
+#else
+#error "SHIFT_PER_LONG"
+#endif
+#endif
+
+#define RESERVED_PIDS       (300)
+#define GETPID_THRESHOLD    (22000)  /* when to switch to a mapped algo */
+#define PID_MAP_SIZE        (PID_MAX >> SHIFT_PER_LONG)
+static unsigned long pid_map[PID_MAP_SIZE];
+static int next_safe = PID_MAX;
+
+static inline void mark_pid(int pid)
+{
+	__set_bit(pid,pid_map);
+}
+
+static int get_pid_by_map(int lastpid) 
+{
+	static int mark_and_sweep = 0;
+
+	int round = 0;
+	struct task_struct *p;
+	int i;
+	unsigned long mask;
+	int fpos;
+
+
+	if (mark_and_sweep) {
+repeat:
+		mark_and_sweep = 0;
+		memset(pid_map, 0, PID_MAP_SIZE * sizeof(unsigned long));
+		lastpid = RESERVED_PIDS;
+	}
+	for_each_task(p) {
+		mark_pid(p->pid);
+		mark_pid(p->pgrp);
+		mark_pid(p->tgid);
+		mark_pid(p->session);
+	}
+
+	/* find next free pid */
+	i = (lastpid >> SHIFT_PER_LONG);
+	mask = pid_map[i] | ((1 << ((lastpid & (BITS_PER_LONG-1)))) - 1);   
+
+	while ((mask == ~0) && (++i < PID_MAP_SIZE)) 
+		mask = pid_map[i];
+
+	if (i == PID_MAP_SIZE) { 
+		if (round == 0) {
+			round = 1;
+			goto repeat;
+		}
+		next_safe = RESERVED_PIDS;
+		mark_and_sweep = 1;  /* mark next time */
+		return 0; 
+	}
+
+	fpos = ffz(mask);
+	i &= (PID_MAX-1);
+	lastpid = (i << SHIFT_PER_LONG) + fpos;
+
+	/* find next save pid */
+	mask &= ~((1 << fpos) - 1);
+
+	while ((mask == 0) && (++i < PID_MAP_SIZE)) 
+		mask = pid_map[i];
+
+	if (i==PID_MAP_SIZE) 
+		next_safe = PID_MAX;
+	else 
+		next_safe = (i << SHIFT_PER_LONG) + ffs(mask) - 1;
+	return lastpid;
+}
 
 static int get_pid(unsigned long flags)
 {
-	static int next_safe = PID_MAX;
 	struct task_struct *p;
 	int pid;
 
-	if (flags & CLONE_IDLETASK)
-		return 0;
-
-	spin_lock(&lastpid_lock);
 	if((++last_pid) & 0xffff8000) {
-		last_pid = 300;		/* Skip daemons etc. */
+		last_pid = RESERVED_PIDS;	/* Skip daemons etc. */
 		goto inside;
 	}
 	if(last_pid >= next_safe) {
 inside:
 		next_safe = PID_MAX;
 		read_lock(&tasklist_lock);
+		if (nr_threads > GETPID_THRESHOLD) {
+			last_pid = get_pid_by_map(last_pid);
+			if (last_pid == 0) {
+				last_pid = PID_MAX;
+				goto nomorepids; 
+			}
+		} else {
+			int beginpid = last_pid;
 	repeat:
 		for_each_task(p) {
 			if(p->pid == last_pid	||
@@ -158,24 +244,33 @@
 			   p->session == last_pid) {
 				if(++last_pid >= next_safe) {
 					if(last_pid & 0xffff8000)
-						last_pid = 300;
+						last_pid = RESERVED_PIDS;
 					next_safe = PID_MAX;
 				}
+				if(last_pid == beginpid)
+					goto nomorepids;
 				goto repeat;
 			}
 			if(p->pid > last_pid && next_safe > p->pid)
 				next_safe = p->pid;
 			if(p->pgrp > last_pid && next_safe > p->pgrp)
 				next_safe = p->pgrp;
+			if(p->tgid > last_pid && next_safe > p->tgid)
+				next_safe = p->tgid;
 			if(p->session > last_pid && next_safe > p->session)
 				next_safe = p->session;
 		}
+		}
 		read_unlock(&tasklist_lock);
 	}
 	pid = last_pid;
-	spin_unlock(&lastpid_lock);
 
 	return pid;
+
+nomorepids:
+	next_safe = last_pid = PID_MAX;
+	read_unlock(&tasklist_lock);
+	return 0;
 }
 
 static inline int dup_mmap(struct mm_struct * mm)
@@ -669,7 +764,14 @@
 	p->state = TASK_UNINTERRUPTIBLE;
 
 	copy_flags(clone_flags, p);
-	p->pid = get_pid(clone_flags);
+	down(&getpid_mutex);
+	if (clone_flags & CLONE_IDLETASK)
+		p->pid = 0;
+	else {
+		p->pid = get_pid(clone_flags);
+		if (p->pid == 0)
+			goto bad_fork_cleanup;
+	}
 	p->proc_dentry = NULL;
 
 	INIT_LIST_HEAD(&p->run_list);
@@ -793,10 +895,20 @@
 		list_add(&p->thread_group, &current->thread_group);
 	}
 
+	/*
+	 * We must do the SET_LINKS() under the getpid_mutex, to avoid
+	 * another CPU to get our same PID between the release of of the
+	 * getpid_mutex and the SET_LINKS().
+	 *
+	 * In short to avoid SMP races the new child->pid must be just visible
+	 * in the tasklist by the time we drop the getpid_mutex.
+	 */
 	SET_LINKS(p);
+
 	hash_pid(p);
 	nr_threads++;
 	write_unlock_irq(&tasklist_lock);
+	up(&getpid_mutex);
 	retval = 0;
 
 fork_out:
@@ -819,6 +931,7 @@
 bad_fork_cleanup_security:
 	security_ops->task_free_security(p);
 bad_fork_cleanup:
+	up(&getpid_mutex);
 	put_exec_domain(p->thread_info->exec_domain);
 	if (p->binfmt && p->binfmt->module)
 		__MOD_DEC_USE_COUNT(p->binfmt->module);


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-07 22:03 Paul Larson
@ 2002-08-07 23:06 ` Andrew Morton
  2002-08-08  0:24   ` Andries Brouwer
  2002-08-08 20:24   ` Linus Torvalds
  0 siblings, 2 replies; 51+ messages in thread
From: Andrew Morton @ 2002-08-07 23:06 UTC (permalink / raw)
  To: Paul Larson; +Cc: Linus Torvalds, lkml, davej, frankeh, andrea

Paul Larson wrote:
> 
> This patch provides an improved version of get_pid() while also taking
> care of the bug that causes the machine to hang when you hit PID_MAX.

Has this been evaluated against Bill Irwin's constant-time
allocator?  Bill says it has slightly worse normal-case and
vastly improved worst-case performance over the stock allocator.
Not sure how it stacks up against this one.   Plus it's much nicer
looking code.

He's shy, so.....



#include <linux/compiler.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/init.h>

/*
 * pid allocator
 * (C) 2002 William Irwin, IBM
 *
 * The strategy is to maintain a tower of bitmaps where a bitmap above
 * another in each bit accounts whether any pid's are available in the
 * space tracked by BITS_PER_LONG bits of the level below. The bitmaps
 * must be marked on allocation and also release, hence some
 * infrastructure for detecting when the last user of a pid releases it
 * must be in place.
 *
 * This general strategy is simple in concept and enforces highly
 * deterministic bounds on the search time for the next pid.
 */

#define PID_MAX		0x8000
#define RESERVED_PIDS	300

#define MAP0_SIZE	(PID_MAX   >> BITS_PER_LONG_SHIFT)
#define MAP1_SIZE	(MAP0_SIZE >> BITS_PER_LONG_SHIFT)
#define MAP2_SIZE	(MAP1_SIZE >> BITS_PER_LONG_SHIFT)

#define MAP0_SHIFT	BITS_PER_LONG_SHIFT
#define MAP1_SHIFT	(2*BITS_PER_LONG_SHIFT)
#define MAP2_SHIFT	(3*BITS_PER_LONG_SHIFT)

#define PID_MAP_MASK	(BITS_PER_LONG - 1)

#define PID_MAP_DEPTH	(ARRAY_SIZE(pid_map) - 1)

static unsigned long pid_map0[MAP0_SIZE];
static unsigned long pid_map1[MAP1_SIZE];
static unsigned long pid_map2[MAP2_SIZE];

static unsigned long * pid_map[] = { pid_map0, pid_map1, pid_map2, NULL, };

unsigned long last_pid = 0;
unsigned long npids = 0;

static const int map_shifts[] =
	{	0,
		BITS_PER_LONG_SHIFT,
		BITS_PER_LONG_SHIFT*2,
		BITS_PER_LONG_SHIFT*3,
		BITS_PER_LONG_SHIFT*4,
	};

static inline int pid_map_shift(int depth)
{
	return map_shifts[depth+1];
}

static spinlock_t pid_lock = SPIN_LOCK_UNLOCKED;

void free_pid(unsigned long pid)
{
	unsigned long **map = pid_map;

	spin_lock(&pid_lock);
	while (*map) {
		int bit = pid & PID_MAP_MASK;
		pid >>= BITS_PER_LONG_SHIFT;
		__clear_bit(bit, &(*map)[pid]);
		++map;
	}
	--npids;
	spin_unlock(&pid_lock);
}

static inline int whole_block_used(int level, unsigned long pid)
{
	return pid_map[level][pid >> pid_map_shift(level)] == ~0UL;
}

static inline void mark_pid(unsigned long pid)
{
	int level;
	for (level = 0; level < PID_MAP_DEPTH; ++level) {
		int shift, bit;
		unsigned long entry;

		shift = pid_map_shift(level);
		entry = pid >> shift;
		bit   = (pid >> (shift - BITS_PER_LONG_SHIFT)) & PID_MAP_MASK;
		if (level == 0 || whole_block_used(level - 1, pid))
			__set_bit(bit, &pid_map[level][entry]);
		else
			break;
	}
	++npids;
}

static inline int pid_map_limit(int depth)
{
	return PID_MAX >> pid_map_shift(depth);
}

#ifdef PID_ALLOC_EXAMPLE
/*
 * the pid allocation traverses the bitmaps by recursively ffz'ing
 * through down the tower of maps. Some additional logic is required
 * to enforce lower limits, but the following example of how one
 * would perform this search without the lower limit may well prove
 * enlightening for those interested in the mechanics of the algorithm.
 */
static long alloc_pid_from_zero(void)
{
	unsigned long pid = 0;
	int level;

	for (level = PID_MAP_DEPTH - 1; level >= 0; --level) {
		unsigned long entry = pid_map[level][pid];

		if (unlikely(entry == ~0UL))
			return ~0UL;

		pid = (pid << BITS_PER_LONG_SHIFT) + ffz(pid_map[level][pid]);
	}
	return pid;
}
#endif /* PID_ALLOC_EXAMPLE */


static const unsigned long pid_max_levels[] =
	{	PID_MAX >> BITS_PER_LONG_SHIFT,
		PID_MAX >> (BITS_PER_LONG_SHIFT*2),
		PID_MAX >> (BITS_PER_LONG_SHIFT*3),
		PID_MAX >> (BITS_PER_LONG_SHIFT*4),
	};

static inline unsigned long pid_map_digit(int level, unsigned long limit)
{
	return (limit >> pid_map_shift(level-1)) & PID_MAP_MASK;
}

/*
 * Scratch space for storing the digits of the limit, all accesses
 * protected by the pid_lock.
 */
static unsigned long limit_digits[4];

/*
 * This is not a high-performance implementation. alloc_pid_after()
 * can be made highly compact with some effort, but this is instead
 * meant to be clear. As the cost of fork() is dominated by much
 * more expensive operations and the cost of this is constant-bounded
 * by a very low constant, the gains from manual optimization here
 * are marginal.
 */
static long alloc_pid_after(unsigned long limit)
{
	unsigned long pid = 0;
	int level;

	/*
	 * The limit passed to us is a strict lower limit. It is more
	 * convenient to work with <= constraints.
	 */
	++limit;
	if (unlikely(limit == PID_MAX))
		return ~0UL;

	for (level = 0; level < PID_MAP_DEPTH; ++level) {
		limit_digits[level] = limit & PID_MAP_MASK;
		limit >>= BITS_PER_LONG_SHIFT;
	}

	/*
	 * Now the lowest available pid number above limit is
	 * reconstructed by ffz'ing down the bitmap and checking
	 * each digit against the digits of the limit for
	 * dictionary ordering. If the check should fail, it's
	 * fixed up by using the maximum of the two digits. The
	 * dictionary ordering on digits also means that a
	 * greater significant digit found in the bitmap
	 * invalidates all further comparisons, which requires
	 * fallback to the pure recursive ffz algorithm outlined
	 * above in order to be handled.
	 */
	for (level = PID_MAP_DEPTH - 1; level >= 0; --level) {
		unsigned long bit, digit;

		if (unlikely(pid >= pid_max_levels[level]))
			return ~0UL;

		bit = ffz(pid_map[level][pid]);
		digit = limit_digits[level];

		if (unlikely(bit < digit))
			bit = digit;

		pid = (pid << BITS_PER_LONG_SHIFT) + bit;

		/*
		 * This is not an optimization; if this check
		 * should succeed the digit comparisons above
		 * are no longer valid and (pessimistically)
		 * incorrect first available pid's are found.
		 * 
		 */
		if (likely(bit > digit)) {
			--level;
			goto finish_just_ffz;
		}
	}
out:
	if (pid < PID_MAX)
		return pid;
	else
		return ~0UL;

finish_just_ffz:
	/*
	 * Now revert to the pure recursive ffz algorithm with
	 * the slight variation of not beginning at a fixed level,
	 * because it is no longer valid to perform comparisons
	 * of the digit obtained by ffz'ing the bitmap against the
	 * digits of the limit.
	 */
	while (level >= 0) {
		unsigned long bit;

		if (unlikely(pid >= pid_max_levels[level]))
			return ~0UL;

		bit = ffz(pid_map[level][pid]);
		pid = (pid << BITS_PER_LONG_SHIFT) + bit;
		--level;
	}

	goto out;
}

int alloc_pid(void)
{
	unsigned long pid;

	spin_lock(&pid_lock);
	BUG_ON(last_pid >= PID_MAX);
	pid = alloc_pid_after(last_pid);
	if (unlikely(pid == ~0UL)) {
		pid = alloc_pid_after(RESERVED_PIDS);
		if (unlikely(pid == ~0UL))
			goto out;
		BUG_ON(pid < RESERVED_PIDS);
	} else
		BUG_ON(pid <= last_pid);
	last_pid = pid;
	mark_pid(pid);
out:
	spin_unlock(&pid_lock);
	return (int)pid;
}

void __init pid_init(void)
{
	mark_pid(0);
}

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-07 23:06 ` Andrew Morton
@ 2002-08-08  0:24   ` Andries Brouwer
  2002-08-08 19:42     ` William Lee Irwin III
  2002-08-08 20:24   ` Linus Torvalds
  1 sibling, 1 reply; 51+ messages in thread
From: Andries Brouwer @ 2002-08-08  0:24 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Paul Larson, Linus Torvalds, lkml, davej, frankeh, andrea

On Wed, Aug 07, 2002 at 04:06:05PM -0700, Andrew Morton wrote:

> Has this been evaluated against Bill Irwin's constant-time
> allocator?  Bill says it has slightly worse normal-case and
> vastly improved worst-case performance over the stock allocator.
> Not sure how it stacks up against this one.   Plus it's much nicer
> looking code.

> #define PID_MAX		0x8000
> #define RESERVED_PIDS	300
> 
> #define MAP0_SIZE	(PID_MAX   >> BITS_PER_LONG_SHIFT)
> #define MAP1_SIZE	(MAP0_SIZE >> BITS_PER_LONG_SHIFT)
> #define MAP2_SIZE	(MAP1_SIZE >> BITS_PER_LONG_SHIFT)
> 
> static unsigned long pid_map0[MAP0_SIZE];
> static unsigned long pid_map1[MAP1_SIZE];
> static unsigned long pid_map2[MAP2_SIZE];

Here it is of interest how large a pid is.
With a 64-bit pid_t it is just

  static pid_t last_pid;

  pid_t get_next_pid() {
	return ++last_pid;
  }

since 2^64 is a really large number.
Unfortunately glibc does not support this (on x86).

With a 32-bit pid_t wraparounds will occur, but very infrequently.
Thus, finding the next pid will be very fast on average, much faster
than the above algorithm, and no arrays are required.
One only loses the guaranteed constant time property.
Unless hard real time is required, this sounds like the best version.

Andries

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08  0:24   ` Andries Brouwer
@ 2002-08-08 19:42     ` William Lee Irwin III
  2002-08-08 20:47       ` Andries Brouwer
  0 siblings, 1 reply; 51+ messages in thread
From: William Lee Irwin III @ 2002-08-08 19:42 UTC (permalink / raw)
  To: Andries Brouwer
  Cc: Andrew Morton, Paul Larson, Linus Torvalds, lkml, davej, frankeh,
	andrea

On Thu, Aug 08, 2002 at 02:24:19AM +0200, Andries Brouwer wrote:
> Here it is of interest how large a pid is.
> With a 64-bit pid_t it is just
>   static pid_t last_pid;
>   pid_t get_next_pid() {
> 	return ++last_pid;
>   }
> since 2^64 is a really large number.
> Unfortunately glibc does not support this (on x86).
> With a 32-bit pid_t wraparounds will occur, but very infrequently.
> Thus, finding the next pid will be very fast on average, much faster
> than the above algorithm, and no arrays are required.
> One only loses the guaranteed constant time property.
> Unless hard real time is required, this sounds like the best version.

The goal of the work that produced this was to remove the global
tasklist. Changing ABI's and/or breaking userspace was not "within the
rules" of that. My allocator relies on that other infrastructure for
notification of release of pid's, and is really only meant to remove
the reliance of fork() on the tasklist. That work is probably more
relevant to heavy tty usage than forkbombs, despite the O(1) get_pid().
I am glad people happen to like various tidbits of it, though. =)

Cheers,
Bill

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 19:42     ` William Lee Irwin III
@ 2002-08-08 20:47       ` Andries Brouwer
  0 siblings, 0 replies; 51+ messages in thread
From: Andries Brouwer @ 2002-08-08 20:47 UTC (permalink / raw)
  To: William Lee Irwin III, Andrew Morton, Paul Larson, Linus Torvalds,
	lkml, davej, frankeh, andrea

On Thu, Aug 08, 2002 at 12:42:38PM -0700, William Lee Irwin III wrote:

> The goal of the work that produced this was to remove the global
> tasklist. Changing ABI's and/or breaking userspace was not "within the
> rules" of that.

It feels wrong to add such complexity and simultaneously keep
such a small pid_t.

Very soon 30000 processes will not be enough.

Using a 32-bit pid_t does not break userspace. Indeed, user space uses
a 32-bit pid_t today. The only complaint I have heard was from
Albert Cahalan who maintains ps and was afraid that the ps output
would become uglier if pids would get more digits.

It is a real pity that going to a 64-bit pid is impossible (on x64).

Many algorithms can be really efficient if you have a large space
to work in. For example, I do not know what your motivation was
for wanting to remove the global tasklist. It is certainly needed
for sending signals. But if you want to avoid access to global stuff
in a MP situation, then it is easy to partition the pid space
so that each processor only gives out pids in its own region.
(So that simultaneous forks do not interfere.)

Andries

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-07 23:06 ` Andrew Morton
  2002-08-08  0:24   ` Andries Brouwer
@ 2002-08-08 20:24   ` Linus Torvalds
  2002-08-08 21:30     ` H. Peter Anvin
                       ` (2 more replies)
  1 sibling, 3 replies; 51+ messages in thread
From: Linus Torvalds @ 2002-08-08 20:24 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Paul Larson, lkml, davej, frankeh, andrea

On Wed, 7 Aug 2002, Andrew Morton wrote:
>
> Has this been evaluated against Bill Irwin's constant-time
> allocator?  Bill says it has slightly worse normal-case and
> vastly improved worst-case performance over the stock allocator.
> Not sure how it stacks up against this one.   Plus it's much nicer
> looking code.

Guys, this discussion is getting ridiculous.

Doing a bit allocator should be trivial, but it's hard to know when a bit
is to be free'd. You can't just do it at "exit()" time, because even if
pid X exits, that doesn't mean that X can be re-used: it may still be used
as a pgid or a tid by some other process Y.

So if you really want to take this approach, you need to count the uses of
"pid X", and free the bitmap entry only when that count goes to zero. I
see no such logic in Bill Irwin's code, only a comment about last use
(which doesn't explain how to notice that last use).

Without that per-pid-count thing clarified, I don't think the (otherwise
fairly straightforward) approach of Bills really flies.

For that reason I think the mark-and-sweep thing is the right thing to do,
but I think the two-level algorithm is just over-engineering and not worth
it. And I do hate that getpid_mutex thing. Having a blocking lock for
something as simple as pid allocation just smells horribly wrong to me.

Moving the pid allocation to later (so that it doesn't need to handle
operations that block in between allocation and "we're done" and the pid
allocation can be a spinlock) sounds like a fairly obvious thing to do. 

I don't see why we would need the "pid" until the very last moment, at 
which point we already have the tasklist lock, in fact.

And I hate overengineering.

		Linus

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 20:24   ` Linus Torvalds
@ 2002-08-08 21:30     ` H. Peter Anvin
  2002-08-08 21:45     ` William Lee Irwin III
  2002-08-09  4:42     ` William Lee Irwin III
  2 siblings, 0 replies; 51+ messages in thread
From: H. Peter Anvin @ 2002-08-08 21:30 UTC (permalink / raw)
  To: linux-kernel

Followup to:  <Pine.LNX.4.44.0208081312330.8705-100000@home.transmeta.com>
By author:    Linus Torvalds <torvalds@transmeta.com>
In newsgroup: linux.dev.kernel
> 
> Guys, this discussion is getting ridiculous.
> 
> Doing a bit allocator should be trivial, but it's hard to know when a bit
> is to be free'd. You can't just do it at "exit()" time, because even if
> pid X exits, that doesn't mean that X can be re-used: it may still be used
> as a pgid or a tid by some other process Y.
> 
> So if you really want to take this approach, you need to count the uses of
> "pid X", and free the bitmap entry only when that count goes to zero. I
> see no such logic in Bill Irwin's code, only a comment about last use
> (which doesn't explain how to notice that last use).
> 

Even so, we need to maintain Not Recently Used semantic.  A discussion
on #kernel seems to have ended up with recommending a design target of
"no pid reuse within 30 seconds", with 1 second being an absolute
requirement.

	-hpa
-- 
<hpa@transmeta.com> at work, <hpa@zytor.com> in private!
"Unix gives you enough rope to shoot yourself in the foot."
http://www.zytor.com/~hpa/puzzle.txt	<amsp@zytor.com>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 20:24   ` Linus Torvalds
  2002-08-08 21:30     ` H. Peter Anvin
@ 2002-08-08 21:45     ` William Lee Irwin III
  2002-08-09  4:42     ` William Lee Irwin III
  2 siblings, 0 replies; 51+ messages in thread
From: William Lee Irwin III @ 2002-08-08 21:45 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Paul Larson, lkml, davej, frankeh, andrea

On Thu, Aug 08, 2002 at 01:24:35PM -0700, Linus Torvalds wrote:
> Without that per-pid-count thing clarified, I don't think the (otherwise
> fairly straightforward) approach of Bills really flies.

I implemented the rest of it, based on maintaining hashtables for the
tgid, pgid, and sid as well as the pid itself. get_pid() was not the
focus of it.


Cheers,
Bill

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 20:24   ` Linus Torvalds
  2002-08-08 21:30     ` H. Peter Anvin
  2002-08-08 21:45     ` William Lee Irwin III
@ 2002-08-09  4:42     ` William Lee Irwin III
  2 siblings, 0 replies; 51+ messages in thread
From: William Lee Irwin III @ 2002-08-09  4:42 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Paul Larson, lkml, davej, frankeh, andrea

On Thu, Aug 08, 2002 at 01:24:35PM -0700, Linus Torvalds wrote:
> So if you really want to take this approach, you need to count the uses of
> "pid X", and free the bitmap entry only when that count goes to zero. I
> see no such logic in Bill Irwin's code, only a comment about last use
> (which doesn't explain how to notice that last use).
> Without that per-pid-count thing clarified, I don't think the (otherwise
> fairly straightforward) approach of Bills really flies.

One big thing to bear in mind is that it is actually part of a much
larger work, one which is not centered around get_pid(), and which is
not yet ready for inclusion, or even widespread review. So please give
me time to finish it, and defer judgment until it is complete.

(1) akpm did not post the full patch, only the "after" picture of one file.
(2) The per-id accounting is properly implemented, with caveats
	unrelated to the general accounting method. Yes, I am well
	aware of the need to be notified on release at points other
	than exit(), and I have implemented that notification.
(3) The patch as it is intended to be is largely a tty and job control
	cleanup. get_pid() changes are required as the central feature
	is the removal of the list of all tasks, upon which the current
	get_pid() relies.
(4) pid hashing actually creates idtag objects for something guaranteed
	to be unique. This is so stupid I consider it a bug.
(5) The patch is not yet finished.

Please defer judgment until I am ready to present as a finished work what
is now a work in progress and barely if even out of the "debug" phase.

The last fully-ported version of the patch, which was originally put
on-line only to facilitate communication with reviewers and
contributors, prior to the initial release (and by a very large margin)
is available from the following URL:

ftp://ftp.kernel.org/pub/linux/kernel/people/wli/task_mgmt/for_each_task-2.5.23-1

This patch does not contain a complete implementation of what I would
like to present when I feel ready to submit it.

While I thought I came up with something "nifty" in the way of a
get_pid() as a result of this work, its primary focus is really to
clean up tty and job control code. As it stands now, it does very
little in the way of cleaning it up, only converting it to use the new
infrastructure as a replacement for for_each_task() in the most
straightforward and braindead ways imaginable. Several bugs are known
to exist, but the full patch with all fixes has not yet been ported to
current mainline, and I won't have time to devote to it for some time.
This patch needs much further work, and that work is not yet finished.
Please defer judgment until I can actually finish it. This will
probably have to wait until 2.7 or even later.

Thanks,
Bill

^ permalink raw reply	[flat|nested] 51+ messages in thread

end of thread, other threads:[~2002-08-12 16:02 UTC | newest]

Thread overview: 51+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-08-12 16:15 [PATCH] Linux-2.5 fix/improve get_pid() Jim Houston
  -- strict thread matches above, loose matches on Subject: below --
2002-08-09  0:53 Hubertus Franke
2002-08-08 21:50 Hubertus Franke
2002-08-08 21:43 Hubertus Franke
2002-08-08 22:02 ` Linus Torvalds
2002-08-08 22:26   ` Linus Torvalds
2002-08-08 23:09     ` Albert D. Cahalan
2002-08-09  3:26       ` Chris Adams
2002-08-09  7:04         ` Albert D. Cahalan
2002-08-09  8:48           ` Helge Hafting
2002-08-09 10:32             ` Alan Cox
2002-08-09  9:35               ` Andrew Morton
2002-08-09 10:37               ` Andries Brouwer
2002-08-09 13:00           ` Chris Adams
2002-08-09 14:39             ` Albert D. Cahalan
2002-08-09 13:59           ` Rik van Riel
2002-08-09 14:57             ` Albert D. Cahalan
2002-08-08 22:34   ` Paul Larson
2002-08-08 22:44     ` Rik van Riel
2002-08-08 22:37   ` Rik van Riel
2002-08-09  1:49     ` Andries Brouwer
2002-08-09 19:34   ` Paul Larson
2002-08-09 20:13     ` Paul Larson
2002-08-09 20:40     ` Andries Brouwer
2002-08-09 21:23     ` Linus Torvalds
2002-08-09 21:42       ` Linus Torvalds
2002-08-09 21:46         ` Paul Larson
2002-08-09 22:04           ` Linus Torvalds
2002-08-10 17:23             ` Jamie Lokier
2002-08-10 18:33               ` Linus Torvalds
2002-08-10 18:48                 ` Jamie Lokier
2002-08-11 20:41                   ` Alan Cox
2002-08-11 19:58                     ` Jamie Lokier
2002-08-11 21:23                       ` Alan Cox
2002-08-11 20:10                         ` Jamie Lokier
2002-08-12  8:56             ` Padraig Brady
2002-08-12 10:37               ` Alan Cox
2002-08-12  9:21                 ` Padraig Brady
2002-08-12 14:40             ` Paul Larson
2002-08-08 18:56 Hubertus Franke
2002-08-08 19:15 ` Rik van Riel
2002-08-08 19:18 ` Paul Larson
2002-08-07 22:03 Paul Larson
2002-08-07 23:06 ` Andrew Morton
2002-08-08  0:24   ` Andries Brouwer
2002-08-08 19:42     ` William Lee Irwin III
2002-08-08 20:47       ` Andries Brouwer
2002-08-08 20:24   ` Linus Torvalds
2002-08-08 21:30     ` H. Peter Anvin
2002-08-08 21:45     ` William Lee Irwin III
2002-08-09  4:42     ` William Lee Irwin III

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox