[PATCH] Linux-2.5 fix/improve get

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] Linux-2.5 fix/improve get_pid()
@ 2002-08-07 22:03 Paul Larson
  2002-08-07 23:06 ` Andrew Morton
  0 siblings, 1 reply; 56+ messages in thread
From: Paul Larson @ 2002-08-07 22:03 UTC (permalink / raw)
  To: Linus Torvalds, lkml; +Cc: davej, frankeh, andrea


This patch provides an improved version of get_pid() while also taking
care of the bug that causes the machine to hang when you hit PID_MAX. 
It is based on both solutions to the problem provided by Hubertus Franke
and Andrea Arcangeli.  This uses a bitmap to find an available pid and
uses Hubertus's adaptive implementation to only use this when it is more
beneficial than the old mechanism.  The getpid_mutex from AA's patch is
also carried over to avoid the race where another cpu could get the same
pid before SET_LINKS was called.

This should patch cleanly against 2.5.30 or bk current.
Please apply.

--- a/kernel/fork.c	Wed Aug  7 16:37:38 2002
+++ b/kernel/fork.c	Wed Aug  7 16:05:22 2002
@@ -50,6 +50,12 @@
 
 rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;  /* outer */
 
+/*
+ * Protectes next_safe, last_pid and it avoids races
+ * between get_pid and SET_LINKS().
+ */
+static DECLARE_MUTEX(getpid_mutex);
+
 void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
 {
 	unsigned long flags;
@@ -129,27 +135,107 @@
 	kmem_cache_free(task_struct_cachep,tsk);
 }
 
-/* Protects next_safe and last_pid. */
-spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
+/* this should be provided in every architecture */
+#ifndef SHIFT_PER_LONG
+#if BITS_PER_LONG == 64
+#   define SHIFT_PER_LONG 6
+#elif BITS_PER_LONG == 32
+#   define SHIFT_PER_LONG 5
+#else
+#error "SHIFT_PER_LONG"
+#endif
+#endif
+
+#define RESERVED_PIDS       (300)
+#define GETPID_THRESHOLD    (22000)  /* when to switch to a mapped algo */
+#define PID_MAP_SIZE        (PID_MAX >> SHIFT_PER_LONG)
+static unsigned long pid_map[PID_MAP_SIZE];
+static int next_safe = PID_MAX;
+
+static inline void mark_pid(int pid)
+{
+	__set_bit(pid,pid_map);
+}
+
+static int get_pid_by_map(int lastpid) 
+{
+	static int mark_and_sweep = 0;
+
+	int round = 0;
+	struct task_struct *p;
+	int i;
+	unsigned long mask;
+	int fpos;
+
+
+	if (mark_and_sweep) {
+repeat:
+		mark_and_sweep = 0;
+		memset(pid_map, 0, PID_MAP_SIZE * sizeof(unsigned long));
+		lastpid = RESERVED_PIDS;
+	}
+	for_each_task(p) {
+		mark_pid(p->pid);
+		mark_pid(p->pgrp);
+		mark_pid(p->tgid);
+		mark_pid(p->session);
+	}
+
+	/* find next free pid */
+	i = (lastpid >> SHIFT_PER_LONG);
+	mask = pid_map[i] | ((1 << ((lastpid & (BITS_PER_LONG-1)))) - 1);   
+
+	while ((mask == ~0) && (++i < PID_MAP_SIZE)) 
+		mask = pid_map[i];
+
+	if (i == PID_MAP_SIZE) { 
+		if (round == 0) {
+			round = 1;
+			goto repeat;
+		}
+		next_safe = RESERVED_PIDS;
+		mark_and_sweep = 1;  /* mark next time */
+		return 0; 
+	}
+
+	fpos = ffz(mask);
+	i &= (PID_MAX-1);
+	lastpid = (i << SHIFT_PER_LONG) + fpos;
+
+	/* find next save pid */
+	mask &= ~((1 << fpos) - 1);
+
+	while ((mask == 0) && (++i < PID_MAP_SIZE)) 
+		mask = pid_map[i];
+
+	if (i==PID_MAP_SIZE) 
+		next_safe = PID_MAX;
+	else 
+		next_safe = (i << SHIFT_PER_LONG) + ffs(mask) - 1;
+	return lastpid;
+}
 
 static int get_pid(unsigned long flags)
 {
-	static int next_safe = PID_MAX;
 	struct task_struct *p;
 	int pid;
 
-	if (flags & CLONE_IDLETASK)
-		return 0;
-
-	spin_lock(&lastpid_lock);
 	if((++last_pid) & 0xffff8000) {
-		last_pid = 300;		/* Skip daemons etc. */
+		last_pid = RESERVED_PIDS;	/* Skip daemons etc. */
 		goto inside;
 	}
 	if(last_pid >= next_safe) {
 inside:
 		next_safe = PID_MAX;
 		read_lock(&tasklist_lock);
+		if (nr_threads > GETPID_THRESHOLD) {
+			last_pid = get_pid_by_map(last_pid);
+			if (last_pid == 0) {
+				last_pid = PID_MAX;
+				goto nomorepids; 
+			}
+		} else {
+			int beginpid = last_pid;
 	repeat:
 		for_each_task(p) {
 			if(p->pid == last_pid	||
@@ -158,24 +244,33 @@
 			   p->session == last_pid) {
 				if(++last_pid >= next_safe) {
 					if(last_pid & 0xffff8000)
-						last_pid = 300;
+						last_pid = RESERVED_PIDS;
 					next_safe = PID_MAX;
 				}
+				if(last_pid == beginpid)
+					goto nomorepids;
 				goto repeat;
 			}
 			if(p->pid > last_pid && next_safe > p->pid)
 				next_safe = p->pid;
 			if(p->pgrp > last_pid && next_safe > p->pgrp)
 				next_safe = p->pgrp;
+			if(p->tgid > last_pid && next_safe > p->tgid)
+				next_safe = p->tgid;
 			if(p->session > last_pid && next_safe > p->session)
 				next_safe = p->session;
 		}
+		}
 		read_unlock(&tasklist_lock);
 	}
 	pid = last_pid;
-	spin_unlock(&lastpid_lock);
 
 	return pid;
+
+nomorepids:
+	next_safe = last_pid = PID_MAX;
+	read_unlock(&tasklist_lock);
+	return 0;
 }
 
 static inline int dup_mmap(struct mm_struct * mm)
@@ -669,7 +764,14 @@
 	p->state = TASK_UNINTERRUPTIBLE;
 
 	copy_flags(clone_flags, p);
-	p->pid = get_pid(clone_flags);
+	down(&getpid_mutex);
+	if (clone_flags & CLONE_IDLETASK)
+		p->pid = 0;
+	else {
+		p->pid = get_pid(clone_flags);
+		if (p->pid == 0)
+			goto bad_fork_cleanup;
+	}
 	p->proc_dentry = NULL;
 
 	INIT_LIST_HEAD(&p->run_list);
@@ -793,10 +895,20 @@
 		list_add(&p->thread_group, &current->thread_group);
 	}
 
+	/*
+	 * We must do the SET_LINKS() under the getpid_mutex, to avoid
+	 * another CPU to get our same PID between the release of of the
+	 * getpid_mutex and the SET_LINKS().
+	 *
+	 * In short to avoid SMP races the new child->pid must be just visible
+	 * in the tasklist by the time we drop the getpid_mutex.
+	 */
 	SET_LINKS(p);
+
 	hash_pid(p);
 	nr_threads++;
 	write_unlock_irq(&tasklist_lock);
+	up(&getpid_mutex);
 	retval = 0;
 
 fork_out:
@@ -819,6 +931,7 @@
 bad_fork_cleanup_security:
 	security_ops->task_free_security(p);
 bad_fork_cleanup:
+	up(&getpid_mutex);
 	put_exec_domain(p->thread_info->exec_domain);
 	if (p->binfmt && p->binfmt->module)
 		__MOD_DEC_USE_COUNT(p->binfmt->module);


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-07 22:03 [PATCH] Linux-2.5 fix/improve get_pid() Paul Larson
@ 2002-08-07 23:06 ` Andrew Morton
  2002-08-08  0:24   ` Andries Brouwer
  2002-08-08 20:24   ` [PATCH] Linux-2.5 fix/improve get_pid() Linus Torvalds
  0 siblings, 2 replies; 56+ messages in thread
From: Andrew Morton @ 2002-08-07 23:06 UTC (permalink / raw)
  To: Paul Larson; +Cc: Linus Torvalds, lkml, davej, frankeh, andrea

Paul Larson wrote:
> 
> This patch provides an improved version of get_pid() while also taking
> care of the bug that causes the machine to hang when you hit PID_MAX.

Has this been evaluated against Bill Irwin's constant-time
allocator?  Bill says it has slightly worse normal-case and
vastly improved worst-case performance over the stock allocator.
Not sure how it stacks up against this one.   Plus it's much nicer
looking code.

He's shy, so.....



#include <linux/compiler.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/init.h>

/*
 * pid allocator
 * (C) 2002 William Irwin, IBM
 *
 * The strategy is to maintain a tower of bitmaps where a bitmap above
 * another in each bit accounts whether any pid's are available in the
 * space tracked by BITS_PER_LONG bits of the level below. The bitmaps
 * must be marked on allocation and also release, hence some
 * infrastructure for detecting when the last user of a pid releases it
 * must be in place.
 *
 * This general strategy is simple in concept and enforces highly
 * deterministic bounds on the search time for the next pid.
 */

#define PID_MAX		0x8000
#define RESERVED_PIDS	300

#define MAP0_SIZE	(PID_MAX   >> BITS_PER_LONG_SHIFT)
#define MAP1_SIZE	(MAP0_SIZE >> BITS_PER_LONG_SHIFT)
#define MAP2_SIZE	(MAP1_SIZE >> BITS_PER_LONG_SHIFT)

#define MAP0_SHIFT	BITS_PER_LONG_SHIFT
#define MAP1_SHIFT	(2*BITS_PER_LONG_SHIFT)
#define MAP2_SHIFT	(3*BITS_PER_LONG_SHIFT)

#define PID_MAP_MASK	(BITS_PER_LONG - 1)

#define PID_MAP_DEPTH	(ARRAY_SIZE(pid_map) - 1)

static unsigned long pid_map0[MAP0_SIZE];
static unsigned long pid_map1[MAP1_SIZE];
static unsigned long pid_map2[MAP2_SIZE];

static unsigned long * pid_map[] = { pid_map0, pid_map1, pid_map2, NULL, };

unsigned long last_pid = 0;
unsigned long npids = 0;

static const int map_shifts[] =
	{	0,
		BITS_PER_LONG_SHIFT,
		BITS_PER_LONG_SHIFT*2,
		BITS_PER_LONG_SHIFT*3,
		BITS_PER_LONG_SHIFT*4,
	};

static inline int pid_map_shift(int depth)
{
	return map_shifts[depth+1];
}

static spinlock_t pid_lock = SPIN_LOCK_UNLOCKED;

void free_pid(unsigned long pid)
{
	unsigned long **map = pid_map;

	spin_lock(&pid_lock);
	while (*map) {
		int bit = pid & PID_MAP_MASK;
		pid >>= BITS_PER_LONG_SHIFT;
		__clear_bit(bit, &(*map)[pid]);
		++map;
	}
	--npids;
	spin_unlock(&pid_lock);
}

static inline int whole_block_used(int level, unsigned long pid)
{
	return pid_map[level][pid >> pid_map_shift(level)] == ~0UL;
}

static inline void mark_pid(unsigned long pid)
{
	int level;
	for (level = 0; level < PID_MAP_DEPTH; ++level) {
		int shift, bit;
		unsigned long entry;

		shift = pid_map_shift(level);
		entry = pid >> shift;
		bit   = (pid >> (shift - BITS_PER_LONG_SHIFT)) & PID_MAP_MASK;
		if (level == 0 || whole_block_used(level - 1, pid))
			__set_bit(bit, &pid_map[level][entry]);
		else
			break;
	}
	++npids;
}

static inline int pid_map_limit(int depth)
{
	return PID_MAX >> pid_map_shift(depth);
}

#ifdef PID_ALLOC_EXAMPLE
/*
 * the pid allocation traverses the bitmaps by recursively ffz'ing
 * through down the tower of maps. Some additional logic is required
 * to enforce lower limits, but the following example of how one
 * would perform this search without the lower limit may well prove
 * enlightening for those interested in the mechanics of the algorithm.
 */
static long alloc_pid_from_zero(void)
{
	unsigned long pid = 0;
	int level;

	for (level = PID_MAP_DEPTH - 1; level >= 0; --level) {
		unsigned long entry = pid_map[level][pid];

		if (unlikely(entry == ~0UL))
			return ~0UL;

		pid = (pid << BITS_PER_LONG_SHIFT) + ffz(pid_map[level][pid]);
	}
	return pid;
}
#endif /* PID_ALLOC_EXAMPLE */


static const unsigned long pid_max_levels[] =
	{	PID_MAX >> BITS_PER_LONG_SHIFT,
		PID_MAX >> (BITS_PER_LONG_SHIFT*2),
		PID_MAX >> (BITS_PER_LONG_SHIFT*3),
		PID_MAX >> (BITS_PER_LONG_SHIFT*4),
	};

static inline unsigned long pid_map_digit(int level, unsigned long limit)
{
	return (limit >> pid_map_shift(level-1)) & PID_MAP_MASK;
}

/*
 * Scratch space for storing the digits of the limit, all accesses
 * protected by the pid_lock.
 */
static unsigned long limit_digits[4];

/*
 * This is not a high-performance implementation. alloc_pid_after()
 * can be made highly compact with some effort, but this is instead
 * meant to be clear. As the cost of fork() is dominated by much
 * more expensive operations and the cost of this is constant-bounded
 * by a very low constant, the gains from manual optimization here
 * are marginal.
 */
static long alloc_pid_after(unsigned long limit)
{
	unsigned long pid = 0;
	int level;

	/*
	 * The limit passed to us is a strict lower limit. It is more
	 * convenient to work with <= constraints.
	 */
	++limit;
	if (unlikely(limit == PID_MAX))
		return ~0UL;

	for (level = 0; level < PID_MAP_DEPTH; ++level) {
		limit_digits[level] = limit & PID_MAP_MASK;
		limit >>= BITS_PER_LONG_SHIFT;
	}

	/*
	 * Now the lowest available pid number above limit is
	 * reconstructed by ffz'ing down the bitmap and checking
	 * each digit against the digits of the limit for
	 * dictionary ordering. If the check should fail, it's
	 * fixed up by using the maximum of the two digits. The
	 * dictionary ordering on digits also means that a
	 * greater significant digit found in the bitmap
	 * invalidates all further comparisons, which requires
	 * fallback to the pure recursive ffz algorithm outlined
	 * above in order to be handled.
	 */
	for (level = PID_MAP_DEPTH - 1; level >= 0; --level) {
		unsigned long bit, digit;

		if (unlikely(pid >= pid_max_levels[level]))
			return ~0UL;

		bit = ffz(pid_map[level][pid]);
		digit = limit_digits[level];

		if (unlikely(bit < digit))
			bit = digit;

		pid = (pid << BITS_PER_LONG_SHIFT) + bit;

		/*
		 * This is not an optimization; if this check
		 * should succeed the digit comparisons above
		 * are no longer valid and (pessimistically)
		 * incorrect first available pid's are found.
		 * 
		 */
		if (likely(bit > digit)) {
			--level;
			goto finish_just_ffz;
		}
	}
out:
	if (pid < PID_MAX)
		return pid;
	else
		return ~0UL;

finish_just_ffz:
	/*
	 * Now revert to the pure recursive ffz algorithm with
	 * the slight variation of not beginning at a fixed level,
	 * because it is no longer valid to perform comparisons
	 * of the digit obtained by ffz'ing the bitmap against the
	 * digits of the limit.
	 */
	while (level >= 0) {
		unsigned long bit;

		if (unlikely(pid >= pid_max_levels[level]))
			return ~0UL;

		bit = ffz(pid_map[level][pid]);
		pid = (pid << BITS_PER_LONG_SHIFT) + bit;
		--level;
	}

	goto out;
}

int alloc_pid(void)
{
	unsigned long pid;

	spin_lock(&pid_lock);
	BUG_ON(last_pid >= PID_MAX);
	pid = alloc_pid_after(last_pid);
	if (unlikely(pid == ~0UL)) {
		pid = alloc_pid_after(RESERVED_PIDS);
		if (unlikely(pid == ~0UL))
			goto out;
		BUG_ON(pid < RESERVED_PIDS);
	} else
		BUG_ON(pid <= last_pid);
	last_pid = pid;
	mark_pid(pid);
out:
	spin_unlock(&pid_lock);
	return (int)pid;
}

void __init pid_init(void)
{
	mark_pid(0);
}

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-07 23:06 ` Andrew Morton
@ 2002-08-08  0:24   ` Andries Brouwer
  2002-08-08 19:42     ` William Lee Irwin III
  2002-08-09 11:22     ` Analysis for Linux-2.5 fix/improve get_pid(), comparing various approaches Hubertus Franke
  2002-08-08 20:24   ` [PATCH] Linux-2.5 fix/improve get_pid() Linus Torvalds
  1 sibling, 2 replies; 56+ messages in thread
From: Andries Brouwer @ 2002-08-08  0:24 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Paul Larson, Linus Torvalds, lkml, davej, frankeh, andrea

On Wed, Aug 07, 2002 at 04:06:05PM -0700, Andrew Morton wrote:

> Has this been evaluated against Bill Irwin's constant-time
> allocator?  Bill says it has slightly worse normal-case and
> vastly improved worst-case performance over the stock allocator.
> Not sure how it stacks up against this one.   Plus it's much nicer
> looking code.

> #define PID_MAX		0x8000
> #define RESERVED_PIDS	300
> 
> #define MAP0_SIZE	(PID_MAX   >> BITS_PER_LONG_SHIFT)
> #define MAP1_SIZE	(MAP0_SIZE >> BITS_PER_LONG_SHIFT)
> #define MAP2_SIZE	(MAP1_SIZE >> BITS_PER_LONG_SHIFT)
> 
> static unsigned long pid_map0[MAP0_SIZE];
> static unsigned long pid_map1[MAP1_SIZE];
> static unsigned long pid_map2[MAP2_SIZE];

Here it is of interest how large a pid is.
With a 64-bit pid_t it is just

  static pid_t last_pid;

  pid_t get_next_pid() {
	return ++last_pid;
  }

since 2^64 is a really large number.
Unfortunately glibc does not support this (on x86).

With a 32-bit pid_t wraparounds will occur, but very infrequently.
Thus, finding the next pid will be very fast on average, much faster
than the above algorithm, and no arrays are required.
One only loses the guaranteed constant time property.
Unless hard real time is required, this sounds like the best version.

Andries

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
@ 2002-08-08 18:56 Hubertus Franke
  2002-08-08 19:15 ` Rik van Riel
  2002-08-08 19:18 ` Paul Larson
  0 siblings, 2 replies; 56+ messages in thread
From: Hubertus Franke @ 2002-08-08 18:56 UTC (permalink / raw)
  To: Andries Brouwer
  Cc: Andrew Morton, andrea, davej, lkml, Paul Larson, Linus Torvalds

Which one sounds like the best one ?

Assuming that for now we have to stick to 16-bit pid_t ....
There are the following patches out there ....:

(a) Vanilla version which breaks down after 22K pids and really sucks above
32.5K
(b) Bill Irwin's, which keeps track of which PID is free and which one is
not ?
(c) Andrea's patch which searches in the bitmap when we are running out
(d) Paul's patch, which I believe was based on one of my earlier submission
(03/02) that
                                 essentially switches between (a)+(c) at
the break even point.

Assuming that Paul's patch still resembles somewhat my earlier intentions:
My observation of (d) back in february was that the current approach with
using next_pid = ++last_pid in the general case is pretty good and that
only the determination of a guaranteed free range sucks due to O(N**2)
algorithm sucks and using a bitmap to determine the next safe range
through a O(N) algorithm is good. I determined the breakeven point with
random pid deletion to be ~22K where the current algorithm worked darn
well.
One difference to Andrea's patch (c) is  (if I remember his code correctly)
that
I used was that I always look forward in the bitmap for the next safe range
in the
bitmap when I exhausted the previous one without having to traverse the
task list.
Only if non is found I traverse the task list and try the bit search one
more time.

I feel (c) or (d) is a better solution over (a) and (b)...   Open for
discussion.
I have a test program that does the random pid deletion and pid allocation
in user space. All what's required is to copy the get_pid() code from the
kernel into there... Can make that available ...

I don't know what Paul has done to the patch since then ....

-- Hubertus Franke  (frankeh@watson.ibm.com)

                      Andries Brouwer                                                                                                
                      <aebr@win.tue.nl>        To:       Andrew Morton <akpm@zip.com.au>                                             
                                               cc:       Paul Larson <plars@austin.ibm.com>, Linus Torvalds <torvalds@transmeta.     
                      08/07/2002 08:24          com>, lkml <linux-kernel@vger.kernel.org>, davej@suse.de, Hubertus                   
                      PM                        Franke/Watson/IBM@IBMUS, andrea@suse.de                                              
                                               Subject:  Re: [PATCH] Linux-2.5 fix/improve get_pid()                                 

On Wed, Aug 07, 2002 at 04:06:05PM -0700, Andrew Morton wrote:

> Has this been evaluated against Bill Irwin's constant-time
> allocator?  Bill says it has slightly worse normal-case and
> vastly improved worst-case performance over the stock allocator.
> Not sure how it stacks up against this one.   Plus it's much nicer
> looking code.

> #define PID_MAX                    0x8000
> #define RESERVED_PIDS        300
>
> #define MAP0_SIZE            (PID_MAX   >> BITS_PER_LONG_SHIFT)
> #define MAP1_SIZE            (MAP0_SIZE >> BITS_PER_LONG_SHIFT)
> #define MAP2_SIZE            (MAP1_SIZE >> BITS_PER_LONG_SHIFT)
>
> static unsigned long pid_map0[MAP0_SIZE];
> static unsigned long pid_map1[MAP1_SIZE];
> static unsigned long pid_map2[MAP2_SIZE];

Here it is of interest how large a pid is.
With a 64-bit pid_t it is just

  static pid_t last_pid;

  pid_t get_next_pid() {
             return ++last_pid;
  }

since 2^64 is a really large number.
Unfortunately glibc does not support this (on x86).

With a 32-bit pid_t wraparounds will occur, but very infrequently.
Thus, finding the next pid will be very fast on average, much faster
than the above algorithm, and no arrays are required.
One only loses the guaranteed constant time property.
Unless hard real time is required, this sounds like the best version.

Andries

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 18:56 Hubertus Franke
@ 2002-08-08 19:15 ` Rik van Riel
  2002-08-08 19:18 ` Paul Larson
  1 sibling, 0 replies; 56+ messages in thread
From: Rik van Riel @ 2002-08-08 19:15 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: Andries Brouwer, Andrew Morton, andrea, davej, lkml, Paul Larson,
	Linus Torvalds

On Thu, 8 Aug 2002, Hubertus Franke wrote:

> Which one sounds like the best one ?
>
> Assuming that for now we have to stick to 16-bit pid_t ....

That assumption is pretty central to the debate.

I don't see the standard get_pid nor the bitmap based
get_pid scale to something larger than a 16-bit pid_t.

If we're not sure yet whether we want to keep pid_t 16
bits it might be worth putting in an algorithm that does
scale to larger numbers, if only so the switch to a larger
pid_t will be more straightforward.

kind regards,

Rik
-- 
	http://www.linuxsymposium.org/2002/
"You're one of those condescending OLS attendants"
"Here's a nickle kid.  Go buy yourself a real t-shirt"

http://www.surriel.com/		http://distro.conectiva.com/

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 18:56 Hubertus Franke
  2002-08-08 19:15 ` Rik van Riel
@ 2002-08-08 19:18 ` Paul Larson
  1 sibling, 0 replies; 56+ messages in thread
From: Paul Larson @ 2002-08-08 19:18 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: Andries Brouwer, Andrew Morton, andrea, Dave Jones, lkml,
	Linus Torvalds

On Thu, 2002-08-08 at 13:56, Hubertus Franke wrote:
> (a) Vanilla version which breaks down after 22K pids and 
> really sucks above 32.5K
> (b) Bill Irwin's, which keeps track of which PID is free and 
> which one is not ?
> (c) Andrea's patch which searches in the bitmap when we are 
> running out
> (d) Paul's patch, which I believe was based on one of my earlier 
> submission (03/02) that essentially switches between (a)+(c) at 
> the break even point.

> I feel (c) or (d) is a better solution over (a) and (b)...   Open for
> discussion.
> I have a test program that does the random pid deletion and pid allocation
> in user space. All what's required is to copy the get_pid() code from the
> kernel into there... Can make that available ...
> 
> I don't know what Paul has done to the patch since then ....

It's pretty much the same, with a couple of small changes.  After
forward porting your patch and the one from AA's patches (hch says that
one was actually done by Ihno Krumreich, thanks for the info), I added
the fix from (c) for the race, and moved the check for if(flags &
CLONE_IDLETASK) outside of get_pid into copy_process() since there's no
need to call get_pid and return just to find out that we need to use 0.

-Paul Larson


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08  0:24   ` Andries Brouwer
@ 2002-08-08 19:42     ` William Lee Irwin III
  2002-08-08 20:47       ` Andries Brouwer
  2002-08-09 11:22     ` Analysis for Linux-2.5 fix/improve get_pid(), comparing various approaches Hubertus Franke
  1 sibling, 1 reply; 56+ messages in thread
From: William Lee Irwin III @ 2002-08-08 19:42 UTC (permalink / raw)
  To: Andries Brouwer
  Cc: Andrew Morton, Paul Larson, Linus Torvalds, lkml, davej, frankeh,
	andrea

On Thu, Aug 08, 2002 at 02:24:19AM +0200, Andries Brouwer wrote:
> Here it is of interest how large a pid is.
> With a 64-bit pid_t it is just
>   static pid_t last_pid;
>   pid_t get_next_pid() {
> 	return ++last_pid;
>   }
> since 2^64 is a really large number.
> Unfortunately glibc does not support this (on x86).
> With a 32-bit pid_t wraparounds will occur, but very infrequently.
> Thus, finding the next pid will be very fast on average, much faster
> than the above algorithm, and no arrays are required.
> One only loses the guaranteed constant time property.
> Unless hard real time is required, this sounds like the best version.

The goal of the work that produced this was to remove the global
tasklist. Changing ABI's and/or breaking userspace was not "within the
rules" of that. My allocator relies on that other infrastructure for
notification of release of pid's, and is really only meant to remove
the reliance of fork() on the tasklist. That work is probably more
relevant to heavy tty usage than forkbombs, despite the O(1) get_pid().
I am glad people happen to like various tidbits of it, though. =)


Cheers,
Bill

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-07 23:06 ` Andrew Morton
  2002-08-08  0:24   ` Andries Brouwer
@ 2002-08-08 20:24   ` Linus Torvalds
  2002-08-08 21:30     ` H. Peter Anvin
                       ` (2 more replies)
  1 sibling, 3 replies; 56+ messages in thread
From: Linus Torvalds @ 2002-08-08 20:24 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Paul Larson, lkml, davej, frankeh, andrea

On Wed, 7 Aug 2002, Andrew Morton wrote:
>
> Has this been evaluated against Bill Irwin's constant-time
> allocator?  Bill says it has slightly worse normal-case and
> vastly improved worst-case performance over the stock allocator.
> Not sure how it stacks up against this one.   Plus it's much nicer
> looking code.

Guys, this discussion is getting ridiculous.

Doing a bit allocator should be trivial, but it's hard to know when a bit
is to be free'd. You can't just do it at "exit()" time, because even if
pid X exits, that doesn't mean that X can be re-used: it may still be used
as a pgid or a tid by some other process Y.

So if you really want to take this approach, you need to count the uses of
"pid X", and free the bitmap entry only when that count goes to zero. I
see no such logic in Bill Irwin's code, only a comment about last use
(which doesn't explain how to notice that last use).

Without that per-pid-count thing clarified, I don't think the (otherwise
fairly straightforward) approach of Bills really flies.

For that reason I think the mark-and-sweep thing is the right thing to do,
but I think the two-level algorithm is just over-engineering and not worth
it. And I do hate that getpid_mutex thing. Having a blocking lock for
something as simple as pid allocation just smells horribly wrong to me.

Moving the pid allocation to later (so that it doesn't need to handle
operations that block in between allocation and "we're done" and the pid
allocation can be a spinlock) sounds like a fairly obvious thing to do. 

I don't see why we would need the "pid" until the very last moment, at 
which point we already have the tasklist lock, in fact.

And I hate overengineering.

		Linus

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 19:42     ` William Lee Irwin III
@ 2002-08-08 20:47       ` Andries Brouwer
  0 siblings, 0 replies; 56+ messages in thread
From: Andries Brouwer @ 2002-08-08 20:47 UTC (permalink / raw)
  To: William Lee Irwin III, Andrew Morton, Paul Larson, Linus Torvalds,
	lkml, davej, frankeh, andrea

On Thu, Aug 08, 2002 at 12:42:38PM -0700, William Lee Irwin III wrote:

> The goal of the work that produced this was to remove the global
> tasklist. Changing ABI's and/or breaking userspace was not "within the
> rules" of that.

It feels wrong to add such complexity and simultaneously keep
such a small pid_t.

Very soon 30000 processes will not be enough.

Using a 32-bit pid_t does not break userspace. Indeed, user space uses
a 32-bit pid_t today. The only complaint I have heard was from
Albert Cahalan who maintains ps and was afraid that the ps output
would become uglier if pids would get more digits.

It is a real pity that going to a 64-bit pid is impossible (on x64).

Many algorithms can be really efficient if you have a large space
to work in. For example, I do not know what your motivation was
for wanting to remove the global tasklist. It is certainly needed
for sending signals. But if you want to avoid access to global stuff
in a MP situation, then it is easy to partition the pid space
so that each processor only gives out pids in its own region.
(So that simultaneous forks do not interfere.)

Andries

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 20:24   ` [PATCH] Linux-2.5 fix/improve get_pid() Linus Torvalds
@ 2002-08-08 21:30     ` H. Peter Anvin
  2002-08-08 21:45     ` William Lee Irwin III
  2002-08-09  4:42     ` William Lee Irwin III
  2 siblings, 0 replies; 56+ messages in thread
From: H. Peter Anvin @ 2002-08-08 21:30 UTC (permalink / raw)
  To: linux-kernel

Followup to:  <Pine.LNX.4.44.0208081312330.8705-100000@home.transmeta.com>
By author:    Linus Torvalds <torvalds@transmeta.com>
In newsgroup: linux.dev.kernel
> 
> Guys, this discussion is getting ridiculous.
> 
> Doing a bit allocator should be trivial, but it's hard to know when a bit
> is to be free'd. You can't just do it at "exit()" time, because even if
> pid X exits, that doesn't mean that X can be re-used: it may still be used
> as a pgid or a tid by some other process Y.
> 
> So if you really want to take this approach, you need to count the uses of
> "pid X", and free the bitmap entry only when that count goes to zero. I
> see no such logic in Bill Irwin's code, only a comment about last use
> (which doesn't explain how to notice that last use).
> 

Even so, we need to maintain Not Recently Used semantic.  A discussion
on #kernel seems to have ended up with recommending a design target of
"no pid reuse within 30 seconds", with 1 second being an absolute
requirement.

	-hpa
-- 
<hpa@transmeta.com> at work, <hpa@zytor.com> in private!
"Unix gives you enough rope to shoot yourself in the foot."
http://www.zytor.com/~hpa/puzzle.txt	<amsp@zytor.com>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
@ 2002-08-08 21:43 Hubertus Franke
  2002-08-08 22:02 ` Linus Torvalds
  0 siblings, 1 reply; 56+ messages in thread
From: Hubertus Franke @ 2002-08-08 21:43 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Andries Brouwer, Andrew Morton, andrea, davej, lkml, Paul Larson,
	Linus Torvalds

                                                                                                               
                                                                                                               
                                                                                                               


That is true. All was done under the 16-bit assumption
My hunch is that the current algorithm might actually work quite well
for a sparsely populated pid-space (32-bits).
A bitmap-ed based solution is not possible at that point due to space
requirements.

Should be easy to figure out.

Hubertus Franke
Enterprise Linux Group (Mgr),  Linux Technology Center (Member Scalability)
, OS-PIC (Chair)
email: frankeh@us.ibm.com
(w) 914-945-2003    (fax) 914-945-4425   TL: 862-2003




                                                                                                                                     
                      Rik van Riel                                                                                                   
                      <riel@conectiva.         To:       Hubertus Franke/Watson/IBM@IBMUS                                            
                      com.br>                  cc:       Andries Brouwer <aebr@win.tue.nl>, Andrew Morton <akpm@zip.com.au>,         
                                                <andrea@suse.de>, <davej@suse.de>, lkml <linux-kernel@vger.kernel.org>, Paul Larson  
                      08/08/2002 04:15          <plars@austin.ibm.com>, Linus Torvalds <torvalds@transmeta.com>                      
                      PM                       Subject:  Re: [PATCH] Linux-2.5 fix/improve get_pid()                                 
                                                                                                                                     
                                                                                                                                     
                                                                                                                                     



On Thu, 8 Aug 2002, Hubertus Franke wrote:

> Which one sounds like the best one ?
>
> Assuming that for now we have to stick to 16-bit pid_t ....

That assumption is pretty central to the debate.

I don't see the standard get_pid nor the bitmap based
get_pid scale to something larger than a 16-bit pid_t.

If we're not sure yet whether we want to keep pid_t 16
bits it might be worth putting in an algorithm that does
scale to larger numbers, if only so the switch to a larger
pid_t will be more straightforward.

kind regards,

Rik
--
             http://www.linuxsymposium.org/2002/
"You're one of those condescending OLS attendants"
"Here's a nickle kid.  Go buy yourself a real t-shirt"

http://www.surriel.com/                    http://distro.conectiva.com/





^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 20:24   ` [PATCH] Linux-2.5 fix/improve get_pid() Linus Torvalds
  2002-08-08 21:30     ` H. Peter Anvin
@ 2002-08-08 21:45     ` William Lee Irwin III
  2002-08-09  4:42     ` William Lee Irwin III
  2 siblings, 0 replies; 56+ messages in thread
From: William Lee Irwin III @ 2002-08-08 21:45 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Paul Larson, lkml, davej, frankeh, andrea

On Thu, Aug 08, 2002 at 01:24:35PM -0700, Linus Torvalds wrote:
> Without that per-pid-count thing clarified, I don't think the (otherwise
> fairly straightforward) approach of Bills really flies.

I implemented the rest of it, based on maintaining hashtables for the
tgid, pgid, and sid as well as the pid itself. get_pid() was not the
focus of it.


Cheers,
Bill

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
@ 2002-08-08 21:50 Hubertus Franke
  0 siblings, 0 replies; 56+ messages in thread
From: Hubertus Franke @ 2002-08-08 21:50 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, andrea, davej, lkml, Paul Larson

Agreed ....
The mark-and-sweep is the correct method for 16-bits ! For 32-bits its not
possible !
Two level is over-engineered (as I told Paul).

If you want I can post the data that I collected comparing
(a) stock kernel
(b) my mark and sweep
(c) my double mark and sweep (try looking forward and then scan task list)
?

Let me know if you are interested.
That should clear up the issue quickly giving you some average allocation
times etc.....
as a function of random used pids.

Hubertus Franke
Enterprise Linux Group (Mgr),  Linux Technology Center (Member Scalability)
, OS-PIC (Chair)
email: frankeh@us.ibm.com
(w) 914-945-2003    (fax) 914-945-4425   TL: 862-2003

                      Linus Torvalds                                                                                                 
                      <torvalds@transme        To:       Andrew Morton <akpm@zip.com.au>                                             
                      ta.com>                  cc:       Paul Larson <plars@austin.ibm.com>, lkml <linux-kernel@vger.kernel.org>,    
                                                <davej@suse.de>, Hubertus Franke/Watson/IBM@IBMUS, <andrea@suse.de>                  
                      08/08/2002 04:24         Subject:  Re: [PATCH] Linux-2.5 fix/improve get_pid()                                 
                      PM                                                                                                             

On Wed, 7 Aug 2002, Andrew Morton wrote:
>
> Has this been evaluated against Bill Irwin's constant-time
> allocator?  Bill says it has slightly worse normal-case and
> vastly improved worst-case performance over the stock allocator.
> Not sure how it stacks up against this one.   Plus it's much nicer
> looking code.

Guys, this discussion is getting ridiculous.

Doing a bit allocator should be trivial, but it's hard to know when a bit
is to be free'd. You can't just do it at "exit()" time, because even if
pid X exits, that doesn't mean that X can be re-used: it may still be used
as a pgid or a tid by some other process Y.

So if you really want to take this approach, you need to count the uses of
"pid X", and free the bitmap entry only when that count goes to zero. I
see no such logic in Bill Irwin's code, only a comment about last use
(which doesn't explain how to notice that last use).

Without that per-pid-count thing clarified, I don't think the (otherwise
fairly straightforward) approach of Bills really flies.

For that reason I think the mark-and-sweep thing is the right thing to do,
but I think the two-level algorithm is just over-engineering and not worth
it. And I do hate that getpid_mutex thing. Having a blocking lock for
something as simple as pid allocation just smells horribly wrong to me.

Moving the pid allocation to later (so that it doesn't need to handle
operations that block in between allocation and "we're done" and the pid
allocation can be a spinlock) sounds like a fairly obvious thing to do.

I don't see why we would need the "pid" until the very last moment, at
which point we already have the tasklist lock, in fact.

And I hate overengineering.

                         Linus

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 21:43 Hubertus Franke
@ 2002-08-08 22:02 ` Linus Torvalds
  2002-08-08 22:26   ` Linus Torvalds
                     ` (3 more replies)
  0 siblings, 4 replies; 56+ messages in thread
From: Linus Torvalds @ 2002-08-08 22:02 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: Rik van Riel, Andries Brouwer, Andrew Morton, andrea, davej, lkml,
	Paul Larson


On Thu, 8 Aug 2002, Hubertus Franke wrote:
> 
> That is true. All was done under the 16-bit assumption
> My hunch is that the current algorithm might actually work quite well
> for a sparsely populated pid-space (32-bits).

I agree.

So let's just try Andries approach, suggested patch as follows..

(yeah, silly mask and MAX_PID, but since even the kernel uses signed
integers for some of it, this way it never gets close to being an issue).

		Linus

----
--- 1.2/include/linux/threads.h	Tue Feb  5 07:23:04 2002
+++ edited/include/linux/threads.h	Thu Aug  8 14:58:28 2002
@@ -19,6 +19,7 @@
 /*
  * This controls the maximum pid allocated to a process
  */
-#define PID_MAX 0x8000
+#define PID_MASK 0x3fffffff
+#define PID_MAX (PID_MASK+1)
 
 #endif
===== kernel/fork.c 1.57 vs edited =====
--- 1.57/kernel/fork.c	Tue Jul 30 15:49:20 2002
+++ edited/kernel/fork.c	Thu Aug  8 15:00:16 2002
@@ -142,7 +142,7 @@
 		return 0;
 
 	spin_lock(&lastpid_lock);
-	if((++last_pid) & 0xffff8000) {
+	if((++last_pid) & ~PID_MASK) {
 		last_pid = 300;		/* Skip daemons etc. */
 		goto inside;
 	}
@@ -157,7 +157,7 @@
 			   p->tgid == last_pid	||
 			   p->session == last_pid) {
 				if(++last_pid >= next_safe) {
-					if(last_pid & 0xffff8000)
+					if(last_pid & ~PID_MASK)
 						last_pid = 300;
 					next_safe = PID_MAX;
 				}


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 22:02 ` Linus Torvalds
@ 2002-08-08 22:26   ` Linus Torvalds
  2002-08-08 23:09     ` Albert D. Cahalan
  2002-08-08 22:34   ` Paul Larson
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 56+ messages in thread
From: Linus Torvalds @ 2002-08-08 22:26 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: Rik van Riel, Andries Brouwer, Andrew Morton, andrea, davej, lkml,
	Paul Larson

On Thu, 8 Aug 2002, Linus Torvalds wrote:
> 
> So let's just try Andries approach, suggested patch as follows..

"ps" seems to do ok from a visual standpoint at least up to 99 million. 
Maybe it won't look that good after that, I'm too lazy to test.

The following trivial program is useful for efficiently allocating pid
numbers without blowing chunks on the VM subsystem and spending all the
time on page table updates - for people who want to test (look out: I've
got dual 2.4GHz CPU's with HT, so getting up to 10+ million was easy, your
milage may wary and at some point you should just compile a kernel that
starts higher ;).

		Linus

---
#include <sys/types.h>
#include <sys/wait.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>

int main()
{
        int i;
        for (i = 1; i < 250000; i++) {
                if (!vfork())
                        exit(1);
                if (waitpid(-1, NULL, WNOHANG) < 0)
                        perror("waitpid");
        }
        return 0;
}

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 22:02 ` Linus Torvalds
  2002-08-08 22:26   ` Linus Torvalds
@ 2002-08-08 22:34   ` Paul Larson
  2002-08-08 22:44     ` Rik van Riel
  2002-08-08 22:37   ` Rik van Riel
  2002-08-09 19:34   ` Paul Larson
  3 siblings, 1 reply; 56+ messages in thread
From: Paul Larson @ 2002-08-08 22:34 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Hubertus Franke, Rik van Riel, Andries Brouwer, Andrew Morton,
	andrea, Dave Jones, lkml

On Thu, 2002-08-08 at 17:02, Linus Torvalds wrote:
> 
> On Thu, 8 Aug 2002, Hubertus Franke wrote:
> > 
> > That is true. All was done under the 16-bit assumption
> > My hunch is that the current algorithm might actually work quite well
> > for a sparsely populated pid-space (32-bits).
> 
> I agree.
The original issue that I had with all of this is the fact that if the
current algorithm can't find an available pid, it just sits there
churning forever and hangs the machine.  My original patch was really
just a very basic fix for that (see the 2.4 tree).  This makes it far
more unlikely for us to max out, but if we do aren't we just going to
have the same trouble all over?

Thanks,
Paul Larson


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 22:02 ` Linus Torvalds
  2002-08-08 22:26   ` Linus Torvalds
  2002-08-08 22:34   ` Paul Larson
@ 2002-08-08 22:37   ` Rik van Riel
  2002-08-09  1:49     ` Andries Brouwer
  2002-08-09 19:34   ` Paul Larson
  3 siblings, 1 reply; 56+ messages in thread
From: Rik van Riel @ 2002-08-08 22:37 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Hubertus Franke, Andries Brouwer, Andrew Morton, andrea, davej,
	lkml, Paul Larson

On Thu, 8 Aug 2002, Linus Torvalds wrote:
> On Thu, 8 Aug 2002, Hubertus Franke wrote:
> >
> > That is true. All was done under the 16-bit assumption
> > My hunch is that the current algorithm might actually work quite well
> > for a sparsely populated pid-space (32-bits).
>
> I agree.
>
> So let's just try Andries approach, suggested patch as follows..

Hmmm, I wonder how badly the system will behave when
we need to reset last_pid and next_safe with 30000
pids in use ...

regards,

Rik
-- 
	http://www.linuxsymposium.org/2002/
"You're one of those condescending OLS attendants"
"Here's a nickle kid.  Go buy yourself a real t-shirt"

http://www.surriel.com/		http://distro.conectiva.com/


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 22:34   ` Paul Larson
@ 2002-08-08 22:44     ` Rik van Riel
  0 siblings, 0 replies; 56+ messages in thread
From: Rik van Riel @ 2002-08-08 22:44 UTC (permalink / raw)
  To: Paul Larson
  Cc: Linus Torvalds, Hubertus Franke, Andries Brouwer, Andrew Morton,
	andrea, Dave Jones, lkml

On 8 Aug 2002, Paul Larson wrote:

> The original issue that I had with all of this is the fact that if the
> current algorithm can't find an available pid, it just sits there
> churning forever and hangs the machine.  My original patch was really
> just a very basic fix for that (see the 2.4 tree).  This makes it far
> more unlikely for us to max out, but if we do aren't we just going to
> have the same trouble all over?

You'll need at least 330 million tasks to run out.

At a minimum kernel memory allocation of about 8 kB per
task, that's about 2600 GB of kernel data structures.

I'm not sure we'll hit that limit, ever. Not because
we won't have a TB of kernel data space at some point
in the future, but because 330 million tasks is a lot
more than we'd want to manage with just a few CPUs ;)

kind regards,

Rik
-- 
	http://www.linuxsymposium.org/2002/
"You're one of those condescending OLS attendants"
"Here's a nickle kid.  Go buy yourself a real t-shirt"

http://www.surriel.com/		http://distro.conectiva.com/


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 22:26   ` Linus Torvalds
@ 2002-08-08 23:09     ` Albert D. Cahalan
  2002-08-09  3:26       ` Chris Adams
  0 siblings, 1 reply; 56+ messages in thread
From: Albert D. Cahalan @ 2002-08-08 23:09 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Hubertus Franke, Rik van Riel, Andries Brouwer, Andrew Morton,
	andrea, davej, lkml, Paul Larson

Linus Torvalds writes:

> "ps" seems to do ok from a visual standpoint at least up to 99 million. 
> Maybe it won't look that good after that, I'm too lazy to test.

Mind sharing what "ps -fj", "ps -lf", and "ps j" look like?
The standard tty is 80x24 BTW, and we already have serious
problems due to ever-expanding tty names.

How about a default limit of 9999, to be adjusted by
sysctl as needed?


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
@ 2002-08-09  0:53 Hubertus Franke
  0 siblings, 0 replies; 56+ messages in thread
From: Hubertus Franke @ 2002-08-09  0:53 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andries Brouwer, Andrew Morton, andrea, davej, lkml, Paul Larson,
	Rik van Riel

                                                                                                               
                                                                                                               
                                                                                                               


I package my stuff up that allows to pull this into user space and run much
more efficient
pid allocation test for performance....
I'll also include the comparision numbers.  Currently remote, so I don't
have access to
that info.
Based on that though it seems with random pid deletion (we surely could
argue about
that one) there still seems reasonable benefits to "consider" a twolevel
algo.
More tomorrow.

Hubertus Franke
Enterprise Linux Group (Mgr),  Linux Technology Center (Member Scalability)
, OS-PIC (Chair)
email: frankeh@us.ibm.com
(w) 914-945-2003    (fax) 914-945-4425   TL: 862-2003




                                                                                                                                     
                      Linus Torvalds                                                                                                 
                      <torvalds@transme        To:       Hubertus Franke/Watson/IBM@IBMUS                                            
                      ta.com>                  cc:       Rik van Riel <riel@conectiva.com.br>, Andries Brouwer <aebr@win.tue.nl>,    
                                                Andrew Morton <akpm@zip.com.au>, <andrea@suse.de>, <davej@suse.de>, lkml <linux-     
                      08/08/2002 06:26          kernel@vger.kernel.org>, Paul Larson <plars@austin.ibm.com>                          
                      PM                       Subject:  Re: [PATCH] Linux-2.5 fix/improve get_pid()                                 
                                                                                                                                     
                                                                                                                                     
                                                                                                                                     




On Thu, 8 Aug 2002, Linus Torvalds wrote:
>
> So let's just try Andries approach, suggested patch as follows..

"ps" seems to do ok from a visual standpoint at least up to 99 million.
Maybe it won't look that good after that, I'm too lazy to test.

The following trivial program is useful for efficiently allocating pid
numbers without blowing chunks on the VM subsystem and spending all the
time on page table updates - for people who want to test (look out: I've
got dual 2.4GHz CPU's with HT, so getting up to 10+ million was easy, your
milage may wary and at some point you should just compile a kernel that
starts higher ;).

                         Linus

---
#include <sys/types.h>
#include <sys/wait.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>

int main()
{
        int i;
        for (i = 1; i < 250000; i++) {
                if (!vfork())
                        exit(1);
                if (waitpid(-1, NULL, WNOHANG) < 0)
                        perror("waitpid");
        }
        return 0;
}






^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 22:37   ` Rik van Riel
@ 2002-08-09  1:49     ` Andries Brouwer
  0 siblings, 0 replies; 56+ messages in thread
From: Andries Brouwer @ 2002-08-09  1:49 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Linus Torvalds, Hubertus Franke, Andrew Morton, andrea, davej,
	lkml, Paul Larson

On Thu, Aug 08, 2002 at 07:37:08PM -0300, Rik van Riel wrote:

> Hmmm, I wonder how badly the system will behave when
> we need to reset last_pid and next_safe with 30000
> pids in use ...

On average very well, that is, the time spent in finding
the next pid is very small on average, but once in a while
there is a small hiccup.
On a hard real time system it may be reasonable to choose
for an average that is ten times higher and avoid the hiccup.

For systems that really have lots of very short-lived
processes, one might do what I suggested a moment ago
for a MP context: have the processes live in several lists,
that each only use part of the pid-space.

(One wants for_each_task to take a limited amount of time.
With N tasks, and sqrt(N) list heads one first picks the
shortest list, then loops over the list, all in time of order
sqrt(N). That works well, certainly up to a million processes.)

Andries

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 23:09     ` Albert D. Cahalan
@ 2002-08-09  3:26       ` Chris Adams
  2002-08-09  7:04         ` Albert D. Cahalan
  0 siblings, 1 reply; 56+ messages in thread
From: Chris Adams @ 2002-08-09  3:26 UTC (permalink / raw)
  To: linux-kernel

Once upon a time, Albert D. Cahalan <acahalan@cs.uml.edu> said:
>Mind sharing what "ps -fj", "ps -lf", and "ps j" look like?
>The standard tty is 80x24 BTW, and we already have serious
>problems due to ever-expanding tty names.
>
>How about a default limit of 9999, to be adjusted by
>sysctl as needed?

I hope you meant 99999 (since we already have 5 digit PIDs).  It would
also seem to me that if it is adjustable, then "ps" would have to handle
it anyway, and making "ps" deal with adjustable size PIDs would be more
complex and error-prone.

Tru64 Unix 5.x uses 19 bit pids (up to 524288, so up to six digits - the
rest of the 32 bits go for cluster node, node sequence, and an unused
sign bit) without significant problems.  Their "ps" args aren't an exact
match, but they're close (lots of processes snipped):

$ ps -fj | head -2
UID         PID   PPID    C STIME    TTY             TIME CMD
cmadams  272363 301021  0.0 20:47:46 pts/1        0:00.09 -bash (bash)
$ ps -lf | head -2
       F S           UID    PID   PPID %CPU PRI  NI  RSS WCHAN    STARTED         TIME COMMAND
80c08001 I           200 272363 301021  0.0  44   0 520K wait     20:47:46     0:00.09 -bash (bash)
$ ps j | head -2
USER        PID   PPID   PGID   SESS JOBC S    TTY             TIME COMMAND
cmadams  272363 301021 272363 272363    0 I    pts/1        0:00.09 -bash (bash)
$ 

I had exactly one problem moving to 5.x with larger PIDs: the free "top"
program (Tru64 doesn't include "top") assumed 5 digit PIDs so the
columns didn't line up until I changed the output format by one column.

I routinely have 1000+ processes running on this server (many of them
short-lived things like POP checks, short CGIs, sendmail background
delivery, etc.), so having a larger PID space is important (having the
same PID reused within 15-30 seconds would be annoying).

I would like to see Linux running on this server (or at least this class
of server), and limiting the number of PIDs because of "ps" formatting
is not the way to go.

-- 
Chris Adams <cmadams@hiwaay.net>
Systems and Network Administrator - HiWAAY Internet Services
I don't speak for anybody but myself - that's enough trouble.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 20:24   ` [PATCH] Linux-2.5 fix/improve get_pid() Linus Torvalds
  2002-08-08 21:30     ` H. Peter Anvin
  2002-08-08 21:45     ` William Lee Irwin III
@ 2002-08-09  4:42     ` William Lee Irwin III
  2 siblings, 0 replies; 56+ messages in thread
From: William Lee Irwin III @ 2002-08-09  4:42 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Paul Larson, lkml, davej, frankeh, andrea

On Thu, Aug 08, 2002 at 01:24:35PM -0700, Linus Torvalds wrote:
> So if you really want to take this approach, you need to count the uses of
> "pid X", and free the bitmap entry only when that count goes to zero. I
> see no such logic in Bill Irwin's code, only a comment about last use
> (which doesn't explain how to notice that last use).
> Without that per-pid-count thing clarified, I don't think the (otherwise
> fairly straightforward) approach of Bills really flies.

One big thing to bear in mind is that it is actually part of a much
larger work, one which is not centered around get_pid(), and which is
not yet ready for inclusion, or even widespread review. So please give
me time to finish it, and defer judgment until it is complete.

(1) akpm did not post the full patch, only the "after" picture of one file.
(2) The per-id accounting is properly implemented, with caveats
	unrelated to the general accounting method. Yes, I am well
	aware of the need to be notified on release at points other
	than exit(), and I have implemented that notification.
(3) The patch as it is intended to be is largely a tty and job control
	cleanup. get_pid() changes are required as the central feature
	is the removal of the list of all tasks, upon which the current
	get_pid() relies.
(4) pid hashing actually creates idtag objects for something guaranteed
	to be unique. This is so stupid I consider it a bug.
(5) The patch is not yet finished.

Please defer judgment until I am ready to present as a finished work what
is now a work in progress and barely if even out of the "debug" phase.

The last fully-ported version of the patch, which was originally put
on-line only to facilitate communication with reviewers and
contributors, prior to the initial release (and by a very large margin)
is available from the following URL:

ftp://ftp.kernel.org/pub/linux/kernel/people/wli/task_mgmt/for_each_task-2.5.23-1

This patch does not contain a complete implementation of what I would
like to present when I feel ready to submit it.

While I thought I came up with something "nifty" in the way of a
get_pid() as a result of this work, its primary focus is really to
clean up tty and job control code. As it stands now, it does very
little in the way of cleaning it up, only converting it to use the new
infrastructure as a replacement for for_each_task() in the most
straightforward and braindead ways imaginable. Several bugs are known
to exist, but the full patch with all fixes has not yet been ported to
current mainline, and I won't have time to devote to it for some time.
This patch needs much further work, and that work is not yet finished.
Please defer judgment until I can actually finish it. This will
probably have to wait until 2.7 or even later.

Thanks,
Bill

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09  3:26       ` Chris Adams
@ 2002-08-09  7:04         ` Albert D. Cahalan
  2002-08-09  8:48           ` Helge Hafting
                             ` (2 more replies)
  0 siblings, 3 replies; 56+ messages in thread
From: Albert D. Cahalan @ 2002-08-09  7:04 UTC (permalink / raw)
  To: Chris Adams; +Cc: linux-kernel

Chris Adams writes:
> Once upon a time, Albert D. Cahalan <acahalan@cs.uml.edu> said:

>> Mind sharing what "ps -fj", "ps -lf", and "ps j" look like?
>> The standard tty is 80x24 BTW, and we already have serious
>> problems due to ever-expanding tty names.
>>
>> How about a default limit of 9999, to be adjusted by
>> sysctl as needed?
>
> I hope you meant 99999 (since we already have 5 digit PIDs).  It would
> also seem to me that if it is adjustable, then "ps" would have to handle
> it anyway, and making "ps" deal with adjustable size PIDs would be more
> complex and error-prone.

BTW, let's start with: not only "ps" is affected.
Off the top of my head: netstat, fuser, top, pstree...

I almost put 99999, but then I realized that that's silly.
For years Linux had a hard limit of about 4000 processes,
and not many people complained. It sure would be nice to
gain back a few of the columns lost to other stuff, so
that people could once again see command arguments.

The two real-word usage examples close at hand:

a. My full GNOME desktop has 48 processes.

b. The main UNIX shell server for CS students
   at this university has 79 processes.
   (Tru64, several hundred CS students, 2:25 am)

For "ps", adjustable width is a trivial addition.
Notice the UID ("ps -l", "ps -lf) handling, and
notice what signals ("ps s") do on a wide screen.
I could also just let the output be ugly, since
no normal system will have so many processes.

The problem is screen space, pure and simple. If the
default limit goes to over 1 billion, then "ps" output
must wrap lines. There is no alternative, unless you
think "System going down to reset PID numbers!" is OK.

> Tru64 Unix 5.x uses 19 bit pids (up to 524288, so up to six digits - the
> rest of the 32 bits go for cluster node, node sequence, and an unused
> sign bit) without significant problems.  Their "ps" args aren't an exact
> match, but they're close (lots of processes snipped):
> 
> $ ps -fj | head -2
> UID         PID   PPID    C STIME    TTY             TIME CMD
> cmadams  272363 301021  0.0 20:47:46 pts/1        0:00.09 -bash (bash)

Linux (and all SysV if I remember right) has 4 columns
of PID info that would need to expand:

UID        PID  PPID  PGID   SID  C STIME TTY          TIME CMD
albert   12975 12966 12975 12975  0 Aug02 pts/1    00:00:00 bash

> $ ps -lf | head -2
>        F S           UID    PID   PPID %CPU PRI  NI  RSS WCHAN    STARTED         TIME COMMAND
> 80c08001 I           200 272363 301021  0.0  44   0 520K wait     20:47:46     0:00.09 -bash (bash)

Eeew. That's broken; it won't display right on a normal
80x24 terminal. Linux "ps -lf" will just barely fit.

> $ ps j | head -2
> USER        PID   PPID   PGID   SESS JOBC S    TTY             TIME COMMAND
> cmadams  272363 301021 272363 272363    0 I    pts/1        0:00.09 -bash (bash)

For historic reasons, Linux has a whopping 5 columns
of PID info for this format:

 PPID   PID  PGID   SID TTY      TPGID STAT   UID   TIME COMMAND
    1   770   770   770 tty1     12893 S     1000   0:00 -bash
  770 12893 12893   770 tty1     12893 S     1000   0:00 /bin/sh /usr/bin/X11/st

> I routinely have 1000+ processes running on this server (many of them
> short-lived things like POP checks, short CGIs, sendmail background
> delivery, etc.), so having a larger PID space is important (having the
> same PID reused within 15-30 seconds would be annoying).

Increase the limit on this server if you wish. Problem?
I only suggest 9999 as the default. (which would actually
be just enough for you)

> I would like to see Linux running on this server (or at least this class
> of server), and limiting the number of PIDs because of "ps" formatting
> is not the way to go.

It's not just "ps", and I'm not saying you couldn't
adjust your system for a higher limit. I do think that
the out-of-box default shouldn't screw up formatting.
If you need to go past 9999, then you'll want to tweak
a few other settings as well. Low process counts are
the common case, and shouldn't be hurt by the fact that
a few people wish to run a billion processes.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09  7:04         ` Albert D. Cahalan
@ 2002-08-09  8:48           ` Helge Hafting
  2002-08-09 10:32             ` Alan Cox
  2002-08-09 13:00           ` Chris Adams
  2002-08-09 13:59           ` Rik van Riel
  2 siblings, 1 reply; 56+ messages in thread
From: Helge Hafting @ 2002-08-09  8:48 UTC (permalink / raw)
  To: Albert D. Cahalan, linux-kernel

"Albert D. Cahalan" wrote:

> The problem is screen space, pure and simple. If the
> default limit goes to over 1 billion, then "ps" output
> must wrap lines. There is no alternative, unless you
> think "System going down to reset PID numbers!" is OK.
> 

There is an alternative.  
Use 32-bit PID's, but with an additional rule for wraparound.
Simply wrap the PID when 
(nextPID > 2*number_of_processes && nextPID > 30000)
The latter one just to avoid wrapping at 10 when there are 
5 processes.  

This simple approach supports 32-bit PIDs for those 
that need them, while "ps" and friends always looks nice 
except for those who actually run large amounts of processes.
You won't get a very large PID unless you need to.

Helge Hafting

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 10:32             ` Alan Cox
@ 2002-08-09  9:35               ` Andrew Morton
  2002-08-09 10:37               ` Andries Brouwer
  1 sibling, 0 replies; 56+ messages in thread
From: Andrew Morton @ 2002-08-09  9:35 UTC (permalink / raw)
  To: Alan Cox; +Cc: Helge Hafting, Albert D. Cahalan, linux-kernel

Alan Cox wrote:
> 
> On Fri, 2002-08-09 at 09:48, Helge Hafting wrote:
> > Use 32-bit PID's, but with an additional rule for wraparound.
> > Simply wrap the PID when
> > (nextPID > 2*number_of_processes && nextPID > 30000)
> > The latter one just to avoid wrapping at 10 when there are
> > 5 processes.
> 
> Its much simpler to put max_pid into /proc/sys/kernel. That way we can
> boot with 32000 process ids, which will ensure ancient stuff which has
> 16bit pid_t (old old libc) and also any old kernel interfaces which
> expose it - does ipc ?

procfs oopses with >65535 processes, btw.  Related to the i_ino
encoding:

	#define fake_ino(pid,ino) (((pid)<<16)|(ino))
	#define PROC_INODE_PROPER(inode) ((inode)->i_ino & ~0xffff)

it ruined Anton's evening ;)

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09  8:48           ` Helge Hafting
@ 2002-08-09 10:32             ` Alan Cox
  2002-08-09  9:35               ` Andrew Morton
  2002-08-09 10:37               ` Andries Brouwer
  0 siblings, 2 replies; 56+ messages in thread
From: Alan Cox @ 2002-08-09 10:32 UTC (permalink / raw)
  To: Helge Hafting; +Cc: Albert D. Cahalan, linux-kernel

On Fri, 2002-08-09 at 09:48, Helge Hafting wrote:
> Use 32-bit PID's, but with an additional rule for wraparound.
> Simply wrap the PID when 
> (nextPID > 2*number_of_processes && nextPID > 30000)
> The latter one just to avoid wrapping at 10 when there are 
> 5 processes.  

Its much simpler to put max_pid into /proc/sys/kernel. That way we can
boot with 32000 process ids, which will ensure ancient stuff which has
16bit pid_t (old old libc) and also any old kernel interfaces which
expose it - does ipc ? 


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 10:32             ` Alan Cox
  2002-08-09  9:35               ` Andrew Morton
@ 2002-08-09 10:37               ` Andries Brouwer
  1 sibling, 0 replies; 56+ messages in thread
From: Andries Brouwer @ 2002-08-09 10:37 UTC (permalink / raw)
  To: Alan Cox; +Cc: Helge Hafting, Albert D. Cahalan, linux-kernel

On Fri, Aug 09, 2002 at 11:32:33AM +0100, Alan Cox wrote:

> Its much simpler to put max_pid into /proc/sys/kernel. That way we can
> boot with 32000 process ids, which will ensure ancient stuff which has
> 16bit pid_t (old old libc) and also any old kernel interfaces which
> expose it - does ipc ? 

I don't think it is a good idea to add lots of almost meaningless knobs.
This is not something one would like to adapt dynamically.
Someone who needs 16bit pid_t can compile her own kernel with a
suitable value of PID_MAX.

There is no old old libc with 16bit pid_t, I think.
I have libc 4.4.1 here with
	typedef int pid_t;
I have Linux 0.01 here with
	typedef int pid_t;

Yes, ipc exposed pid in a 16-bit field, namely in the fields
msg_lspid, msg_lrpid, shm_cpid, shm_lpid.
Two years ago I found all occurrences in all Linux sources,
but I forgot the details of the result; I do not recall
significant occurrences. Maybe someone will repeat this big grep.

Andries

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Analysis for Linux-2.5 fix/improve get_pid(), comparing various approaches
  2002-08-08  0:24   ` Andries Brouwer
  2002-08-08 19:42     ` William Lee Irwin III
@ 2002-08-09 11:22     ` Hubertus Franke
  2002-08-09 15:36       ` Andries Brouwer
  2002-08-09 16:05       ` Linus Torvalds
  1 sibling, 2 replies; 56+ messages in thread
From: Hubertus Franke @ 2002-08-09 11:22 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton, Andries Brouwer
  Cc: Paul Larson, lkml, andrea, gh

[-- Attachment #1: Type: text/plain, Size: 17126 bytes --]



Folks, below is the analysis that I promised yesterday.
Attached is also the harness program that brings this into userspace and 
computes basic overhead for pid allocation in a random setting.
The tar file contains the following stuff and represents the
status last time I gave this consideration. I had posted it to
lkml but other than from Andrea I had not received any feedback 
and dropped the issue.

total 52
   4 -rw-rw-r--    1 frankeh  frankeh       141 Mar 28 11:25 Makefile
  12 -rw-rw-r--    1 frankeh  frankeh      8306 Mar 22 16:59 res-2
  16 -rw-rw-r--    1 frankeh  frankeh     13268 Mar 20 10:11 getpid1.c
  16 -rw-rw-r--    1 frankeh  frankeh     15124 Mar 14 18:19 res-1
   4 -rwxrw-r--    1 frankeh  frankeh       316 Mar 13 12:01 bm

getpid1 is the harness. bm is the batch driver.
res-1 and res-2 are two result files that each were mangled together from
the outputs of <bm> executed on different machines.
I attach res-1 here , which I posted earlier in March, so you can read through 
it and draw your own conclusions with respect on where we should go with 
this.
It might be worthwile to independenly redo the test and also include
Andrea's <getpid> there, allthough it resembles <algo-1>. 

Volunteers ?

Note that based on the results I got, I still favor a version of the
mark and sweep that continues to go forward to find the next range
rather than always start from the beginning again.
I am not really sure whether Bill's version can be easily integrated.

The second part of the message (res-1) also experiments with a partial
maximum safe pid-range, i.e., once a range of 256 free pids has been 
established we stop the mark and sweep. That looks very competitive 
as well. I gives too simple improvements
(a) bitmask can be located on the stack  
(b) we could potentially deal with 32-bit pid numbers as we can limit
    the bitmask to partial space of the pid range.

-- Hubertus

----------------<previous post>-----------------------------------------------

I implemented an alternative version of getpid, that for large thread counts
( > 210000), provides "significantly" better performance as shown in attached
performance analysis. This is particulary viable for PID_MAX=32768.

-- Hubertus Franke <frankeh@watson.ibm.com>

---------------------------------------------------------------------------------

Currently the getpid algorithm works as follows:
At any given time an interval of [ last_pid .. next_safe ) is known to hold     
unused pids. Initially the interval is set to [0 .. 32767]

Hence to allocate a new pid the following is sufficient:

       if (++last_pid < next_safe) return last_pid;

However, if we move out of the safe interval, the next safe interval needs 
to be established first.
This is currently done by a repetive search 

repeat:
     foralltasks(p) {
        if (p uses lastpid) { last_pid++; goto repeat; }
        /* narrow [ last_pid .. next_safe ) */
        if (p->pids in [ last_pid .. next_safe ) ) next_safe = p->pid
     }

Particulary for large number of tasks, this can lead to frequent exercise of
the repeat resulting in a O(N^2) algorithm. We call this : <algo-0>.

Instead I have provided an alternative mechanism that at the time
of determining the next interval marks a bitmask by walking the tasklist
once [ O(N) ] and then finding the proper bit offsets to mark the next free
interval starting from last_pid. The bitmap requires 4096 bytes.
This is <algo-1>.

An optimization to this to keep the last bitmap instead of clearing it
with every search. Only if we fail to obtain a free range, then we have
to go back and clear the bitmap and redo the search one more time.
This is <algo-2>.

I dragged the various algorithms into a userlevel test program to figure
out where the cut off points are with PID_MAX=32768. In this testprogram
I maintain A tasks, and for 10 rounds (delete D random tasks and 
reallocate D tasks again) resulting in T=10*D total measured allocations.

Si states how many interval searches where needed for algo-i.
Gi states the average overhead per get_pid for algo-i in usecs.

Based on that one should use the current algorithm until ~ 22K tasks and
beyond that use algo-2. Only the last 15 tasks are a bit faster under algo-1.
We can safely ignore that case.

Based on that providing an adaptive implementation seems the right choice.
The patch for 2.5.7-pre1 is attached.


executed program example: getpid -c 2 -s 10 -e 100 -d 10 -t <0,1> 
        0 is old 1 is new algo 2.

   A     D      T |      S0           G0 |    S1        G1 |    S2        G2 
----------------------------------------------------------------------------
   10    10    80 |       1         0.34 |     1      0.59 |    1       0.81 
   20    10   100 |       1         0.30 |     1      0.49 |    1       0.64 
   30    10   100 |       1         0.29 |     1      0.55 |    1       0.65 
   40    10   100 |       1         0.35 |     1      0.51 |    1       0.65 
   50    10   100 |       1         0.35 |     1      0.54 |    1       0.67 
   60    10   100 |       2         0.38 |    21      1.95 |    2       0.79 
   70    10   100 |       1         0.39 |     1      0.59 |    1       0.76 
   80    10   100 |       1         0.41 |     1      0.62 |    1       0.76 
  100    50   500 |       2         0.22 |    63      1.26 |    2       0.30 
  150    50   500 |       3         0.24 |    12      0.56 |    4       0.36 
  200    50   500 |       3         0.27 |    56      2.26 |    5       0.46 
  250    50   500 |       2         0.26 |   119      5.63 |    6       0.54 
  300    50   500 |       3         0.32 |   148      8.73 |    9       0.76 
  350    50   500 |       5         0.45 |   168     11.51 |    6       0.76 
  400    50   500 |       4         0.44 |    90      7.28 |   10       1.10 
  450    50   500 |       6         0.61 |   143     13.08 |    7       0.97 
  500    50   500 |       6         0.65 |   100     10.47 |    7       1.06 
  550    50   500 |       5         0.63 |    71      8.10 |    9       1.34 
  600    50   500 |       7         0.86 |   115     14.32 |   14       2.04 
  650    50   500 |       8         1.00 |   112     15.08 |   13       2.07 
  700    50   500 |       8         1.06 |   127     18.12 |   10       1.79 
  750    50   500 |       8         1.26 |    62      9.73 |   15       2.73 
  800    50   500 |      11         1.68 |    92     15.14 |   12       2.42 
  850    50   500 |      14         2.03 |    78     13.73 |   13       2.67 
  900    50   500 |      21         3.17 |   102     18.74 |   27       5.18 
 1000  1000  9980 |       1         0.18 |     4      0.19 |    1       0.18 
 2000  1000 10000 |      76         1.22 |  3604     53.03 |  322       4.81 
 3000  1000 10000 |     161         3.84 |  4502    112.24 |  606      15.49 
 4000  1000 10000 |     359        11.17 |  4912    183.37 |  901      33.76 
 5000  1000 10000 |     539        23.33 |  4949    257.35 | 1165      59.91 
 6000  1000 10000 |     724        43.42 |  4918    349.59 | 1498     104.36 
 7000  1000 10000 |    1026        85.38 |  4886    447.58 | 1835     165.08 
 8000  1000 10000 |    1228       126.45 |  4870    565.29 | 2084     234.73 
 9000  1000 10000 |    1516       193.62 |  4826    699.85 | 2489     354.27 
10000  1000 10000 |    1818       289.32 |  4910    843.32 | 2763     472.47 
11000  1000 10000 |    2093       389.33 |  5005   1023.08 | 3095     629.70 
12000  1000 10000 |    2305       506.23 |  5095   1194.71 | 3277     773.06 
13000  1000 10000 |    2680       683.66 |  5289   1424.81 | 3711    1003.67 
14000  1000 10000 |    2959       853.10 |  5358   1602.05 | 3878    1172.70 
15000  1000 10000 |    3167      1037.79 |  5539   1835.64 | 4301    1436.40 
16000  1000 10000 |    3466      1272.80 |  5669   2087.03 | 4485    1635.48 
17000  1000 10000 |    3743      1539.06 |  5932   2338.50 | 4844    1924.27 
18000  1000 10000 |    4069      1869.63 |  6097   2613.60 | 5218    2232.52 
19000  1000 10000 |    4293      2183.98 |  6242   2866.34 | 5501    2519.60 
20000  1000 10000 |    4616      2607.10 |  6508   3175.90 | 5770    2823.98 
21000  1000 10000 |    4974      3119.34 |  6700   3460.95 | 6161    3183.73 
22000  1000 10000 |    5177      3609.28 |  6944   3788.19 | 6389    3492.97 =
23000  1000 10000 |    5483      4214.03 |  7183   4136.25 | 6665    3823.38 
24000  1000 10000 |    5838      4971.60 |  7404   4460.62 | 6982    4199.61 
25000  1000 10000 |    6183      5880.92 |  7736   4891.80 | 7209    4546.18 
26000  1000 10000 |    6413      6829.07 |  7890   5210.85 | 7533    4939.12 
27000  1000 10000 |    6748      8132.96 |  8148   5598.19 | 7959    5442.25 
28000  1000 10000 |    7139     10065.52 |  8445   6047.42 | 8140    5767.13 
29000  1000 10000 |    7638     12967.20 |  8736   6475.23 | 8501    6250.86 
30000  1000 10000 |    8178     16991.05 |  8994   6907.40 | 8911    6791.97 
32000    50   500 |     482     26446.69 |   488   7405.63 |  487    7494.39 
32050    50   500 |     488     34769.89 |   488   7463.11 |  486    7541.61 
32100    50   500 |     489     44564.86 |   493   7593.99 |  486    7589.02 
32150    50   500 |     486     58150.58 |   487   7549.96 |  492    7731.18 
32200    50   500 |     490     64875.38 |   495   7721.82 |  497    7854.59 
32250    50   500 |     491     81790.21 |   491   7697.57 |  490    7795.12 
32300    50   500 |     489     88975.62 |   493   7763.04 |  495    7909.77 
32350    50   500 |     489    115797.38 |   492   7782.34 |  495    7967.86 
32400    50   500 |     490    120958.50 |   497   7898.45 |  496    8018.98 
32450    50   500 |     492    147541.84 |   493   7874.27 |  492    7982.34 
32500    50   500 |     493    175498.39 |   495   7940.18 |  495    8060.97 
32550    50   500 |     492    207229.29 |   496   7973.88 |  498    8134.02 
32600    50   500 |     495    267057.05 |   498   8028.86 |  498    8171.97 
32650    50   500 |     492    375722.28 |   500   8088.30 |  498    8213.85 
32700    50   500 |     497    528321.07 |   500   8110.51 |  499    8267.67 
32751     1    10 |      10    259785.80 |    10   7851.50 |   10    8549.30 
32752     1    10 |      10   1121285.60 |    10   7846.30 |   10    8556.10 
32753     1    10 |      10    383729.50 |    10   7848.60 |   10    8562.20 
32754     1    10 |      10   1061467.50 |    10   7849.80 |   10    8564.40 
32755     1    10 |      10    612726.50 |    10   7853.00 |   10    8553.90 
32756     1    10 |      10   1725559.90 |    10   7851.90 |   10    8553.00 
32757     1    10 |      10   1679818.50 |    10   7847.80 |   10    8552.10 
32758     1    10 |      10   2991838.60 |    10   7865.70 |   10    8557.20 
32759     1    10 |      10   883388.90  |    10   7859.40 |   10    8562.00 
32760     1    10 |      10   4830405.90 |    10   7850.50 |   10    9336.60 
32761     1    10 |      10   7105809.20 |    10   7863.90 |   10    9337.20 
32762     1    10 |      10   7919703.40 |    10   7867.10 |   10    9340.70 
32763     1    10 |      10   1537522.50 |    10   7869.40 |   10    9340.70 
32764     1    10 |      10   6173019.20 |    10   7866.60 |   10    9340.00 
32765     1    10 |      10   8104105.00 |    10   7876.20 |   10   10112.80 
32766     1    10 |      10  16145415.40 |    10   7880.80 |   10   10893.50 
32767     1    10 |      10  16135267.10 |    10   7878.60 |   10   11674.40 


Other variants are possible, for instance if 4096 bytes is too much
(hell I don't know how that could be), one can break it up into smaller
search chunks (e.g. 256 bytes).

Another alternative is to allocate the page on the first occasion of
getting into get_pid_my_map....

In the following I give a comparative result between algo-2 and
algo-2 with a max interval size of 256. The times are very comparative.
Also the search count values are identical, but in 2 cases suggesting
that a interval size particular for large thread counts of 256 is certainly 
sufficient, but it brings some small overhead. Question to answer is 
wether setting aside an extra page is such a crime..... 

   A     D      T |    S2        G2 |  S2-256     G2-256
-------------------------------------------------------
   10    10    80 |    1       0.81 |      1       0.84
   20    10   100 |    1       0.64 |      1       0.67
   30    10   100 |    1       0.65 |      1       0.68
   40    10   100 |    1       0.65 |      1       0.69
   50    10   100 |    1       0.67 |      1       0.71
   60    10   100 |    2       0.79 |      2       0.82
   70    10   100 |    1       0.76 |      1       0.76
   80    10   100 |    1       0.76 |      1       0.79
  100    50   500 |    2       0.30 |      2       0.31
  150    50   500 |    4       0.36 |      5       0.39  <=
  200    50   500 |    5       0.46 |      5       0.46
  250    50   500 |    6       0.54 |      6       0.55
  300    50   500 |    9       0.76 |      9       0.76
  350    50   500 |    6       0.76 |      6       0.75
  400    50   500 |   10       1.10 |     10       1.10
  450    50   500 |    7       0.97 |      7       0.97
  500    50   500 |    7       1.06 |      7       1.06
  550    50   500 |    9       1.34 |      9       1.35
  600    50   500 |   14       2.04 |     14       2.06
  650    50   500 |   13       2.07 |     13       2.09
  700    50   500 |   10       1.79 |     10       1.82
  750    50   500 |   15       2.73 |     15       2.69
  800    50   500 |   12       2.42 |     12       2.38
  850    50   500 |   13       2.67 |     13       2.66
  900    50   500 |   27       5.18 |     27       5.25
 1000  1000  9980 |    1       0.18 |      3       0.19 <=
 2000  1000 10000 |  322       4.81 |    322       4.84
 3000  1000 10000 |  606      15.49 |    606      15.55
 4000  1000 10000 |  901      33.76 |    901      34.42
 5000  1000 10000 | 1165      59.91 |   1165      62.35
 6000  1000 10000 | 1498     104.36 |   1498     105.55
 7000  1000 10000 | 1835     165.08 |   1835     174.82
 8000  1000 10000 | 2084     234.73 |   2084     244.18
 9000  1000 10000 | 2489     354.27 |   2489     372.11
10000  1000 10000 | 2763     472.47 |   2763     486.73
11000  1000 10000 | 3095     629.70 |   3095     648.31
12000  1000 10000 | 3277     773.06 |   3277     784.75
13000  1000 10000 | 3711    1003.67 |   3711    1006.94
14000  1000 10000 | 3878    1172.70 |   3878    1168.71
15000  1000 10000 | 4301    1436.40 |   4301    1429.89
16000  1000 10000 | 4485    1635.48 |   4485    1620.90
17000  1000 10000 | 4844    1924.27 |   4844    1904.92
18000  1000 10000 | 5218    2232.52 |   5218    2218.80
19000  1000 10000 | 5501    2519.60 |   5501    2508.83
20000  1000 10000 | 5770    2823.98 |   5770    2895.66
21000  1000 10000 | 6161    3183.73 |   6161    3307.54
22000  1000 10000 | 6389    3492.97 |   6389    3620.53
23000  1000 10000 | 6665    3823.38 |   6665    3995.63
24000  1000 10000 | 6982    4199.61 |   6982    4347.95
25000  1000 10000 | 7209    4546.18 |   7209    4701.95
26000  1000 10000 | 7533    4939.12 |   7533    5088.13
27000  1000 10000 | 7959    5442.25 |   7959    5599.85
28000  1000 10000 | 8140    5767.13 |   8140    5817.86
29000  1000 10000 | 8501    6250.86 |   8501    6250.30
30000  1000 10000 | 8911    6791.97 |   8911    6788.51
32000    50   500 |  487    7494.39 |    487    7493.47
32050    50   500 |  486    7541.61 |    486    7541.05
32100    50   500 |  486    7589.02 |    486    7586.12
32150    50   500 |  492    7731.18 |    492    7728.76
32200    50   500 |  497    7854.59 |    497    7854.94
32250    50   500 |  490    7795.12 |    490    7783.10
32300    50   500 |  495    7909.77 |    495    7902.70
32350    50   500 |  495    7967.86 |    495    7946.20
32400    50   500 |  496    8018.98 |    496    7999.34
32450    50   500 |  492    7982.34 |    492    7962.93
32500    50   500 |  495    8060.97 |    495    8048.18
32550    50   500 |  498    8134.02 |    498    8122.08
32600    50   500 |  498    8171.97 |    498    8169.34
32650    50   500 |  498    8213.85 |    498    8209.95
32700    50   500 |  499    8267.67 |    499    8266.13
32751     1    10 |   10    8549.30 |     10    8629.00
32752     1    10 |   10    8556.10 |     10    8636.30
32753     1    10 |   10    8562.20 |     10    8632.00
32754     1    10 |   10    8564.40 |     10    8633.40
32755     1    10 |   10    8553.90 |     10    8635.40
32756     1    10 |   10    8553.00 |     10    8637.60
32757     1    10 |   10    8552.10 |     10    8640.90
32758     1    10 |   10    8557.20 |     10    8644.90
32759     1    10 |   10    8562.00 |     10    8644.10
32760     1    10 |   10    9336.60 |     10    9436.10
32761     1    10 |   10    9337.20 |     10    9435.60
32762     1    10 |   10    9340.70 |     10    9439.10
32763     1    10 |   10    9340.70 |     10    9433.60
32764     1    10 |   10    9340.00 |     10    9440.60
32765     1    10 |   10   10112.80 |     10   10228.40
32766     1    10 |   10   10893.50 |     10   11023.50
32767     1    10 |   10   11674.40 |     10   11813.70


[-- Attachment #2: gp.tar.gz --]
[-- Type: application/x-tgz, Size: 10619 bytes --]

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09  7:04         ` Albert D. Cahalan
  2002-08-09  8:48           ` Helge Hafting
@ 2002-08-09 13:00           ` Chris Adams
  2002-08-09 14:39             ` Albert D. Cahalan
  2002-08-09 13:59           ` Rik van Riel
  2 siblings, 1 reply; 56+ messages in thread
From: Chris Adams @ 2002-08-09 13:00 UTC (permalink / raw)
  To: Albert D. Cahalan; +Cc: linux-kernel

Once upon a time, Albert D. Cahalan <acahalan@cs.uml.edu> said:
> BTW, let's start with: not only "ps" is affected.
> Off the top of my head: netstat, fuser, top, pstree...

netstat: already wraps around and loses formatting when PIDs are shown
    (so no changes)
fuser: already has enough columns (because it uses "kernel" some places)
    (so no changes)
top: two line change
pstree: PIDs are displayed in "()" after process name (not formatted
    columns) (so no changes)

How many people actually _use_ PIDs on a regular basis?  When I did our
upgrade from Tru64 4.x to 5.x and got bigger PIDs, it took awhile before
anyone else even noticed.  Almost always when I'm doing something to
processes, I'm scripting it, not typing numbers.

> I almost put 99999, but then I realized that that's silly.
> For years Linux had a hard limit of about 4000 processes,
> and not many people complained. It sure would be nice to
> gain back a few of the columns lost to other stuff, so
> that people could once again see command arguments.

We're talking one or two more columns.  That's not going to magically
make it possible to see the command arguments.  If someone wants to see
the arguments, they're going to have to use "w" anyway (also, "normal"
ps with no arguments still has lots of space).

> The two real-word usage examples close at hand:
> 
> a. My full GNOME desktop has 48 processes.
> 
> b. The main UNIX shell server for CS students
>    at this university has 79 processes.
>    (Tru64, several hundred CS students, 2:25 am)

My GNOME desktop has 72 processes, and I've only got Mozilla and a
couple of xterms running right now.  My Tru64 server, with only 9 users
at 7:40 am, has 443 processes (during the business day we typically get
a lot more processes - we'll have 50-75 users logged in, lots of POP
checks running, and the usually at least once per day spam attack adding
a couple of hundred extra sendmail processes - that's when we hit 1000+
processes).

> The problem is screen space, pure and simple. If the
> default limit goes to over 1 billion, then "ps" output
> must wrap lines. There is no alternative, unless you
> think "System going down to reset PID numbers!" is OK.

Adding just one column (and going to 19 bits) allows 16 times as many
PIDs (or a sparse space 16 times as large).

> > $ ps -lf | head -2
> >        F S           UID    PID   PPID %CPU PRI  NI  RSS WCHAN    STARTED         TIME COMMAND
> > 80c08001 I           200 272363 301021  0.0  44   0 520K wait     20:47:46     0:00.09 -bash (bash)
> 
> Eeew. That's broken; it won't display right on a normal
> 80x24 terminal. Linux "ps -lf" will just barely fit.

Why does every possible output format of ps have to fit each process on
one line?  How often are these output formats used by normal people?

> > $ ps j | head -2
> > USER        PID   PPID   PGID   SESS JOBC S    TTY             TIME COMMAND
> > cmadams  272363 301021 272363 272363    0 I    pts/1        0:00.09 -bash (bash)

Note: one reason this one wrapped (instead of chopping the command) is
that Tru64 ps automatically displays everything (like lots of "w"
options) when output is not a TTY.

> It's not just "ps", and I'm not saying you couldn't
> adjust your system for a higher limit. I do think that
> the out-of-box default shouldn't screw up formatting.
> If you need to go past 9999, then you'll want to tweak
> a few other settings as well. Low process counts are
> the common case, and shouldn't be hurt by the fact that
> a few people wish to run a billion processes.

I'm not talking about a billion processes.  Also, are you going to make
all those other programs (well, "top" is the only one named above that
has to be changed) handle variable width PIDs now too?  That argument
cuts both ways.

I think setting things based on formatting of some modes of a couple of
programs that not that many users actually use is not the right way to
go.  We don't limit usernames to 8 characters anymore just because "ls"
truncates the name (not a kernel issue but the same concept).  If a
larger PID space works better for PID allocation, then go with a larger
PID space.

-- 
Chris Adams <cmadams@hiwaay.net>
Systems and Network Administrator - HiWAAY Internet Services
I don't speak for anybody but myself - that's enough trouble.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09  7:04         ` Albert D. Cahalan
  2002-08-09  8:48           ` Helge Hafting
  2002-08-09 13:00           ` Chris Adams
@ 2002-08-09 13:59           ` Rik van Riel
  2002-08-09 14:57             ` Albert D. Cahalan
  2 siblings, 1 reply; 56+ messages in thread
From: Rik van Riel @ 2002-08-09 13:59 UTC (permalink / raw)
  To: Albert D. Cahalan; +Cc: Chris Adams, linux-kernel

On Fri, 9 Aug 2002, Albert D. Cahalan wrote:

> I almost put 99999, but then I realized that that's silly.
> For years Linux had a hard limit of about 4000 processes,

That limit was removed some 2 years ago.

Rik
-- 
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/		http://distro.conectiva.com/


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 13:00           ` Chris Adams
@ 2002-08-09 14:39             ` Albert D. Cahalan
  0 siblings, 0 replies; 56+ messages in thread
From: Albert D. Cahalan @ 2002-08-09 14:39 UTC (permalink / raw)
  To: Chris Adams; +Cc: Albert D. Cahalan, linux-kernel

Chris Adams writes:
> Once upon a time, Albert D. Cahalan <acahalan@cs.uml.edu> said:

>> BTW, let's start with: not only "ps" is affected.
>> Off the top of my head: netstat, fuser, top, pstree...
>
> netstat: already wraps around and loses formatting when PIDs are shown
>     (so no changes)

Sure, but no need to make it worse.

> fuser: already has enough columns (because it uses "kernel" some places)
>     (so no changes)
> top: two line change

Still this is screwing the common case for the truly unusual.

> pstree: PIDs are displayed in "()" after process name (not formatted
>     columns) (so no changes)
>
> How many people actually _use_ PIDs on a regular basis?  When I did our
> upgrade from Tru64 4.x to 5.x and got bigger PIDs, it took awhile before
> anyone else even noticed.  Almost always when I'm doing something to
> processes, I'm scripting it, not typing numbers.

I'm not counting Aunt Tillie.

Anybody on a terminal without cut-and-paste will suffer.

>> I almost put 99999, but then I realized that that's silly.
>> For years Linux had a hard limit of about 4000 processes,
>> and not many people complained. It sure would be nice to
>> gain back a few of the columns lost to other stuff, so
>> that people could once again see command arguments.
>
> We're talking one or two more columns.

It's as many as 25 character-columns. Linus was using
0x3fffffff as a mask. There are 5 field-columns with
PID data in the "ps j" format.

> That's not going to magically
> make it possible to see the command arguments.  If someone wants to see
> the arguments, they're going to have to use "w" anyway (also, "normal"
> ps with no arguments still has lots of space).

Huh? Try "ps -jf", which BTW has 4 PID columns. Also try "ps j".
I'm seeing arguments, and won't if PIDs get too wide.

>> The two real-word usage examples close at hand:
>> 
>> a. My full GNOME desktop has 48 processes.
>> 
>> b. The main UNIX shell server for CS students
>>    at this university has 79 processes.
>>    (Tru64, several hundred CS students, 2:25 am)
>
> My GNOME desktop has 72 processes, and I've only got Mozilla and a
> couple of xterms running right now.

Same as me, but I run Debian. Still, 72 is well below 9999.

> My Tru64 server, with only 9 users
> at 7:40 am, has 443 processes (during the business day we typically get
> a lot more processes - we'll have 50-75 users logged in, lots of POP
> checks running, and the usually at least once per day spam attack adding
> a couple of hundred extra sendmail processes - that's when we hit 1000+
> processes).

While I think this would fit under 9999, you're welcome
to adjust your system for any limit you like. I just don't
think the default should cater to systems that are both
rare and likely to have a real paid admin. It's your job to
increase the limit if you feel a need for more PID space.

>> The problem is screen space, pure and simple. If the
>> default limit goes to over 1 billion, then "ps" output
>> must wrap lines. There is no alternative, unless you
>> think "System going down to reset PID numbers!" is OK.
>
> Adding just one column (and going to 19 bits) allows 16
> times as many PIDs (or a sparse space 16 times as large).

Yes, and it trashes many common formats.

>>> $ ps -lf | head -2
>>>        F S           UID    PID   PPID %CPU PRI  NI  RSS WCHAN    STARTED         TIME COMMAND
>>> 80c08001 I           200 272363 301021  0.0  44   0 520K wait     20:47:46     0:00.09 -bash (bash)
>> 
>> Eeew. That's broken; it won't display right on a normal
>> 80x24 terminal. Linux "ps -lf" will just barely fit.
>
> Why does every possible output format of ps have to fit each process on
> one line?  How often are these output formats used by normal people?

How often are thousands of PIDs used by normal people?

>>> $ ps j | head -2
>>> USER        PID   PPID   PGID   SESS JOBC S    TTY             TIME COMMAND
>>> cmadams  272363 301021 272363 272363    0 I    pts/1        0:00.09 -bash (bash)
>
> Note: one reason this one wrapped (instead of chopping the command) is
> that Tru64 ps automatically displays everything (like lots of "w"
> options) when output is not a TTY.

It's either that or assume 80, unless you have a way to
get info from the controlling tty. I haven't looked into
that yet.

>> It's not just "ps", and I'm not saying you couldn't
>> adjust your system for a higher limit. I do think that
>> the out-of-box default shouldn't screw up formatting.
>> If you need to go past 9999, then you'll want to tweak
>> a few other settings as well. Low process counts are
>> the common case, and shouldn't be hurt by the fact that
>> a few people wish to run a billion processes.
>
> I'm not talking about a billion processes.

That's what Linus was testing with.

> Also, are you going to make
> all those other programs (well, "top" is the only one named above that
> has to be changed) handle variable width PIDs now too?  That argument
> cuts both ways.

Perhaps. You, with the odd case, can do without those programs.

> I think setting things based on formatting of some modes of a couple of
> programs that not that many users actually use is not the right way to
> go.  We don't limit usernames to 8 characters anymore just because "ls"
> truncates the name (not a kernel issue but the same concept).  If a
> larger PID space works better for PID allocation, then go with a larger
> PID space.

Arrrgh! That's a bug in ls, and yes you should limit usernames.
Think about:

Albert_Roberts
Albert_Robertson
Albert_Robbens

I think this is a much more popular problem than PID issues.
Lots of people with an NT background don't see why a long
username might cause problems. (names may be assigned by
corporate policy, and Linux admins must adapt) Maybe I could
do something about this if the default PID limit was 9999.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 13:59           ` Rik van Riel
@ 2002-08-09 14:57             ` Albert D. Cahalan
  0 siblings, 0 replies; 56+ messages in thread
From: Albert D. Cahalan @ 2002-08-09 14:57 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Albert D. Cahalan, Chris Adams, linux-kernel

Rik van Riel writes:
> On Fri, 9 Aug 2002, Albert D. Cahalan wrote:
>
>> I almost put 99999, but then I realized that that's silly.
>> For years Linux had a hard limit of about 4000 processes,
>
> That limit was removed some 2 years ago.

Sure. Linux is 11 years old now. Also remember that that
was the _hard_ limit. We had a soft limit near 1000,
with a kernel recompile needed to increase it. Almost
everybody was satisfied with this.

Personally I'd change my adjustable PID limit to 999,
but I wouldn't suggest that as a default. 9999 should
cover almost all systems.

Self-adjusting could be nice. Then everybody starts
with 9, and gains a digit whenever space is 50% full.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: Analysis for Linux-2.5 fix/improve get_pid(), comparing various approaches
  2002-08-09 11:22     ` Analysis for Linux-2.5 fix/improve get_pid(), comparing various approaches Hubertus Franke
@ 2002-08-09 15:36       ` Andries Brouwer
  2002-08-09 18:14         ` Hubertus Franke
  2002-08-09 16:05       ` Linus Torvalds
  1 sibling, 1 reply; 56+ messages in thread
From: Andries Brouwer @ 2002-08-09 15:36 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: Linus Torvalds, Andrew Morton, Paul Larson, lkml, andrea, gh

On Fri, Aug 09, 2002 at 07:22:08AM -0400, Hubertus Franke wrote:

> Particulary for large number of tasks, this can lead to frequent exercise of
> the repeat resulting in a O(N^2) algorithm. We call this : <algo-0>.

Your math is flawed. The O(N^2) happens only when the name space for pid's
has the same order of magnitude as the number N of processes.
Now consider N=100000 with 31-bit name space. In a series of
2.10^9 forks you have to do the loop fewer than N times and
N^2 / 2.10^9 = 5. You see that on average for each fork there
are 5 comparisons.
For N=1000000 you rearrange the task list as I described yesterday
so that each loop takes time sqrt(N), and altogether N.sqrt(N)
comparisons are needed in a series of 2.10^9 forks.
That is 0.5 comparisons per fork.

You see that thanks to the large pid space things get really
efficient. Ugly constructions are only needed when a large fraction
of all possible pids is actually in use, or when you need hard
real time guarantees.

Andries

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: Analysis for Linux-2.5 fix/improve get_pid(), comparing various approaches
  2002-08-09 11:22     ` Analysis for Linux-2.5 fix/improve get_pid(), comparing various approaches Hubertus Franke
  2002-08-09 15:36       ` Andries Brouwer
@ 2002-08-09 16:05       ` Linus Torvalds
  2002-08-09 18:18         ` Hubertus Franke
  1 sibling, 1 reply; 56+ messages in thread
From: Linus Torvalds @ 2002-08-09 16:05 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: Andrew Morton, Andries Brouwer, Paul Larson, lkml, andrea, gh

On Fri, 9 Aug 2002, Hubertus Franke wrote:
> 
> I dragged the various algorithms into a userlevel test program to figure
> out where the cut off points are with PID_MAX=32768. In this testprogram
> I maintain A tasks, and for 10 rounds (delete D random tasks and 
> reallocate D tasks again) resulting in T=10*D total measured allocations.

Mind re-doing that with PID_MAX=999999 or similar? The whole point of the
current simple algorithm is that the common case (nay, done right, the
_only_ case) is where the number of threads << PID_MAX.

That certainly used to be true with PID_MAX=32768 (not many people may
realize it, but in 1991 the maximum number of tasks in the system was
limited to 63, simply because of how the VM carved out the 4GB address
space).

Things have changed, but considering that some people think that 32k
threads are a limitation already, and that the current code should work
fine (and be pretty much optimal) with a larger PID_MAX, I really think 
it's unfair to not even benchmark that case..

		Linus

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: Analysis for Linux-2.5 fix/improve get_pid(), comparing various approaches
  2002-08-09 15:36       ` Andries Brouwer
@ 2002-08-09 18:14         ` Hubertus Franke
  0 siblings, 0 replies; 56+ messages in thread
From: Hubertus Franke @ 2002-08-09 18:14 UTC (permalink / raw)
  To: Andries Brouwer
  Cc: Linus Torvalds, Andrew Morton, Paul Larson, lkml, andrea, gh

On Friday 09 August 2002 11:36 am, Andries Brouwer wrote:

!!!!!!!!!!! You are in a different space !!!!!!!!
All work was done under the assumption of 16-bit pid_t.
I stated yesterday already that for NumTasks substantially smaller
than the pid_t supported size, this won't be a problem
as your analysis states and my data also states.

You have two choices 
(a) move Linux up to 32-bit pid_t
(b) stick within the current 16-bit discussion.

> On Fri, Aug 09, 2002 at 07:22:08AM -0400, Hubertus Franke wrote:
> > Particulary for large number of tasks, this can lead to frequent exercise
> > of the repeat resulting in a O(N^2) algorithm. We call this : <algo-0>.
>
> Your math is flawed. The O(N^2) happens only when the name space for pid's
> has the same order of magnitude as the number N of processes.
> Now consider N=100000 with 31-bit name space. In a series of
> 2.10^9 forks you have to do the loop fewer than N times and
> N^2 / 2.10^9 = 5. You see that on average for each fork there
> are 5 comparisons.
> For N=1000000 you rearrange the task list as I described yesterday
> so that each loop takes time sqrt(N), and altogether N.sqrt(N)
> comparisons are needed in a series of 2.10^9 forks.
> That is 0.5 comparisons per fork.
>
> You see that thanks to the large pid space things get really
> efficient. Ugly constructions are only needed when a large fraction
> of all possible pids is actually in use, or when you need hard
> real time guarantees.
>
> Andries


-- 
-- Hubertus Franke  (frankeh@watson.ibm.com)

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: Analysis for Linux-2.5 fix/improve get_pid(), comparing various approaches
  2002-08-09 16:05       ` Linus Torvalds
@ 2002-08-09 18:18         ` Hubertus Franke
  0 siblings, 0 replies; 56+ messages in thread
From: Hubertus Franke @ 2002-08-09 18:18 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrew Morton, Andries Brouwer, Paul Larson, lkml, andrea, gh

On Friday 09 August 2002 12:05 pm, Linus Torvalds wrote:
> On Fri, 9 Aug 2002, Hubertus Franke wrote:
> > I dragged the various algorithms into a userlevel test program to figure
> > out where the cut off points are with PID_MAX=32768. In this testprogram
> > I maintain A tasks, and for 10 rounds (delete D random tasks and
> > reallocate D tasks again) resulting in T=10*D total measured allocations.
>
> Mind re-doing that with PID_MAX=999999 or similar? The whole point of the
> current simple algorithm is that the common case (nay, done right, the
> _only_ case) is where the number of threads << PID_MAX.
>

Don't have time right now...
Simply look at the numbers for the ratio you are expected.
I would be very surprise if the relative curves would change
when moving to 132K tasks and also populate the pid space only by
let's say 25%.

Otherwise, Paul can you run this....

> That certainly used to be true with PID_MAX=32768 (not many people may
> realize it, but in 1991 the maximum number of tasks in the system was
> limited to 63, simply because of how the VM carved out the 4GB address
> space).
>
> Things have changed, but considering that some people think that 32k
> threads are a limitation already, and that the current code should work
> fine (and be pretty much optimal) with a larger PID_MAX, I really think
> it's unfair to not even benchmark that case..
>
> 		Linus

-- 
-- Hubertus Franke  (frankeh@watson.ibm.com)

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-08 22:02 ` Linus Torvalds
                     ` (2 preceding siblings ...)
  2002-08-08 22:37   ` Rik van Riel
@ 2002-08-09 19:34   ` Paul Larson
  2002-08-09 20:13     ` Paul Larson
                       ` (2 more replies)
  3 siblings, 3 replies; 56+ messages in thread
From: Paul Larson @ 2002-08-09 19:34 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Hubertus Franke, Rik van Riel, Andries Brouwer, Andrew Morton,
	andrea, Dave Jones, lkml

I suspect that it would actually require more than just this.  I tried
this with the same test I've been using and had several failed attepmts
at low numbers by getting wierd unexpected signals (like 28), and then
one that ran for a much longer time and produced an oops with random
garbage to the console (trying to extract this now).

-Paul Larson

On Thu, 2002-08-08 at 17:02, Linus Torvalds wrote:
> ----
> --- 1.2/include/linux/threads.h	Tue Feb  5 07:23:04 2002
> +++ edited/include/linux/threads.h	Thu Aug  8 14:58:28 2002
> @@ -19,6 +19,7 @@
>  /*
>   * This controls the maximum pid allocated to a process
>   */
> -#define PID_MAX 0x8000
> +#define PID_MASK 0x3fffffff
> +#define PID_MAX (PID_MASK+1)
>  
>  #endif
> ===== kernel/fork.c 1.57 vs edited =====
> --- 1.57/kernel/fork.c	Tue Jul 30 15:49:20 2002
> +++ edited/kernel/fork.c	Thu Aug  8 15:00:16 2002
> @@ -142,7 +142,7 @@
>  		return 0;
>  
>  	spin_lock(&lastpid_lock);
> -	if((++last_pid) & 0xffff8000) {
> +	if((++last_pid) & ~PID_MASK) {
>  		last_pid = 300;		/* Skip daemons etc. */
>  		goto inside;
>  	}
> @@ -157,7 +157,7 @@
>  			   p->tgid == last_pid	||
>  			   p->session == last_pid) {
>  				if(++last_pid >= next_safe) {
> -					if(last_pid & 0xffff8000)
> +					if(last_pid & ~PID_MASK)
>  						last_pid = 300;
>  					next_safe = PID_MAX;
>  				}
> 
> 



^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 19:34   ` Paul Larson
@ 2002-08-09 20:13     ` Paul Larson
  2002-08-09 20:40     ` Andries Brouwer
  2002-08-09 21:23     ` Linus Torvalds
  2 siblings, 0 replies; 56+ messages in thread
From: Paul Larson @ 2002-08-09 20:13 UTC (permalink / raw)
  To: Paul Larson
  Cc: Linus Torvalds, Hubertus Franke, Rik van Riel, Andries Brouwer,
	Andrew Morton, andrea, Dave Jones, lkml

On Fri, 2002-08-09 at 14:34, Paul Larson wrote:
> I suspect that it would actually require more than just this.  I tried
> this with the same test I've been using and had several failed attepmts
> at low numbers by getting wierd unexpected signals (like 28), and then
> one that ran for a much longer time and produced an oops with random
> garbage to the console (trying to extract this now).
> 
> -Paul Larson
Here's the ksymoops output minus the unprintable chars that got sent
after it:

Unable to handle kernel paging request at virtual address 32313256
c015cb38
*pde = 37184001
Oops: 0000
CPU:    4
0010:[<c015cb38>]    Not tainted
Using defaults from ksymoops -t elf32-i386 -a i386
EFLAGS: 00010202
eax: d9768090   ebx: d9768090   ecx: 00000020   edx: 32313232
esi: c015cb10   edi: d9768090   ebp: 00000008   esp: f6cb9e34
ds: 0018   es: 0018   ss: 0018
Stack: c0151403 d9768090 d9768090 f6dabaa0 c01515ad d9768090 f6dabab8
c014f49b
d9768090 f6dabaa0 c0139fa3 00000020 00000001 000001d0 c013a21a c013a170
000001f8 f6cb9e7c 00000020 00000001 000001d0 000041e6 c014f8f0 00000010
Call Trace: [<c0151403>] [<c01515ad>] [<c014f49b>] [<c0139fa3>]
[<c013a21a>]
[<c013a170>] [<c014f8f0>] [<c0131f9e>] [<c0131fec>] [<c0132c64>]
[<c0132fa2>]
[<c0133010>] [<c0116717>] [<c0117186>] [<c011793e>] [<c0105bc5>]
[<c0107173>]
Code: 8b 42 24 85 c0 74 0b f0 ff 48 10 8b 42 24 83 48 14 08 52 e8

>>EIP; c015cb38 <proc_delete_inode+28/50>   <=====
Trace; c0151403 <generic_delete_inode+83/b0>
Trace; c01515ad <iput+5d/60>
Trace; c014f49b <prune_dcache+eb/180>
Trace; c0139fa3 <pdflush_operation+83/90>
Trace; c013a21a <wakeup_bdflush+1a/20>
Trace; c013a170 <background_writeout+0/90>
Trace; c014f8f0 <shrink_dcache_memory+20/40>
Trace; c0131f9e <shrink_caches+7e/a0>
Trace; c0131fec <try_to_free_pages+2c/50>
Trace; c0132c64 <balance_classzone+44/1f0>
Trace; c0132fa2 <__alloc_pages+192/1f0>
Trace; c0133010 <__get_free_pages+10/20>
Trace; c0116717 <dup_task_struct+17/70>
Trace; c0117186 <copy_process+56/7f0>
Trace; c011793e <do_fork+1e/a0>
Trace; c0105bc5 <sys_fork+15/30>
Trace; c0107173 <syscall_call+7/b>
Code;  c015cb38 <proc_delete_inode+28/50>
00000000 <_EIP>:
Code;  c015cb38 <proc_delete_inode+28/50>   <=====
   0:   8b 42 24                  mov    0x24(%edx),%eax   <=====
Code;  c015cb3b <proc_delete_inode+2b/50>
   3:   85 c0                     test   %eax,%eax
Code;  c015cb3d <proc_delete_inode+2d/50>
   5:   74 0b                     je     12 <_EIP+0x12> c015cb4a
<proc_delete_inode+3a/50>
Code;  c015cb3f <proc_delete_inode+2f/50>
   7:   f0 ff 48 10               lock decl 0x10(%eax)
Code;  c015cb43 <proc_delete_inode+33/50>
   b:   8b 42 24                  mov    0x24(%edx),%eax
Code;  c015cb46 <proc_delete_inode+36/50>
   e:   83 48 14 08               orl    $0x8,0x14(%eax)
Code;  c015cb4a <proc_delete_inode+3a/50>
  12:   52                        push   %edx
Code;  c015cb4b <proc_delete_inode+3b/50>
  13:   e8 00 00 00 00            call   18 <_EIP+0x18> c015cb50
<proc_delete_inode+40/50>


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 19:34   ` Paul Larson
  2002-08-09 20:13     ` Paul Larson
@ 2002-08-09 20:40     ` Andries Brouwer
  2002-08-09 21:23     ` Linus Torvalds
  2 siblings, 0 replies; 56+ messages in thread
From: Andries Brouwer @ 2002-08-09 20:40 UTC (permalink / raw)
  To: Paul Larson
  Cc: Linus Torvalds, Hubertus Franke, Rik van Riel, Andrew Morton,
	andrea, Dave Jones, lkml

On Fri, Aug 09, 2002 at 02:34:17PM -0500, Paul Larson wrote:

> I suspect that it would actually require more than just this.  I tried
> this with the same test I've been using and had several failed attepmts
> at low numbers by getting wierd unexpected signals (like 28), and then
> one that ran for a much longer time and produced an oops with random
> garbage to the console (trying to extract this now).

Not much more. Around 2.3.40 I have run with a large PID_MAX for a long time.
The patch that I submitted is still visible on the net various places
(I just tried  Andries pid_max  and found a few).

At that time the only other change (other than <linux/threads.h> and
kernel/fork.c) was in proc/base.c, namely

	-#define fake_ino(pid,ino) (((pid)<<16)|(ino)) 
	+#define fake_ino(pid,ino) (((1)<<16)|(ino)) 

and

	- if (!pid) 
	-         goto out; 
	- if (pid & 0xffff0000) 
	+ if (pid <= 0 || pid >= PID_MAX) 
                  goto out; 

(plucked from google output).

I have not checked precisely what change is appropriate in procfs today.


Andries

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 19:34   ` Paul Larson
  2002-08-09 20:13     ` Paul Larson
  2002-08-09 20:40     ` Andries Brouwer
@ 2002-08-09 21:23     ` Linus Torvalds
  2002-08-09 21:42       ` Linus Torvalds
  2 siblings, 1 reply; 56+ messages in thread
From: Linus Torvalds @ 2002-08-09 21:23 UTC (permalink / raw)
  To: Paul Larson
  Cc: Hubertus Franke, Rik van Riel, Andries Brouwer, Andrew Morton,
	andrea, Dave Jones, lkml

On 9 Aug 2002, Paul Larson wrote:
>
> I suspect that it would actually require more than just this. 

There are various /proc paths, Andries had a full patch at some point a 
long time ago.

My point was not so much that this is sufficient, but that the approach is 
valid. I'd rather go with the simple approach (that solves two problems) 
than with a much more complex approach (that solves only one).

And yes, I doubt I actually want to give all 30 bits to pid space in the 
long run (the per-node pid space argument is still valid), but it's better 
to start with 30 bits and actively trying to break things and then phasing 
it down later than to be timid and never even see the stuff that breaks.

		Linus

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 21:23     ` Linus Torvalds
@ 2002-08-09 21:42       ` Linus Torvalds
  2002-08-09 21:46         ` Paul Larson
  0 siblings, 1 reply; 56+ messages in thread
From: Linus Torvalds @ 2002-08-09 21:42 UTC (permalink / raw)
  To: Paul Larson
  Cc: Hubertus Franke, Rik van Riel, Andries Brouwer, Andrew Morton,
	andrea, Dave Jones, lkml

On Fri, 9 Aug 2002, Linus Torvalds wrote:
>
> There are various /proc paths, Andries had a full patch at some point a
> long time ago.

Hmm.. Giving them a quick glance-over, the /proc issues look like they
shouldn't be there in 2.5.x anyway, since the inode number really is
largely just a random number in 2.5 and all the real information is
squirelled away at path open time.

There's certainly a possibility for some cleanups, though.

		Linus

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 21:42       ` Linus Torvalds
@ 2002-08-09 21:46         ` Paul Larson
  2002-08-09 22:04           ` Linus Torvalds
  0 siblings, 1 reply; 56+ messages in thread
From: Paul Larson @ 2002-08-09 21:46 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Hubertus Franke, Rik van Riel, Andries Brouwer, Andrew Morton,
	andrea, Dave Jones, lkml

On Fri, 2002-08-09 at 16:42, Linus Torvalds wrote:

> Hmm.. Giving them a quick glance-over, the /proc issues look like they
> shouldn't be there in 2.5.x anyway, since the inode number really is
> largely just a random number in 2.5 and all the real information is
> squirelled away at path open time.
> 
> There's certainly a possibility for some cleanups, though.
So for now then, should I dig out my original (minimal) patch that
*just* fixed the "loop forever even if we're out of pids" problem?  Even
if we increase PID_MAX to something obscenely high, I think we should
still be checking for this.

-Paul Larson


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 21:46         ` Paul Larson
@ 2002-08-09 22:04           ` Linus Torvalds
  2002-08-10 17:23             ` Jamie Lokier
                               ` (2 more replies)
  0 siblings, 3 replies; 56+ messages in thread
From: Linus Torvalds @ 2002-08-09 22:04 UTC (permalink / raw)
  To: Paul Larson
  Cc: Hubertus Franke, Rik van Riel, Andries Brouwer, Andrew Morton,
	andrea, Dave Jones, lkml

On 9 Aug 2002, Paul Larson wrote:
> On Fri, 2002-08-09 at 16:42, Linus Torvalds wrote:
> > Hmm.. Giving them a quick glance-over, the /proc issues look like they
> > shouldn't be there in 2.5.x anyway, since the inode number really is
> > largely just a random number in 2.5 and all the real information is
> > squirelled away at path open time.

It looks like the biggest impact on /proc would be that the /proc/<pid> 
inodes wouldn't be 10%% unique any more, which in turn means that an 
old-style /bin/pwd that actually walks the tree backwards and checks the 
inode number would occasionally fail.

That in turn makes me suspect that we'd better off just biting the bullet 
and makign the inode numbers almost completely static, and forcing that 
particular failure mode early rather than hit it randomly due to unlucky 
timing.

Doing a simple strace shows that all the systems I have regular access to
use the "getcwd()" system call anyway, which gets this right on /proc (and
other filesystems that do not guarantee unique inode numbers)

> So for now then, should I dig out my original (minimal) patch that
> *just* fixed the "loop forever even if we're out of pids" problem?  Even
> if we increase PID_MAX to something obscenely high, I think we should
> still be checking for this.

Ehh, considering that especially with a 30-bit pid, there's _no_ way we'd
run out without some other serious problems hitting us long before (out of
memory being the obvious one), I don't think even that is an issue.

With a minimum of 8kB / pid for process overhead, you need to have at 
least 43 bits of physical address space completely populated just to cover 
the memory uses of that many pid's.

		Linus

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 22:04           ` Linus Torvalds
@ 2002-08-10 17:23             ` Jamie Lokier
  2002-08-10 18:33               ` Linus Torvalds
  2002-08-12  8:56             ` Padraig Brady
  2002-08-12 14:40             ` Paul Larson
  2 siblings, 1 reply; 56+ messages in thread
From: Jamie Lokier @ 2002-08-10 17:23 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Paul Larson, Hubertus Franke, Rik van Riel, Andries Brouwer,
	Andrew Morton, andrea, Dave Jones, lkml

Linus Torvalds wrote:
> Doing a simple strace shows that all the systems I have regular access to
> use the "getcwd()" system call anyway, which gets this right on /proc (and
> other filesystems that do not guarantee unique inode numbers)

Oh dear -- what of programs that assume duplicate inode numbers are hard
links, and therefore assume the same contents will be found in each
duplicate?

-- Jamie

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-10 17:23             ` Jamie Lokier
@ 2002-08-10 18:33               ` Linus Torvalds
  2002-08-10 18:48                 ` Jamie Lokier
  0 siblings, 1 reply; 56+ messages in thread
From: Linus Torvalds @ 2002-08-10 18:33 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Paul Larson, Hubertus Franke, Rik van Riel, Andries Brouwer,
	Andrew Morton, andrea, Dave Jones, lkml


On Sat, 10 Aug 2002, Jamie Lokier wrote:
>
> Linus Torvalds wrote:
> > Doing a simple strace shows that all the systems I have regular access to
> > use the "getcwd()" system call anyway, which gets this right on /proc (and
> > other filesystems that do not guarantee unique inode numbers)
> 
> Oh dear -- what of programs that assume duplicate inode numbers are hard
> links, and therefore assume the same contents will be found in each
> duplicate?

Well, anybody who tries to back up /proc with "tar" is in for some 
surprises anyway ;)

			Linus


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-10 18:33               ` Linus Torvalds
@ 2002-08-10 18:48                 ` Jamie Lokier
  2002-08-11 20:41                   ` Alan Cox
  0 siblings, 1 reply; 56+ messages in thread
From: Jamie Lokier @ 2002-08-10 18:48 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Paul Larson, Hubertus Franke, Rik van Riel, Andries Brouwer,
	Andrew Morton, andrea, Dave Jones, lkml

Linus Torvalds wrote:
> > Oh dear -- what of programs that assume duplicate inode numbers are hard
> > links, and therefore assume the same contents will be found in each
> > duplicate?
> 
> Well, anybody who tries to back up /proc with "tar" is in for some 
> surprises anyway ;)

I was thinking of an over-intelligent `find'.  But hey, as long as this
is only for the weird and wonderful /proc :-)

-- Jamie

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-11 20:41                   ` Alan Cox
@ 2002-08-11 19:58                     ` Jamie Lokier
  2002-08-11 21:23                       ` Alan Cox
  0 siblings, 1 reply; 56+ messages in thread
From: Jamie Lokier @ 2002-08-11 19:58 UTC (permalink / raw)
  To: Alan Cox
  Cc: Linus Torvalds, Paul Larson, Hubertus Franke, Rik van Riel,
	Andries Brouwer, Andrew Morton, andrea, Dave Jones, lkml

Alan Cox wrote:
> If they hold both handles open and stat them and find the same inode
> number then yes its a bug. We have lots of room for inode numbers to
> handle 2^30 processes and allow for 2^30 other files

So, in general, the way to detect hard links requires both objects to be
open at the same time?  I was sure it was enough to stat(), and check
(st1.st_ino == st2.st_ino && st1.st_dev == st2.st_dev).

Admittedly, one of the object could be renamed or deleted in that time
so it's not 100% reliable on changing filesystems.

Ah well.

-- Jamie

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-11 21:23                       ` Alan Cox
@ 2002-08-11 20:10                         ` Jamie Lokier
  0 siblings, 0 replies; 56+ messages in thread
From: Jamie Lokier @ 2002-08-11 20:10 UTC (permalink / raw)
  To: Alan Cox
  Cc: Linus Torvalds, Paul Larson, Hubertus Franke, Rik van Riel,
	Andries Brouwer, Andrew Morton, andrea, Dave Jones, lkml

Alan Cox wrote:
> > So, in general, the way to detect hard links requires both objects to be
> > open at the same time?  I was sure it was enough to stat(), and check
> > (st1.st_ino == st2.st_ino && st1.st_dev == st2.st_dev).
> > 
> > Admittedly, one of the object could be renamed or deleted in that time
> > so it's not 100% reliable on changing filesystems.
> 
> Hence you need both open at the same time. 

... unless you know that what you're looking at isn't changing, or you
only guarantee correct results for parts of the filesystem which don't
change (`tar', `find' etc.)

Again, /proc is exempt! :)

-- Jamie

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-10 18:48                 ` Jamie Lokier
@ 2002-08-11 20:41                   ` Alan Cox
  2002-08-11 19:58                     ` Jamie Lokier
  0 siblings, 1 reply; 56+ messages in thread
From: Alan Cox @ 2002-08-11 20:41 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Linus Torvalds, Paul Larson, Hubertus Franke, Rik van Riel,
	Andries Brouwer, Andrew Morton, andrea, Dave Jones, lkml

On Sat, 2002-08-10 at 19:48, Jamie Lokier wrote:
> Linus Torvalds wrote:
> > > Oh dear -- what of programs that assume duplicate inode numbers are hard
> > > links, and therefore assume the same contents will be found in each
> > > duplicate?
> > 
> > Well, anybody who tries to back up /proc with "tar" is in for some 
> > surprises anyway ;)
> 
> I was thinking of an over-intelligent `find'.  But hey, as long as this
> is only for the weird and wonderful /proc :-)

If they hold both handles open and stat them and find the same inode
number then yes its a bug. We have lots of room for inode numbers to
handle 2^30 processes and allow for 2^30 other files


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-11 19:58                     ` Jamie Lokier
@ 2002-08-11 21:23                       ` Alan Cox
  2002-08-11 20:10                         ` Jamie Lokier
  0 siblings, 1 reply; 56+ messages in thread
From: Alan Cox @ 2002-08-11 21:23 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Linus Torvalds, Paul Larson, Hubertus Franke, Rik van Riel,
	Andries Brouwer, Andrew Morton, andrea, Dave Jones, lkml

On Sun, 2002-08-11 at 20:58, Jamie Lokier wrote:
> Alan Cox wrote:
> > If they hold both handles open and stat them and find the same inode
> > number then yes its a bug. We have lots of room for inode numbers to
> > handle 2^30 processes and allow for 2^30 other files
> 
> So, in general, the way to detect hard links requires both objects to be
> open at the same time?  I was sure it was enough to stat(), and check
> (st1.st_ino == st2.st_ino && st1.st_dev == st2.st_dev).
> 
> Admittedly, one of the object could be renamed or deleted in that time
> so it's not 100% reliable on changing filesystems.

Hence you need both open at the same time. 

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 22:04           ` Linus Torvalds
  2002-08-10 17:23             ` Jamie Lokier
@ 2002-08-12  8:56             ` Padraig Brady
  2002-08-12 10:37               ` Alan Cox
  2002-08-12 14:40             ` Paul Larson
  2 siblings, 1 reply; 56+ messages in thread
From: Padraig Brady @ 2002-08-12  8:56 UTC (permalink / raw)
  To: lkml

Linus Torvalds wrote:
> On 9 Aug 2002, Paul Larson wrote:
> 
>>On Fri, 2002-08-09 at 16:42, Linus Torvalds wrote:
>>
>>>Hmm.. Giving them a quick glance-over, the /proc issues look like they
>>>shouldn't be there in 2.5.x anyway, since the inode number really is
>>>largely just a random number in 2.5 and all the real information is
>>>squirelled away at path open time.
>>
> 
> It looks like the biggest impact on /proc would be that the /proc/<pid> 
> inodes wouldn't be 10%% unique any more, which in turn means that an 
> old-style /bin/pwd that actually walks the tree backwards and checks the 
> inode number would occasionally fail.
> 
> That in turn makes me suspect that we'd better off just biting the bullet 
> and makign the inode numbers almost completely static, and forcing that 
> particular failure mode early rather than hit it randomly due to unlucky 
> timing.
> 
> Doing a simple strace shows that all the systems I have regular access to
> use the "getcwd()" system call anyway, which gets this right on /proc (and
> other filesystems that do not guarantee unique inode numbers)

Anyone care to clarify which filesystems don't
have unique inode numbers. I always thought you
could uniquely identify any file using a device,inode
tuple? Fair enough proc is "special" but can/should
you not assume unique inodes within all other filesystems?

Also why can't you allocate a unique number in /proc?

thanks,
Pádraig.


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-12 10:37               ` Alan Cox
@ 2002-08-12  9:21                 ` Padraig Brady
  0 siblings, 0 replies; 56+ messages in thread
From: Padraig Brady @ 2002-08-12  9:21 UTC (permalink / raw)
  To: lkml

Alan Cox wrote:
> On Mon, 2002-08-12 at 09:56, Padraig Brady wrote:
> 
>>Anyone care to clarify which filesystems don't
>>have unique inode numbers. I always thought you
>>could uniquely identify any file using a device,inode
>>tuple? Fair enough proc is "special" but can/should
>>you not assume unique inodes within all other filesystems?
> 
> 
> 2.4 functions correctly here in all the stuff I've looked at.

cool.

> I can't speak for 2.5. In the 2.4 case you must be holding the files open 
 > during the comparison. Some file systems like MSDOS invent inodes as
 > they go, never issuing the same one to two objects at the same time but
 > happily reissuing different inode numbers the next time around.

Sure, no hardlinks so who cares what the number is, as long
as it's unique.

> That breaks NFS but not much else

hmm..

thanks,
Pádraig.


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-12  8:56             ` Padraig Brady
@ 2002-08-12 10:37               ` Alan Cox
  2002-08-12  9:21                 ` Padraig Brady
  0 siblings, 1 reply; 56+ messages in thread
From: Alan Cox @ 2002-08-12 10:37 UTC (permalink / raw)
  To: Padraig Brady; +Cc: lkml

On Mon, 2002-08-12 at 09:56, Padraig Brady wrote:
> Anyone care to clarify which filesystems don't
> have unique inode numbers. I always thought you
> could uniquely identify any file using a device,inode
> tuple? Fair enough proc is "special" but can/should
> you not assume unique inodes within all other filesystems?

2.4 functions correctly here in all the stuff I've looked at. I can't
speak for 2.5. In the 2.4 case you must be holding the files open during
the comparison. Some file systems like MSDOS invent inodes as they go,
never issuing the same one to two objects at the same time but happily
reissuing different inode numbers the next time around.

That breaks NFS but not much else

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
  2002-08-09 22:04           ` Linus Torvalds
  2002-08-10 17:23             ` Jamie Lokier
  2002-08-12  8:56             ` Padraig Brady
@ 2002-08-12 14:40             ` Paul Larson
  2 siblings, 0 replies; 56+ messages in thread
From: Paul Larson @ 2002-08-12 14:40 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Hubertus Franke, Rik van Riel, Andries Brouwer, Andrew Morton,
	andrea, Dave Jones, lkml

Dave McCracken had a patch a while back that should probably be
considered with all this too. 
http://marc.theaimsgroup.com/?l=linux-kernel&m=100506843702466&w=2

This fixes the calculation of the max number of tasks so that it doesn't
consider highmem since it can't use use that anyway.

-Paul Larson

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] Linux-2.5 fix/improve get_pid()
@ 2002-08-12 16:15 Jim Houston
  0 siblings, 0 replies; 56+ messages in thread
From: Jim Houston @ 2002-08-12 16:15 UTC (permalink / raw)
  To: frankeh, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1475 bytes --]


I was doing some similar work last week and stumbled on the
this discussion.  I have been working with the Posix timers
patch and was looking for a nice way to allocate ids for
timers.

I looked at get_pid() and also the code used for ipc/msg.c
and friends.  I thought I could do better so I wrote the
attached routines.  They are a subtle variation on the bitmap
theme.  I'm mixing a bitmap with a sparse array which is implemented
as a radix tree.  This means that the data structure
grows/shrinks as needed.  The interface is:

	id = id_new(idp, pointer);
	id_remove(idp, id);
	pointer = id_lookup(idp, id);

The idp is a pointer to the structure which defines the id space.
This provides, not only the bitmap allocation of the id but also, mapping
the id to a pointer.  In the case of pids, this would replace the
hash table used by find_task_by_pid().

So far I have been playing with this code in user space.  I'm including
my test harness.  I plan to try it with your program later today.
My program was written more to test correctness, but I do collect
a few numbers.  Here is the output:

jhouston@linux:~/id > time ./id_test
my_id.count=1
new_cnt=5002501
avr_depth = 4996
id_lookup took 101 cycles
id_new took 204 cycles
id_remove took 148 cycles 
real    0m3.476s

That's about 0.5us total to allocate an id, lookup a random id
and remove it with a table containing 5000 ids.  The times
only double for 500000 ids.

Jim Houston - Concurrent Computer Corporation.

[-- Attachment #2: id.c --]
[-- Type: text/plain, Size: 2825 bytes --]

/*
 * Small id to pointer translation service.  
 *
 * It uses a radix tree like structure as a sparse array indexed 
 * by the id to obtain the pointer.  The bitmap makes allocating
 * an new id quick.  
 */

#include "id.h"

#ifdef __KERNEL__
static kmem_cache_t *id_layer_cache;
#endif

void *id_lookup(struct id *idp, int id)
{
	int n = idp->layers * ID_BITS;
	struct id_layer *p = idp->top;

	if (id >= (1 << n))
		return(NULL);

	while (n > 0 && p) {
		n -= ID_BITS;
		p = p->ary[(id >> n) & ID_MASK];
	}
	return((void *)p);
}

static int sub_alloc(struct id_layer *p, int shift, int id, void *ptr)
{
	int n = (id >> shift) & ID_MASK;
	int bitmap = p->bitmap;
	int id_base = id & ~((1 << (shift+ID_BITS))-1);
	int v;
	
	for ( ; n <= ID_MASK; n++, id = id_base + (n << shift)) {
		if (bitmap & (1 << n))
			continue;
		if (shift == 0) {
			p->ary[n] = (struct id_layer *)ptr;
			p->bitmap |= 1<<n;
			return(id);
		}
		if (!p->ary[n])
			p->ary[n] = alloc_layer();
		if (v = sub_alloc(p->ary[n], shift-ID_BITS, id, ptr)) {
			update_bitmap(p, n);
			return(v);
		}
	}
	return(0);
}

int id_new(struct id *idp, void *ptr)
{
	int n = idp->layers * ID_BITS;
	int last = idp->last;
	struct id_layer **p, *new;
	int id, v;
	
	/*
	 * Add a new layer if the array is full or the last id
	 * was at the limit and we don't want to wrap.
	 */
	if ((last == ((1 << n)-1) && last < idp->min_wrap) ||
		idp->count == (1 << n)) {
		++idp->layers;
		n += ID_BITS;
		new = alloc_layer();
		new->ary[0] = idp->top;
		idp->top = new;
		update_bitmap(new, 0);
	}
	if (last >= ((1 << n)-1))
		last = 0;

	/*
	 * Search for a free id starting after last id allocated.
	 * If that fails wrap back to start.
	 */
	id = last+1;
	if (!(v = sub_alloc(idp->top, n-ID_BITS, id, ptr)))
		v = sub_alloc(idp->top, n-ID_BITS, 1, ptr);
	idp->last = v;
	idp->count++;
	return(v);
}


static int sub_remove(struct id_layer *p, int shift, int id)
{
	int n = (id >> shift) & ID_MASK;
	int i, bitmap, rv;
	
if (!p) {
printf("in sub_remove for id=%d called with null pointer.\n", id);
return(0);
}
	rv = 0;
	bitmap = p->bitmap & ~(1<<n);
	p->bitmap = bitmap;
	if (shift == 0) {
		p->ary[n] = NULL;
		rv = !bitmap;
	} else {
		if (sub_remove(p->ary[n], shift-ID_BITS, id)) {
			free_layer(p->ary[n]);
			p->ary[n] = 0;
			for (i = 0; i < (1 << ID_BITS); i++)
				if (p->ary[i])
					break;
			if (i == (1 << ID_BITS))
				rv = 1;
		}
	}
	return(rv);
}

void id_remove(struct id *idp, int id)
{
	sub_remove(idp->top, (idp->layers-1)*ID_BITS, id);
	idp->count--;
}

void id_init(struct id *idp, int min_wrap)
{
#ifdef __KERNEL__
	id_layer_cache = kmem_cache_create("id_layer_cache", 
		sizeof(struct id_layer), 0, 0, 0, 0);
#endif
	idp->count = 1;
	idp->last = 0;
	idp->layers = 1;
	idp->top = alloc_layer();
	idp->top->bitmap = 1;
	idp->min_wrap = min_wrap;
}


[-- Attachment #3: id.h --]
[-- Type: text/plain, Size: 1289 bytes --]

/*
 * Small id to pointer translation service avoiding fixed sized
 * tables.
 */

#define ID_BITS 5

#define ID_MASK ((1 << ID_BITS)-1)
#define ID_FULL ((1 << (1 << ID_BITS))-1)

struct id_layer {
	unsigned int	bitmap: (1<<ID_BITS);
	struct id_layer	*ary[1<<ID_BITS];
};

struct id {
	int		layers;
	int		last;
	int		count;
	int		min_wrap;
	struct id_layer *top;
};

void *id_lookup(struct id *idp, int id);
int id_new(struct id *idp, void *ptr);
void id_remove(struct id *idp, int id);
void id_init(struct id *idp, int min_wrap);

static inline update_bitmap(struct id_layer *p, int bit)
{
	if (p->ary[bit] && p->ary[bit]->bitmap == (typeof(p->bitmap))-1)
		p->bitmap |= 1<<bit;
	else
		p->bitmap &= ~(1<<bit);
}

#ifndef __KERNEL__

#ifndef NULL
#define NULL 0
#endif

static inline struct id_layer *alloc_layer()
{
	struct id_layer *new;

	new = (struct id_layer *)malloc(sizeof(struct id_layer));
	bzero((void *)new, sizeof(struct id_layer));
	return(new);
	
}

static inline void free_layer(struct id_layer *p)
{
	free((void *)p);
}

#else

extern kmem_cache_t *id_layer_cache;

static inline struct id_layer *alloc_layer()
{
	return(kmem_cache_alloc(id_layer_cache, GFP_KERNEL));
}

static inline void free_layer(struct id_layer *p)
{
	kmem_cache_free(id_layer_cache, p);
}

#endif


[-- Attachment #4: id_test.c --]
[-- Type: text/plain, Size: 2133 bytes --]

/*
 * Test program for id.c
 */

#include <stdlib.h>
#include "id.h"

struct id my_id;

main()
{
	int i, n;
	void *v;

	id_init(&my_id, 10000);
	
	random_test();
}

#define LIST_SZ 50000

struct id_list {
	int id;
	int ptr;
} id_list[LIST_SZ];

int list_cnt;
int new_cnt;
int max = 5000;
int count = 10000000;

long long avr_depth;

#define rdtsc(low,high) \
     __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
  
static inline unsigned long get_tsc(void)
{
	register unsigned long eax, edx;

	rdtsc(eax,edx);
	return(eax);
}



random_test()
{
	int n, i;
	void *v;
	unsigned long t1, t2, t3, t4;

	for (n = 0; n < count; n++) {
		/*
		 * favor insertion so we will tend to run will max
		 * id's active.
		 */
		if (list_cnt && (list_cnt > max || rand() < (RAND_MAX/4))) {
			i = rand() % list_cnt;
			v = id_lookup(&my_id, id_list[i].id);
			if ((int)v != id_list[i].ptr) {
				printf("list_cnt=%d, i=%d\n", list_cnt, i);
				printf("failed id=%d, expected %d got %d\n",
					id_list[i].id, id_list[i].ptr, (int)v);
			} else {
#if 0
				printf("rm id=%d, ptr=%d\n",
					id_list[i].id, id_list[i].ptr);
#endif
				id_remove(&my_id, id_list[i].id);
			}
			id_list[i] = id_list[--list_cnt];
		} else {
			new_cnt++;
			id_list[list_cnt].id = id_new(&my_id, (void *)new_cnt);
			id_list[list_cnt].ptr = new_cnt;
#if 0
			printf("ins id=%d, ptr=%d\n",
				id_list[list_cnt].id, id_list[list_cnt].ptr);
#endif
			list_cnt++;
			avr_depth += list_cnt;
		}
	}
t1 = get_tsc();
id_lookup(&my_id, id_list[0].id);
t2 = get_tsc();
n = id_new(&my_id, (void *)++new_cnt);
t3 = get_tsc();
id_remove(&my_id, n);
t4 = get_tsc();

	for (i = 0; i < list_cnt; i++) {
		v = id_lookup(&my_id, id_list[i].id);
		if ((int)v != id_list[i].ptr) {
			printf("failed id=%d, expected %d got %d\n",
				id_list[i].id, id_list[i], (int)v);
		}
		id_remove(&my_id, id_list[i].id);
	}
	printf("my_id.count=%d\n", my_id.count);
	printf("new_cnt=%d\n", new_cnt);
	printf("avr_depth = %d\n", (int)(avr_depth/new_cnt));
	printf("id_lookup took %d cycles\n", t2-t1);
	printf("id_new took %d cycles\n", t3-t2);
	printf("id_remove took %d cycles\n", t4-t3);
}

^ permalink raw reply	[flat|nested] 56+ messages in thread

end of thread, other threads:[~2002-08-12 16:02 UTC | newest]

Thread overview: 56+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-08-07 22:03 [PATCH] Linux-2.5 fix/improve get_pid() Paul Larson
2002-08-07 23:06 ` Andrew Morton
2002-08-08  0:24   ` Andries Brouwer
2002-08-08 19:42     ` William Lee Irwin III
2002-08-08 20:47       ` Andries Brouwer
2002-08-09 11:22     ` Analysis for Linux-2.5 fix/improve get_pid(), comparing various approaches Hubertus Franke
2002-08-09 15:36       ` Andries Brouwer
2002-08-09 18:14         ` Hubertus Franke
2002-08-09 16:05       ` Linus Torvalds
2002-08-09 18:18         ` Hubertus Franke
2002-08-08 20:24   ` [PATCH] Linux-2.5 fix/improve get_pid() Linus Torvalds
2002-08-08 21:30     ` H. Peter Anvin
2002-08-08 21:45     ` William Lee Irwin III
2002-08-09  4:42     ` William Lee Irwin III
  -- strict thread matches above, loose matches on Subject: below --
2002-08-08 18:56 Hubertus Franke
2002-08-08 19:15 ` Rik van Riel
2002-08-08 19:18 ` Paul Larson
2002-08-08 21:43 Hubertus Franke
2002-08-08 22:02 ` Linus Torvalds
2002-08-08 22:26   ` Linus Torvalds
2002-08-08 23:09     ` Albert D. Cahalan
2002-08-09  3:26       ` Chris Adams
2002-08-09  7:04         ` Albert D. Cahalan
2002-08-09  8:48           ` Helge Hafting
2002-08-09 10:32             ` Alan Cox
2002-08-09  9:35               ` Andrew Morton
2002-08-09 10:37               ` Andries Brouwer
2002-08-09 13:00           ` Chris Adams
2002-08-09 14:39             ` Albert D. Cahalan
2002-08-09 13:59           ` Rik van Riel
2002-08-09 14:57             ` Albert D. Cahalan
2002-08-08 22:34   ` Paul Larson
2002-08-08 22:44     ` Rik van Riel
2002-08-08 22:37   ` Rik van Riel
2002-08-09  1:49     ` Andries Brouwer
2002-08-09 19:34   ` Paul Larson
2002-08-09 20:13     ` Paul Larson
2002-08-09 20:40     ` Andries Brouwer
2002-08-09 21:23     ` Linus Torvalds
2002-08-09 21:42       ` Linus Torvalds
2002-08-09 21:46         ` Paul Larson
2002-08-09 22:04           ` Linus Torvalds
2002-08-10 17:23             ` Jamie Lokier
2002-08-10 18:33               ` Linus Torvalds
2002-08-10 18:48                 ` Jamie Lokier
2002-08-11 20:41                   ` Alan Cox
2002-08-11 19:58                     ` Jamie Lokier
2002-08-11 21:23                       ` Alan Cox
2002-08-11 20:10                         ` Jamie Lokier
2002-08-12  8:56             ` Padraig Brady
2002-08-12 10:37               ` Alan Cox
2002-08-12  9:21                 ` Padraig Brady
2002-08-12 14:40             ` Paul Larson
2002-08-08 21:50 Hubertus Franke
2002-08-09  0:53 Hubertus Franke
2002-08-12 16:15 Jim Houston

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox