public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* get_pid fixes against 2.4.19pre7
@ 2002-04-26 11:44 Andrea Arcangeli
  2002-04-26 11:53 ` Russell King
  2002-04-26 13:16 ` Hubertus Franke
  0 siblings, 2 replies; 5+ messages in thread
From: Andrea Arcangeli @ 2002-04-26 11:44 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: linux-kernel, Ihno Krumreich, Linus Torvalds

Hello,

Could you have a look at these get_pid fixes? Besides the deadlocking
while running out of pids and reducing from quadratic to linear the
complexity of get_pids, it also addresses a longstanding non trivial
race present in 2.2 too that can lead to pid collisions even on UP
(noticed today while merging two more fixes from Ihno on the other
part). the fix reduces a bit scalability of simultaneous forks from
different cpus, but it's obviously right at least. Putting non-ready
tasks into the tasklists asks for troubles (signals...).  For more
details see the comment at the end of the patch. If you've suggestions
they're welcome, thanks.

Patch in pre7aa2 against 2.4.19pre7:

diff -urN 2.4.19pre7/include/linux/threads.h get_pid-1/include/linux/threads.h
--- 2.4.19pre7/include/linux/threads.h	Thu Apr 18 07:51:30 2002
+++ get_pid-1/include/linux/threads.h	Fri Apr 26 09:10:30 2002
@@ -19,6 +19,6 @@
 /*
  * This controls the maximum pid allocated to a process
  */
-#define PID_MAX 0x8000
+#define PID_NR 0x8000
 
 #endif
diff -urN 2.4.19pre7/kernel/fork.c get_pid-1/kernel/fork.c
--- 2.4.19pre7/kernel/fork.c	Tue Apr 16 08:12:09 2002
+++ get_pid-1/kernel/fork.c	Fri Apr 26 10:25:33 2002
@@ -37,6 +37,12 @@
 
 struct task_struct *pidhash[PIDHASH_SZ];
 
+/*
+ * Protectes next_unsafe, last_pid and it avoids races
+ * between get_pid and SET_LINKS().
+ */
+static DECLARE_MUTEX(getpid_mutex);
+
 void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
 {
 	unsigned long flags;
@@ -79,51 +85,105 @@
 	init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
 }
 
-/* Protects next_safe and last_pid. */
-spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
-
+/*
+ *	Get the next free pid for a new process/thread.
+ *
+ *	Strategy: last_pid and next_unsafe (excluded) are an interval where all pids
+ *		  are free, so next pid is just last_pid + 1 if it's also < next_unsafe.
+ *		  If last_pid + 1 >= next_unsafe the interval is completely used.
+ *		  In this case a bitmap with all used pids/tgids/pgrp/seesion is
+ *		  is created. This bitmap is looked for the next free pid and next_unsafe.
+ *		  If all pids are used, a kernel warning is issued.
+ */
 static int get_pid(unsigned long flags)
 {
-	static int next_safe = PID_MAX;
+	static int next_unsafe = PID_NR;
+#define PID_FIRST	2 /* pid 1 is init, first usable pid is 2 */
+#define PID_BITMAP_SIZE	((((PID_NR + 7) / 8) + sizeof(long) - 1 ) / (sizeof(long)))
+	/*
+	 * Even if this could be local per-thread, keep it static and protected by
+	 * the lock because we don't want to overflow the stack and we wouldn't
+	 * SMP scale better anyways. It doesn't waste disk space because it's in
+	 * the .bss.
+	 */
+	static unsigned long pid_bitmap[PID_BITMAP_SIZE];
+
+	/* from here the stuff on the stack */
 	struct task_struct *p;
-	int pid;
+	int pid, found_pid;
 
 	if (flags & CLONE_PID)
 		return current->pid;
 
-	spin_lock(&lastpid_lock);
-	if((++last_pid) & 0xffff8000) {
-		last_pid = 300;		/* Skip daemons etc. */
-		goto inside;
-	}
-	if(last_pid >= next_safe) {
-inside:
-		next_safe = PID_MAX;
+	pid = last_pid + 1;
+	if (pid >= next_unsafe) {
+		next_unsafe = PID_NR;
+		memset(pid_bitmap, 0, PID_BITMAP_SIZE*sizeof(long));
+
 		read_lock(&tasklist_lock);
-	repeat:
+		/*
+		 * Build the bitmap and calc next_unsafe.
+		 */
 		for_each_task(p) {
-			if(p->pid == last_pid	||
-			   p->pgrp == last_pid	||
-			   p->tgid == last_pid	||
-			   p->session == last_pid) {
-				if(++last_pid >= next_safe) {
-					if(last_pid & 0xffff8000)
-						last_pid = 300;
-					next_safe = PID_MAX;
+			set_bit(p->pid, pid_bitmap);
+			set_bit(p->pgrp, pid_bitmap);
+			set_bit(p->tgid, pid_bitmap);
+			set_bit(p->session, pid_bitmap);
+
+			if (next_unsafe > p->pid && p->pid > pid)
+				next_unsafe = p->pid;
+			if (next_unsafe > p->pgrp && p->pgrp > pid)
+				next_unsafe = p->pgrp;
+			if (next_unsafe > p->tgid && p->tgid > pid)
+				next_unsafe = p->tgid;
+			if (next_unsafe > p->session && p->session > pid)
+				next_unsafe = p->session;
+		}
+
+		/*
+		 * Release the tasklist_lock, after the unlock it may happen that
+		 * a pid is freed while it's still marked in use
+		 * in the pid_bitmap[].
+		 */
+		read_unlock(&tasklist_lock);
+
+		found_pid = find_next_zero_bit(pid_bitmap, PID_NR, pid);
+		if (found_pid >= PID_NR) {
+			next_unsafe = 0; /* depends on PID_FIRST > 0 */
+			found_pid = find_next_zero_bit(pid_bitmap, pid, PID_FIRST);
+			/* We scanned the whole bitmap without finding a free pid. */
+			if (found_pid >= pid) {
+				static long last_get_pid_warning;
+				if ((unsigned long) (jiffies - last_get_pid_warning) >= HZ) {
+					printk(KERN_NOTICE "No more PIDs (PID_NR = %d)\n", PID_NR);
+					last_get_pid_warning = jiffies;
 				}
-				goto repeat;
+				return -1;
+			}
+		}
+
+		pid = found_pid;
+
+		if (pid > next_unsafe) {
+			/* recalc next_unsafe by looking for the next bit set in the bitmap */
+			unsigned long * start = pid_bitmap;
+			unsigned long * p = start + (pid / (sizeof(long) * 8));
+			unsigned long * end = pid_bitmap + PID_BITMAP_SIZE;
+			unsigned long mask = ~((1UL << (pid & ((sizeof(long) * 8 - 1)))) - 1);
+
+			*p &= (mask << 1);
+
+			while (p < end) {
+				if (*p) {
+					next_unsafe = ffz(~*p) + (p - start) * sizeof(long) * 8;
+					break;
+				}
+				p++;
 			}
-			if(p->pid > last_pid && next_safe > p->pid)
-				next_safe = p->pid;
-			if(p->pgrp > last_pid && next_safe > p->pgrp)
-				next_safe = p->pgrp;
-			if(p->session > last_pid && next_safe > p->session)
-				next_safe = p->session;
 		}
-		read_unlock(&tasklist_lock);
 	}
-	pid = last_pid;
-	spin_unlock(&lastpid_lock);
+
+	last_pid = pid;
 
 	return pid;
 }
@@ -623,7 +683,10 @@
 	p->state = TASK_UNINTERRUPTIBLE;
 
 	copy_flags(clone_flags, p);
+	down(&getpid_mutex);
 	p->pid = get_pid(clone_flags);
+	if (p->pid < 0) /* valid pids are >= 0 */
+		goto bad_fork_cleanup;
 
 	p->run_list.next = NULL;
 	p->run_list.prev = NULL;
@@ -730,7 +793,17 @@
 		list_add(&p->thread_group, &current->thread_group);
 	}
 
+	/*
+	 * We must do the SET_LINKS() under the getpid_mutex, to avoid
+	 * another CPU to get our same PID between the release of of the
+	 * getpid_mutex and the SET_LINKS().
+	 *
+	 * In short to avoid SMP races the new child->pid must be just visible
+	 * in the tasklist by the time we drop the getpid_mutex.
+	 */
 	SET_LINKS(p);
+	up(&getpid_mutex);
+
 	hash_pid(p);
 	nr_threads++;
 	write_unlock_irq(&tasklist_lock);
@@ -757,6 +830,7 @@
 bad_fork_cleanup_files:
 	exit_files(p); /* blocking */
 bad_fork_cleanup:
+	up(&getpid_mutex);
 	put_exec_domain(p->exec_domain);
 	if (p->binfmt && p->binfmt->module)
 		__MOD_DEC_USE_COUNT(p->binfmt->module);

Andrea

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: get_pid fixes against 2.4.19pre7
  2002-04-26 11:44 get_pid fixes against 2.4.19pre7 Andrea Arcangeli
@ 2002-04-26 11:53 ` Russell King
  2002-04-26 11:57   ` Andrea Arcangeli
  2002-04-26 13:16 ` Hubertus Franke
  1 sibling, 1 reply; 5+ messages in thread
From: Russell King @ 2002-04-26 11:53 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Marcelo Tosatti, linux-kernel, Ihno Krumreich, Linus Torvalds

On Fri, Apr 26, 2002 at 01:44:09PM +0200, Andrea Arcangeli wrote:
> +			set_bit(p->pid, pid_bitmap);
> +			set_bit(p->pgrp, pid_bitmap);
> +			set_bit(p->tgid, pid_bitmap);
> +			set_bit(p->session, pid_bitmap);

Since we're running under a lock, do we really need the guaranteed
atomic (and therefore expensive) set_bit(), or would __set_bit()
suffice?

-- 
Russell King (rmk@arm.linux.org.uk)                The developer of ARM Linux
             http://www.arm.linux.org.uk/personal/aboutme.html


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: get_pid fixes against 2.4.19pre7
  2002-04-26 11:53 ` Russell King
@ 2002-04-26 11:57   ` Andrea Arcangeli
  0 siblings, 0 replies; 5+ messages in thread
From: Andrea Arcangeli @ 2002-04-26 11:57 UTC (permalink / raw)
  To: Russell King
  Cc: Marcelo Tosatti, linux-kernel, Ihno Krumreich, Linus Torvalds

On Fri, Apr 26, 2002 at 12:53:47PM +0100, Russell King wrote:
> On Fri, Apr 26, 2002 at 01:44:09PM +0200, Andrea Arcangeli wrote:
> > +			set_bit(p->pid, pid_bitmap);
> > +			set_bit(p->pgrp, pid_bitmap);
> > +			set_bit(p->tgid, pid_bitmap);
> > +			set_bit(p->session, pid_bitmap);
> 
> Since we're running under a lock, do we really need the guaranteed
> atomic (and therefore expensive) set_bit(), or would __set_bit()
> suffice?

__set_bit definitely suffices, thanks.

Andrea

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: get_pid fixes against 2.4.19pre7
  2002-04-26 11:44 get_pid fixes against 2.4.19pre7 Andrea Arcangeli
  2002-04-26 11:53 ` Russell King
@ 2002-04-26 13:16 ` Hubertus Franke
  2002-04-27  0:25   ` Andrea Arcangeli
  1 sibling, 1 reply; 5+ messages in thread
From: Hubertus Franke @ 2002-04-26 13:16 UTC (permalink / raw)
  To: Andrea Arcangeli, Marcelo Tosatti
  Cc: linux-kernel, Ihno Krumreich, Linus Torvalds

On Friday 26 April 2002 07:44 am, Andrea Arcangeli wrote:
> Hello,
>
> Could you have a look at these get_pid fixes? Besides the deadlocking
> while running out of pids and reducing from quadratic to linear the
> complexity of get_pids, it also addresses a longstanding non trivial
> race present in 2.2 too that can lead to pid collisions even on UP
> (noticed today while merging two more fixes from Ihno on the other
> part). the fix reduces a bit scalability of simultaneous forks from
> different cpus, but it's obviously right at least. Putting non-ready
> tasks into the tasklists asks for troubles (signals...).  For more
> details see the comment at the end of the patch. If you've suggestions
> they're welcome, thanks.
>
>

Andrea, is this a consolidation of some of the work that I posted earlier 
with the following patch for 2.4.17 and some of the discussions that followed
from that. 

At least from my prospect I ran some test and recognized that the
break even point between the two algorithms was about 22K pids.

Hence, I switched between the two algorithms at that break even point.

If you want I can send you the test program and you can measure
for yourself how your stuff is doing. The test program uses the
algo in user space and hence you can get some quick and decent answers.
Let me know if you like it.

-- 
-- Hubertus Franke  (frankeh@watson.ibm.com)


diff -urbN linux-2.4.17/kernel/fork.c linux-2.4.17-pid/kernel/fork.c
--- linux-2.4.17/kernel/fork.c	Wed Nov 21 13:18:42 2001
+++ linux-2.4.17-pid/kernel/fork.c	Thu Mar 28 17:05:33 2002
@@ -81,23 +81,111 @@
 /* Protects next_safe and last_pid. */
 spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
 
+/* this should be provided in every architecture */
+#ifndef SHIFT_PER_LONG
+#if BITS_PER_LONG == 64
+#   define SHIFT_PER_LONG 6
+#elif BITS_PER_LONG == 32
+#   define SHIFT_PER_LONG 5
+#else
+#error "SHIFT_PER_LONG"
+#endif
+#endif
+
+#define RESERVED_PIDS       (300)
+#define GETPID_THRESHOLD    (22000)  /* when to switch to a mapped algo */
+#define PID_MAP_SIZE        (PID_MAX >> SHIFT_PER_LONG)
+static unsigned long pid_map[PID_MAP_SIZE];
+static int next_safe = PID_MAX;
+
+static inline void mark_pid(int pid)
+{
+	__set_bit(pid,pid_map);
+}
+
+static int get_pid_by_map(int lastpid) 
+{
+	static int mark_and_sweep = 0;
+
+	int round = 0;
+        struct task_struct *p;
+        int i;
+	unsigned long mask;
+	int fpos;
+
+
+	if (mark_and_sweep) {
+repeat:
+		mark_and_sweep = 0;
+		memset(pid_map, 0, PID_MAP_SIZE * sizeof(unsigned long));
+		lastpid = RESERVED_PIDS;
+	}
+	for_each_task(p) {
+		mark_pid(p->pid);
+		mark_pid(p->pgrp);
+		mark_pid(p->tgid);
+		mark_pid(p->session);
+	}
+
+	/* find next free pid */
+	i = (lastpid >> SHIFT_PER_LONG);
+	mask = pid_map[i] | ((1 << ((lastpid & (BITS_PER_LONG-1)))) - 1);   
+	
+	while ((mask == ~0) && (++i < PID_MAP_SIZE)) 
+		mask = pid_map[i];
+	
+	if (i == PID_MAP_SIZE) { 
+		if (round == 0) {
+			round = 1;
+			goto repeat;
+		}
+		next_safe = RESERVED_PIDS;
+		mark_and_sweep = 1;  /* mark next time */
+		return 0; 
+	}
+
+	fpos = ffz(mask);
+	i &= (PID_MAX-1);
+	lastpid = (i << SHIFT_PER_LONG) + fpos;
+
+	/* find next save pid */
+	mask &= ~((1 << fpos) - 1);
+
+	while ((mask == 0) && (++i < PID_MAP_SIZE)) 
+		mask = pid_map[i];
+
+	if (i==PID_MAP_SIZE) 
+		next_safe = PID_MAX;
+	else 
+		next_safe = (i << SHIFT_PER_LONG) + ffs(mask) - 1;
+	return lastpid;
+}
+
 static int get_pid(unsigned long flags)
 {
-	static int next_safe = PID_MAX;
 	struct task_struct *p;
+	int pid;
 
 	if (flags & CLONE_PID)
 		return current->pid;
 
 	spin_lock(&lastpid_lock);
 	if((++last_pid) & 0xffff8000) {
-		last_pid = 300;		/* Skip daemons etc. */
+		last_pid = RESERVED_PIDS;		/* Skip daemons etc. */
 		goto inside;
 	}
 	if(last_pid >= next_safe) {
 inside:
 		next_safe = PID_MAX;
 		read_lock(&tasklist_lock);
+		if (nr_threads > GETPID_THRESHOLD) {
+			last_pid = get_pid_by_map(last_pid);
+			if (last_pid == 0) {
+				last_pid = PID_MAX;
+				goto nomorepids; 
+			}
+		} else {
+			int beginpid = last_pid;
 	repeat:
 		for_each_task(p) {
 			if(p->pid == last_pid	||
@@ -106,9 +194,11 @@
 			   p->session == last_pid) {
 				if(++last_pid >= next_safe) {
 					if(last_pid & 0xffff8000)
-						last_pid = 300;
+							last_pid = RESERVED_PIDS;
 					next_safe = PID_MAX;
 				}
+					if(last_pid == beginpid)
+						goto nomorepids;
 				goto repeat;
 			}
 			if(p->pid > last_pid && next_safe > p->pid)
@@ -117,12 +207,22 @@
 				next_safe = p->pgrp;
 			if(p->session > last_pid && next_safe > p->session)
 				next_safe = p->session;
+				if(p->tgid > last_pid && next_safe > p->tgid)
+					next_safe = p->tgid;
+			}
 		}
 		read_unlock(&tasklist_lock);
 	}
+	pid = last_pid;
 	spin_unlock(&lastpid_lock);
 
-	return last_pid;
+	return pid;
+
+nomorepids:
+        next_safe = last_pid = PID_MAX;
+        read_unlock(&tasklist_lock);
+        spin_unlock(&lastpid_lock);
+        return 0;
 }
 
 static inline int dup_mmap(struct mm_struct * mm)

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: get_pid fixes against 2.4.19pre7
  2002-04-26 13:16 ` Hubertus Franke
@ 2002-04-27  0:25   ` Andrea Arcangeli
  0 siblings, 0 replies; 5+ messages in thread
From: Andrea Arcangeli @ 2002-04-27  0:25 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: Marcelo Tosatti, linux-kernel, Ihno Krumreich, Linus Torvalds

On Fri, Apr 26, 2002 at 09:16:46AM -0400, Hubertus Franke wrote:
> On Friday 26 April 2002 07:44 am, Andrea Arcangeli wrote:
> > Hello,
> >
> > Could you have a look at these get_pid fixes? Besides the deadlocking
> > while running out of pids and reducing from quadratic to linear the
> > complexity of get_pids, it also addresses a longstanding non trivial
> > race present in 2.2 too that can lead to pid collisions even on UP
> > (noticed today while merging two more fixes from Ihno on the other
> > part). the fix reduces a bit scalability of simultaneous forks from
> > different cpus, but it's obviously right at least. Putting non-ready
> > tasks into the tasklists asks for troubles (signals...).  For more
> > details see the comment at the end of the patch. If you've suggestions
> > they're welcome, thanks.
> >
> >
> 
> Andrea, is this a consolidation of some of the work that I posted earlier 
> with the following patch for 2.4.17 and some of the discussions that followed
> from that. 
> 
> At least from my prospect I ran some test and recognized that the
> break even point between the two algorithms was about 22K pids.
> 
> Hence, I switched between the two algorithms at that break even point.

Nice to hear.

> 
> If you want I can send you the test program and you can measure
> for yourself how your stuff is doing. The test program uses the
> algo in user space and hence you can get some quick and decent answers.
> Let me know if you like it.

Thanks, it is welcome. If you've time it would also be nice if you could
review and/or test the patch with your proggy so that we can get those
fixes included in mainline, the more eyes on it the better :).

Latest version with the __set_bit optimization from Russell is appended:

diff -urN 2.4.19pre7/include/linux/threads.h getpid/include/linux/threads.h
--- 2.4.19pre7/include/linux/threads.h	Thu Apr 18 07:51:30 2002
+++ getpid/include/linux/threads.h	Sat Apr 27 00:47:59 2002
@@ -19,6 +19,6 @@
 /*
  * This controls the maximum pid allocated to a process
  */
-#define PID_MAX 0x8000
+#define PID_NR 0x8000
 
 #endif
diff -urN 2.4.19pre7/kernel/fork.c getpid/kernel/fork.c
--- 2.4.19pre7/kernel/fork.c	Tue Apr 16 08:12:09 2002
+++ getpid/kernel/fork.c	Sat Apr 27 00:48:15 2002
@@ -37,6 +37,12 @@
 
 struct task_struct *pidhash[PIDHASH_SZ];
 
+/*
+ * Protectes next_unsafe, last_pid and it avoids races
+ * between get_pid and SET_LINKS().
+ */
+static DECLARE_MUTEX(getpid_mutex);
+
 void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
 {
 	unsigned long flags;
@@ -79,51 +85,105 @@
 	init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
 }
 
-/* Protects next_safe and last_pid. */
-spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
-
+/*
+ *	Get the next free pid for a new process/thread.
+ *
+ *	Strategy: last_pid and next_unsafe (excluded) are an interval where all pids
+ *		  are free, so next pid is just last_pid + 1 if it's also < next_unsafe.
+ *		  If last_pid + 1 >= next_unsafe the interval is completely used.
+ *		  In this case a bitmap with all used pids/tgids/pgrp/seesion is
+ *		  is created. This bitmap is looked for the next free pid and next_unsafe.
+ *		  If all pids are used, a kernel warning is issued.
+ */
 static int get_pid(unsigned long flags)
 {
-	static int next_safe = PID_MAX;
+	static int next_unsafe = PID_NR;
+#define PID_FIRST	2 /* pid 1 is init, first usable pid is 2 */
+#define PID_BITMAP_SIZE	((((PID_NR + 7) / 8) + sizeof(long) - 1 ) / (sizeof(long)))
+	/*
+	 * Even if this could be local per-thread, keep it static and protected by
+	 * the lock because we don't want to overflow the stack and we wouldn't
+	 * SMP scale better anyways. It doesn't waste disk space because it's in
+	 * the .bss.
+	 */
+	static unsigned long pid_bitmap[PID_BITMAP_SIZE];
+
+	/* from here the stuff on the stack */
 	struct task_struct *p;
-	int pid;
+	int pid, found_pid;
 
 	if (flags & CLONE_PID)
 		return current->pid;
 
-	spin_lock(&lastpid_lock);
-	if((++last_pid) & 0xffff8000) {
-		last_pid = 300;		/* Skip daemons etc. */
-		goto inside;
-	}
-	if(last_pid >= next_safe) {
-inside:
-		next_safe = PID_MAX;
+	pid = last_pid + 1;
+	if (pid >= next_unsafe) {
+		next_unsafe = PID_NR;
+		memset(pid_bitmap, 0, PID_BITMAP_SIZE*sizeof(long));
+
 		read_lock(&tasklist_lock);
-	repeat:
+		/*
+		 * Build the bitmap and calc next_unsafe.
+		 */
 		for_each_task(p) {
-			if(p->pid == last_pid	||
-			   p->pgrp == last_pid	||
-			   p->tgid == last_pid	||
-			   p->session == last_pid) {
-				if(++last_pid >= next_safe) {
-					if(last_pid & 0xffff8000)
-						last_pid = 300;
-					next_safe = PID_MAX;
+			__set_bit(p->pid, pid_bitmap);
+			__set_bit(p->pgrp, pid_bitmap);
+			__set_bit(p->tgid, pid_bitmap);
+			__set_bit(p->session, pid_bitmap);
+
+			if (next_unsafe > p->pid && p->pid > pid)
+				next_unsafe = p->pid;
+			if (next_unsafe > p->pgrp && p->pgrp > pid)
+				next_unsafe = p->pgrp;
+			if (next_unsafe > p->tgid && p->tgid > pid)
+				next_unsafe = p->tgid;
+			if (next_unsafe > p->session && p->session > pid)
+				next_unsafe = p->session;
+		}
+
+		/*
+		 * Release the tasklist_lock, after the unlock it may happen that
+		 * a pid is freed while it's still marked in use
+		 * in the pid_bitmap[].
+		 */
+		read_unlock(&tasklist_lock);
+
+		found_pid = find_next_zero_bit(pid_bitmap, PID_NR, pid);
+		if (found_pid >= PID_NR) {
+			next_unsafe = 0; /* depends on PID_FIRST > 0 */
+			found_pid = find_next_zero_bit(pid_bitmap, pid, PID_FIRST);
+			/* We scanned the whole bitmap without finding a free pid. */
+			if (found_pid >= pid) {
+				static long last_get_pid_warning;
+				if ((unsigned long) (jiffies - last_get_pid_warning) >= HZ) {
+					printk(KERN_NOTICE "No more PIDs (PID_NR = %d)\n", PID_NR);
+					last_get_pid_warning = jiffies;
 				}
-				goto repeat;
+				return -1;
+			}
+		}
+
+		pid = found_pid;
+
+		if (pid > next_unsafe) {
+			/* recalc next_unsafe by looking for the next bit set in the bitmap */
+			unsigned long * start = pid_bitmap;
+			unsigned long * p = start + (pid / (sizeof(long) * 8));
+			unsigned long * end = pid_bitmap + PID_BITMAP_SIZE;
+			unsigned long mask = ~((1UL << (pid & ((sizeof(long) * 8 - 1)))) - 1);
+
+			*p &= (mask << 1);
+
+			while (p < end) {
+				if (*p) {
+					next_unsafe = ffz(~*p) + (p - start) * sizeof(long) * 8;
+					break;
+				}
+				p++;
 			}
-			if(p->pid > last_pid && next_safe > p->pid)
-				next_safe = p->pid;
-			if(p->pgrp > last_pid && next_safe > p->pgrp)
-				next_safe = p->pgrp;
-			if(p->session > last_pid && next_safe > p->session)
-				next_safe = p->session;
 		}
-		read_unlock(&tasklist_lock);
 	}
-	pid = last_pid;
-	spin_unlock(&lastpid_lock);
+
+	last_pid = pid;
 
 	return pid;
 }
@@ -623,7 +683,10 @@
 	p->state = TASK_UNINTERRUPTIBLE;
 
 	copy_flags(clone_flags, p);
+	down(&getpid_mutex);
 	p->pid = get_pid(clone_flags);
+	if (p->pid < 0) /* valid pids are >= 0 */
+		goto bad_fork_cleanup;
 
 	p->run_list.next = NULL;
 	p->run_list.prev = NULL;
@@ -730,7 +793,17 @@
 		list_add(&p->thread_group, &current->thread_group);
 	}
 
+	/*
+	 * We must do the SET_LINKS() under the getpid_mutex, to avoid
+	 * another CPU to get our same PID between the release of of the
+	 * getpid_mutex and the SET_LINKS().
+	 *
+	 * In short to avoid SMP races the new child->pid must be just visible
+	 * in the tasklist by the time we drop the getpid_mutex.
+	 */
 	SET_LINKS(p);
+	up(&getpid_mutex);
+
 	hash_pid(p);
 	nr_threads++;
 	write_unlock_irq(&tasklist_lock);
@@ -757,6 +830,7 @@
 bad_fork_cleanup_files:
 	exit_files(p); /* blocking */
 bad_fork_cleanup:
+	up(&getpid_mutex);
 	put_exec_domain(p->exec_domain);
 	if (p->binfmt && p->binfmt->module)
 		__MOD_DEC_USE_COUNT(p->binfmt->module);


Andrea

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2002-04-27  0:25 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-04-26 11:44 get_pid fixes against 2.4.19pre7 Andrea Arcangeli
2002-04-26 11:53 ` Russell King
2002-04-26 11:57   ` Andrea Arcangeli
2002-04-26 13:16 ` Hubertus Franke
2002-04-27  0:25   ` Andrea Arcangeli

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox