[PATCH]IPC locks breaking down with RCU

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH]IPC locks breaking down with RCU
@ 2002-10-18  0:14 mingming cao
  2002-10-20 13:14 ` Hugh Dickins
  0 siblings, 1 reply; 51+ messages in thread
From: mingming cao @ 2002-10-18  0:14 UTC (permalink / raw)
  To: torvalds, akpm, Hugh Dickins, linux-kernel; +Cc: cmm, dipankar

[-- Attachment #1: Type: text/plain, Size: 1197 bytes --]

Hi Linus,

This is the latest version of the ipc lock patch.  It breaks down the
three global IPC locks into one lock per IPC ID,  also addresses the
cache line bouncing problem  introduced in the original patch. The
original post could be found at:
http://marc.theaimsgroup.com/?l=linux-kernel&m=102980357802682&w=2
\x18
The original patch breaks down the global IPC locks, yet added another
layer of locking to protect the IPC ID array in case of resizing. Some
concern was raised that the read/write lock may cause cache line
bouncing.

Since write lock is only used when the array is dynamically resized, 
RCU seems perfectly fit for this situation.  By doing so it could reduce
the possible lock contention in some applications where the IPC
resources are heavily used, without introducing cache line bouncing.

Besides the RCU changes, it also remove the redundant ipc_lockall() and
ipc_unlockall() as suggested by Hugh Dickins.

Patch is against 2.5.43 kernel. It requires Dipankar Sarma's
read_barrier_depends RCU helper patch:
http://marc.theaimsgroup.com/?l=linux-kernel&m=103479438017486&w=2

We use the ipc lock on OracleApps and it gave us the best number. 
Please include.

Mingming Cao

[-- Attachment #2: ipclock-rcu-2543.patch --]
[-- Type: text/plain, Size: 5233 bytes --]

Binary files linux-2.5.43/arch/i386/boot/compressed/vmlinux.bin.gz and linux-2.5.43-ipc/arch/i386/boot/compressed/vmlinux.bin.gz differ
diff -urN -X dontdiff linux-2.5.43/include/linux/ipc.h linux-2.5.43-ipc/include/linux/ipc.h
--- linux-2.5.43/include/linux/ipc.h	Tue Oct 15 20:26:43 2002
+++ linux-2.5.43-ipc/include/linux/ipc.h	Wed Oct 16 09:48:28 2002
@@ -56,6 +56,7 @@
 /* used by in-kernel data structures */
 struct kern_ipc_perm
 {
+	spinlock_t	lock;
 	key_t		key;
 	uid_t		uid;
 	gid_t		gid;
diff -urN -X dontdiff linux-2.5.43/ipc/shm.c linux-2.5.43-ipc/ipc/shm.c
--- linux-2.5.43/ipc/shm.c	Tue Oct 15 20:28:22 2002
+++ linux-2.5.43-ipc/ipc/shm.c	Wed Oct 16 09:48:28 2002
@@ -38,8 +38,6 @@
 
 #define shm_lock(id)	((struct shmid_kernel*)ipc_lock(&shm_ids,id))
 #define shm_unlock(id)	ipc_unlock(&shm_ids,id)
-#define shm_lockall()	ipc_lockall(&shm_ids)
-#define shm_unlockall()	ipc_unlockall(&shm_ids)
 #define shm_get(id)	((struct shmid_kernel*)ipc_get(&shm_ids,id))
 #define shm_buildid(id, seq) \
 	ipc_buildid(&shm_ids, id, seq)
@@ -409,14 +407,12 @@
 
 		memset(&shm_info,0,sizeof(shm_info));
 		down(&shm_ids.sem);
-		shm_lockall();
 		shm_info.used_ids = shm_ids.in_use;
 		shm_get_stat (&shm_info.shm_rss, &shm_info.shm_swp);
 		shm_info.shm_tot = shm_tot;
 		shm_info.swap_attempts = 0;
 		shm_info.swap_successes = 0;
 		err = shm_ids.max_id;
-		shm_unlockall();
 		up(&shm_ids.sem);
 		if(copy_to_user (buf, &shm_info, sizeof(shm_info)))
 			return -EFAULT;
diff -urN -X dontdiff linux-2.5.43/ipc/util.c linux-2.5.43-ipc/ipc/util.c
--- linux-2.5.43/ipc/util.c	Tue Oct 15 20:27:54 2002
+++ linux-2.5.43-ipc/ipc/util.c	Wed Oct 16 09:48:28 2002
@@ -92,8 +92,10 @@
 {
 	int id;
 	struct kern_ipc_perm* p;
+	int max_id = ids->max_id;
 
-	for (id = 0; id <= ids->max_id; id++) {
+	read_barrier_depends();
+	for (id = 0; id <= max_id; id++) {
 		p = ids->entries[id].p;
 		if(p==NULL)
 			continue;
@@ -106,8 +108,8 @@
 static int grow_ary(struct ipc_ids* ids, int newsize)
 {
 	struct ipc_id* new;
-	struct ipc_id* old;
 	int i;
+	struct rcu_ipc_array *arg = NULL;
 
 	if(newsize > IPCMNI)
 		newsize = IPCMNI;
@@ -121,14 +123,19 @@
 	for(i=ids->size;i<newsize;i++) {
 		new[i].p = NULL;
 	}
+	arg = (struct rcu_ipc_array *) kmalloc(sizeof(*arg), GFP_KERNEL);
+	if(arg == NULL)
+		return ids->size;
+	arg->entries = ids->entries;
+	arg->size = ids->size;
+	
 	spin_lock(&ids->ary);
-
-	old = ids->entries;
 	ids->entries = new;
-	i = ids->size;
+	wmb();
 	ids->size = newsize;
 	spin_unlock(&ids->ary);
-	ipc_free(old, sizeof(struct ipc_id)*i);
+
+	call_rcu(&arg->rh, ipc_free_callback, arg);
 	return ids->size;
 }
 
@@ -166,7 +173,9 @@
 	if(ids->seq > ids->seq_max)
 		ids->seq = 0;
 
-	spin_lock(&ids->ary);
+	new->lock = SPIN_LOCK_UNLOCKED;
+	rcu_read_lock();
+	spin_lock(&new->lock);
 	ids->entries[id].p = new;
 	return id;
 }
@@ -188,6 +197,7 @@
 	int lid = id % SEQ_MULTIPLIER;
 	if(lid >= ids->size)
 		BUG();
+	rmb();
 	p = ids->entries[lid].p;
 	ids->entries[lid].p = NULL;
 	if(p==NULL)
@@ -239,7 +249,12 @@
 	else
 		kfree(ptr);
 }
-
+static void ipc_free_callback(void * arg)
+{
+	struct rcu_ipc_array *a = (struct rcu_ipc_array *)arg;
+	ipc_free(a->entries, a->size);
+	kfree(arg);
+}
 /**
  *	ipcperms	-	check IPC permissions
  *	@ipcp: IPC permission set
diff -urN -X dontdiff linux-2.5.43/ipc/util.h linux-2.5.43-ipc/ipc/util.h
--- linux-2.5.43/ipc/util.h	Tue Oct 15 20:28:24 2002
+++ linux-2.5.43-ipc/ipc/util.h	Wed Oct 16 09:48:28 2002
@@ -4,6 +4,7 @@
  *
  * ipc helper functions (c) 1999 Manfred Spraul <manfreds@colorfullife.com>
  */
+#include <linux/rcupdate.h>
 
 #define USHRT_MAX 0xffff
 #define SEQ_MULTIPLIER	(IPCMNI)
@@ -12,6 +13,12 @@
 void msg_init (void);
 void shm_init (void);
 
+struct rcu_ipc_array {
+	struct rcu_head rh;
+	struct ipc_id* entries;
+	int size;
+};
+
 struct ipc_ids {
 	int size;
 	int in_use;
@@ -44,11 +51,7 @@
  */
 void* ipc_alloc(int size);
 void ipc_free(void* ptr, int size);
-
-extern inline void ipc_lockall(struct ipc_ids* ids)
-{
-	spin_lock(&ids->ary);
-}
+void ipc_free_callback(void* arg);
 
 extern inline struct kern_ipc_perm* ipc_get(struct ipc_ids* ids, int id)
 {
@@ -56,32 +59,43 @@
 	int lid = id % SEQ_MULTIPLIER;
 	if(lid >= ids->size)
 		return NULL;
-
+	rmb();
 	out = ids->entries[lid].p;
 	return out;
 }
 
-extern inline void ipc_unlockall(struct ipc_ids* ids)
-{
-	spin_unlock(&ids->ary);
-}
 extern inline struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id)
 {
 	struct kern_ipc_perm* out;
 	int lid = id % SEQ_MULTIPLIER;
-	if(lid >= ids->size)
-		return NULL;
 
-	spin_lock(&ids->ary);
+	rcu_read_lock();
+	if(lid >= ids->size) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	rmb();
 	out = ids->entries[lid].p;
-	if(out==NULL)
-		spin_unlock(&ids->ary);
+	if(out==NULL) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	spin_lock(&out->lock);
 	return out;
 }
 
 extern inline void ipc_unlock(struct ipc_ids* ids, int id)
 {
-	spin_unlock(&ids->ary);
+	int lid = id % SEQ_MULTIPLIER;
+	struct kern_ipc_perm* out;
+
+        if(lid >= ids->size)
+		return;
+	rmb();	
+	out = ids->entries[lid].p;
+	if (out)
+		spin_unlock(&out->lock);
+	rcu_read_unlock();
 }
 
 extern inline int ipc_buildid(struct ipc_ids* ids, int id, int seq)

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]IPC locks breaking down with RCU
  2002-10-18  0:14 [PATCH]IPC locks breaking down with RCU mingming cao
@ 2002-10-20 13:14 ` Hugh Dickins
  2002-10-20 17:27   ` Hugh Dickins
  2002-10-21 18:07   ` mingming cao
  0 siblings, 2 replies; 51+ messages in thread
From: Hugh Dickins @ 2002-10-20 13:14 UTC (permalink / raw)
  To: mingming cao; +Cc: Andrew Morton, linux-kernel, dipankar

On Thu, 17 Oct 2002, mingming cao wrote:
> Hi Linus,
> 
> This is the latest version of the ipc lock patch.  It breaks down the
> three global IPC locks into one lock per IPC ID,  also addresses the
> cache line bouncing problem  introduced in the original patch. The
> original post could be found at:
> http://marc.theaimsgroup.com/?l=linux-kernel&m=102980357802682&w=2
> \x18
> The original patch breaks down the global IPC locks, yet added another
> layer of locking to protect the IPC ID array in case of resizing. Some
> concern was raised that the read/write lock may cause cache line
> bouncing.
> 
> Since write lock is only used when the array is dynamically resized, 
> RCU seems perfectly fit for this situation.  By doing so it could reduce
> the possible lock contention in some applications where the IPC
> resources are heavily used, without introducing cache line bouncing.
> 
> Besides the RCU changes, it also remove the redundant ipc_lockall() and
> ipc_unlockall() as suggested by Hugh Dickins.
> 
> Patch is against 2.5.43 kernel. It requires Dipankar Sarma's
> read_barrier_depends RCU helper patch:
> http://marc.theaimsgroup.com/?l=linux-kernel&m=103479438017486&w=2
> 
> We use the ipc lock on OracleApps and it gave us the best number. 
> Please include.

This looks very good to me: I'm glad you found the RCU idea works out.
No need for performance numbers, this is now clearly the right way to
go.  And read_barrier_depends is in 2.5.44, so no problem there.

I'm ignorant of RCU, and my mind goes mushy around memory barriers,
but I expect you've consulted the best there; and I'll be wanting to
refer to this implementation as a nice example of how to use RCU.
But please make a couple of small cleanups, unless you disagree.

Now delete spinlock_t ary and all references to it: only grow_ary
is using it, and it's already protected by sem, and we'd be in
trouble with concurrent allocations if it were not.

And I'd be happier to see ipc_unlock without those conditionals i.e.
delete the "if(lid >= ids->size) return;" and the "if (out)" - they
seem to encourage calling ipc_unlock where ipc_lock did not succeed,
but that would be unsafe.  If you found somewhere that's being done,
I think we need to fix that place, not work around it in ipc_unlock.

Linus is away this week (so I've left him off, to avoid clogging up
/dev/null): perhaps Andrew could take your patch into his -mm tree
when you've made those changes (or persuaded us against)?

Hugh

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]IPC locks breaking down with RCU
  2002-10-20 13:14 ` Hugh Dickins
@ 2002-10-20 17:27   ` Hugh Dickins
  2002-10-21 18:11     ` mingming cao
  2002-10-21 18:07   ` mingming cao
  1 sibling, 1 reply; 51+ messages in thread
From: Hugh Dickins @ 2002-10-20 17:27 UTC (permalink / raw)
  To: mingming cao; +Cc: Andrew Morton, linux-kernel, dipankar

On Sun, 20 Oct 2002, Hugh Dickins wrote:
> 
> This looks very good to me: I'm glad you found the RCU idea works out.
> ...
> And I'd be happier to see ipc_unlock without those conditionals i.e.
> delete the "if(lid >= ids->size) return;" and the "if (out)" - they
> seem to encourage calling ipc_unlock where ipc_lock did not succeed,
> but that would be unsafe.  If you found somewhere that's being done,
> I think we need to fix that place, not work around it in ipc_unlock.

Sorry, MingMing, it doesn't look so good to me now.

The "if(lid >= ids->size) return;" still looks unnecessary,
but I think I see why you have "if (out)" in ipc_unlock: because
of ipc_rmid, which has already nulled out entries[lid].p, yes?

A minor point is, wouldn't that skipping of spin_unlock leave you
with the wrong preempt count, on a CONFIG_PREEMPT y configuration?
But that's easily dealt with.

A much more serious point: we could certainly bring the ipc_rmid
and ipc_unlock much closer together; but however close we bring them
(unlock implicit within rmid), there will still be a race with one
cpu in ipc_lock spinning on out->lock, while we in ipc_rmid null
entries[lid].p and unlock and free the structure containing that lock.

I think you're going to have to extend RCU to freeing the entry
(though this is a much less exceptional case than growing the array,
so less clear to me that RCU is appropriate here), if you're to
avoid reverting to the earlier rwlock or embedded spinlock designs.

Perhaps there's a simpler solution - ask around - but I don't see it.

Hugh

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]IPC locks breaking down with RCU
  2002-10-20 13:14 ` Hugh Dickins
  2002-10-20 17:27   ` Hugh Dickins
@ 2002-10-21 18:07   ` mingming cao
  1 sibling, 0 replies; 51+ messages in thread
From: mingming cao @ 2002-10-21 18:07 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: Andrew Morton, linux-kernel, dipankar

Hugh Dickins wrote:
> 
> I'm ignorant of RCU, and my mind goes mushy around memory barriers,
> but I expect you've consulted the best there; and I'll be wanting to
> refer to this implementation as a nice example of how to use RCU.

Yes the RCU patch author Dipankar has already reviewed the memory
barriers in ipclock patch.  

> Now delete spinlock_t ary and all references to it: only grow_ary
> is using it, and it's already protected by sem, and we'd be in
> trouble with concurrent allocations if it were not.
> 
Oh, right. grow_ary does not need spinlock_t ary anymore.:-)

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]IPC locks breaking down with RCU
  2002-10-20 17:27   ` Hugh Dickins
@ 2002-10-21 18:11     ` mingming cao
  2002-10-21 19:00       ` Hugh Dickins
  2002-10-21 19:18       ` [PATCH]IPC locks breaking down with RCU Dipankar Sarma
  0 siblings, 2 replies; 51+ messages in thread
From: mingming cao @ 2002-10-21 18:11 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: Andrew Morton, linux-kernel, dipankar

Hugh Dickins wrote:
> 
> The "if(lid >= ids->size) return;" still looks unnecessary,
> but I think I see why you have "if (out)" in ipc_unlock: because
> of ipc_rmid, which has already nulled out entries[lid].p, yes?
>

Thanks a lot for your comments.  Yes.  That's the consideration.

> A minor point is, wouldn't that skipping of spin_unlock leave you
> with the wrong preempt count, on a CONFIG_PREEMPT y configuration?
> But that's easily dealt with.
> 
> A much more serious point: we could certainly bring the ipc_rmid
> and ipc_unlock much closer together; but however close we bring them
> (unlock implicit within rmid), there will still be a race with one
> cpu in ipc_lock spinning on out->lock, while we in ipc_rmid null
> entries[lid].p and unlock and free the structure containing that lock.
>

Thanks for pointing this out.  This is a issue that has to be addressed. 

A simple solution I could think of for this problem, moving the per_id
lock out of the kern_ipc_perm structure, and put it in the ipc_id
structure. Actually I did this way at the first time,  then I agreed
with you that moving the per_id lock into there kern_ipc_perm structure
will help reduce cacheline bouncing.  

I think that having the per_id lock stay out of the structure it
protects will easy the job of ipc_rmid(), also will avoid the wrong
preempt count problem caused by the additional check "if (out)" in
ipc_unlock() as you mentioned above.

Is this solution looks good to you? If so, I will update the patch for
2.5.44 soon.

Mingming

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]IPC locks breaking down with RCU
  2002-10-21 18:11     ` mingming cao
@ 2002-10-21 19:00       ` Hugh Dickins
  2002-10-24 21:49         ` [PATCH]updated ipc lock patch mingming cao
  2002-10-21 19:18       ` [PATCH]IPC locks breaking down with RCU Dipankar Sarma
  1 sibling, 1 reply; 51+ messages in thread
From: Hugh Dickins @ 2002-10-21 19:00 UTC (permalink / raw)
  To: mingming cao; +Cc: Andrew Morton, linux-kernel, dipankar

On Mon, 21 Oct 2002, mingming cao wrote:
> Hugh Dickins wrote:
> > A much more serious point: we could certainly bring the ipc_rmid
> > and ipc_unlock much closer together; but however close we bring them
> > (unlock implicit within rmid), there will still be a race with one
> > cpu in ipc_lock spinning on out->lock, while we in ipc_rmid null
> > entries[lid].p and unlock and free the structure containing that lock.
> 
> A simple solution I could think of for this problem, moving the per_id
> lock out of the kern_ipc_perm structure, and put it in the ipc_id
> structure. Actually I did this way at the first time,  then I agreed
> with you that moving the per_id lock into there kern_ipc_perm structure
> will help reduce cacheline bouncing.  
> 
> I think that having the per_id lock stay out of the structure it
> protects will easy the job of ipc_rmid(), also will avoid the wrong
> preempt count problem caused by the additional check "if (out)" in
> ipc_unlock() as you mentioned above.
> 
> Is this solution looks good to you? If so, I will update the patch for
> 2.5.44 soon.

Sorry, no, doesn't look good to me.  I do agree that it's an easy
solution to this problem, but there's no point in having gone the RCU
route to avoid cacheline bouncing, if you now reintroduce it all here.

I believe, as I said, that you'll have to go further, applying RCU
to freeing the entries themselves.  I had toyed with the idea of never
freeing entries once allocated, which is a similarly simple solution;
rejected that as in general too wasteful; and concluded that RCU is a
reasonable compromise - it amounts to lazy freeing, I think.

I'm happy to be overruled by someone who understands these cacheline
bounce issues better than we do, but nobody else has spoken up yet.

Hugh


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]IPC locks breaking down with RCU
  2002-10-21 18:11     ` mingming cao
  2002-10-21 19:00       ` Hugh Dickins
@ 2002-10-21 19:18       ` Dipankar Sarma
  2002-10-21 19:36         ` Hugh Dickins
  2002-10-21 19:41         ` mingming cao
  1 sibling, 2 replies; 51+ messages in thread
From: Dipankar Sarma @ 2002-10-21 19:18 UTC (permalink / raw)
  To: mingming cao; +Cc: Hugh Dickins, Andrew Morton, linux-kernel

On Mon, Oct 21, 2002 at 11:11:15AM -0700, mingming cao wrote:
> A simple solution I could think of for this problem, moving the per_id
> lock out of the kern_ipc_perm structure, and put it in the ipc_id
> structure. Actually I did this way at the first time,  then I agreed
> with you that moving the per_id lock into there kern_ipc_perm structure
> will help reduce cacheline bouncing.  
> 
> I think that having the per_id lock stay out of the structure it
> protects will easy the job of ipc_rmid(), also will avoid the wrong
> preempt count problem caused by the additional check "if (out)" in
> ipc_unlock() as you mentioned above.

I took a quick look at the original ipc code and I don't understand
something - it seems to me the ipc_ids structs are protected by the semaphore
inside for all operations, so why do we need the spinlock in the
first place ? Am I missing something here ?

Thanks
-- 
Dipankar Sarma  <dipankar@in.ibm.com> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]IPC locks breaking down with RCU
  2002-10-21 19:18       ` [PATCH]IPC locks breaking down with RCU Dipankar Sarma
@ 2002-10-21 19:36         ` Hugh Dickins
  2002-10-21 19:41         ` mingming cao
  1 sibling, 0 replies; 51+ messages in thread
From: Hugh Dickins @ 2002-10-21 19:36 UTC (permalink / raw)
  To: Dipankar Sarma; +Cc: mingming cao, Andrew Morton, linux-kernel

On Tue, 22 Oct 2002, Dipankar Sarma wrote:
> 
> I took a quick look at the original ipc code and I don't understand
> something - it seems to me the ipc_ids structs are protected by the semaphore
> inside for all operations, so why do we need the spinlock in the
> first place ? Am I missing something here ?

I made that mistake too at first, Mingming set me straight.
Many of the entry points down() the ipc_ids.sem semaphore, but the
most significant ones do not.  ipc/sem.c is probably the best example
(if confusing, since it involves quite different meanings of semaphore):
sys_semop() is the frequent, fast entry point, uses sem_lock without down.

Hugh

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]IPC locks breaking down with RCU
  2002-10-21 19:18       ` [PATCH]IPC locks breaking down with RCU Dipankar Sarma
  2002-10-21 19:36         ` Hugh Dickins
@ 2002-10-21 19:41         ` mingming cao
  2002-10-21 20:14           ` Dipankar Sarma
  1 sibling, 1 reply; 51+ messages in thread
From: mingming cao @ 2002-10-21 19:41 UTC (permalink / raw)
  To: dipankar; +Cc: Hugh Dickins, Andrew Morton, linux-kernel

Dipankar Sarma wrote:
> 
> I took a quick look at the original ipc code and I don't understand
> something - it seems to me the ipc_ids structs are protected by the semaphore
> inside for all operations, so why do we need the spinlock in the
> first place ? Am I missing something here ?

The semaphore is used to protect the fields in ipc_ids structure, while
the spinlock is used to protect IPC ids. For the current implementation,
there is one spinlock for all IPC ids of the same type(i.e. for all
messages queues).  The patch is intend to breaks down the global
spinlock and have a lock per IPC id.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]IPC locks breaking down with RCU
  2002-10-21 19:41         ` mingming cao
@ 2002-10-21 20:14           ` Dipankar Sarma
  0 siblings, 0 replies; 51+ messages in thread
From: Dipankar Sarma @ 2002-10-21 20:14 UTC (permalink / raw)
  To: mingming cao; +Cc: Hugh Dickins, Andrew Morton, linux-kernel

On Mon, Oct 21, 2002 at 12:41:58PM -0700, mingming cao wrote:
> > I took a quick look at the original ipc code and I don't understand
> > something - it seems to me the ipc_ids structs are protected by the semaphore
> > inside for all operations, so why do we need the spinlock in the
> > first place ? Am I missing something here ?
> 
> The semaphore is used to protect the fields in ipc_ids structure, while
> the spinlock is used to protect IPC ids. For the current implementation,
> there is one spinlock for all IPC ids of the same type(i.e. for all
> messages queues).  The patch is intend to breaks down the global

Well, if the semaphore is grabbed then the critical section is serialized
including accessing of IPC ids, so there would be no need to have
a separate spinlock for the IPC ids. Hugh pointed out the right reason,
semaphore IPCs use the spinlock for serialization in many paths,
not the semaphore in ipc_ids. 

Hugh's point is right - you have two data structures to protect - ipc_id
arrays and the kern_ipc_perms attached to it. I would use a single
spinlock in ipc_ids to serialize all the updates and use RCU for both
to implement safe lockfree lookups. For the second, you would probably
want to add a rcu_head to struct kern_ipc_perms and an RCU callback
to free it.

Thanks
-- 
Dipankar Sarma  <dipankar@in.ibm.com> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH]updated ipc lock patch
  2002-10-21 19:00       ` Hugh Dickins
@ 2002-10-24 21:49         ` mingming cao
  2002-10-24 22:29           ` Andrew Morton
  0 siblings, 1 reply; 51+ messages in thread
From: mingming cao @ 2002-10-24 21:49 UTC (permalink / raw)
  To: Hugh Dickins, Andrew Morton, manfred
  Cc: linux-kernel, dipankar, lse-tech, cmm

[-- Attachment #1: Type: text/plain, Size: 1465 bytes --]

Hi Andrew,

Here is the updated ipc lock patch:

- It greatly reduces the lock contention by having one lock per id. The
global spinlock is removed and a spinlock is added in kern_ipc_perm
structure.

- Uses ReadCopyUpdate in grow_ary() for locking-free resizing.

- In the places where ipc_rmid() is called, delay calling ipc_free() to
RCU callbacks.  This is to prevent ipc_lock() returning an invalid
pointer after ipc_rmid().  In addition, use the workqueue to enable RCU
freeing vmalloced entries.

Also some other changes:
- Remove redundant ipc_lockall/ipc_unlockall
- Now ipc_unlock() directly takes IPC ID pointer as argument, avoid
extra looking up the array.

The changes are made based on the input from Huge Dickens, Manfred
Spraul and Dipankar Sarma. In addition, Cliff White has run OSDL's dbt1
test on a 2 way against the earlier version of this patch. Results shows
about 2-6% improvement on the average number of transactions per
second.  Here is the summary of his tests:

                        2.5.42-mm2      2.5.42-mm2-ipclock
----------------------------------------------------------
Average over 5 runs	85.0 BT		89.8 BT
Std Deviation 5 runs	7.4  BT		1.0 BT

Average over 4 best 	88.15 BT	90.2 BT
Std Deviation 4 best	2.8 BT		0.5 BT

Full details of the tests could be found here:
http://www.osdl.org/projects/dbt1prfrns/results/mingming/index.html

patch is against 2.5.44-mm4.  Please include or give any feedback.

Thanks,

Mingming Cao

[-- Attachment #2: ipclock-2544mm4.patch --]
[-- Type: text/plain, Size: 17754 bytes --]

diff -urN 2544-mm4/include/linux/ipc.h 2544-mm4-ipc/include/linux/ipc.h
--- 2544-mm4/include/linux/ipc.h	Fri Oct 18 21:00:42 2002
+++ 2544-mm4-ipc/include/linux/ipc.h	Thu Oct 24 13:59:24 2002
@@ -56,6 +56,8 @@
 /* used by in-kernel data structures */
 struct kern_ipc_perm
 {
+	spinlock_t	lock;
+	int		deleted;
 	key_t		key;
 	uid_t		uid;
 	gid_t		gid;
diff -urN 2544-mm4/ipc/msg.c 2544-mm4-ipc/ipc/msg.c
--- 2544-mm4/ipc/msg.c	Fri Oct 18 21:00:43 2002
+++ 2544-mm4-ipc/ipc/msg.c	Thu Oct 24 13:59:24 2002
@@ -65,7 +65,7 @@
 static struct ipc_ids msg_ids;
 
 #define msg_lock(id)	((struct msg_queue*)ipc_lock(&msg_ids,id))
-#define msg_unlock(id)	ipc_unlock(&msg_ids,id)
+#define msg_unlock(msq)	ipc_unlock(&(msq)->q_perm)
 #define msg_rmid(id)	((struct msg_queue*)ipc_rmid(&msg_ids,id))
 #define msg_checkid(msq, msgid)	\
 	ipc_checkid(&msg_ids,&msq->q_perm,msgid)
@@ -122,7 +122,7 @@
 	INIT_LIST_HEAD(&msq->q_messages);
 	INIT_LIST_HEAD(&msq->q_receivers);
 	INIT_LIST_HEAD(&msq->q_senders);
-	msg_unlock(id);
+	msg_unlock(msq);
 
 	return msg_buildid(id,msq->q_perm.seq);
 }
@@ -271,7 +271,7 @@
 
 	expunge_all(msq,-EIDRM);
 	ss_wakeup(&msq->q_senders,1);
-	msg_unlock(id);
+	msg_unlock(msq);
 		
 	tmp = msq->q_messages.next;
 	while(tmp != &msq->q_messages) {
@@ -282,7 +282,7 @@
 	}
 	atomic_sub(msq->q_cbytes, &msg_bytes);
 	security_ops->msg_queue_free_security(msq);
-	kfree(msq);
+	ipc_rcu_free(msq, sizeof(struct msg_queue));
 }
 
 asmlinkage long sys_msgget (key_t key, int msgflg)
@@ -308,7 +308,7 @@
 			ret = -EACCES;
 		else
 			ret = msg_buildid(id, msq->q_perm.seq);
-		msg_unlock(id);
+		msg_unlock(msq);
 	}
 	up(&msg_ids.sem);
 	return ret;
@@ -488,7 +488,7 @@
 		tbuf.msg_qbytes = msq->q_qbytes;
 		tbuf.msg_lspid  = msq->q_lspid;
 		tbuf.msg_lrpid  = msq->q_lrpid;
-		msg_unlock(msqid);
+		msg_unlock(msq);
 		if (copy_msqid_to_user(buf, &tbuf, version))
 			return -EFAULT;
 		return success_return;
@@ -541,7 +541,7 @@
 		 * due to a larger queue size.
 		 */
 		ss_wakeup(&msq->q_senders,0);
-		msg_unlock(msqid);
+		msg_unlock(msq);
 		break;
 	}
 	case IPC_RMID:
@@ -553,10 +553,10 @@
 	up(&msg_ids.sem);
 	return err;
 out_unlock_up:
-	msg_unlock(msqid);
+	msg_unlock(msq);
 	goto out_up;
 out_unlock:
-	msg_unlock(msqid);
+	msg_unlock(msq);
 	return err;
 }
 
@@ -651,7 +651,7 @@
 			goto out_unlock_free;
 		}
 		ss_add(msq, &s);
-		msg_unlock(msqid);
+		msg_unlock(msq);
 		schedule();
 		current->state= TASK_RUNNING;
 
@@ -684,7 +684,7 @@
 	msg = NULL;
 
 out_unlock_free:
-	msg_unlock(msqid);
+	msg_unlock(msq);
 out_free:
 	if(msg!=NULL)
 		free_msg(msg);
@@ -766,7 +766,7 @@
 		atomic_sub(msg->m_ts,&msg_bytes);
 		atomic_dec(&msg_hdrs);
 		ss_wakeup(&msq->q_senders,0);
-		msg_unlock(msqid);
+		msg_unlock(msq);
 out_success:
 		msgsz = (msgsz > msg->m_ts) ? msg->m_ts : msgsz;
 		if (put_user (msg->m_type, &msgp->mtype) ||
@@ -777,7 +777,6 @@
 		return msgsz;
 	} else
 	{
-		struct msg_queue *t;
 		/* no message waiting. Prepare for pipelined
 		 * receive.
 		 */
@@ -795,7 +794,7 @@
 		 	msr_d.r_maxsize = msgsz;
 		msr_d.r_msg = ERR_PTR(-EAGAIN);
 		current->state = TASK_INTERRUPTIBLE;
-		msg_unlock(msqid);
+		msg_unlock(msq);
 
 		schedule();
 		current->state = TASK_RUNNING;
@@ -804,21 +803,19 @@
 		if(!IS_ERR(msg)) 
 			goto out_success;
 
-		t = msg_lock(msqid);
-		if(t==NULL)
-			msqid=-1;
+		msq = msg_lock(msqid);
 		msg = (struct msg_msg*)msr_d.r_msg;
 		if(!IS_ERR(msg)) {
 			/* our message arived while we waited for
 			 * the spinlock. Process it.
 			 */
-			if(msqid!=-1)
-				msg_unlock(msqid);
+			if(msq)
+				msg_unlock(msq);
 			goto out_success;
 		}
 		err = PTR_ERR(msg);
 		if(err == -EAGAIN) {
-			if(msqid==-1)
+			if(!msq)
 				BUG();
 			list_del(&msr_d.r_list);
 			if (signal_pending(current))
@@ -828,8 +825,8 @@
 		}
 	}
 out_unlock:
-	if(msqid!=-1)
-		msg_unlock(msqid);
+	if(msq)
+		msg_unlock(msq);
 	return err;
 }
 
@@ -862,7 +859,7 @@
 				msq->q_stime,
 				msq->q_rtime,
 				msq->q_ctime);
-			msg_unlock(i);
+			msg_unlock(msq);
 
 			pos += len;
 			if(pos < offset) {
diff -urN 2544-mm4/ipc/sem.c 2544-mm4-ipc/ipc/sem.c
--- 2544-mm4/ipc/sem.c	Fri Oct 18 21:01:48 2002
+++ 2544-mm4-ipc/ipc/sem.c	Thu Oct 24 13:59:24 2002
@@ -69,7 +69,7 @@
 
 
 #define sem_lock(id)	((struct sem_array*)ipc_lock(&sem_ids,id))
-#define sem_unlock(id)	ipc_unlock(&sem_ids,id)
+#define sem_unlock(sma)	ipc_unlock(&(sma)->sem_perm)
 #define sem_rmid(id)	((struct sem_array*)ipc_rmid(&sem_ids,id))
 #define sem_checkid(sma, semid)	\
 	ipc_checkid(&sem_ids,&sma->sem_perm,semid)
@@ -156,7 +156,7 @@
 	/* sma->undo = NULL; */
 	sma->sem_nsems = nsems;
 	sma->sem_ctime = CURRENT_TIME;
-	sem_unlock(id);
+	sem_unlock(sma);
 
 	return sem_buildid(id, sma->sem_perm.seq);
 }
@@ -189,7 +189,7 @@
 			err = -EACCES;
 		else
 			err = sem_buildid(id, sma->sem_perm.seq);
-		sem_unlock(id);
+		sem_unlock(sma);
 	}
 
 	up(&sem_ids.sem);
@@ -205,12 +205,12 @@
 	if(smanew==NULL)
 		return -EIDRM;
 	if(smanew != sma || sem_checkid(sma,semid) || sma->sem_nsems != nsems) {
-		sem_unlock(semid);
+		sem_unlock(smanew);
 		return -EIDRM;
 	}
 
 	if (ipcperms(&sma->sem_perm, flg)) {
-		sem_unlock(semid);
+		sem_unlock(smanew);
 		return -EACCES;
 	}
 	return 0;
@@ -423,12 +423,12 @@
 		q->prev = NULL;
 		wake_up_process(q->sleeper); /* doesn't sleep */
 	}
-	sem_unlock(id);
+	sem_unlock(sma);
 
 	used_sems -= sma->sem_nsems;
 	size = sizeof (*sma) + sma->sem_nsems * sizeof (struct sem);
 	security_ops->sem_free_security(sma);
-	ipc_free(sma, size);
+	ipc_rcu_free(sma, size);
 }
 
 static unsigned long copy_semid_to_user(void *buf, struct semid64_ds *in, int version)
@@ -456,6 +456,7 @@
 static int semctl_nolock(int semid, int semnum, int cmd, int version, union semun arg)
 {
 	int err = -EINVAL;
+	struct sem_array *sma;
 
 	switch(cmd) {
 	case IPC_INFO:
@@ -489,7 +490,6 @@
 	}
 	case SEM_STAT:
 	{
-		struct sem_array *sma;
 		struct semid64_ds tbuf;
 		int id;
 
@@ -511,7 +511,7 @@
 		tbuf.sem_otime  = sma->sem_otime;
 		tbuf.sem_ctime  = sma->sem_ctime;
 		tbuf.sem_nsems  = sma->sem_nsems;
-		sem_unlock(semid);
+		sem_unlock(sma);
 		if (copy_semid_to_user (arg.buf, &tbuf, version))
 			return -EFAULT;
 		return id;
@@ -521,7 +521,7 @@
 	}
 	return err;
 out_unlock:
-	sem_unlock(semid);
+	sem_unlock(sma);
 	return err;
 }
 
@@ -555,7 +555,7 @@
 		int i;
 
 		if(nsems > SEMMSL_FAST) {
-			sem_unlock(semid);			
+			sem_unlock(sma);			
 			sem_io = ipc_alloc(sizeof(ushort)*nsems);
 			if(sem_io == NULL)
 				return -ENOMEM;
@@ -566,7 +566,7 @@
 
 		for (i = 0; i < sma->sem_nsems; i++)
 			sem_io[i] = sma->sem_base[i].semval;
-		sem_unlock(semid);
+		sem_unlock(sma);
 		err = 0;
 		if(copy_to_user(array, sem_io, nsems*sizeof(ushort)))
 			err = -EFAULT;
@@ -577,7 +577,7 @@
 		int i;
 		struct sem_undo *un;
 
-		sem_unlock(semid);
+		sem_unlock(sma);
 
 		if(nsems > SEMMSL_FAST) {
 			sem_io = ipc_alloc(sizeof(ushort)*nsems);
@@ -619,7 +619,7 @@
 		tbuf.sem_otime  = sma->sem_otime;
 		tbuf.sem_ctime  = sma->sem_ctime;
 		tbuf.sem_nsems  = sma->sem_nsems;
-		sem_unlock(semid);
+		sem_unlock(sma);
 		if (copy_semid_to_user (arg.buf, &tbuf, version))
 			return -EFAULT;
 		return 0;
@@ -665,7 +665,7 @@
 	}
 	}
 out_unlock:
-	sem_unlock(semid);
+	sem_unlock(sma);
 out_free:
 	if(sem_io != fast_sem_io)
 		ipc_free(sem_io, sizeof(ushort)*nsems);
@@ -750,18 +750,18 @@
 		ipcp->mode = (ipcp->mode & ~S_IRWXUGO)
 				| (setbuf.mode & S_IRWXUGO);
 		sma->sem_ctime = CURRENT_TIME;
-		sem_unlock(semid);
+		sem_unlock(sma);
 		err = 0;
 		break;
 	default:
-		sem_unlock(semid);
+		sem_unlock(sma);
 		err = -EINVAL;
 		break;
 	}
 	return err;
 
 out_unlock:
-	sem_unlock(semid);
+	sem_unlock(sma);
 	return err;
 }
 
@@ -914,7 +914,7 @@
 	saved_add_count = 0;
 	if (current->sysvsem.undo_list != NULL)
 		saved_add_count = current->sysvsem.undo_list->add_count;
-	sem_unlock(semid);
+	sem_unlock(sma);
 	unlock_semundo();
 
 	error = get_undo_list(&undo_list);
@@ -1052,18 +1052,17 @@
 	current->sysvsem.sleep_list = &queue;
 
 	for (;;) {
-		struct sem_array* tmp;
 		queue.status = -EINTR;
 		queue.sleeper = current;
 		current->state = TASK_INTERRUPTIBLE;
-		sem_unlock(semid);
+		sem_unlock(sma);
 		unlock_semundo();
 
 		schedule();
 
 		lock_semundo();
-		tmp = sem_lock(semid);
-		if(tmp==NULL) {
+		sma = sem_lock(semid);
+		if(sma==NULL) {
 			if(queue.prev != NULL)
 				BUG();
 			current->sysvsem.sleep_list = NULL;
@@ -1098,7 +1097,7 @@
 	if (alter)
 		update_queue (sma);
 out_unlock_semundo_free:
-	sem_unlock(semid);
+	sem_unlock(sma);
 out_semundo_free:
 	unlock_semundo();
 out_free:
@@ -1185,7 +1184,7 @@
 			remove_from_queue(q->sma,q);
 		}
 		if(sma!=NULL)
-			sem_unlock(semid);
+			sem_unlock(sma);
 	}
 
 	undo_list = current->sysvsem.undo_list;
@@ -1233,7 +1232,7 @@
 		/* maybe some queued-up processes were waiting for this */
 		update_queue(sma);
 next_entry:
-		sem_unlock(semid);
+		sem_unlock(sma);
 	}
 	__exit_semundo(current);
 
@@ -1265,7 +1264,7 @@
 				sma->sem_perm.cgid,
 				sma->sem_otime,
 				sma->sem_ctime);
-			sem_unlock(i);
+			sem_unlock(sma);
 
 			pos += len;
 			if(pos < offset) {
diff -urN 2544-mm4/ipc/shm.c 2544-mm4-ipc/ipc/shm.c
--- 2544-mm4/ipc/shm.c	Thu Oct 24 09:22:14 2002
+++ 2544-mm4-ipc/ipc/shm.c	Thu Oct 24 13:59:24 2002
@@ -38,9 +38,7 @@
 static struct ipc_ids shm_ids;
 
 #define shm_lock(id)	((struct shmid_kernel*)ipc_lock(&shm_ids,id))
-#define shm_unlock(id)	ipc_unlock(&shm_ids,id)
-#define shm_lockall()	ipc_lockall(&shm_ids)
-#define shm_unlockall()	ipc_unlockall(&shm_ids)
+#define shm_unlock(shp)	ipc_unlock(&(shp)->shm_perm)
 #define shm_get(id)	((struct shmid_kernel*)ipc_get(&shm_ids,id))
 #define shm_buildid(id, seq) \
 	ipc_buildid(&shm_ids, id, seq)
@@ -93,7 +91,7 @@
 	shp->shm_atim = CURRENT_TIME;
 	shp->shm_lprid = current->pid;
 	shp->shm_nattch++;
-	shm_unlock(id);
+	shm_unlock(shp);
 }
 
 /* This is called by fork, once for every shm attach. */
@@ -114,7 +112,7 @@
 {
 	shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	shm_rmid (shp->id);
-	shm_unlock(shp->id);
+	shm_unlock(shp);
 	if (!is_file_hugepages(shp->shm_file))
 		shmem_lock(shp->shm_file, 0);
 	fput (shp->shm_file);
@@ -145,7 +143,7 @@
 	   shp->shm_flags & SHM_DEST)
 		shm_destroy (shp);
 	else
-		shm_unlock(id);
+		shm_unlock(shp);
 	up (&shm_ids.sem);
 }
 
@@ -225,7 +223,7 @@
 	else
 		file->f_op = &shm_file_operations;
 	shm_tot += numpages;
-	shm_unlock (id);
+	shm_unlock(shp);
 	return shp->id;
 
 no_id:
@@ -261,7 +259,7 @@
 			err = -EACCES;
 		else
 			err = shm_buildid(id, shp->shm_perm.seq);
-		shm_unlock(id);
+		shm_unlock(shp);
 	}
 	up(&shm_ids.sem);
 
@@ -421,14 +419,12 @@
 
 		memset(&shm_info,0,sizeof(shm_info));
 		down(&shm_ids.sem);
-		shm_lockall();
 		shm_info.used_ids = shm_ids.in_use;
 		shm_get_stat (&shm_info.shm_rss, &shm_info.shm_swp);
 		shm_info.shm_tot = shm_tot;
 		shm_info.swap_attempts = 0;
 		shm_info.swap_successes = 0;
 		err = shm_ids.max_id;
-		shm_unlockall();
 		up(&shm_ids.sem);
 		if(copy_to_user (buf, &shm_info, sizeof(shm_info))) {
 			err = -EFAULT;
@@ -470,7 +466,7 @@
 		tbuf.shm_cpid	= shp->shm_cprid;
 		tbuf.shm_lpid	= shp->shm_lprid;
 		tbuf.shm_nattch	= shp->shm_nattch;
-		shm_unlock(shmid);
+		shm_unlock(shp);
 		if(copy_shmid_to_user (buf, &tbuf, version))
 			err = -EFAULT;
 		else
@@ -505,7 +501,7 @@
 				shmem_lock(shp->shm_file, 0);
 			shp->shm_flags &= ~SHM_LOCKED;
 		}
-		shm_unlock(shmid);
+		shm_unlock(shp);
 		goto out;
 	}
 	case IPC_RMID:
@@ -538,7 +534,7 @@
 			shp->shm_flags |= SHM_DEST;
 			/* Do not find it any more */
 			shp->shm_perm.key = IPC_PRIVATE;
-			shm_unlock(shmid);
+			shm_unlock(shp);
 		} else
 			shm_destroy (shp);
 		up(&shm_ids.sem);
@@ -581,12 +577,12 @@
 
 	err = 0;
 out_unlock_up:
-	shm_unlock(shmid);
+	shm_unlock(shp);
 out_up:
 	up(&shm_ids.sem);
 	goto out;
 out_unlock:
-	shm_unlock(shmid);
+	shm_unlock(shp);
 out:
 	return err;
 }
@@ -646,18 +642,18 @@
 	}
 	err = shm_checkid(shp,shmid);
 	if (err) {
-		shm_unlock(shmid);
+		shm_unlock(shp);
 		goto out;
 	}
 	if (ipcperms(&shp->shm_perm, acc_mode)) {
-		shm_unlock(shmid);
+		shm_unlock(shp);
 		err = -EACCES;
 		goto out;
 	}
 	file = shp->shm_file;
 	size = file->f_dentry->d_inode->i_size;
 	shp->shm_nattch++;
-	shm_unlock(shmid);
+	shm_unlock(shp);
 
 	down_write(&current->mm->mmap_sem);
 	if (addr && !(shmflg & SHM_REMAP)) {
@@ -686,7 +682,7 @@
 	   shp->shm_flags & SHM_DEST)
 		shm_destroy (shp);
 	else
-		shm_unlock(shmid);
+		shm_unlock(shp);
 	up (&shm_ids.sem);
 
 	*raddr = (unsigned long) user_addr;
@@ -764,7 +760,7 @@
 				shp->shm_atim,
 				shp->shm_dtim,
 				shp->shm_ctim);
-			shm_unlock(i);
+			shm_unlock(shp);
 
 			pos += len;
 			if(pos < offset) {
diff -urN 2544-mm4/ipc/util.c 2544-mm4-ipc/ipc/util.c
--- 2544-mm4/ipc/util.c	Fri Oct 18 21:01:49 2002
+++ 2544-mm4-ipc/ipc/util.c	Thu Oct 24 13:59:24 2002
@@ -8,6 +8,8 @@
  *            Chris Evans, <chris@ferret.lmh.ox.ac.uk>
  * Nov 1999 - ipc helper functions, unified SMP locking
  *	      Manfred Spraul <manfreds@colorfullife.com>
+ * Oct 2002 - One lock per IPC id. RCU ipc_free for lock-free grow_ary().
+ *            Mingming Cao <cmm@us.ibm.com>
  */
 
 #include <linux/config.h>
@@ -75,7 +77,6 @@
 		printk(KERN_ERR "ipc_init_ids() failed, ipc service disabled.\n");
 		ids->size = 0;
 	}
-	ids->ary = SPIN_LOCK_UNLOCKED;
 	for(i=0;i<ids->size;i++)
 		ids->entries[i].p = NULL;
 }
@@ -92,8 +93,10 @@
 {
 	int id;
 	struct kern_ipc_perm* p;
+	int max_id = ids->max_id;
 
-	for (id = 0; id <= ids->max_id; id++) {
+	read_barrier_depends();
+	for (id = 0; id <= max_id; id++) {
 		p = ids->entries[id].p;
 		if(p==NULL)
 			continue;
@@ -121,14 +124,14 @@
 	for(i=ids->size;i<newsize;i++) {
 		new[i].p = NULL;
 	}
-	spin_lock(&ids->ary);
-
 	old = ids->entries;
-	ids->entries = new;
 	i = ids->size;
+	
+	ids->entries = new;
+	wmb();
 	ids->size = newsize;
-	spin_unlock(&ids->ary);
-	ipc_free(old, sizeof(struct ipc_id)*i);
+
+	ipc_rcu_free(old, sizeof(struct ipc_id)*i);
 	return ids->size;
 }
 
@@ -166,7 +169,10 @@
 	if(ids->seq > ids->seq_max)
 		ids->seq = 0;
 
-	spin_lock(&ids->ary);
+	new->lock = SPIN_LOCK_UNLOCKED;
+	new->deleted = 0;
+	rcu_read_lock();
+	spin_lock(&new->lock);
 	ids->entries[id].p = new;
 	return id;
 }
@@ -188,6 +194,7 @@
 	int lid = id % SEQ_MULTIPLIER;
 	if(lid >= ids->size)
 		BUG();
+	rmb();
 	p = ids->entries[lid].p;
 	ids->entries[lid].p = NULL;
 	if(p==NULL)
@@ -202,6 +209,7 @@
 		} while (ids->entries[lid].p == NULL);
 		ids->max_id = lid;
 	}
+	p->deleted = 1;
 	return p;
 }
 
@@ -240,6 +248,44 @@
 		kfree(ptr);
 }
 
+/* 
+ * Since RCU callback function is called in bh,
+ * we need to defer the vfree to schedule_work
+ */
+static void ipc_free_scheduled(void* arg)
+{
+	struct rcu_ipc_free *a = (struct rcu_ipc_free *)arg;
+	vfree(a->ptr);
+	kfree(a);
+}
+
+static void ipc_free_callback(void* arg)
+{
+	struct rcu_ipc_free *a = (struct rcu_ipc_free *)arg;
+	/* 
+	 * if data is vmalloced, then we need to delay the free
+	 */
+	if (a->size > PAGE_SIZE) {
+		INIT_WORK(&a->work, ipc_free_scheduled, arg);
+		schedule_work(&a->work);
+	} else {
+		kfree(a->ptr);
+		kfree(a);
+	}
+}
+
+void ipc_rcu_free(void* ptr, int size)
+{
+	struct rcu_ipc_free* arg;
+
+	arg = (struct rcu_ipc_free *) kmalloc(sizeof(*arg), GFP_KERNEL);
+	if (arg == NULL)
+		return;
+	arg->ptr = ptr;
+	arg->size = size;
+	call_rcu(&arg->rcu_head, ipc_free_callback, arg);
+}
+
 /**
  *	ipcperms	-	check IPC permissions
  *	@ipcp: IPC permission set
diff -urN 2544-mm4/ipc/util.h 2544-mm4-ipc/ipc/util.h
--- 2544-mm4/ipc/util.h	Fri Oct 18 21:01:57 2002
+++ 2544-mm4-ipc/ipc/util.h	Thu Oct 24 13:59:24 2002
@@ -4,6 +4,8 @@
  *
  * ipc helper functions (c) 1999 Manfred Spraul <manfreds@colorfullife.com>
  */
+#include <linux/rcupdate.h>
+#include <linux/workqueue.h>
 
 #define USHRT_MAX 0xffff
 #define SEQ_MULTIPLIER	(IPCMNI)
@@ -12,6 +14,13 @@
 void msg_init (void);
 void shm_init (void);
 
+struct rcu_ipc_free {
+	struct rcu_head		rcu_head;
+	void 			*ptr;
+	int 			size;
+	struct work_struct	work;
+};
+
 struct ipc_ids {
 	int size;
 	int in_use;
@@ -19,7 +28,6 @@
 	unsigned short seq;
 	unsigned short seq_max;
 	struct semaphore sem;	
-	spinlock_t ary;
 	struct ipc_id* entries;
 };
 
@@ -44,11 +52,7 @@
  */
 void* ipc_alloc(int size);
 void ipc_free(void* ptr, int size);
-
-extern inline void ipc_lockall(struct ipc_ids* ids)
-{
-	spin_lock(&ids->ary);
-}
+void ipc_rcu_free(void* arg, int size);
 
 extern inline struct kern_ipc_perm* ipc_get(struct ipc_ids* ids, int id)
 {
@@ -56,32 +60,44 @@
 	int lid = id % SEQ_MULTIPLIER;
 	if(lid >= ids->size)
 		return NULL;
-
+	rmb();
 	out = ids->entries[lid].p;
 	return out;
 }
 
-extern inline void ipc_unlockall(struct ipc_ids* ids)
-{
-	spin_unlock(&ids->ary);
-}
 extern inline struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id)
 {
 	struct kern_ipc_perm* out;
 	int lid = id % SEQ_MULTIPLIER;
-	if(lid >= ids->size)
-		return NULL;
 
-	spin_lock(&ids->ary);
+	rcu_read_lock();
+	if(lid >= ids->size) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	rmb();
 	out = ids->entries[lid].p;
-	if(out==NULL)
-		spin_unlock(&ids->ary);
+	if(out == NULL) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	spin_lock(&out->lock);
+	
+	/* ipc_rmid() may have already freed the ID while ipc_lock
+	 * was spinning: here verify that the structure is still valid
+	 */
+	if (out->deleted) {
+		spin_unlock(&out->lock);
+		rcu_read_unlock();
+		return NULL;
+	}
 	return out;
 }
 
-extern inline void ipc_unlock(struct ipc_ids* ids, int id)
+extern inline void ipc_unlock(struct kern_ipc_perm* perm)
 {
-	spin_unlock(&ids->ary);
+	spin_unlock(&perm->lock);
+	rcu_read_unlock();
 }
 
 extern inline int ipc_buildid(struct ipc_ids* ids, int id, int seq)

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-24 21:49         ` [PATCH]updated ipc lock patch mingming cao
@ 2002-10-24 22:29           ` Andrew Morton
  2002-10-24 22:56             ` Hugh Dickins
                               ` (3 more replies)
  0 siblings, 4 replies; 51+ messages in thread
From: Andrew Morton @ 2002-10-24 22:29 UTC (permalink / raw)
  To: cmm; +Cc: Hugh Dickins, manfred, linux-kernel, dipankar, lse-tech

mingming cao wrote:
> 
> Hi Andrew,
> 
> Here is the updated ipc lock patch:

Well I can get you a bit of testing and attention, but I'm afraid
my knowledge of the IPC code is negligible.

So to be able to commend this change to Linus I'd have to rely on
assurances from people who _do_ understand IPC (Hugh?) and on lots
of testing.

So yes, I'll include it, and would solicit success reports from
people who are actually exercising that code path, thanks.

> http://www.osdl.org/projects/dbt1prfrns/results/mingming/index.html

DBT1 is really interesting, and I'm glad the OSDL team have
put it together.  If people would only stop sending me patches
I'd be using it ;)

Could someone please help explain the results?  Comparing, say,
http://www.osdl.org/projects/dbt1prfrns/results/mingming/run.2cpu.42-mm2.r5/index.html
and
http://www.osdl.org/projects/dbt1prfrns/results/mingming/run.18.r5/index.html

It would appear that 2.5 completely smoked 2.4 on response time,
yet the overall bogotransactions/sec is significantly lower.
What should we conclude from this?

Also I see:

	14.7 minute duration
and
	Time for DBT run 19:36

What is the 14.7 minutes referring to?

Also:

	2.5: Time for key creation 1:27
	2.4: Time for key creation 14:24
versus:
	2.5: Time for table creation 16:48
	2.4: Time for table creation 8:58

So it's all rather confusing.  Masses of numbers usually _are_
confusing.  What really adds tons of value to such an exercise is
for the person who ran the test to write up some conclusions.  To
tell the developers what went well, what went poorly, what areas
to focus on, etc.  To use your own judgement to tell us what to
zoom in on.

Is that something which could be added?

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-24 22:29           ` Andrew Morton
@ 2002-10-24 22:56             ` Hugh Dickins
  2002-10-24 23:30               ` Andrew Morton
  2002-10-24 23:23             ` mingming cao
                               ` (2 subsequent siblings)
  3 siblings, 1 reply; 51+ messages in thread
From: Hugh Dickins @ 2002-10-24 22:56 UTC (permalink / raw)
  To: Andrew Morton; +Cc: cmm, manfred, linux-kernel, dipankar, lse-tech

On Thu, 24 Oct 2002, Andrew Morton wrote:
> mingming cao wrote:
> > 
> > Hi Andrew,
> > 
> > Here is the updated ipc lock patch:
> 
> Well I can get you a bit of testing and attention, but I'm afraid
> my knowledge of the IPC code is negligible.
> 
> So to be able to commend this change to Linus I'd have to rely on
> assurances from people who _do_ understand IPC (Hugh?) and on lots
> of testing.
> 
> So yes, I'll include it, and would solicit success reports from
> people who are actually exercising that code path, thanks.

Manfred and I have both reviewed the patch (or the 2.5.44 version)
and we both recommend it highly (well, let Manfred speak for himself).

I can't claim great expertise on IPC (never on msg, but some on shm and
sem), but (unless there's an error we've missed) there's no change to
IPC functionality here - it's an exercise in "self-evidently" better
locking (there used to be just one spinlock covering all e.g. sems),
with RCU to avoid the dirty cacheline bouncing in earlier version.

And I rarely exercise IPC paths, except when testing if I change
something: I do hope someone else can vouch for it in practice,
we believe Mingming has devised a fine patch here.

Hugh

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-24 22:29           ` Andrew Morton
  2002-10-24 22:56             ` Hugh Dickins
@ 2002-10-24 23:23             ` mingming cao
  2002-10-25 14:21               ` [Lse-tech] " Paul Larson
  2002-10-25  0:38             ` Cliff White
  2002-10-31 17:52             ` [Lse-tech] Re: [PATCH]updated ipc lock patch [PERFORMANCE RESULTS] Bill Hartner
  3 siblings, 1 reply; 51+ messages in thread
From: mingming cao @ 2002-10-24 23:23 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Hugh Dickins, manfred, linux-kernel, dipankar, lse-tech

Andrew Morton wrote:
> 
> mingming cao wrote:
> >
> > Hi Andrew,
> >
> > Here is the updated ipc lock patch:
> 
> Well I can get you a bit of testing and attention, but I'm afraid
> my knowledge of the IPC code is negligible.
> 
> So to be able to commend this change to Linus I'd have to rely on
> assurances from people who _do_ understand IPC (Hugh?) and on lots
> of testing.

Thanks for your quick feedback.  I did LTP tests on it--it passed(well,
I saw a failure on shmctl(), but the failure was there since 2.5.43
kernel).  I will do more stress tests on it soon.

Mingming

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-24 22:56             ` Hugh Dickins
@ 2002-10-24 23:30               ` Andrew Morton
  2002-10-24 23:59                 ` Hugh Dickins
                                   ` (4 more replies)
  0 siblings, 5 replies; 51+ messages in thread
From: Andrew Morton @ 2002-10-24 23:30 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: cmm, manfred, linux-kernel, dipankar, lse-tech

Hugh Dickins wrote:
> 
> ...
> Manfred and I have both reviewed the patch (or the 2.5.44 version)
> and we both recommend it highly (well, let Manfred speak for himself).
> 

OK, thanks.

So I took a look.  Wish I hadn't :(  The locking rules in there
are outrageously uncommented.  You must be brave people.

What about this code?

void ipc_rcu_free(void* ptr, int size)
{
        struct rcu_ipc_free* arg;

        arg = (struct rcu_ipc_free *) kmalloc(sizeof(*arg), GFP_KERNEL);
        if (arg == NULL)
                return;
        arg->ptr = ptr;
        arg->size = size;
        call_rcu(&arg->rcu_head, ipc_free_callback, arg);
}

Are we sure that it's never called under locks?

And it seems that if the kmalloc fails, we decide to leak some
memory, yes?

If so it would be better to use GFP_ATOMIC there.  Avoids any
locking problems and also increases the chance of the allocation
succeeding.  (With an explanatory comment, naturally :)).

Even better: is it possible to embed the rcu_ipc_free inside the
object-to-be-freed?  Perhaps not?


Stylistically, it is best to not typecast the return value
from kmalloc, btw.  You should never typecast the return
value of anything which returns a void *, because it weakens
your compile-time checking.  Example:

	foo *bar = (foo *)zot();

The compiler will swallow that, regardless of what zot() returns.
Someone could go and change zot() to return a reiserfs_inode *
and you would never know about it.  Whereas:

	foo *bar = zot();

Says to the compiler "zot() must return a bar * or a void *",
which is much tighter checking, yes?
	

There is an insane amount of inlining in the ipc code.  I
couldn't keep my paws off it.

Before:
mnm:/usr/src/25> size ipc/*.o
   text    data     bss     dec     hex filename
  28346     224     192   28762    705a ipc/built-in.o
   7390      20      64    7474    1d32 ipc/msg.o
  11236      16      64   11316    2c34 ipc/sem.o
   8136     160      64    8360    20a8 ipc/shm.o
   1584       0       0    1584     630 ipc/util.o

After:
mnm:/usr/src/25> size ipc/*.o
   text    data     bss     dec     hex filename
  19274     224     192   19690    4cea ipc/built-in.o
   4846      20      64    4930    1342 ipc/msg.o
   7636      16      64    7716    1e24 ipc/sem.o
   4808     160      64    5032    13a8 ipc/shm.o
   1984       0       0    1984     7c0 ipc/util.o



--- 25/ipc/util.h~ipc-akpm	Thu Oct 24 16:03:32 2002
+++ 25-akpm/ipc/util.h	Thu Oct 24 16:08:25 2002
@@ -54,63 +54,11 @@ void* ipc_alloc(int size);
 void ipc_free(void* ptr, int size);
 void ipc_rcu_free(void* arg, int size);
 
-extern inline struct kern_ipc_perm* ipc_get(struct ipc_ids* ids, int id)
-{
-	struct kern_ipc_perm* out;
-	int lid = id % SEQ_MULTIPLIER;
-	if(lid >= ids->size)
-		return NULL;
-	rmb();
-	out = ids->entries[lid].p;
-	return out;
-}
-
-extern inline struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id)
-{
-	struct kern_ipc_perm* out;
-	int lid = id % SEQ_MULTIPLIER;
-
-	rcu_read_lock();
-	if(lid >= ids->size) {
-		rcu_read_unlock();
-		return NULL;
-	}
-	rmb();
-	out = ids->entries[lid].p;
-	if(out == NULL) {
-		rcu_read_unlock();
-		return NULL;
-	}
-	spin_lock(&out->lock);
-	
-	/* ipc_rmid() may have already freed the ID while ipc_lock
-	 * was spinning: here verify that the structure is still valid
-	 */
-	if (out->deleted) {
-		spin_unlock(&out->lock);
-		rcu_read_unlock();
-		return NULL;
-	}
-	return out;
-}
-
-extern inline void ipc_unlock(struct kern_ipc_perm* perm)
-{
-	spin_unlock(&perm->lock);
-	rcu_read_unlock();
-}
-
-extern inline int ipc_buildid(struct ipc_ids* ids, int id, int seq)
-{
-	return SEQ_MULTIPLIER*seq + id;
-}
-
-extern inline int ipc_checkid(struct ipc_ids* ids, struct kern_ipc_perm* ipcp, int uid)
-{
-	if(uid/SEQ_MULTIPLIER != ipcp->seq)
-		return 1;
-	return 0;
-}
+struct kern_ipc_perm* ipc_get(struct ipc_ids* ids, int id);
+struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id);
+void ipc_unlock(struct kern_ipc_perm* perm);
+int ipc_buildid(struct ipc_ids* ids, int id, int seq);
+int ipc_checkid(struct ipc_ids* ids, struct kern_ipc_perm* ipcp, int uid);
 
 void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out);
 void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out);
--- 25/ipc/util.c~ipc-akpm	Thu Oct 24 16:07:07 2002
+++ 25-akpm/ipc/util.c	Thu Oct 24 16:07:51 2002
@@ -359,6 +359,61 @@ void ipc64_perm_to_ipc_perm (struct ipc6
 	out->seq	= in->seq;
 }
 
+struct kern_ipc_perm* ipc_get(struct ipc_ids* ids, int id)
+{
+	struct kern_ipc_perm* out;
+	int lid = id % SEQ_MULTIPLIER;
+	if(lid >= ids->size)
+		return NULL;
+	rmb();
+	out = ids->entries[lid].p;
+	return out;
+}
+
+struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id)
+{
+	struct kern_ipc_perm* out;
+	int lid = id % SEQ_MULTIPLIER;
+
+	rcu_read_lock();
+	if(lid >= ids->size)
+		goto fail;
+	rmb();
+	out = ids->entries[lid].p;
+	if (out == NULL)
+		goto fail;
+	spin_lock(&out->lock);
+	
+	/* ipc_rmid() may have already freed the ID while ipc_lock
+	 * was spinning: here verify that the structure is still valid
+	 */
+	if (!out->deleted)
+		return out;
+
+	spin_unlock(&out->lock);
+fail:
+	rcu_read_unlock();
+	return NULL;
+}
+
+void ipc_unlock(struct kern_ipc_perm* perm)
+{
+	spin_unlock(&perm->lock);
+	rcu_read_unlock();
+}
+
+int ipc_buildid(struct ipc_ids* ids, int id, int seq)
+{
+	return SEQ_MULTIPLIER*seq + id;
+}
+
+int ipc_checkid(struct ipc_ids* ids, struct kern_ipc_perm* ipcp, int uid)
+{
+	if(uid/SEQ_MULTIPLIER != ipcp->seq)
+		return 1;
+	return 0;
+}
+
 #ifndef __ia64__
 
 /**

.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-24 23:30               ` Andrew Morton
@ 2002-10-24 23:59                 ` Hugh Dickins
  2002-10-25  0:35                   ` [Lse-tech] " Rick Lindsley
  2002-10-25  0:07                 ` mingming cao
                                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 51+ messages in thread
From: Hugh Dickins @ 2002-10-24 23:59 UTC (permalink / raw)
  To: Andrew Morton; +Cc: cmm, manfred, linux-kernel, dipankar, lse-tech

On Thu, 24 Oct 2002, Andrew Morton wrote:
> Hugh Dickins wrote:
> > 
> > ...
> > Manfred and I have both reviewed the patch (or the 2.5.44 version)
> > and we both recommend it highly (well, let Manfred speak for himself).
> 
> OK, thanks.
> 
> So I took a look.  Wish I hadn't :(  The locking rules in there
> are outrageously uncommented.  You must be brave people.

Ah, we all like to criticize the lack of comments in others' code.

> What about this code?
> 
> void ipc_rcu_free(void* ptr, int size)
> {
>         struct rcu_ipc_free* arg;
> 
>         arg = (struct rcu_ipc_free *) kmalloc(sizeof(*arg), GFP_KERNEL);
>         if (arg == NULL)
>                 return;
>         arg->ptr = ptr;
>         arg->size = size;
>         call_rcu(&arg->rcu_head, ipc_free_callback, arg);
> }
> 
> Are we sure that it's never called under locks?

Yes.

> And it seems that if the kmalloc fails, we decide to leak some
> memory, yes?

Yes, but why would it fail?
and what do you think should be the alternative?

> If so it would be better to use GFP_ATOMIC there.  Avoids any
> locking problems and also increases the chance of the allocation
> succeeding.  (With an explanatory comment, naturally :)).

There are no locking doubts here.
GFP_ATOMIC would _reduce_ the chance of the allocation succeeding:
GFP_KERNEL does include the __GFP_WAIT flag, GFP_ATOMIC does not.

> Even better: is it possible to embed the rcu_ipc_free inside the
> object-to-be-freed?  Perhaps not?

It would certainly be possible (I did suggest it as a maybe),
but it's unclear whether it's worthwhile wasting the extra memory
longterm like that.  Mingming chose not to embed, I see no reason
to overrule.

> Stylistically, it is best to not typecast the return value
> from kmalloc, btw.  You should never typecast the return
> value of anything which returns a void *, because it weakens
> your compile-time checking.  Example:
> 
> 	foo *bar = (foo *)zot();
> 
> The compiler will swallow that, regardless of what zot() returns.
> Someone could go and change zot() to return a reiserfs_inode *
> and you would never know about it.  Whereas:
> 
> 	foo *bar = zot();
> 
> Says to the compiler "zot() must return a bar * or a void *",
> which is much tighter checking, yes?

You have too much time on your hands, Andrew :-)

> There is an insane amount of inlining in the ipc code.  I
> couldn't keep my paws off it.

I agree tempting: I thought you might like that in a subsequent patch,
yes?  Mingming was splitting locks, not doing a cleanup of inlines.

Hugh


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-24 23:30               ` Andrew Morton
  2002-10-24 23:59                 ` Hugh Dickins
@ 2002-10-25  0:07                 ` mingming cao
  2002-10-25  0:24                   ` Andrew Morton
  2002-10-25  4:18                 ` Rusty Russell
                                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 51+ messages in thread
From: mingming cao @ 2002-10-25  0:07 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Hugh Dickins, manfred, linux-kernel, dipankar, lse-tech

Andrew Morton wrote:
> 
> What about this code?
> 
> void ipc_rcu_free(void* ptr, int size)
> {
>         struct rcu_ipc_free* arg;
> 
>         arg = (struct rcu_ipc_free *) kmalloc(sizeof(*arg), GFP_KERNEL);
>         if (arg == NULL)
>                 return;
>         arg->ptr = ptr;
>         arg->size = size;
>         call_rcu(&arg->rcu_head, ipc_free_callback, arg);
> }
> 
> Are we sure that it's never called under locks?
Did you see any place where this is called with lock(s) hold? Maybe
there is, but I could not see here.  They are called from the functions
which are used by IPC code only. Inside IPC there is only spin_lock per
ID and sem_undo lock. Both of them are not hold when ipc_rcu_free is
called.

> 
> And it seems that if the kmalloc fails, we decide to leak some
> memory, yes?
>

yes.
 
> If so it would be better to use GFP_ATOMIC there.  Avoids any
> locking problems and also increases the chance of the allocation
> succeeding.  (With an explanatory comment, naturally :)).
>

Good point. I agree GFP_ATOMIC fits better here.
 
> Even better: is it possible to embed the rcu_ipc_free inside the
> object-to-be-freed?  Perhaps not?

Are you saying that have a static RCU header structure in the
object-to-be-freed?  I think it's possible.  It fits well in the rmid
case, where the object to be freed is an kern_ipc_perm structure. But
for the  grow_ary() case, the object to be freed is a array of struct
ipc_id, so it need a little bit more changes there. Maybe add a new
structure ipc_entries, which include the RCU header structure and the
pointer to the entries array.  Then have the ipc_ids->entries point to
ipc_entries.  Just a little concern that this way we added a reference
when looking up the IPC ID from the array.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-25  0:07                 ` mingming cao
@ 2002-10-25  0:24                   ` Andrew Morton
  0 siblings, 0 replies; 51+ messages in thread
From: Andrew Morton @ 2002-10-25  0:24 UTC (permalink / raw)
  To: cmm; +Cc: Hugh Dickins, manfred, linux-kernel, dipankar, lse-tech

mingming cao wrote:
> 
> > Even better: is it possible to embed the rcu_ipc_free inside the
> > object-to-be-freed?  Perhaps not?
> 
> Are you saying that have a static RCU header structure in the
> object-to-be-freed?  I think it's possible.  It fits well in the rmid
> case, where the object to be freed is an kern_ipc_perm structure. But
> for the  grow_ary() case, the object to be freed is a array of struct
> ipc_id, so it need a little bit more changes there. Maybe add a new
> structure ipc_entries, which include the RCU header structure and the
> pointer to the entries array.  Then have the ipc_ids->entries point to
> ipc_entries.  Just a little concern that this way we added a reference
> when looking up the IPC ID from the array.

This is a place where a mempool is appropriate.  The objects have
a "guaranteed to be returned if you wait for long enough" lifecycle.

But Hugh's right here.  The chance of the single-page GFP_KERNEL
allocation failing is tiny; the probability depending upon the
VM-of-the-day.  Let's leave it be.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [Lse-tech] Re: [PATCH]updated ipc lock patch
  2002-10-24 23:59                 ` Hugh Dickins
@ 2002-10-25  0:35                   ` Rick Lindsley
  2002-10-25  1:07                     ` Robert Love
  0 siblings, 1 reply; 51+ messages in thread
From: Rick Lindsley @ 2002-10-25  0:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Hugh Dickins, cmm, manfred, linux-kernel, dipankar, lse-tech

slightly offtopic ...

    > There is an insane amount of inlining in the ipc code.  I
    > couldn't keep my paws off it.

    I agree tempting: I thought you might like that in a subsequent patch,
    yes?  Mingming was splitting locks, not doing a cleanup of inlines.

There was a time when "inline" was a very cool tool because it had been
judged that the overhead of actually calling a function was just too
heinous to contemplate.  From comments in this and other discussions,
is it safe to say that the pendulum has now swung the other way?  I see
a lot of people concerned about code size and apparently returning to
the axiom of "if you use it more than once, make it a function."  Are
we as a community coming around to using inlining only on very tight,
very critical functions?

Rick

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [Lse-tech] Re: [PATCH]updated ipc lock patch
  2002-10-24 22:29           ` Andrew Morton
  2002-10-24 22:56             ` Hugh Dickins
  2002-10-24 23:23             ` mingming cao
@ 2002-10-25  0:38             ` Cliff White
  2002-10-31 17:52             ` [Lse-tech] Re: [PATCH]updated ipc lock patch [PERFORMANCE RESULTS] Bill Hartner
  3 siblings, 0 replies; 51+ messages in thread
From: Cliff White @ 2002-10-25  0:38 UTC (permalink / raw)
  To: Andrew Morton; +Cc: cmm, linux-kernel, dipankar, lse-tech, cliffw

> mingming cao wrote:
> > 
> > Hi Andrew,
> > 
> > Here is the updated ipc lock patch:
> 
> Well I can get you a bit of testing and attention, but I'm afraid
> my knowledge of the IPC code is negligible.
> 
> So to be able to commend this change to Linus I'd have to rely on
> assurances from people who _do_ understand IPC (Hugh?) and on lots
> of testing.
> 
> So yes, I'll include it, and would solicit success reports from
> people who are actually exercising that code path, thanks.
> 
> > http://www.osdl.org/projects/dbt1prfrns/results/mingming/index.html
> 
> DBT1 is really interesting, and I'm glad the OSDL team have
> put it together.  If people would only stop sending me patches
> I'd be using it ;)
> 
Thank you very much for that :)

> Could someone please help explain the results?  Comparing, say,
> http://www.osdl.org/projects/dbt1prfrns/results/mingming/run.2cpu.42-mm2.r5/index.html
> and
> http://www.osdl.org/projects/dbt1prfrns/results/mingming/run.18.r5/index.html
> 
> It would appear that 2.5 completely smoked 2.4 on response time,
> yet the overall bogotransactions/sec is significantly lower.
> What should we conclude from this?

Whoa - we ran these 5 times for an average. The 2.5 run you picked was the 
'off' run -
It has the worse results. You will notice on this run, there are a large 
number of errors
which didn't happen on the other runs - this lowered the BT/sec number. Use 
one of the
other 2.5 ones and you'll see something more sensible. ( say, 42-mm2.r3) 
Unfortunately,
on average, 2.4 still beats 2.5 on both response time and BT's

 		         2.5.42-mm2     2.5.42-mm2-ipclock  2.4.18
 Average over 5 runs     85.0 BT           89.8 BT          96.92 BT
 Std Deviation 5 runs    7.4  BT           1.0 BT           2.07 BT
 Average of best 4 runs  88.15 BT          90.2 BT          97.2 BT
 Std Deviation 4 run     2.8 BT            0.5 BT            2.3 BT 
> 
One other place to start comparing is in the system information which is at 
the bottom of the page.
Some points (might be minor) : 
Cpu statistics:
	2.4.18 - cpu %idle averages around 1.5% %system swings between 3-7% %nice 
steady at ~3.6%
	2.5.42-mm2 cpu %idle 0.0 all thru run, %system steady at ~6% % nice up ~5.5
Swap (sar -r) 
	Very slight differences - we consumed ~98% of the memory in both cases, 2.4 
swapped a little
		bit (%28) more than 2.5 (%26) 
We also include profile data for both the load and run phase. (profile=2)

> Also I see:
> 
> 	14.7 minute duration
> and
> 	Time for DBT run 19:36
> 
> What is the 14.7 minutes referring to?
> 
The 14.7 minute time comes from the workload driver log, which are parsed to 
get the
response numbers. The 'Time for' stamps come from the master driver script, 
and include some
of the workload startup and shutdown time. The workload driver waits a bit to 
be sure things are
stable, before the official run data is collected.  The script timestamp waits 
until the run clients are
dead. So there's always a bit of a delta between the two. 

> Also:
> 
> 	2.5: Time for key creation 1:27
> 	2.4: Time for key creation 14:24
> versus:
> 	2.5: Time for table creation 16:48
> 	2.4: Time for table creation 8:58
>  
	
	
This is a Mystery Question - we don't have an answer, we were hoping _you 
would see something :)
Table creation involves sequential inserts of data from a flat file to an 
SAPDB B-tree on a devspace.
Our devspace is a raw device, so we're doing raw io, plus some processing. 
This op is write-intensive
'Key creation' is establishing a foreign key column contraint on various 
tables.  For each table, it examines every row in the table,
looks up (does a B-tree index lookup) the column value in a second table to 
find a specific primary key that matches the
column value in the first table. So again, some I/O, a bit of processing. Key 
creation (foreign key) is read-intensive.
Also interesting is the delta in index creation:
	2.5 Time for index creation 27:58
	2.4 Time for index creation 17:21
Index creation requires a read of the table, a sort, then creation of a B-tree 
index.  Both the index and
table creates build a B-tree for SAP-DB ( both run slower on 2.5 ) - the table 
creation does no sorting.
We also notice that the times for both index and key creation varies a bit 
more across runs with the -mm2 kernel,
as shown by the standard deviation across the runs. 
mingming and 2.4.18 are a bit more consistent. ( we threw out -mm2 run 5 for 
this average, due to the errors)

Results are: average time[std dev] 
Action           2.4.18        2.5.42-mm2     2.5.42-mm2-ipclock
table create 	 8:55 [0:04]   19:03 [2:40]    19.39 [0:50]
index create     17:17 [0:11]  25:19 [5:31]    28:05 [0:02]
key create       14:23 [0:16]  15:21 [6:37]    18:46 [0:17]

Also interesting is -mm2 run2 - foreign key creation took 5:26, the run 
completed with no errors...why so fast, only one time?
 It is an ongoing mystery. We Just Don't Know Why Right Now.
We are working on better data capture of db/run errors, and we'd love to hear 
suggestions
on improving the instrumentation. 


> So it's all rather confusing.  Masses of numbers usually _are_
> confusing.  What really adds tons of value to such an exercise is
> for the person who ran the test to write up some conclusions. 

Yes, agreed. We don't yet know enough to map from test results to an exact 
kernel area.
We just added a database expert to staff (Mary Edie Meredith) so we intend to 
get better.
We'll probably be nagging you a bit, and again we very much appreciate all 
suggestions.

 To
> tell the developers what went well, what went poorly, what areas
> to focus on, etc.  To use your own judgement to tell us what to
> zoom in on.
> 
> Is that something which could be added?
> 
It is something we are working on adding.  
cliffw

> 
> -------------------------------------------------------
> This sf.net email is sponsored by: Influence the future 
> of Java(TM) technology. Join the Java Community 
> Process(SM) (JCP(SM)) program now. 
> http://ads.sourceforge.net/cgi-bin/redirect.pl?sunm0003en
> _______________________________________________
> Lse-tech mailing list
> Lse-tech@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/lse-tech
> 



^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [Lse-tech] Re: [PATCH]updated ipc lock patch
  2002-10-25  0:35                   ` [Lse-tech] " Rick Lindsley
@ 2002-10-25  1:07                     ` Robert Love
  0 siblings, 0 replies; 51+ messages in thread
From: Robert Love @ 2002-10-25  1:07 UTC (permalink / raw)
  To: Rick Lindsley
  Cc: Andrew Morton, Hugh Dickins, cmm, manfred, linux-kernel, dipankar,
	lse-tech

On Thu, 2002-10-24 at 20:35, Rick Lindsley wrote:

> There was a time when "inline" was a very cool tool because it had been
> judged that the overhead of actually calling a function was just too
> heinous to contemplate.  From comments in this and other discussions,
> is it safe to say that the pendulum has now swung the other way?  I see
> a lot of people concerned about code size and apparently returning to
> the axiom of "if you use it more than once, make it a function."  Are
> we as a community coming around to using inlining only on very tight,
> very critical functions?

I think so, at least Andrew is championing us in that direction.  But I
agree.

It somewhere became the notion if the function is small, it
automatically should be inlined.  I suspect Andrew has even stricter
criteria than me (I think super small functions should be inlined) but
the general "its only a couple of lines" or "it could be a macro" are
not sufficient criterion for inlining.

So, my thoughts on suitable criteria would be:

	- used only once and only ever in that one spot (i.e.
	  it really could be part of the caller, but it was pulled
	  out for cleanliness.  Keep it inline to not have the
	  cleanliness cause a performance degradation (however
	  small)).

	- small functions, where small is so small the function
	  overhead is nearly the same size.  Stuff that might not
	  even do anything but return a permutation of an argument,
	  etc.

	- very very time critical functions in time critical places

So that removes the previous criteria of "the function is N lines or
smaller" where N is some number less than 100 :)

	Robert Love


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-24 23:30               ` Andrew Morton
  2002-10-24 23:59                 ` Hugh Dickins
  2002-10-25  0:07                 ` mingming cao
@ 2002-10-25  4:18                 ` Rusty Russell
  2002-10-25  5:53                   ` mingming cao
  2002-10-25  5:36                 ` Manfred Spraul
  2002-10-25 16:53                 ` Rik van Riel
  4 siblings, 1 reply; 51+ messages in thread
From: Rusty Russell @ 2002-10-25  4:18 UTC (permalink / raw)
  To: Andrew Morton; +Cc: hugh, cmm, manfred, linux-kernel, dipankar, lse-tech

On Thu, 24 Oct 2002 16:30:32 -0700
Andrew Morton <akpm@digeo.com> wrote:

> Hugh Dickins wrote:
> > 
> > ...
> > Manfred and I have both reviewed the patch (or the 2.5.44 version)
> > and we both recommend it highly (well, let Manfred speak for himself).
> > 
> 
> OK, thanks.
> 
> So I took a look.  Wish I hadn't :(  The locking rules in there
> are outrageously uncommented.  You must be brave people.

Agreed.  Here's my brief audit:

>+	int max_id = ids->max_id;
> 
>-	for (id = 0; id <= ids->max_id; id++) {
>+	read_barrier_depends();
>+	for (id = 0; id <= max_id; id++) {

That needs to be a rmb(), not a read_barrier_depends().  And like all
barriers, it *requires* a comment:
	/* We must read max_id before reading any entries */

I can't see the following in the patch posted, but:
> void ipc_rcu_free(void* ptr, int size)
> {
>         struct rcu_ipc_free* arg;
> 
>         arg = (struct rcu_ipc_free *) kmalloc(sizeof(*arg), GFP_KERNEL);
>         if (arg == NULL)
>                 return;
>         arg->ptr = ptr;
>         arg->size = size;
>         call_rcu(&arg->rcu_head, ipc_free_callback, arg);
> }

This is unacceptable crap, sorry.  You *must* allocate the resources
required to free the object *at the time you allocate the object*,
since freeing must not fail.

> Even better: is it possible to embed the rcu_ipc_free inside the
> object-to-be-freed?  Perhaps not?

Yes, this must be done.

Rusty.
-- 
   there are those who do and those who hang on and you don't see too
   many doers quoting their contemporaries.  -- Larry McVoy

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-24 23:30               ` Andrew Morton
                                   ` (2 preceding siblings ...)
  2002-10-25  4:18                 ` Rusty Russell
@ 2002-10-25  5:36                 ` Manfred Spraul
  2002-10-25 16:53                 ` Rik van Riel
  4 siblings, 0 replies; 51+ messages in thread
From: Manfred Spraul @ 2002-10-25  5:36 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Hugh Dickins, cmm, linux-kernel, dipankar, lse-tech

Andrew Morton wrote:

>Hugh Dickins wrote:
>  
>
>>...
>>Manfred and I have both reviewed the patch (or the 2.5.44 version)
>>and we both recommend it highly (well, let Manfred speak for himself).
>>
>>    
>>
>
>OK, thanks.
>
>So I took a look.  Wish I hadn't :(  The locking rules in there
>are outrageously uncommented.  You must be brave people.
>  
>
Ahm. No idea who wrote the current locking. But the patch is very nice, 
it reduces the lock contention without increasing the number of spinlock 
calls.

--
    Manfred


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-25  4:18                 ` Rusty Russell
@ 2002-10-25  5:53                   ` mingming cao
  2002-10-25  7:27                     ` Rusty Russell
  0 siblings, 1 reply; 51+ messages in thread
From: mingming cao @ 2002-10-25  5:53 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andrew Morton, hugh, manfred, linux-kernel, dipankar, lse-tech

Rusty Russell wrote:
> 
> 
> Here's my brief audit:
> 
> >+      int max_id = ids->max_id;
> >
> >-      for (id = 0; id <= ids->max_id; id++) {
> >+      read_barrier_depends();
> >+      for (id = 0; id <= max_id; id++) {
> 
> That needs to be a rmb(), not a read_barrier_depends().  

Thanks for spending some time reviewing the barriers for me. While I was
thinking the reason why a rmb is needed here, I found that maybe we
don't need a barrier here at all. Since ipc_findkey()(the code above)
and the grow_ary() are both protected by ipc_ids.sem(there missing
document for this), so both the max_id and the the entries array seen by
ipc_findkey should be the latest one.

Also I think it's safe to remove the rmb() in ipc_get() for the same
reason. ipc_get() is only used by shm_get_stat() through shm_get() and
is called with the shm_ids.sem protected. (Maybe ipc_get should be
removed totally?)

> And like all
> barriers, it *requires* a comment:
>         /* We must read max_id before reading any entries */
>
Sure.  I will add such comments on all places where barriers are being
used.  I will do as much as I can to add more comments in the code about
what lock/sem are hold before/after the funtion is called.:-)
 
> I can't see the following in the patch posted, but:
> > void ipc_rcu_free(void* ptr, int size)
> > {
> >         struct rcu_ipc_free* arg;
> >
> >         arg = (struct rcu_ipc_free *) kmalloc(sizeof(*arg), GFP_KERNEL);
> >         if (arg == NULL)
> >                 return;
> >         arg->ptr = ptr;
> >         arg->size = size;
> >         call_rcu(&arg->rcu_head, ipc_free_callback, arg);
> > }
> 
> This is unacceptable crap, sorry.  You *must* allocate the resources
> required to free the object *at the time you allocate the object*,
> since freeing must not fail.
> 
> > Even better: is it possible to embed the rcu_ipc_free inside the
> > object-to-be-freed?  Perhaps not?
> 
> Yes, this must be done.
> 
I thought about embed rcu_ipc_free inside the ipc_ids structure before. 
But there could be a problem if grow_ary() is called again before the
old array associated with the previous grow_ary() has not scheduled to
be freed yet.  I see a need to do that now, as you made very good point.
I will make the changes tomorrow.

Thanks a lot for your comments.

Mingming

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-25  5:53                   ` mingming cao
@ 2002-10-25  7:27                     ` Rusty Russell
  0 siblings, 0 replies; 51+ messages in thread
From: Rusty Russell @ 2002-10-25  7:27 UTC (permalink / raw)
  To: cmm; +Cc: Andrew Morton, hugh, manfred, linux-kernel, dipankar, lse-tech

In message <3DB8DC72.6A08C74F@us.ibm.com> you write:
> > This is unacceptable crap, sorry.  You *must* allocate the resources
> > required to free the object *at the time you allocate the object*,
> > since freeing must not fail.
> > 
> > > Even better: is it possible to embed the rcu_ipc_free inside the
> > > object-to-be-freed?  Perhaps not?
> > 
> > Yes, this must be done.
> > 
> I thought about embed rcu_ipc_free inside the ipc_ids structure before. 
> But there could be a problem if grow_ary() is called again before the
> old array associated with the previous grow_ary() has not scheduled to
> be freed yet.  I see a need to do that now, as you made very good point.
> I will make the changes tomorrow.

You don't need to allocate it in the object, but you *do* need to fail
grow_ary() if you can't allocate it.

I had the same dilemma when I tried to write a generic "kfree_rcu(void
*)" last year: you simply can't do it 8(

Cheers,
Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [Lse-tech] Re: [PATCH]updated ipc lock patch
  2002-10-24 23:23             ` mingming cao
@ 2002-10-25 14:21               ` Paul Larson
  2002-10-25 17:17                 ` mingming cao
  0 siblings, 1 reply; 51+ messages in thread
From: Paul Larson @ 2002-10-25 14:21 UTC (permalink / raw)
  To: cmm; +Cc: Andrew Morton, Hugh Dickins, manfred, lkml, dipankar, lse-tech

On Thu, 2002-10-24 at 18:23, mingming cao wrote:
> Thanks for your quick feedback.  I did LTP tests on it--it passed(well,
> I saw a failure on shmctl(), but the failure was there since 2.5.43
> kernel).  I will do more stress tests on it soon.
Which shmctl() test is this?  To my knowledge, there are no current
known issues with shmctl tests.  There is however one with sem02 in
semctl() that last I heard has been partially fixed in the kernel and
still needs to be fixed in glibc.  Is that the one you are referring to,
or is there really some other shmctl test in LTP that is failing?

Thanks,
Paul Larson


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-24 23:30               ` Andrew Morton
                                   ` (3 preceding siblings ...)
  2002-10-25  5:36                 ` Manfred Spraul
@ 2002-10-25 16:53                 ` Rik van Riel
  4 siblings, 0 replies; 51+ messages in thread
From: Rik van Riel @ 2002-10-25 16:53 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Hugh Dickins, cmm, manfred, linux-kernel, dipankar, lse-tech

On Thu, 24 Oct 2002, Andrew Morton wrote:

> And it seems that if the kmalloc fails, we decide to leak some
> memory, yes?
>
> If so it would be better to use GFP_ATOMIC there.  Avoids any
> locking problems and also increases the chance of the allocation
> succeeding.  (With an explanatory comment, naturally :)).

Actually, under memory load GFP_KERNEL will wait for the
memory to become available, while GFP_ATOMIC will fail.

Using GFP_ATOMIC here will probably increase the risk of
a memory leak.

regards,

Rik
-- 
Bravely reimplemented by the knights who say "NIH".
http://www.surriel.com/		http://distro.conectiva.com/
Current spamtrap:  <a href=mailto:"october@surriel.com">october@surriel.com</a>


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [Lse-tech] Re: [PATCH]updated ipc lock patch
  2002-10-25 14:21               ` [Lse-tech] " Paul Larson
@ 2002-10-25 17:17                 ` mingming cao
  2002-10-25 18:20                   ` Paul Larson
  0 siblings, 1 reply; 51+ messages in thread
From: mingming cao @ 2002-10-25 17:17 UTC (permalink / raw)
  To: Paul Larson
  Cc: Andrew Morton, Hugh Dickins, manfred, lkml, dipankar, lse-tech

Paul Larson wrote:
> 
> On Thu, 2002-10-24 at 18:23, mingming cao wrote:
> > Thanks for your quick feedback.  I did LTP tests on it--it passed(well,
> > I saw a failure on shmctl(), but the failure was there since 2.5.43
> > kernel).  I will do more stress tests on it soon.
> Which shmctl() test is this?  To my knowledge, there are no current
> known issues with shmctl tests.  There is however one with sem02 in
> semctl() that last I heard has been partially fixed in the kernel and
> still needs to be fixed in glibc.  Is that the one you are referring to,
> or is there really some other shmctl test in LTP that is failing?

Here is the failure I saw on LTP test.  The one failed is 
/ltp-20020807/testcases/kernel/syscalls/ipc/shmctl/shmctl01

<<<test_start>>>
tag=shmctl01 stime=1035475025
cmdline="shmctl01"
contacts=""
analysis=exit
initiation_status="ok"
<<<test_output>>>
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    1  PASS  :  pid, size, # of attaches and mode are correct -
pass #1
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    1  PASS  :  pid, size, # of attaches and mode are correct -
pass #1
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    1  PASS  :  pid, size, # of attaches and mode are correct -
pass #1
shmctl01    2  PASS  :  pid, size, # of attaches and mode are correct -
pass #2
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    1  PASS  :  pid, size, # of attaches and mode are correct -
pass #1
shmctl01    2  PASS  :  pid, size, # of attaches and mode are correct -
pass #2
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    1  PASS  :  pid, size, # of attaches and mode are correct -
pass #1
shmctl01    2  PASS  :  pid, size, # of attaches and mode are correct -
pass #2
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    1  PASS  :  pid, size, # of attaches and mode are correct -
pass #1
shmctl01    2  PASS  :  pid, size, # of attaches and mode are correct -
pass #2
shmctl01    3  FAIL  :  # of attaches is incorrect - 0
shmctl01    4  PASS  :  new mode and change time are correct
<<<execution_status>>>
duration=1 termination_type=exited termination_id=1 corefile=no
cutime=0 cstime=0
<<<test_end>>>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
@ 2002-10-25 17:20 Cliff White
  0 siblings, 0 replies; 51+ messages in thread
From: Cliff White @ 2002-10-25 17:20 UTC (permalink / raw)
  To: linux-kernel, lse-tech


> mingming cao wrote:
> > 
> > Hi Andrew,
> > 
> > Here is the updated ipc lock patch:
> 
> Well I can get you a bit of testing and attention, but I'm afraid
> my knowledge of the IPC code is negligible.
> 
> So to be able to commend this change to Linus I'd have to rely on
> assurances from people who _do_ understand IPC (Hugh?) and on lots
> of testing.
> 
> So yes, I'll include it, and would solicit success reports from
> people who are actually exercising that code path, thanks.
> 
> > http://www.osdl.org/projects/dbt1prfrns/results/mingming/index.html
> 
> DBT1 is really interesting, and I'm glad the OSDL team have
> put it together.  If people would only stop sending me patches
> I'd be using it ;)
> 
Thank you very much for that :)

> Could someone please help explain the results?  Comparing, say,
> http://www.osdl.org/projects/dbt1prfrns/results/mingming/run.2cpu.42-mm2.r5/index.html
> and
> http://www.osdl.org/projects/dbt1prfrns/results/mingming/run.18.r5/index.html
> 
> It would appear that 2.5 completely smoked 2.4 on response time,
> yet the overall bogotransactions/sec is significantly lower.
> What should we conclude from this?

Whoa - we ran these 5 times for an average. The 2.5 run you picked was the 
'off' run -
It has the worse results. You will notice on this run, there are a large 
number of errors
which didn't happen on the other runs - this lowered the BT/sec number. Use 
one of the
other 2.5 ones and you'll see something more sensible. ( say, 42-mm2.r3) 
Unfortunately,
on average, 2.4 still beats 2.5 on both response time and BT's

 		         2.5.42-mm2     2.5.42-mm2-ipclock  2.4.18
 Average over 5 runs     85.0 BT           89.8 BT          96.92 BT
 Std Deviation 5 runs    7.4  BT           1.0 BT           2.07 BT
 Average of best 4 runs  88.15 BT          90.2 BT          97.2 BT
 Std Deviation 4 run     2.8 BT            0.5 BT            2.3 BT 
> 
One other place to start comparing is in the system information which is at 
the bottom of the page.
Some points (might be minor) : 
Cpu statistics:
	2.4.18 - cpu %idle averages around 1.5% %system swings between 3-7% %nice 
steady at ~3.6%
	2.5.42-mm2 cpu %idle 0.0 all thru run, %system steady at ~6% % nice up ~5.5
Swap (sar -r) 
	Very slight differences - we consumed ~98% of the memory in both cases, 2.4 
swapped a little
		bit (%28) more than 2.5 (%26) 
We also include profile data for both the load and run phase. (profile=2)

> Also I see:
> 
> 	14.7 minute duration
> and
> 	Time for DBT run 19:36
> 
> What is the 14.7 minutes referring to?
> 
The 14.7 minute time comes from the workload driver log, which are parsed to 
get the
response numbers. The 'Time for' stamps come from the master driver script, 
and include some
of the workload startup and shutdown time. The workload driver waits a bit to 
be sure things are
stable, before the official run data is collected.  The script timestamp waits 
until the run clients are
dead. So there's always a bit of a delta between the two. 

> Also:
> 
> 	2.5: Time for key creation 1:27
> 	2.4: Time for key creation 14:24
> versus:
> 	2.5: Time for table creation 16:48
> 	2.4: Time for table creation 8:58
>  
	
	
This is a Mystery Question - we don't have an answer, we were hoping _you 
would see something :)
Table creation involves sequential inserts of data from a flat file to an 
SAPDB B-tree on a devspace.
Our devspace is a raw device, so we're doing raw io, plus some processing. 
This op is write-intensive
'Key creation' is establishing a foreign key column contraint on various 
tables.  For each table, it examines every row in the table,
looks up (does a B-tree index lookup) the column value in a second table to 
find a specific primary key that matches the
column value in the first table. So again, some I/O, a bit of processing. Key 
creation (foreign key) is read-intensive.
Also interesting is the delta in index creation:
	2.5 Time for index creation 27:58
	2.4 Time for index creation 17:21
Index creation requires a read of the table, a sort, then creation of a B-tree 
index.  Both the index and
table creates build a B-tree for SAP-DB ( both run slower on 2.5 ) - the table 
creation does no sorting.
We also notice that the times for both index and key creation varies a bit 
more across runs with the -mm2 kernel,
as shown by the standard deviation across the runs. 
mingming and 2.4.18 are a bit more consistent. ( we threw out -mm2 run 5 for 
this average, due to the errors)

Results are: average time[std dev] 
Action           2.4.18        2.5.42-mm2     2.5.42-mm2-ipclock
table create 	 8:55 [0:04]   19:03 [2:40]    19.39 [0:50]
index create     17:17 [0:11]  25:19 [5:31]    28:05 [0:02]
key create       14:23 [0:16]  15:21 [6:37]    18:46 [0:17]

Also interesting is -mm2 run2 - foreign key creation took 5:26, the run 
completed with no errors...why so fast, only one time?
 It is an ongoing mystery. We Just Don't Know Why Right Now.
We are working on better data capture of db/run errors, and we'd love to hear 
suggestions
on improving the instrumentation. 


> So it's all rather confusing.  Masses of numbers usually _are_
> confusing.  What really adds tons of value to such an exercise is
> for the person who ran the test to write up some conclusions. 

Yes, agreed. We don't yet know enough to map from test results to an exact 
kernel area.
We just added a database expert to staff (Mary Edie Meredith) so we intend to 
get better.
We'll probably be nagging you a bit, and again we very much appreciate all 
suggestions.

 To
> tell the developers what went well, what went poorly, what areas
> to focus on, etc.  To use your own judgement to tell us what to
> zoom in on.
> 
> Is that something which could be added?
> 
It is something we are working on adding.  
cliffw

> 
> -------------------------------------------------------
> This sf.net email is sponsored by: Influence the future 
> of Java(TM) technology. Join the Java Community 
> Process(SM) (JCP(SM)) program now. 
> http://ads.sourceforge.net/cgi-bin/redirect.pl?sunm0003en
> _______________________________________________
> Lse-tech mailing list
> Lse-tech@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/lse-tech
> 



------- End of Forwarded Message




^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [Lse-tech] Re: [PATCH]updated ipc lock patch
  2002-10-25 17:17                 ` mingming cao
@ 2002-10-25 18:20                   ` Paul Larson
  2002-10-25 18:51                     ` mingming cao
  0 siblings, 1 reply; 51+ messages in thread
From: Paul Larson @ 2002-10-25 18:20 UTC (permalink / raw)
  To: cmm; +Cc: Andrew Morton, Hugh Dickins, manfred, lkml, dipankar, lse-tech

On Fri, 2002-10-25 at 12:17, mingming cao wrote:
>
> shmctl01    3  FAIL  :  # of attaches is incorrect - 0
I guess you are running it with -i2?  I just tried shmctl01 -i2 on a
2.5.44 kernel and did not get this error.
shmctl01    1  PASS  :  pid, size, # of attaches and mode are correct -
pass #1 shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    2  PASS  :  pid, size, # of attaches and mode are correct -
pass #2
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    3  PASS  :  new mode and change time are correct
shmctl01    4  PASS  :  shared memory appears to be removed
shmctl01    1  PASS  :  pid, size, # of attaches and mode are correct -
pass #1
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    2  PASS  :  pid, size, # of attaches and mode are correct -
pass #2
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    3  PASS  :  new mode and change time are correct
shmctl01    4  PASS  :  shared memory appears to be removed

If I can find some time, I'll try to grab your patch and see if I can
reproduce the error on my machine.

-Paul Larson


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [Lse-tech] Re: [PATCH]updated ipc lock patch
  2002-10-25 18:20                   ` Paul Larson
@ 2002-10-25 18:51                     ` mingming cao
  2002-10-25 19:06                       ` Paul Larson
  2002-10-25 20:23                       ` Manfred Spraul
  0 siblings, 2 replies; 51+ messages in thread
From: mingming cao @ 2002-10-25 18:51 UTC (permalink / raw)
  To: Paul Larson
  Cc: Andrew Morton, Hugh Dickins, manfred, lkml, dipankar, lse-tech

Paul Larson wrote:
> 
> On Fri, 2002-10-25 at 12:17, mingming cao wrote:
> >
> > shmctl01    3  FAIL  :  # of attaches is incorrect - 0
> I guess you are running it with -i2?
No, I did not use -i2.

What I did is just run ./shmctl01

>  I just tried shmctl01 -i2 on a
> 2.5.44 kernel and did not get this error.
Sorry, Paul.  Could you try 2.5.44-mm4?  I saw the error on clean
2.5.44-mm4(without my patch). And I remember I saw this on 2.5.42-mm2
also. 

Here is what I saw:

[root@elm3b83 shmctl]# ./shmctl01
shmctl01    1  PASS  :  pid, size, # of attaches and mode are correct -
pass #1
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    2  PASS  :  pid, size, # of attaches and mode are correct -
pass #2
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    3  FAIL  :  # of attaches is incorrect - 0
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    4  PASS  :  new mode and change time are correct

[root@elm3b83 shmctl]# ./shmctl01 -i2
shmctl01    1  PASS  :  pid, size, # of attaches and mode are correct -
pass #1
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    2  PASS  :  pid, size, # of attaches and mode are correct -
pass #2
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    3  FAIL  :  # of attaches is incorrect - 0
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    4  PASS  :  new mode and change time are correct
shmctl01    1  BROK  :  couldn't create the shared memory segment
shmctl01    2  BROK  :  Remaining cases broken
shmctl01    3  BROK  :  Remaining cases broken
shmctl01    4  BROK  :  Remaining cases broken

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [Lse-tech] Re: [PATCH]updated ipc lock patch
  2002-10-25 18:51                     ` mingming cao
@ 2002-10-25 19:06                       ` Paul Larson
  2002-10-25 20:14                         ` mingming cao
  2002-10-25 20:23                       ` Manfred Spraul
  1 sibling, 1 reply; 51+ messages in thread
From: Paul Larson @ 2002-10-25 19:06 UTC (permalink / raw)
  To: cmm; +Cc: Andrew Morton, Hugh Dickins, manfred, lkml, dipankar, lse-tech

On Fri, 2002-10-25 at 13:51, mingming cao wrote:
> Paul Larson wrote:
> > 
> > On Fri, 2002-10-25 at 12:17, mingming cao wrote:
> > >
> > > shmctl01    3  FAIL  :  # of attaches is incorrect - 0
> > I guess you are running it with -i2?
> No, I did not use -i2.
Maybe I just read it wrong.

> What I did is just run ./shmctl01
> 
> >  I just tried shmctl01 -i2 on a
> > 2.5.44 kernel and did not get this error.
> Sorry, Paul.  Could you try 2.5.44-mm4?  I saw the error on clean
> 2.5.44-mm4(without my patch). And I remember I saw this on 2.5.42-mm2
> also. 
> 
> Here is what I saw:
I still have my results from testing 2.5.44-mm4, here's a cut and paste
from shmctl01:

<<<test_start>>>
tag=shmctl01 stime=1035486589
cmdline="shmctl01"
contacts=""
analysis=exit
initiation_status="ok"
<<<test_output>>>
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    1  PASS  :  pid, size, # of attaches and mode are correct -
pass #1
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    1  PASS  :  pid, size, # of attaches and mode are correct -
pass #1
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    1  PASS  :  pid, size, # of attaches and mode are correct -
pass #1
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    1  PASS  :  pid, size, # of attaches and mode are correct -
pass #1
shmctl01    0  INFO  :  shmdt() failed - 22
shmctl01    1  PASS  :  pid, size, # of attaches and mode are correct -
pass #1
shmctl01    2  PASS  :  pid, size, # of attaches and mode are correct -
pass #2
shmctl01    3  PASS  :  new mode and change time are correct
shmctl01    4  PASS  :  shared memory appears to be removed
<<<execution_status>>>
duration=1 termination_type=exited termination_id=0 corefile=no
cutime=0 cstime=1
<<<test_end>>>

I havn't seen this test fail before but I'll be happy to do more testing
with your patch to see if I can reproduce it.  You may also want to
consider updating LTP to the newest version.  I'm fairly certain that
shmctl01 hasn't been changed since the version you have, but just to be
consistent you may want to do that.

Thanks,
Paul Larson


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [Lse-tech] Re: [PATCH]updated ipc lock patch
  2002-10-25 19:06                       ` Paul Larson
@ 2002-10-25 20:14                         ` mingming cao
  0 siblings, 0 replies; 51+ messages in thread
From: mingming cao @ 2002-10-25 20:14 UTC (permalink / raw)
  To: Paul Larson; +Cc: Andrew Morton, lkml, lse-tech

Paul Larson wrote:
> 
> I havn't seen this test fail before but I'll be happy to do more testing
> with your patch to see if I can reproduce it.  You may also want to
> consider updating LTP to the newest version.  I'm fairly certain that
> shmctl01 hasn't been changed since the version you have, but just to be
> consistent you may want to do that.
> 
Ha! Sorry about the confusion.  I re-install ltp test suites and the
error is gone. My old tests must be dirty.

Mingming

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [Lse-tech] Re: [PATCH]updated ipc lock patch
  2002-10-25 18:51                     ` mingming cao
  2002-10-25 19:06                       ` Paul Larson
@ 2002-10-25 20:23                       ` Manfred Spraul
  1 sibling, 0 replies; 51+ messages in thread
From: Manfred Spraul @ 2002-10-25 20:23 UTC (permalink / raw)
  To: cmm; +Cc: Paul Larson, Andrew Morton, Hugh Dickins, lkml, dipankar,
	lse-tech

[-- Attachment #1: Type: text/plain, Size: 918 bytes --]

mingming cao wrote:

>Here is what I saw:
>
>[root@elm3b83 shmctl]# ./shmctl01
>shmctl01    1  PASS  :  pid, size, # of attaches and mode are correct -
>pass #1
>shmctl01    0  INFO  :  shmdt() failed - 22
>shmctl01    0  INFO  :  shmdt() failed - 22
>shmctl01    0  INFO  :  shmdt() failed - 22
>shmctl01    0  INFO  :  shmdt() failed - 22
>
These failures are caused by a bug in the ltp test. See the attached patch.

>shmctl01    2  PASS  :  pid, size, # of attaches and mode are correct -
>pass #2
>shmctl01    0  INFO  :  shmdt() failed - 22
>shmctl01    0  INFO  :  shmdt() failed - 22
>shmctl01    0  INFO  :  shmdt() failed - 22
>shmctl01    0  INFO  :  shmdt() failed - 22
>shmctl01    3  FAIL  :  # of attaches is incorrect - 0
>
This one is odd. The testcase contains races, but they can only increase 
# of attaches.
Could you strace shmctl01?
The testcase with shmat(), then fork() fails.

--
    Manfred

[-- Attachment #2: patch-ltp --]
[-- Type: text/plain, Size: 714 bytes --]

diff -u ltp-orig/testcases/kernel/syscalls/ipc/shmctl/shmctl01.c ltp-20021008/testcases/kernel/syscalls/ipc/shmctl/shmctl01.c
--- ltp-orig/testcases/kernel/syscalls/ipc/shmctl/shmctl01.c	Tue May 21 15:55:56 2002
+++ ltp-20021008/testcases/kernel/syscalls/ipc/shmctl/shmctl01.c	Fri Oct 25 22:14:23 2002
@@ -252,10 +252,12 @@
 
 			if (stat_time == FIRST) {
 				test = set_shmat();
+			} else {
+				test = set_shared;
 			}
 
 			/* do an assignement for fun */
-			(int *)test = i;
+			*(int *)test = i;
 
 			/* pause until we get a signal from stat_cleanup() */
 			rval = pause();
@@ -273,7 +275,7 @@
 		}
 	}
 	/* sleep briefly to ensure correct execution order */
-	usleep(25000);
+	usleep(250000);
 }
 
 /*

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
       [not found] <Pine.LNX.4.44.0210270748560.1704-100000@localhost.localdomain>
@ 2002-10-28  1:06 ` Rusty Russell
  2002-10-28 14:21   ` Hugh Dickins
  2002-10-28 20:00   ` Dipankar Sarma
  0 siblings, 2 replies; 51+ messages in thread
From: Rusty Russell @ 2002-10-28  1:06 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: mingming cao, Andrew Morton, linux-kernel

In message <Pine.LNX.4.44.0210270748560.1704-100000@localhost.localdomain> you 
write:
> On Sun, 27 Oct 2002, Rusty Russell wrote:
> > 
> > You can't do that.  It's the price you pay.  It's nonsensical to fail
> > to destroy an shm or sem.
> 
> Ironic, but not nonsensical.

Yes, nonsensical.  Firstly, it's in violation of the standard to fail
IPC_RMID under random circumstances.  Secondly, failing to clean up is
an unhandlable error, since you're quite possible in the failure path
of the code already.  This is a well known issue.

> > Using a mempool is putting your head in the sand, because it's use is
> > not bounded.  Might as well just ignore kmalloc failures and leak
> > memory, which is *really* dumb, because if we get killed by the
> > oom-killer because we're out of memory, and that results in IPC trying
> > to free.
> 
> Bounded in what sense?  The mempool is dedicated to ipc freeing, it's
> not being drawn on by other kinds of use.  In the OOM-kill case of
> actually getting down to using the reserved pool, each reserved item
> will be returned when RCU (and, in the vfree case, the additional
> scheduled work) has done with it.  Unbounded in that we cannot say
> how many milliseconds that will take, but so what?

Two oom kills.  Three oom kills.  Four oom kills.  Where's the bound
here?

Our allocator behavior for GFP_KERNEL has changed several times.  Are
you sure that it won't *ever* fail under other circomstances?

> Okay (I expect, didn't review it) for just the ids arrays, but too much
> memory waste if we have to allocate for each msq, sema, shm: if there's
> a better solution available.  mempool looks better to us.

It's a hacky, fragile and incorrect solution.  It's completely
tasteless.

There are *three* things this patch does, and it's not clear which
ones helped the dbt1 benchmark (and I can't find the source).  But
let's assume they're all good optimizations (*cough*).

In order to save 12 bytes, you've added dozens of lines of subtle,
fragile, incorrect code.  You honestly think this is a worthwhile
tradeoff?

> > Hope that helps,
> 
> Not yet.  You seem to have had a bad experience with something like
> this (the mempools or the RCU or the combination), and you're warning
> us away without actually telling us what you found.

I *wrote* the RCU interface (though the implementation in 2.5 isn't
mine).  I thought it was pretty clear how it was supposed to be used.
Obviously, I was wrong.

> I suspect that whatever it was that you found, is not relevant to
> this IPC case.

Clear code vs. bad code?  Definitely relevent here.

Patch below is against Mingming's mm4 release.  Compiles, untested.
Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

Commentry: There are several cases where ipc_alloc is *always*
followed by ipc_free, so the extra allocation is gratuitous, but these
are temporary allocations and the extra size is not critical.

diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal working-2.5.44-mm4-ipc-rcu/include/linux/msg.h working-2.5.44-mm4-ipc-rcu-fix/include/linux/msg.h
--- working-2.5.44-mm4-ipc-rcu/include/linux/msg.h	2002-07-21 17:43:10.000000000 +1000
+++ working-2.5.44-mm4-ipc-rcu-fix/include/linux/msg.h	2002-10-28 11:12:54.000000000 +1100
@@ -2,6 +2,7 @@
 #define _LINUX_MSG_H
 
 #include <linux/ipc.h>
+#include <linux/rcupdate.h>
 
 /* ipcs ctl commands */
 #define MSG_STAT 11
@@ -90,6 +91,8 @@ struct msg_queue {
 	struct list_head q_messages;
 	struct list_head q_receivers;
 	struct list_head q_senders;
+
+	struct rcu_head q_free;		/* Used to free this via rcu */
 };
 
 asmlinkage long sys_msgget (key_t key, int msgflg);
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal working-2.5.44-mm4-ipc-rcu/include/linux/sem.h working-2.5.44-mm4-ipc-rcu-fix/include/linux/sem.h
--- working-2.5.44-mm4-ipc-rcu/include/linux/sem.h	2002-06-06 21:38:40.000000000 +1000
+++ working-2.5.44-mm4-ipc-rcu-fix/include/linux/sem.h	2002-10-28 11:24:39.000000000 +1100
@@ -2,6 +2,7 @@
 #define _LINUX_SEM_H
 
 #include <linux/ipc.h>
+#include <linux/rcupdate.h>
 
 /* semop flags */
 #define SEM_UNDO        0x1000  /* undo the operation on exit */
@@ -94,6 +95,7 @@ struct sem_array {
 	struct sem_queue	**sem_pending_last; /* last pending operation */
 	struct sem_undo		*undo;		/* undo requests on this array */
 	unsigned long		sem_nsems;	/* no. of semaphores in array */
+	struct rcu_head		sem_free;	/* used to rcu free array. */
 };
 
 /* One queue for each sleeping process in the system. */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal working-2.5.44-mm4-ipc-rcu/ipc/util.c working-2.5.44-mm4-ipc-rcu-fix/ipc/util.c
--- working-2.5.44-mm4-ipc-rcu/ipc/util.c	2002-10-28 11:08:35.000000000 +1100
+++ working-2.5.44-mm4-ipc-rcu-fix/ipc/util.c	2002-10-28 12:01:09.000000000 +1100
@@ -213,21 +213,49 @@ struct kern_ipc_perm* ipc_rmid(struct ip
 	return p;
 }
 
+struct ipc_rcu_kmalloc
+{
+	struct rcu_head rcu;
+	/* "void *" makes sure alignment of following data is sane. */
+	void *data[0];
+};
+
+struct ipc_rcu_vmalloc
+{
+	struct rcu_head rcu;
+	struct work_struct work;
+	/* "void *" makes sure alignment of following data is sane. */
+	void *data[0];
+};
+
+static inline int use_vmalloc(int size)
+{
+	/* Too big for a single page? */
+	if (sizeof(struct ipc_rcu_kmalloc) + size > PAGE_SIZE)
+		return 1;
+	return 0;
+}
+
 /**
  *	ipc_alloc	-	allocate ipc space
  *	@size: size desired
  *
  *	Allocate memory from the appropriate pools and return a pointer to it.
- *	NULL is returned if the allocation fails
+ *	NULL is returned if the allocation fails.  This can be freed with
+ *	ipc_free (to free immediately) or ipc_rcu_free (to free once safe).
  */
- 
 void* ipc_alloc(int size)
 {
 	void* out;
-	if(size > PAGE_SIZE)
-		out = vmalloc(size);
-	else
-		out = kmalloc(size, GFP_KERNEL);
+	/* We prepend the allocation with the rcu struct, and
+           workqueue if necessary (for vmalloc). */
+	if (use_vmalloc(size)) {
+		out = vmalloc(sizeof(struct ipc_rcu_vmalloc) + size);
+		if (out) out += sizeof(struct ipc_rcu_vmalloc);
+	} else {
+		out = kmalloc(sizeof(struct ipc_rcu_kmalloc)+size, GFP_KERNEL);
+		if (out) out += sizeof(struct ipc_rcu_kmalloc);
+	}
 	return out;
 }
 
@@ -242,48 +270,36 @@ void* ipc_alloc(int size)
  
 void ipc_free(void* ptr, int size)
 {
-	if(size > PAGE_SIZE)
-		vfree(ptr);
+	if (use_vmalloc(size))
+		vfree(ptr - sizeof(struct ipc_rcu_vmalloc));
 	else
-		kfree(ptr);
+		kfree(ptr - sizeof(struct ipc_rcu_kmalloc));
 }
 
 /* 
  * Since RCU callback function is called in bh,
  * we need to defer the vfree to schedule_work
  */
-static void ipc_free_scheduled(void* arg)
+static void ipc_schedule_free(void *arg)
 {
-	struct rcu_ipc_free *a = (struct rcu_ipc_free *)arg;
-	vfree(a->ptr);
-	kfree(a);
-}
+	struct ipc_rcu_vmalloc *free = arg;
 
-static void ipc_free_callback(void* arg)
-{
-	struct rcu_ipc_free *a = (struct rcu_ipc_free *)arg;
-	/* 
-	 * if data is vmalloced, then we need to delay the free
-	 */
-	if (a->size > PAGE_SIZE) {
-		INIT_WORK(&a->work, ipc_free_scheduled, arg);
-		schedule_work(&a->work);
-	} else {
-		kfree(a->ptr);
-		kfree(a);
-	}
+	INIT_WORK(&free->work, vfree, free);
+	schedule_work(&free->work);
 }
 
 void ipc_rcu_free(void* ptr, int size)
 {
-	struct rcu_ipc_free* arg;
-
-	arg = (struct rcu_ipc_free *) kmalloc(sizeof(*arg), GFP_KERNEL);
-	if (arg == NULL)
-		return;
-	arg->ptr = ptr;
-	arg->size = size;
-	call_rcu(&arg->rcu_head, ipc_free_callback, arg);
+	if (use_vmalloc(size)) {
+		struct ipc_rcu_vmalloc *free;
+		free = ptr - sizeof(*free);
+		call_rcu(&free->rcu, ipc_schedule_free, free);
+	} else {
+		struct ipc_rcu_kmalloc *free;
+		free = ptr - sizeof(*free);
+		/* kfree takes a "const void *" so gcc warns.  So we cast. */
+		call_rcu(&free->rcu, (void (*)(void *))kfree, free);
+	}
 }
 
 /**

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
@ 2002-10-28  1:15 Rusty Russell
  2002-10-28  1:35 ` Davide Libenzi
  0 siblings, 1 reply; 51+ messages in thread
From: Rusty Russell @ 2002-10-28  1:15 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: mingming cao, Andrew Morton, linux-kernel

> diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal working-2.5.44-mm4-ipc-rcu/include/linux/msg.h working-2.5.44-mm4-ipc-rcu-fix/include/linux/msg.h
> --- working-2.5.44-mm4-ipc-rcu/include/linux/msg.h	2002-07-21 17:43:10.000000000 +1000
> +++ working-2.5.44-mm4-ipc-rcu-fix/include/linux/msg.h	2002-10-28 11:12:54.000000000 +1100

Oops.  That patch had some fluff in msg.h and sem.h.  Delete those, or
just use this instead (still against Mingming's mm4 "ignore kmalloc
failure" patch):

diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal working-2.5.44-mm4-ipc-rcu/ipc/util.c working-2.5.44-mm4-ipc-rcu-fix/ipc/util.c
--- working-2.5.44-mm4-ipc-rcu/ipc/util.c	2002-10-28 11:08:35.000000000 +1100
+++ working-2.5.44-mm4-ipc-rcu-fix/ipc/util.c	2002-10-28 12:01:09.000000000 +1100
@@ -213,21 +213,49 @@ struct kern_ipc_perm* ipc_rmid(struct ip
 	return p;
 }
 
+struct ipc_rcu_kmalloc
+{
+	struct rcu_head rcu;
+	/* "void *" makes sure alignment of following data is sane. */
+	void *data[0];
+};
+
+struct ipc_rcu_vmalloc
+{
+	struct rcu_head rcu;
+	struct work_struct work;
+	/* "void *" makes sure alignment of following data is sane. */
+	void *data[0];
+};
+
+static inline int use_vmalloc(int size)
+{
+	/* Too big for a single page? */
+	if (sizeof(struct ipc_rcu_kmalloc) + size > PAGE_SIZE)
+		return 1;
+	return 0;
+}
+
 /**
  *	ipc_alloc	-	allocate ipc space
  *	@size: size desired
  *
  *	Allocate memory from the appropriate pools and return a pointer to it.
- *	NULL is returned if the allocation fails
+ *	NULL is returned if the allocation fails.  This can be freed with
+ *	ipc_free (to free immediately) or ipc_rcu_free (to free once safe).
  */
- 
 void* ipc_alloc(int size)
 {
 	void* out;
-	if(size > PAGE_SIZE)
-		out = vmalloc(size);
-	else
-		out = kmalloc(size, GFP_KERNEL);
+	/* We prepend the allocation with the rcu struct, and
+           workqueue if necessary (for vmalloc). */
+	if (use_vmalloc(size)) {
+		out = vmalloc(sizeof(struct ipc_rcu_vmalloc) + size);
+		if (out) out += sizeof(struct ipc_rcu_vmalloc);
+	} else {
+		out = kmalloc(sizeof(struct ipc_rcu_kmalloc)+size, GFP_KERNEL);
+		if (out) out += sizeof(struct ipc_rcu_kmalloc);
+	}
 	return out;
 }
 
@@ -242,48 +270,36 @@ void* ipc_alloc(int size)
  
 void ipc_free(void* ptr, int size)
 {
-	if(size > PAGE_SIZE)
-		vfree(ptr);
+	if (use_vmalloc(size))
+		vfree(ptr - sizeof(struct ipc_rcu_vmalloc));
 	else
-		kfree(ptr);
+		kfree(ptr - sizeof(struct ipc_rcu_kmalloc));
 }
 
 /* 
  * Since RCU callback function is called in bh,
  * we need to defer the vfree to schedule_work
  */
-static void ipc_free_scheduled(void* arg)
+static void ipc_schedule_free(void *arg)
 {
-	struct rcu_ipc_free *a = (struct rcu_ipc_free *)arg;
-	vfree(a->ptr);
-	kfree(a);
-}
+	struct ipc_rcu_vmalloc *free = arg;
 
-static void ipc_free_callback(void* arg)
-{
-	struct rcu_ipc_free *a = (struct rcu_ipc_free *)arg;
-	/* 
-	 * if data is vmalloced, then we need to delay the free
-	 */
-	if (a->size > PAGE_SIZE) {
-		INIT_WORK(&a->work, ipc_free_scheduled, arg);
-		schedule_work(&a->work);
-	} else {
-		kfree(a->ptr);
-		kfree(a);
-	}
+	INIT_WORK(&free->work, vfree, free);
+	schedule_work(&free->work);
 }
 
 void ipc_rcu_free(void* ptr, int size)
 {
-	struct rcu_ipc_free* arg;
-
-	arg = (struct rcu_ipc_free *) kmalloc(sizeof(*arg), GFP_KERNEL);
-	if (arg == NULL)
-		return;
-	arg->ptr = ptr;
-	arg->size = size;
-	call_rcu(&arg->rcu_head, ipc_free_callback, arg);
+	if (use_vmalloc(size)) {
+		struct ipc_rcu_vmalloc *free;
+		free = ptr - sizeof(*free);
+		call_rcu(&free->rcu, ipc_schedule_free, free);
+	} else {
+		struct ipc_rcu_kmalloc *free;
+		free = ptr - sizeof(*free);
+		/* kfree takes a "const void *" so gcc warns.  So we cast. */
+		call_rcu(&free->rcu, (void (*)(void *))kfree, free);
+	}
 }
 
 /**

--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-28  1:15 Rusty Russell
@ 2002-10-28  1:35 ` Davide Libenzi
  2002-10-28  4:10   ` Rusty Russell
  0 siblings, 1 reply; 51+ messages in thread
From: Davide Libenzi @ 2002-10-28  1:35 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Hugh Dickins, mingming cao, Andrew Morton, linux-kernel

On Mon, 28 Oct 2002, Rusty Russell wrote:

> +struct ipc_rcu_kmalloc
> +{
> +	struct rcu_head rcu;
> +	/* "void *" makes sure alignment of following data is sane. */
> +	void *data[0];
> +};

Rusty, why not using gcc "aligned" keywords instead of black magic :

void *data[0];



- Davide



^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-28  1:35 ` Davide Libenzi
@ 2002-10-28  4:10   ` Rusty Russell
  2002-10-28 17:08     ` Davide Libenzi
  0 siblings, 1 reply; 51+ messages in thread
From: Rusty Russell @ 2002-10-28  4:10 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: Hugh Dickins, mingming cao, Andrew Morton, linux-kernel

In message <Pine.LNX.4.44.0210271734450.7252-100000@blue1.dev.mcafeelabs.com> y
ou write:
> On Mon, 28 Oct 2002, Rusty Russell wrote:
> 
> > +struct ipc_rcu_kmalloc
> > +{
> > +	struct rcu_head rcu;
> > +	/* "void *" makes sure alignment of following data is sane. */
> > +	void *data[0];
> > +};
> 
> Rusty, why not using gcc "aligned" keywords instead of black magic :
> 
> void *data[0];

I think it's clearer *why* it's being done than:

	struct ipc_rcu_kmalloc
	{
		struct rcu_head rcu;
	} __attribute__((aligned(__alignof__(void *))));

And simpler than:

	struct ipc_rcu_kmalloc
	{
		struct rcu_head rcu;
		/* "void *" makes sure alignment of following data is sane. */
		char data[0] __attribute__((aligned(__alignof__(void *))));
	};

Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-28  1:06 ` Rusty Russell
@ 2002-10-28 14:21   ` Hugh Dickins
  2002-10-28 21:47     ` Rusty Russell
  2002-10-28 20:00   ` Dipankar Sarma
  1 sibling, 1 reply; 51+ messages in thread
From: Hugh Dickins @ 2002-10-28 14:21 UTC (permalink / raw)
  To: Rusty Russell; +Cc: mingming cao, Andrew Morton, linux-kernel

You may prefer to skip the history and goto Your_patch;

On Mon, 28 Oct 2002, Rusty Russell wrote:
> In message <Pine.LNX.4.44.0210270748560.1704-100000@localhost.localdomain> you 
> write:
> > On Sun, 27 Oct 2002, Rusty Russell wrote:
> > > 
> > > You can't do that.  It's the price you pay.  It's nonsensical to fail
> > > to destroy an shm or sem.
> > 
> > Ironic, but not nonsensical: remember, this would only happen (if we
> > abandoned the mempool method and) the task trying to free is itself
> > being OOM-killed - sometimes, OOM-killing will kill the very task
> > that might have gone on to free even more memory, just a sad fact.

(Since you're bringing our discussion to linux-kernel,
I've restored the full paragraph of my side of the argument above.)

> Yes, nonsensical.  Firstly, it's in violation of the standard to fail
> IPC_RMID under random circumstances.  Secondly, failing to clean up is
> an unhandlable error, since you're quite possible in the failure path
> of the code already.  This is a well known issue.

The task which would have failed was being OOM-killed: it's not even
going to get back to userspace.  It might have been OOM-killed just
before it tried to IPC_RMID, but it happened during, that's all.
I think OOM-killing lies, shall we say, outside the standard?

But that's all irrelevant: we (Andrew tidied patch by Mingming
following recommendation by me of solution by Andrew) added the
mempool so that it will surely succeed, as you insisted, even if
OOM-kill intervenes at the worst moment.

> > > Using a mempool is putting your head in the sand, because it's use is
> > > not bounded.  Might as well just ignore kmalloc failures and leak
> > > memory, which is *really* dumb, because if we get killed by the
> > > oom-killer because we're out of memory, and that results in IPC trying
> > > to free.
> > 
> > Bounded in what sense?  The mempool is dedicated to ipc freeing, it's
> > not being drawn on by other kinds of use.  In the OOM-kill case of
> > actually getting down to using the reserved pool, each reserved item
> > will be returned when RCU (and, in the vfree case, the additional
> > scheduled work) has done with it.  Unbounded in that we cannot say
> > how many milliseconds that will take, but so what?
> 
> Two oom kills.  Three oom kills.  Four oom kills.  Where's the bound
> here?

No bound to the number of possible OOM kills, but what problem is that?
I got excited for a few minutes when I thought you were saying that the
task being OOM-killed would exit holding on to its mempool buffer.  But
that's not so, is it?  We'd have wider, more serious resource problems
with OOM kills if that were so.

> Our allocator behavior for GFP_KERNEL has changed several times.  Are
> you sure that it won't *ever* fail under other circomstances?

Well, I'll be surprised if we change kmalloc(GFP_KERNEL) to fail for
reasons other than memory shortage ("insufficient privilege"? hmm,
we'd change the names before that); though how hard it tries to decide
if there's really a memory shortage certainly changes from one kernel
to another.  But so long as it doesn't fail very often in normal
circumstances, it's okay: the reserved mempool buffers back it up.

> > Okay (I expect, didn't review it) for just the ids arrays, but too much
> > memory waste if we have to allocate for each msq, sema, shm: if there's
> > a better solution available.  mempool looks better to us.
> 
> It's a hacky, fragile and incorrect solution.  It's completely
> tasteless.

I guess it's a step forward that you haven't called it senseless.
Hacky, tasteless?  Maybe, depends on your taste.  I'm inclined to
counter that it's _fashionable_, which holds about as much weight.
But fragile, incorrect?  You've repeatedly failed to argue that.

> In order to save 12 bytes, you've added dozens of lines of subtle,
> fragile, incorrect code.  You honestly think this is a worthwhile
> tradeoff?

I thought mempool was the best way to go, to avoid wasting 80 bytes
per msq, sema, shmseg - I thought it would get ugly (hacky, tasteless)
to avoid the struct work overhead in some cases and not others.

Your_patch shows that it need not be ugly, and reduces the normal
waste to 16 bytes per msq, sema, shmseg.  These are not struct pages,
that amount will often be wasted by cacheline alignment too, so I'm
not going to get into a fight over it.

I think your patch looks quite nice - apart from the subtle hacky
fragile tasteless void *data[0]; but if something like that is
necessary, I'm sure someone can come up with a better there.

If Andrew or Mingming likes it and wants to replace the mempool
solution by yours, I'm not going to object (but if that is done,
remove struct rcu_ipc_free from ipc/util.h, and move its includes
of rcupdate.h and workqueue.h to ipc/util.c).  If they prefer the
mempool method, that's fine with me too.

But I wish you'd introduced your patch as "Here, I think this is
a nicer way of doing it, and doesn't waste as much as you thought:
you don't have to bother with a mempool this way" instead of getting
so mysteriously upset about it, implying some idiot failure in the
mempool method which you've not yet revealed to us.

Hugh

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-28  4:10   ` Rusty Russell
@ 2002-10-28 17:08     ` Davide Libenzi
  2002-10-28 22:39       ` Rusty Russell
  0 siblings, 1 reply; 51+ messages in thread
From: Davide Libenzi @ 2002-10-28 17:08 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Hugh Dickins, mingming cao, Andrew Morton,
	Linux Kernel Mailing List

On Mon, 28 Oct 2002, Rusty Russell wrote:

> In message <Pine.LNX.4.44.0210271734450.7252-100000@blue1.dev.mcafeelabs.com> y
> ou write:
> > On Mon, 28 Oct 2002, Rusty Russell wrote:
> >
> > > +struct ipc_rcu_kmalloc
> > > +{
> > > +	struct rcu_head rcu;
> > > +	/* "void *" makes sure alignment of following data is sane. */
> > > +	void *data[0];
> > > +};
> >
> > Rusty, why not using gcc "aligned" keywords instead of black magic :
> >
> > void *data[0];
>
> I think it's clearer *why* it's being done than:
>
> 	struct ipc_rcu_kmalloc
> 	{
> 		struct rcu_head rcu;
> 	} __attribute__((aligned(__alignof__(void *))));

Well, not really Rusty. The above syntax uses documented gcc features
already used inside the kernel, while the fact that void *data[0];
enforces alignment it is not ( to my knowledge ) documented anywhere.
You can also avoid the comment using the aligned syntax ...



- Davide



^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-28  1:06 ` Rusty Russell
  2002-10-28 14:21   ` Hugh Dickins
@ 2002-10-28 20:00   ` Dipankar Sarma
  2002-10-28 21:41     ` Rusty Russell
  2002-10-28 22:07     ` mingming cao
  1 sibling, 2 replies; 51+ messages in thread
From: Dipankar Sarma @ 2002-10-28 20:00 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Hugh Dickins, Mingming Cao, Andrew Morton, linux-kernel

Hi Rusty,

I am pathologically late in catching up lkml, so if I missed some
context here, I apologize in advance. I have just started looking
at mm6 ipc code and I want to point out a few things.

On Mon, Oct 28, 2002 at 02:20:04AM +0100, Rusty Russell wrote:
> Yes, nonsensical.  Firstly, it's in violation of the standard to fail
> IPC_RMID under random circumstances.  Secondly, failing to clean up is
> an unhandlable error, since you're quite possible in the failure path
> of the code already.  This is a well known issue.

I am not sure how Ming/Hugh's current IPC changes affect IPC_RMID.
It affects only when you are trying to add a new ipc. In fact,
since it is a *add* operation (grow_ary()), it seems ok to fail it if rcu_head
allocation fails. Feel free to correct me if I missed something here.
AFAICS, the rcu stuff doesn't affect any freeing other than the IPC
id array.

> Two oom kills.  Three oom kills.  Four oom kills.  Where's the bound
> here?
> 
> Our allocator behavior for GFP_KERNEL has changed several times.  Are
> you sure that it won't *ever* fail under other circomstances?
> 
> > Okay (I expect, didn't review it) for just the ids arrays, but too much
> > memory waste if we have to allocate for each msq, sema, shm: if there's
> > a better solution available.  mempool looks better to us.
> 
> It's a hacky, fragile and incorrect solution.  It's completely
> tasteless.

Yes, the mempool code is broken, but only because rcu_backup_pool
is created three times, one by each IPC mechanism init :-)

> > Not yet.  You seem to have had a bad experience with something like
> > this (the mempools or the RCU or the combination), and you're warning
> > us away without actually telling us what you found.
> 
> I *wrote* the RCU interface (though the implementation in 2.5 isn't
> mine).  I thought it was pretty clear how it was supposed to be used.
> Obviously, I was wrong.

Yes, we went through this a long time ago and the general model
is to embedd the rcu_head thereby allocating it at the time
of allocation of the RCU protected data. This increases the
probability of recovery from low-memory situation as compared
to having to allocte during freeing.

That said, it seems that Ming/Hugh's patch does allocate
the rcu_head at the time of *growing* the array. It is just
that they allocate it for the freeing array rather than the
allocated array. I don't see how this is semantically different
from clubbing the two allocations other than the fact that
smaller number of allocation calls would likely reduce the
likelyhood of allocation failures.

> Patch below is against Mingming's mm4 release.  Compiles, untested.
> Rusty.

Yes, this is the typical RCU model, except that in this case (IPC),
I am not quite sure if it is in effect that different from what Ming/Hugh
have done.

Thanks
Dipankar

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-28 20:00   ` Dipankar Sarma
@ 2002-10-28 21:41     ` Rusty Russell
  2002-10-29  6:11       ` Dipankar Sarma
  2002-10-28 22:07     ` mingming cao
  1 sibling, 1 reply; 51+ messages in thread
From: Rusty Russell @ 2002-10-28 21:41 UTC (permalink / raw)
  To: dipankar; +Cc: Hugh Dickins, Mingming Cao, Andrew Morton, linux-kernel

In message <20021029013059.A13287@dikhow> you write:
> Hi Rusty,
> 
> I am pathologically late in catching up lkml, so if I missed some
> context here, I apologize in advance. I have just started looking
> at mm6 ipc code and I want to point out a few things.

That's OK, I'm still 1500 behind 8(

	If all current uses are embedded, can we remove the "void
*arg" and reduce the size of struct rcu_head by 25%?  Users can always
embed it in their own struct which has a "void *arg", but if that's
the uncommon case, it'd be nice to slim it a little.

	It'd also be nice to change the double linked list to a single
too: as far as I can tell the only issue is the list_add_tail in
call_rcu(): how important is this ordering?  It can be done by keeping
a head as well as a tail pointer if required.

I'd be happy to prepare a patch, to avoid more complaints of bloat 8)

> That said, it seems that Ming/Hugh's patch does allocate
> the rcu_head at the time of *growing* the array. It is just
> that they allocate it for the freeing array rather than the
> allocated array. I don't see how this is semantically different
> from clubbing the two allocations other than the fact that
> smaller number of allocation calls would likely reduce the
> likelyhood of allocation failures.

We must be looking at different variants of the patch.  This one does:
IPC_RMID -> freeary() -> ipc_rcu_free -> kmalloc.

Cheers,
Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-28 14:21   ` Hugh Dickins
@ 2002-10-28 21:47     ` Rusty Russell
  2002-10-29  0:26       ` Hugh Dickins
  0 siblings, 1 reply; 51+ messages in thread
From: Rusty Russell @ 2002-10-28 21:47 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: mingming cao, Andrew Morton, linux-kernel

In message <Pine.LNX.4.44.0210281311001.10156-100000@localhost.localdomain> you
 write:
> (Since you're bringing our discussion to linux-kernel,
> I've restored the full paragraph of my side of the argument above.)

Hi Hugh,

	Thanks for your complete reply.  I thought it was my fault
that it fell off lkml, and "corrected" it.  Sorry about that.

> > Two oom kills.  Three oom kills.  Four oom kills.  Where's the bound
> > here?
> 
> No bound to the number of possible OOM kills, but what problem is that?

Sorry, I'm obviously not making myself clear, since I've said this
three times now.

1) The memory is required for one whole RCU period (whether from
   kmalloc or the mempool).  This can be an almost arbitrarily long
   time (I've seen it take a good fraction of a second).

2) This is a problem, because other tasks could be OOM killed during
   that period, and could also try to use this mempool.

3) So, the size of the mempool which guarantees there will be enough?
   It's equal to the number of things you might free, which means
   you might as well allocate them together.

This is the correctness problem with the mempool IPC implementation.

> > Our allocator behavior for GFP_KERNEL has changed several times.  Are
> > you sure that it won't *ever* fail under other circomstances?
> 
> Well, I'll be surprised if we change kmalloc(GFP_KERNEL) to fail for
> reasons other than memory shortage ("insufficient privilege"? hmm,
> we'd change the names before that); though how hard it tries to decide
> if there's really a memory shortage certainly changes from one kernel
> to another.  But so long as it doesn't fail very often in normal
> circumstances, it's okay: the reserved mempool buffers back it up.

Once again, if ever returns NULL other than for tasks being OOM
killed, the problem gets wider.

It would be reasonable for the page allocator one day to say "hmm, at
the rate kswapd is going, it's going to take > 1 minute to fill this
allocation.  Let's fail it immediately instead".

This is the fragility problem with the mempool IPC implementation: you
are relying on the kmalloc implementation details which you are
explicitly not allowed to do (see previous "should kmalloc fail?"
threads).

> Your_patch shows that it need not be ugly, and reduces the normal
> waste to 16 bytes per msq, sema, shmseg.  These are not struct pages,
> that amount will often be wasted by cacheline alignment too, so I'm
> not going to get into a fight over it.

It could be reduced to 12 bytes by cutting out the "arg", and 8 bytes
by making it a single linked list.  I've posted this separately.

> I think your patch looks quite nice - apart from the subtle hacky
> fragile tasteless void *data[0]; but if something like that is
> necessary, I'm sure someone can come up with a better there.

Yes, it's tasteless, but not fragile.  Skipping it would be fragile
(it's unneccessary since struct rcu_head has a pointer in it).

I'm not sure that the alternative is nicer, either, though 8(

> But I wish you'd introduced your patch as "Here, I think this is
> a nicer way of doing it, and doesn't waste as much as you thought:
> you don't have to bother with a mempool this way" instead of getting
> so mysteriously upset about it, implying some idiot failure in the
> mempool method which you've not yet revealed to us.

You're right.  This took far more time than simply producing the patch
myself would have taken.

But despite that, I prefer to take the un-Viro-like approach of
believing that my fellow programmers cleverer than I am, and hence
require only a few subtle pointers when I manage to spot errors.

Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-28 20:00   ` Dipankar Sarma
  2002-10-28 21:41     ` Rusty Russell
@ 2002-10-28 22:07     ` mingming cao
  2002-10-29  1:06       ` Rusty Russell
  1 sibling, 1 reply; 51+ messages in thread
From: mingming cao @ 2002-10-28 22:07 UTC (permalink / raw)
  To: dipankar, Rusty Russell; +Cc: Hugh Dickins, Andrew Morton, linux-kernel

Dipankar Sarma wrote:
> 
> On Mon, Oct 28, 2002 at 02:20:04AM +0100, Rusty Russell wrote:
> > Yes, nonsensical.  Firstly, it's in violation of the standard to fail
> > IPC_RMID under random circumstances.  Secondly, failing to clean up is
> > an unhandlable error, since you're quite possible in the failure path
> > of the code already.  This is a well known issue.
> 
> I am not sure how Ming/Hugh's current IPC changes affect IPC_RMID.
> It affects only when you are trying to add a new ipc. In fact,
> since it is a *add* operation (grow_ary()), it seems ok to fail it if rcu_head
> allocation fails. Feel free to correct me if I missed something here.
> AFAICS, the rcu stuff doesn't affect any freeing other than the IPC
> id array.
>

We extended the usage of RCU to IPC_RMID, to prevent ipc_lock()
returning an invalid IPC ID which has been removed by ioc_rmid.
 
> > It's a hacky, fragile and incorrect solution.  It's completely
> > tasteless.
> 
> Yes, the mempool code is broken, but only because rcu_backup_pool
> is created three times, one by each IPC mechanism init :-)
>

That's my bad, thanks for pointing this out. It's easy to fix if we
decide to go with mempool way.
 
> > Patch below is against Mingming's mm4 release.  Compiles, untested.
> > Rusty.
> 
> Yes, this is the typical RCU model, except that in this case (IPC),
> I am not quite sure if it is in effect that different from what Ming/Hugh
> have done.

Rusty's patch looks good to me. I would like to replace the mempool in
IPC with this typical RCU model. Rusty, if you like, I will make a patch
against mm6.  There need some cleanups. One thing is that ipc_alloc()
are called by other places(besides grow_ary()), and they don't need to
the RCU header structure. 

Mingming

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-28 17:08     ` Davide Libenzi
@ 2002-10-28 22:39       ` Rusty Russell
  2002-10-28 23:52         ` Davide Libenzi
  0 siblings, 1 reply; 51+ messages in thread
From: Rusty Russell @ 2002-10-28 22:39 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Hugh Dickins, mingming cao, Andrew Morton,
	Linux Kernel Mailing List

In message <Pine.LNX.4.44.0210280906150.966-100000@blue1.dev.mcafeelabs.com> yo
u write:
> On Mon, 28 Oct 2002, Rusty Russell wrote:
> 
> > I think it's clearer *why* it's being done than:
> >
> > 	struct ipc_rcu_kmalloc
> > 	{
> > 		struct rcu_head rcu;
> > 	} __attribute__((aligned(__alignof__(void *))));
> 
> Well, not really Rusty. The above syntax uses documented gcc features
> already used inside the kernel, while the fact that void *data[0];
> enforces alignment it is not ( to my knowledge ) documented anywhere.
> You can also avoid the comment using the aligned syntax ...

A comment is definitely required: you must say *why* you are aligning
the structure (a clearer comment would be better, of course):

	/* We return a pointer after the structure to the unwitting
	   caller: ensure safe alignment. */

The alignment of a structure member of type X is the alignment of type
X: this seems obvious to me.  And "type X data[0];" is the standard
way of representing a variable struct.

Have we picked all the nits yet? 8)
Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-28 22:39       ` Rusty Russell
@ 2002-10-28 23:52         ` Davide Libenzi
  0 siblings, 0 replies; 51+ messages in thread
From: Davide Libenzi @ 2002-10-28 23:52 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Hugh Dickins, mingming cao, Andrew Morton,
	Linux Kernel Mailing List

On Tue, 29 Oct 2002, Rusty Russell wrote:

> In message <Pine.LNX.4.44.0210280906150.966-100000@blue1.dev.mcafeelabs.com> yo
> u write:
> > On Mon, 28 Oct 2002, Rusty Russell wrote:
> >
> > > I think it's clearer *why* it's being done than:
> > >
> > > 	struct ipc_rcu_kmalloc
> > > 	{
> > > 		struct rcu_head rcu;
> > > 	} __attribute__((aligned(__alignof__(void *))));
> >
> > Well, not really Rusty. The above syntax uses documented gcc features
> > already used inside the kernel, while the fact that void *data[0];
> > enforces alignment it is not ( to my knowledge ) documented anywhere.
> > You can also avoid the comment using the aligned syntax ...
>
> A comment is definitely required: you must say *why* you are aligning
> the structure (a clearer comment would be better, of course):
>
> 	/* We return a pointer after the structure to the unwitting
> 	   caller: ensure safe alignment. */
>
> The alignment of a structure member of type X is the alignment of type
> X: this seems obvious to me.  And "type X data[0];" is the standard
> way of representing a variable struct.
>
> Have we picked all the nits yet? 8)

Rusty, I give you two reason why you should be using the gcc documented
"aligned" features :)

1) using "void *data[0];" and expecting an alignement, being undocumented,
	will force not obvious constraints for the gcc folks

2) I was looking at my latest copy of the ISO standard, section :
	6.7.5.2 "Array declarators" - Point 1)
	where it is explicitly declared that the constant expression that
	declared the size of the array must be greater then zero

Looking at how gcc folks is driving towards standards I will expect them
screaming at you in a near future :)



- Davide



^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-28 21:47     ` Rusty Russell
@ 2002-10-29  0:26       ` Hugh Dickins
  2002-10-29  2:51         ` Rusty Russell
  0 siblings, 1 reply; 51+ messages in thread
From: Hugh Dickins @ 2002-10-29  0:26 UTC (permalink / raw)
  To: Rusty Russell; +Cc: mingming cao, Andrew Morton, linux-kernel

On Tue, 29 Oct 2002, Rusty Russell wrote:
> > 
> > No bound to the number of possible OOM kills, but what problem is that?
> 
> Sorry, I'm obviously not making myself clear, since I've said this
> three times now.
> 
> 1) The memory is required for one whole RCU period (whether from
>    kmalloc or the mempool).  This can be an almost arbitrarily long
>    time (I've seen it take a good fraction of a second).

That's a very short time compared with an OOMing thrash: no worries there.

> 2) This is a problem, because other tasks could be OOM killed during
>    that period, and could also try to use this mempool.

They'll try to use the mempool, maybe some will be allowed to wait
for their kmalloc(GFP_KERNEL) memory, and others will be PF_MEMDIEd and
proceed to take a reserved mempool buffer, and others will be PF_MEMDIEd
and have to wait for a reserved mempool buffer.  Which will be released
to them in due course.  No worries there.

> 3) So, the size of the mempool which guarantees there will be enough?
>    It's equal to the number of things you might free, which means
>    you might as well allocate them together.

No, they take their turns.  It's sure not as efficient as each getting
a kmalloc'ed buffer immediately, but its failures will be rare.  And
it doesn't matter whether the failures only occur when heading for
OOM-kill or not: we just don't want failure to be the common case.
If kmalloc evolves into something that normally fails half the time,
well, I'd think that'd be called a bug.

> This is the correctness problem with the mempool IPC implementation.

No.  There may be other situations which might need at least
NR_CPUS reserved mempool buffer to avoid deadlock, but that's not
the case here.  Looks like mempool will be superseded as you wish
in the IPC context, fine; but I do think you need to take a look
at mempool_alloc: it's a different beast from what you suppose.

Hugh

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-28 22:07     ` mingming cao
@ 2002-10-29  1:06       ` Rusty Russell
  0 siblings, 0 replies; 51+ messages in thread
From: Rusty Russell @ 2002-10-29  1:06 UTC (permalink / raw)
  To: cmm; +Cc: dipankar, Hugh Dickins, Andrew Morton, linux-kernel

In message <3DBDB51B.84F97EC1@us.ibm.com> you write:
> > Yes, this is the typical RCU model, except that in this case (IPC),
> > I am not quite sure if it is in effect that different from what Ming/Hugh
> > have done.
> 
> Rusty's patch looks good to me. I would like to replace the mempool in
> IPC with this typical RCU model. Rusty, if you like, I will make a patch
> against mm6.  There need some cleanups. One thing is that ipc_alloc()
> are called by other places(besides grow_ary()), and they don't need to
> the RCU header structure. 

Yes, I noticed that, but I'm not sure it's worth separating
ipc_alloc() and ipc_rcu_alloc() for a couple of temporary allocations.

Anyway, glad you like the patch,
Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-29  0:26       ` Hugh Dickins
@ 2002-10-29  2:51         ` Rusty Russell
  0 siblings, 0 replies; 51+ messages in thread
From: Rusty Russell @ 2002-10-29  2:51 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: mingming cao, Andrew Morton, linux-kernel

In message <Pine.LNX.4.44.0210282357450.1315-100000@localhost.localdomain> you 
write:
> > 2) This is a problem, because other tasks could be OOM killed during
> >    that period, and could also try to use this mempool.
> 
> They'll try to use the mempool, maybe some will be allowed to wait
> for their kmalloc(GFP_KERNEL) memory, and others will be PF_MEMDIEd and
> proceed to take a reserved mempool buffer, and others will be PF_MEMDIEd
> and have to wait for a reserved mempool buffer.  Which will be released
> to them in due course.  No worries there.

Oh.

You are (of course) correct.  Thankyou for your patience.  Your
solution is elegant and correct.

Feeling dimwitted,
Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH]updated ipc lock patch
  2002-10-28 21:41     ` Rusty Russell
@ 2002-10-29  6:11       ` Dipankar Sarma
  0 siblings, 0 replies; 51+ messages in thread
From: Dipankar Sarma @ 2002-10-29  6:11 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Hugh Dickins, Mingming Cao, Andrew Morton, linux-kernel

On Tue, Oct 29, 2002 at 08:41:19AM +1100, Rusty Russell wrote:
> 	If all current uses are embedded, can we remove the "void
> *arg" and reduce the size of struct rcu_head by 25%?  Users can always
> embed it in their own struct which has a "void *arg", but if that's
> the uncommon case, it'd be nice to slim it a little.

All current cases are not embedded, synchronize_kernel() needs
"arg" :-)

> 
> 	It'd also be nice to change the double linked list to a single
> too: as far as I can tell the only issue is the list_add_tail in
> call_rcu(): how important is this ordering?  It can be done by keeping
> a head as well as a tail pointer if required.

I can't see how the ordering of the RCU updates matter, so we can 
trivially change things internally without affecting the interface. 

That said, I disagree about the bloat issue, I don't see a problem there, 
atleast not yet. All the uses that I have seen so far, the additional "prev"
pointer is a very small fraction of the total memory allocated for
the objects. And it is certainly not an issue with IPC - just look
at the values for SHMMNI, SEMMNI etc.

> We must be looking at different variants of the patch.  This one does:
> IPC_RMID -> freeary() -> ipc_rcu_free -> kmalloc.
> 

Grr... I thought Mingming's patch modified only IPC common code
and was looking at the mm6 tree directly. My earlier suggestion
is valid only if RCU head allocation is limited to grow_ary(). Otherwise,
rcu_head should be embedded.

Thanks
-- 
Dipankar Sarma  <dipankar@in.ibm.com> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [Lse-tech] Re: [PATCH]updated ipc lock patch [PERFORMANCE RESULTS]
  2002-10-24 22:29           ` Andrew Morton
                               ` (2 preceding siblings ...)
  2002-10-25  0:38             ` Cliff White
@ 2002-10-31 17:52             ` Bill Hartner
  3 siblings, 0 replies; 51+ messages in thread
From: Bill Hartner @ 2002-10-31 17:52 UTC (permalink / raw)
  To: Andrew Morton
  Cc: cmm, Hugh Dickins, manfred, linux-kernel, dipankar, lse-tech


Andrew Morton wrote:
> 
> mingming cao wrote:
> >
> > Hi Andrew,
> >
> > Here is the updated ipc lock patch:
> 
> 
> So to be able to commend this change to Linus I'd have to rely on
> assurances from people who _do_ understand IPC (Hugh?) and on lots
> of testing.
> 
> So yes, I'll include it, and would solicit success reports from
> people who are actually exercising that code path, thanks.
> 

Andrew,

I tested Mingming's RCU ipc lock patch using a *new* microbenchmark - semopbench.
semopbench was written to test the performance of Mingming's patch.
I also ran a 3 hour stress and it completed successfully.

Explanation of the microbenchmark is below the results.
Here is a link to the microbenchmark source.

http://www-124.ibm.com/developerworks/opensource/linuxperf/semopbench/semopbench.c

SUT : 8-way 700 Mhz PIII

I tested 2.5.44-mm2 and 2.5.44-mm2 + RCU ipc patch

>semopbench -g 64 -s 16 -n 16384 -r > sem.results.out
>readprofile -m /boot/System.map | sort -n +0 -r > sem.profile.out

The metric is seconds / per repetition.  Lower is better.
                    
kernel              run 1     run 2
                    seconds   seconds
==================  =======   =======
2.5.44-mm2          515.1       515.4
2.5.44-mm2+rcu-ipc   46.7        46.7

With Mingming's patch, the test completes 10X faster.

-----

2.4.44-mm2 readprofile shows 70 % of 8 CPUs spinning on .text.lock.sem :

http://www-124.ibm.com/developerworks/opensource/linuxperf/semopbench/sem.profile.1.out

2.5.44-mm2 + Mingming's patch shows that the spin on .text.lock.sem is gone :

http://www-124.ibm.com/developerworks/opensource/linuxperf/semopbench/sem.rcu.profile.1.out

Here is the semopbench results for 2.5.44-mm2 :

http://www-124.ibm.com/developerworks/opensource/linuxperf/semopbench/sem.results.1.out

Here is the semopbench results for 2.5.44-mm2 + Mingming's patch :

http://www-124.ibm.com/developerworks/opensource/linuxperf/semopbench/sem.rcu.results.1.out

-----

Here is some info on how the microbenchmark works :

>semopbench -g 64 -s 16 -n 16384 -r

-g 64 creates 64 sema4 groups

group0
group1
...
group63

-s 16 creates 16 sema4s in each group

group0  - sem0, sem1, ... sem15
group1  - sem0, sem1, ... sem15
...
group63 - sem0, sem1, ... sem15

For each of the 1024 (64*16) sema4s, a process is forked and sleeps on
it's own sema4.  When the test starts, the master process will post the
sema4 for the 1st process in each group.

When the 1st process in each group wakes up it will :

	(a) resets it's own sema4
	(b) post the sema4 for the next process in the group
	(c) waits on his own sema4

-n 16384 runs through each sema4 group in the above manner 16384 times.

semopbench reports :

(1) average microseconds that it takes each process to complete repetitions.
(2) CPU utilization

-d turns on debug printfs
-v turns on per process times.
-r does a readprofile -r , reset of the profile buffer before test starts

Bill Hartner
-- 
IBM Linux Technology Center Performance Team
http://www-124.ibm.com/developerworks/oss/linux
hartner@austin.ibm.com

^ permalink raw reply	[flat|nested] 51+ messages in thread

end of thread, other threads:[~2002-10-31 17:49 UTC | newest]

Thread overview: 51+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-10-18  0:14 [PATCH]IPC locks breaking down with RCU mingming cao
2002-10-20 13:14 ` Hugh Dickins
2002-10-20 17:27   ` Hugh Dickins
2002-10-21 18:11     ` mingming cao
2002-10-21 19:00       ` Hugh Dickins
2002-10-24 21:49         ` [PATCH]updated ipc lock patch mingming cao
2002-10-24 22:29           ` Andrew Morton
2002-10-24 22:56             ` Hugh Dickins
2002-10-24 23:30               ` Andrew Morton
2002-10-24 23:59                 ` Hugh Dickins
2002-10-25  0:35                   ` [Lse-tech] " Rick Lindsley
2002-10-25  1:07                     ` Robert Love
2002-10-25  0:07                 ` mingming cao
2002-10-25  0:24                   ` Andrew Morton
2002-10-25  4:18                 ` Rusty Russell
2002-10-25  5:53                   ` mingming cao
2002-10-25  7:27                     ` Rusty Russell
2002-10-25  5:36                 ` Manfred Spraul
2002-10-25 16:53                 ` Rik van Riel
2002-10-24 23:23             ` mingming cao
2002-10-25 14:21               ` [Lse-tech] " Paul Larson
2002-10-25 17:17                 ` mingming cao
2002-10-25 18:20                   ` Paul Larson
2002-10-25 18:51                     ` mingming cao
2002-10-25 19:06                       ` Paul Larson
2002-10-25 20:14                         ` mingming cao
2002-10-25 20:23                       ` Manfred Spraul
2002-10-25  0:38             ` Cliff White
2002-10-31 17:52             ` [Lse-tech] Re: [PATCH]updated ipc lock patch [PERFORMANCE RESULTS] Bill Hartner
2002-10-21 19:18       ` [PATCH]IPC locks breaking down with RCU Dipankar Sarma
2002-10-21 19:36         ` Hugh Dickins
2002-10-21 19:41         ` mingming cao
2002-10-21 20:14           ` Dipankar Sarma
2002-10-21 18:07   ` mingming cao
  -- strict thread matches above, loose matches on Subject: below --
2002-10-25 17:20 [PATCH]updated ipc lock patch Cliff White
     [not found] <Pine.LNX.4.44.0210270748560.1704-100000@localhost.localdomain>
2002-10-28  1:06 ` Rusty Russell
2002-10-28 14:21   ` Hugh Dickins
2002-10-28 21:47     ` Rusty Russell
2002-10-29  0:26       ` Hugh Dickins
2002-10-29  2:51         ` Rusty Russell
2002-10-28 20:00   ` Dipankar Sarma
2002-10-28 21:41     ` Rusty Russell
2002-10-29  6:11       ` Dipankar Sarma
2002-10-28 22:07     ` mingming cao
2002-10-29  1:06       ` Rusty Russell
2002-10-28  1:15 Rusty Russell
2002-10-28  1:35 ` Davide Libenzi
2002-10-28  4:10   ` Rusty Russell
2002-10-28 17:08     ` Davide Libenzi
2002-10-28 22:39       ` Rusty Russell
2002-10-28 23:52         ` Davide Libenzi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).