linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Re: [PATCH]updated ipc lock patch
@ 2002-10-28  1:15 Rusty Russell
  2002-10-28  1:35 ` Davide Libenzi
  0 siblings, 1 reply; 31+ messages in thread
From: Rusty Russell @ 2002-10-28  1:15 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: mingming cao, Andrew Morton, linux-kernel

> diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal working-2.5.44-mm4-ipc-rcu/include/linux/msg.h working-2.5.44-mm4-ipc-rcu-fix/include/linux/msg.h
> --- working-2.5.44-mm4-ipc-rcu/include/linux/msg.h	2002-07-21 17:43:10.000000000 +1000
> +++ working-2.5.44-mm4-ipc-rcu-fix/include/linux/msg.h	2002-10-28 11:12:54.000000000 +1100

Oops.  That patch had some fluff in msg.h and sem.h.  Delete those, or
just use this instead (still against Mingming's mm4 "ignore kmalloc
failure" patch):

diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal working-2.5.44-mm4-ipc-rcu/ipc/util.c working-2.5.44-mm4-ipc-rcu-fix/ipc/util.c
--- working-2.5.44-mm4-ipc-rcu/ipc/util.c	2002-10-28 11:08:35.000000000 +1100
+++ working-2.5.44-mm4-ipc-rcu-fix/ipc/util.c	2002-10-28 12:01:09.000000000 +1100
@@ -213,21 +213,49 @@ struct kern_ipc_perm* ipc_rmid(struct ip
 	return p;
 }
 
+struct ipc_rcu_kmalloc
+{
+	struct rcu_head rcu;
+	/* "void *" makes sure alignment of following data is sane. */
+	void *data[0];
+};
+
+struct ipc_rcu_vmalloc
+{
+	struct rcu_head rcu;
+	struct work_struct work;
+	/* "void *" makes sure alignment of following data is sane. */
+	void *data[0];
+};
+
+static inline int use_vmalloc(int size)
+{
+	/* Too big for a single page? */
+	if (sizeof(struct ipc_rcu_kmalloc) + size > PAGE_SIZE)
+		return 1;
+	return 0;
+}
+
 /**
  *	ipc_alloc	-	allocate ipc space
  *	@size: size desired
  *
  *	Allocate memory from the appropriate pools and return a pointer to it.
- *	NULL is returned if the allocation fails
+ *	NULL is returned if the allocation fails.  This can be freed with
+ *	ipc_free (to free immediately) or ipc_rcu_free (to free once safe).
  */
- 
 void* ipc_alloc(int size)
 {
 	void* out;
-	if(size > PAGE_SIZE)
-		out = vmalloc(size);
-	else
-		out = kmalloc(size, GFP_KERNEL);
+	/* We prepend the allocation with the rcu struct, and
+           workqueue if necessary (for vmalloc). */
+	if (use_vmalloc(size)) {
+		out = vmalloc(sizeof(struct ipc_rcu_vmalloc) + size);
+		if (out) out += sizeof(struct ipc_rcu_vmalloc);
+	} else {
+		out = kmalloc(sizeof(struct ipc_rcu_kmalloc)+size, GFP_KERNEL);
+		if (out) out += sizeof(struct ipc_rcu_kmalloc);
+	}
 	return out;
 }
 
@@ -242,48 +270,36 @@ void* ipc_alloc(int size)
  
 void ipc_free(void* ptr, int size)
 {
-	if(size > PAGE_SIZE)
-		vfree(ptr);
+	if (use_vmalloc(size))
+		vfree(ptr - sizeof(struct ipc_rcu_vmalloc));
 	else
-		kfree(ptr);
+		kfree(ptr - sizeof(struct ipc_rcu_kmalloc));
 }
 
 /* 
  * Since RCU callback function is called in bh,
  * we need to defer the vfree to schedule_work
  */
-static void ipc_free_scheduled(void* arg)
+static void ipc_schedule_free(void *arg)
 {
-	struct rcu_ipc_free *a = (struct rcu_ipc_free *)arg;
-	vfree(a->ptr);
-	kfree(a);
-}
+	struct ipc_rcu_vmalloc *free = arg;
 
-static void ipc_free_callback(void* arg)
-{
-	struct rcu_ipc_free *a = (struct rcu_ipc_free *)arg;
-	/* 
-	 * if data is vmalloced, then we need to delay the free
-	 */
-	if (a->size > PAGE_SIZE) {
-		INIT_WORK(&a->work, ipc_free_scheduled, arg);
-		schedule_work(&a->work);
-	} else {
-		kfree(a->ptr);
-		kfree(a);
-	}
+	INIT_WORK(&free->work, vfree, free);
+	schedule_work(&free->work);
 }
 
 void ipc_rcu_free(void* ptr, int size)
 {
-	struct rcu_ipc_free* arg;
-
-	arg = (struct rcu_ipc_free *) kmalloc(sizeof(*arg), GFP_KERNEL);
-	if (arg == NULL)
-		return;
-	arg->ptr = ptr;
-	arg->size = size;
-	call_rcu(&arg->rcu_head, ipc_free_callback, arg);
+	if (use_vmalloc(size)) {
+		struct ipc_rcu_vmalloc *free;
+		free = ptr - sizeof(*free);
+		call_rcu(&free->rcu, ipc_schedule_free, free);
+	} else {
+		struct ipc_rcu_kmalloc *free;
+		free = ptr - sizeof(*free);
+		/* kfree takes a "const void *" so gcc warns.  So we cast. */
+		call_rcu(&free->rcu, (void (*)(void *))kfree, free);
+	}
 }
 
 /**

--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 31+ messages in thread
* Re: [PATCH]updated ipc lock patch
@ 2002-10-25 17:20 Cliff White
  0 siblings, 0 replies; 31+ messages in thread
From: Cliff White @ 2002-10-25 17:20 UTC (permalink / raw)
  To: linux-kernel, lse-tech


> mingming cao wrote:
> > 
> > Hi Andrew,
> > 
> > Here is the updated ipc lock patch:
> 
> Well I can get you a bit of testing and attention, but I'm afraid
> my knowledge of the IPC code is negligible.
> 
> So to be able to commend this change to Linus I'd have to rely on
> assurances from people who _do_ understand IPC (Hugh?) and on lots
> of testing.
> 
> So yes, I'll include it, and would solicit success reports from
> people who are actually exercising that code path, thanks.
> 
> > http://www.osdl.org/projects/dbt1prfrns/results/mingming/index.html
> 
> DBT1 is really interesting, and I'm glad the OSDL team have
> put it together.  If people would only stop sending me patches
> I'd be using it ;)
> 
Thank you very much for that :)

> Could someone please help explain the results?  Comparing, say,
> http://www.osdl.org/projects/dbt1prfrns/results/mingming/run.2cpu.42-mm2.r5/index.html
> and
> http://www.osdl.org/projects/dbt1prfrns/results/mingming/run.18.r5/index.html
> 
> It would appear that 2.5 completely smoked 2.4 on response time,
> yet the overall bogotransactions/sec is significantly lower.
> What should we conclude from this?

Whoa - we ran these 5 times for an average. The 2.5 run you picked was the 
'off' run -
It has the worse results. You will notice on this run, there are a large 
number of errors
which didn't happen on the other runs - this lowered the BT/sec number. Use 
one of the
other 2.5 ones and you'll see something more sensible. ( say, 42-mm2.r3) 
Unfortunately,
on average, 2.4 still beats 2.5 on both response time and BT's

 		         2.5.42-mm2     2.5.42-mm2-ipclock  2.4.18
 Average over 5 runs     85.0 BT           89.8 BT          96.92 BT
 Std Deviation 5 runs    7.4  BT           1.0 BT           2.07 BT
 Average of best 4 runs  88.15 BT          90.2 BT          97.2 BT
 Std Deviation 4 run     2.8 BT            0.5 BT            2.3 BT 
> 
One other place to start comparing is in the system information which is at 
the bottom of the page.
Some points (might be minor) : 
Cpu statistics:
	2.4.18 - cpu %idle averages around 1.5% %system swings between 3-7% %nice 
steady at ~3.6%
	2.5.42-mm2 cpu %idle 0.0 all thru run, %system steady at ~6% % nice up ~5.5
Swap (sar -r) 
	Very slight differences - we consumed ~98% of the memory in both cases, 2.4 
swapped a little
		bit (%28) more than 2.5 (%26) 
We also include profile data for both the load and run phase. (profile=2)

> Also I see:
> 
> 	14.7 minute duration
> and
> 	Time for DBT run 19:36
> 
> What is the 14.7 minutes referring to?
> 
The 14.7 minute time comes from the workload driver log, which are parsed to 
get the
response numbers. The 'Time for' stamps come from the master driver script, 
and include some
of the workload startup and shutdown time. The workload driver waits a bit to 
be sure things are
stable, before the official run data is collected.  The script timestamp waits 
until the run clients are
dead. So there's always a bit of a delta between the two. 

> Also:
> 
> 	2.5: Time for key creation 1:27
> 	2.4: Time for key creation 14:24
> versus:
> 	2.5: Time for table creation 16:48
> 	2.4: Time for table creation 8:58
>  
	
	
This is a Mystery Question - we don't have an answer, we were hoping _you 
would see something :)
Table creation involves sequential inserts of data from a flat file to an 
SAPDB B-tree on a devspace.
Our devspace is a raw device, so we're doing raw io, plus some processing. 
This op is write-intensive
'Key creation' is establishing a foreign key column contraint on various 
tables.  For each table, it examines every row in the table,
looks up (does a B-tree index lookup) the column value in a second table to 
find a specific primary key that matches the
column value in the first table. So again, some I/O, a bit of processing. Key 
creation (foreign key) is read-intensive.
Also interesting is the delta in index creation:
	2.5 Time for index creation 27:58
	2.4 Time for index creation 17:21
Index creation requires a read of the table, a sort, then creation of a B-tree 
index.  Both the index and
table creates build a B-tree for SAP-DB ( both run slower on 2.5 ) - the table 
creation does no sorting.
We also notice that the times for both index and key creation varies a bit 
more across runs with the -mm2 kernel,
as shown by the standard deviation across the runs. 
mingming and 2.4.18 are a bit more consistent. ( we threw out -mm2 run 5 for 
this average, due to the errors)

Results are: average time[std dev] 
Action           2.4.18        2.5.42-mm2     2.5.42-mm2-ipclock
table create 	 8:55 [0:04]   19:03 [2:40]    19.39 [0:50]
index create     17:17 [0:11]  25:19 [5:31]    28:05 [0:02]
key create       14:23 [0:16]  15:21 [6:37]    18:46 [0:17]

Also interesting is -mm2 run2 - foreign key creation took 5:26, the run 
completed with no errors...why so fast, only one time?
 It is an ongoing mystery. We Just Don't Know Why Right Now.
We are working on better data capture of db/run errors, and we'd love to hear 
suggestions
on improving the instrumentation. 


> So it's all rather confusing.  Masses of numbers usually _are_
> confusing.  What really adds tons of value to such an exercise is
> for the person who ran the test to write up some conclusions. 

Yes, agreed. We don't yet know enough to map from test results to an exact 
kernel area.
We just added a database expert to staff (Mary Edie Meredith) so we intend to 
get better.
We'll probably be nagging you a bit, and again we very much appreciate all 
suggestions.

 To
> tell the developers what went well, what went poorly, what areas
> to focus on, etc.  To use your own judgement to tell us what to
> zoom in on.
> 
> Is that something which could be added?
> 
It is something we are working on adding.  
cliffw

> 
> -------------------------------------------------------
> This sf.net email is sponsored by: Influence the future 
> of Java(TM) technology. Join the Java Community 
> Process(SM) (JCP(SM)) program now. 
> http://ads.sourceforge.net/cgi-bin/redirect.pl?sunm0003en
> _______________________________________________
> Lse-tech mailing list
> Lse-tech@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/lse-tech
> 



------- End of Forwarded Message




^ permalink raw reply	[flat|nested] 31+ messages in thread
* Re: [PATCH]IPC locks breaking down with RCU
@ 2002-10-21 19:00 Hugh Dickins
  2002-10-24 21:49 ` [PATCH]updated ipc lock patch mingming cao
  0 siblings, 1 reply; 31+ messages in thread
From: Hugh Dickins @ 2002-10-21 19:00 UTC (permalink / raw)
  To: mingming cao; +Cc: Andrew Morton, linux-kernel, dipankar

On Mon, 21 Oct 2002, mingming cao wrote:
> Hugh Dickins wrote:
> > A much more serious point: we could certainly bring the ipc_rmid
> > and ipc_unlock much closer together; but however close we bring them
> > (unlock implicit within rmid), there will still be a race with one
> > cpu in ipc_lock spinning on out->lock, while we in ipc_rmid null
> > entries[lid].p and unlock and free the structure containing that lock.
> 
> A simple solution I could think of for this problem, moving the per_id
> lock out of the kern_ipc_perm structure, and put it in the ipc_id
> structure. Actually I did this way at the first time,  then I agreed
> with you that moving the per_id lock into there kern_ipc_perm structure
> will help reduce cacheline bouncing.  
> 
> I think that having the per_id lock stay out of the structure it
> protects will easy the job of ipc_rmid(), also will avoid the wrong
> preempt count problem caused by the additional check "if (out)" in
> ipc_unlock() as you mentioned above.
> 
> Is this solution looks good to you? If so, I will update the patch for
> 2.5.44 soon.

Sorry, no, doesn't look good to me.  I do agree that it's an easy
solution to this problem, but there's no point in having gone the RCU
route to avoid cacheline bouncing, if you now reintroduce it all here.

I believe, as I said, that you'll have to go further, applying RCU
to freeing the entries themselves.  I had toyed with the idea of never
freeing entries once allocated, which is a similarly simple solution;
rejected that as in general too wasteful; and concluded that RCU is a
reasonable compromise - it amounts to lazy freeing, I think.

I'm happy to be overruled by someone who understands these cacheline
bounce issues better than we do, but nobody else has spoken up yet.

Hugh


^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2002-10-29  5:59 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <Pine.LNX.4.44.0210270748560.1704-100000@localhost.localdomain>
2002-10-28  1:06 ` [PATCH]updated ipc lock patch Rusty Russell
2002-10-28 14:21   ` Hugh Dickins
2002-10-28 21:47     ` Rusty Russell
2002-10-29  0:03       ` [RFC][PATCH]ipc rcu alloc/free patch - mm6 mingming cao
2002-10-29  0:26       ` [PATCH]updated ipc lock patch Hugh Dickins
2002-10-29  2:51         ` Rusty Russell
2002-10-28 20:00   ` Dipankar Sarma
2002-10-28 21:41     ` Rusty Russell
2002-10-29  6:11       ` Dipankar Sarma
2002-10-28 22:07     ` mingming cao
2002-10-29  1:06       ` Rusty Russell
2002-10-28  1:15 Rusty Russell
2002-10-28  1:35 ` Davide Libenzi
2002-10-28  4:10   ` Rusty Russell
2002-10-28 17:08     ` Davide Libenzi
2002-10-28 22:39       ` Rusty Russell
2002-10-28 23:52         ` Davide Libenzi
  -- strict thread matches above, loose matches on Subject: below --
2002-10-25 17:20 Cliff White
2002-10-21 19:00 [PATCH]IPC locks breaking down with RCU Hugh Dickins
2002-10-24 21:49 ` [PATCH]updated ipc lock patch mingming cao
2002-10-24 22:29   ` Andrew Morton
2002-10-24 22:56     ` Hugh Dickins
2002-10-24 23:30       ` Andrew Morton
2002-10-24 23:59         ` Hugh Dickins
2002-10-25  0:07         ` mingming cao
2002-10-25  0:24           ` Andrew Morton
2002-10-25  4:18         ` Rusty Russell
2002-10-25  5:53           ` mingming cao
2002-10-25  7:27             ` Rusty Russell
2002-10-25  5:36         ` Manfred Spraul
2002-10-25 16:53         ` Rik van Riel
2002-10-24 23:23     ` mingming cao

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).