[PATCH] mm: Implement swap prefetching tweaks

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] mm: Implement swap prefetching tweaks
@ 2006-03-10  9:54 Con Kolivas
  2006-03-10 22:35 ` Andrew Morton
  0 siblings, 1 reply; 32+ messages in thread
From: Con Kolivas @ 2006-03-10  9:54 UTC (permalink / raw)
  To: linux kernel mailing list; +Cc: ck list, Andrew Morton

The current swap prefetching implementation is far too aggressive to the point
of its cpu and disk access being noticed. This patch addresses that issue.

Andrew please apply this one and keep ignoring the yield patch the way you
rightly already were.

Cheers,
Con
---
Swap prefetch tweaks.

Add watermarks to swap prefetching, and prefetch when free memory is greater
than pages_high * 4 down to pages_high * 3.

Check cpu load and only prefetch when kprefetchd is the only process running.
Testing cpu load of just the cpu that kprefetchd is currently running on is
not enough to ensure that kprefetchd working does not consume resources in a
noticeable way on SMP.

Clear the busy bit only if it is set.

Signed-off-by: Con Kolivas <kernel@kolivas.org>

---
 mm/swap_prefetch.c |  154 ++++++++++++++++++++++++++++++++++++++++-------------
 1 files changed, 118 insertions(+), 36 deletions(-)

Index: linux-2.6.16-rc5-mm3/mm/swap_prefetch.c
===================================================================
--- linux-2.6.16-rc5-mm3.orig/mm/swap_prefetch.c	2006-03-10 15:29:11.000000000 +1100
+++ linux-2.6.16-rc5-mm3/mm/swap_prefetch.c	2006-03-10 20:36:56.000000000 +1100
@@ -150,21 +150,31 @@ enum trickle_return {
 	TRICKLE_DELAY,
 };
 
+struct node_stats {
+	unsigned long	last_free;
+	/* Free ram after a cycle of prefetching */
+	unsigned long	current_free;
+	/* Free ram on this cycle of checking prefetch_suitable */
+	unsigned long	prefetch_watermark;
+	/* Maximum amount we will prefetch to */
+	unsigned long	highfree[MAX_NR_ZONES];
+	/* The amount of free ram before we start prefetching */
+	unsigned long	lowfree[MAX_NR_ZONES];
+	/* The amount of free ram where we will stop prefetching */
+	unsigned long	*pointfree[MAX_NR_ZONES];
+	/* highfree or lowfree depending on whether we've hit a watermark */
+};
+
 /*
  * prefetch_stats stores the free ram data of each node and this is used to
  * determine if a node is suitable for prefetching into.
  */
-struct prefetch_stats{
-	unsigned long	last_free[MAX_NUMNODES];
-	/* Free ram after a cycle of prefetching */
-	unsigned long	current_free[MAX_NUMNODES];
-	/* Free ram on this cycle of checking prefetch_suitable */
-	unsigned long	prefetch_watermark[MAX_NUMNODES];
-	/* Maximum amount we will prefetch to */
+struct prefetch_stats {
 	nodemask_t	prefetch_nodes;
 	/* Which nodes are currently suited to prefetching */
 	unsigned long	prefetched_pages;
 	/* Total pages we've prefetched on this wakeup of kprefetchd */
+	struct node_stats node[MAX_NUMNODES];
 };
 
 static struct prefetch_stats sp_stat;
@@ -211,7 +221,7 @@ static enum trickle_return trickle_swap_
 	}
 
 	sp_stat.prefetched_pages++;
-	sp_stat.last_free[node]--;
+	sp_stat.node[node].last_free--;
 
 	ret = TRICKLE_SUCCESS;
 out_release:
@@ -229,8 +239,11 @@ static void clear_last_prefetch_free(voi
 	 * update the data to take into account memory hotplug if desired..
 	 */
 	sp_stat.prefetch_nodes = node_online_map;
-	for_each_node_mask(node, sp_stat.prefetch_nodes)
-		sp_stat.last_free[node] = 0;
+	for_each_node_mask(node, sp_stat.prefetch_nodes) {
+		struct node_stats *ns = &sp_stat.node[node];
+
+		ns->last_free = 0;
+	}
 }
 
 static void clear_current_prefetch_free(void)
@@ -238,8 +251,43 @@ static void clear_current_prefetch_free(
 	int node;
 
 	sp_stat.prefetch_nodes = node_online_map;
-	for_each_node_mask(node, sp_stat.prefetch_nodes)
-		sp_stat.current_free[node] = 0;
+	for_each_node_mask(node, sp_stat.prefetch_nodes) {
+		struct node_stats *ns = &sp_stat.node[node];
+
+		ns->current_free = 0;
+	}
+}
+
+/*
+ * This updates the high and low watermarks of amount of free ram in each
+ * node used to start and stop prefetching. We prefetch from pages_high * 4
+ * down to pages_high * 3.
+ */
+static void examine_free_limits(void)
+{
+	struct zone *z;
+
+	for_each_zone(z) {
+		struct node_stats *ns;
+		int idx;
+
+		if (!populated_zone(z))
+			continue;
+
+		ns = &sp_stat.node[z->zone_pgdat->node_id];
+		idx = zone_idx(z);
+		ns->lowfree[idx] = z->pages_high * 3 + z->lowmem_reserve[idx];
+		ns->highfree[idx] = ns->lowfree[idx] + z->pages_high;
+
+		if (z->free_pages > ns->highfree[idx]) {
+			/*
+			 * We've gotten above the high watermark of free pages
+			 * so we can start prefetching till we get to the low
+			 * watermark.
+			 */
+			ns->pointfree[idx] = &ns->lowfree[idx];
+		}
+	}
 }
 
 /*
@@ -247,14 +295,34 @@ static void clear_current_prefetch_free(
  */
 static int prefetch_suitable(void)
 {
-	struct page_state ps;
 	unsigned long limit;
 	struct zone *z;
-	int node, ret = 0;
+	int node, ret = 0, test_pagestate = 0;
 
-	/* Purposefully racy and might return false positive which is ok */
-	if (__test_and_clear_bit(0, &swapped.busy))
+	/* Purposefully racy */
+	if (test_bit(0, &swapped.busy)) {
+		__clear_bit(0, &swapped.busy);
 		goto out;
+	}
+
+	/*
+	 * get_page_state is super expensive so we only perform it every
+	 * SWAP_CLUSTER_MAX prefetched_pages. We also test if we're the only
+	 * task running anywhere. We want to have as little impact on all
+	 * resources (cpu, disk, bus etc). As this iterates over every cpu
+	 * we measure this infrequently.
+	 */
+	if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
+		unsigned long cpuload = nr_running();
+
+		if (cpuload > 1)
+			goto out;
+		cpuload += nr_uninterruptible();
+		if (cpuload > 1)
+			goto out;
+
+		test_pagestate = 1;
+	}
 
 	clear_current_prefetch_free();
 
@@ -263,18 +331,29 @@ static int prefetch_suitable(void)
 	 * will occur to prevent ping-ponging between them.
 	 */
 	for_each_zone(z) {
+		struct node_stats *ns;
 		unsigned long free;
+		int idx;
 
 		if (!populated_zone(z))
 			continue;
+
 		node = z->zone_pgdat->node_id;
+		ns = &sp_stat.node[node];
+		idx = zone_idx(z);
 
 		free = z->free_pages;
-		if (z->pages_high * 3 + z->lowmem_reserve[zone_idx(z)] > free) {
+		if (free < *ns->pointfree[idx]) {
+			/*
+			 * Free pages have dropped below the low watermark so
+			 * we won't start prefetching again till we hit the
+			 * high watermark of free pages.
+			 */
+			ns->pointfree[idx] = &ns->highfree[idx];
 			node_clear(node, sp_stat.prefetch_nodes);
 			continue;
 		}
-		sp_stat.current_free[node] += free;
+		ns->current_free += free;
 	}
 
 	/*
@@ -282,28 +361,26 @@ static int prefetch_suitable(void)
 	 * prefetching and clear the nodemask if it is not.
 	 */
 	for_each_node_mask(node, sp_stat.prefetch_nodes) {
+		struct node_stats *ns = &sp_stat.node[node];
+		struct page_state ps;
+
 		/*
 		 * We check to see that pages are not being allocated
 		 * elsewhere at any significant rate implying any
 		 * degree of memory pressure (eg during file reads)
 		 */
-		if (sp_stat.last_free[node]) {
-			if (sp_stat.current_free[node] + SWAP_CLUSTER_MAX <
-				sp_stat.last_free[node]) {
-					sp_stat.last_free[node] =
-						sp_stat.current_free[node];
+		if (ns->last_free) {
+			if (ns->current_free + SWAP_CLUSTER_MAX <
+				ns->last_free) {
+					ns->last_free = ns->current_free;
 					node_clear(node,
 						sp_stat.prefetch_nodes);
 					continue;
 			}
 		} else
-			sp_stat.last_free[node] = sp_stat.current_free[node];
+			ns->last_free = ns->current_free;
 
-		/*
-		 * get_page_state is super expensive so we only perform it
-		 * every SWAP_CLUSTER_MAX prefetched_pages
-		 */
-		if (sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)
+		if (!test_pagestate)
 			continue;
 
 		get_page_state_node(&ps, node);
@@ -324,7 +401,7 @@ static int prefetch_suitable(void)
 		 */
 		limit = ps.nr_mapped + ps.nr_slab + ps.nr_dirty +
 			ps.nr_unstable + total_swapcache_pages;
-		if (limit > sp_stat.prefetch_watermark[node]) {
+		if (limit > ns->prefetch_watermark) {
 			node_clear(node, sp_stat.prefetch_nodes);
 			continue;
 		}
@@ -370,6 +447,7 @@ static enum trickle_return trickle_swap(
 	if (!swap_prefetch || laptop_mode)
 		return ret;
 
+	examine_free_limits();
 	entry = NULL;
 
 	for ( ; ; ) {
@@ -459,8 +537,7 @@ static int kprefetchd(void *__unused)
  */
 void __init prepare_swap_prefetch(void)
 {
-	pg_data_t *pgdat;
-	int node;
+	struct zone *zone;
 
 	swapped.cache = kmem_cache_create("swapped_entry",
 		sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL);
@@ -471,14 +548,19 @@ void __init prepare_swap_prefetch(void)
 	 */
 	swapped.maxcount = nr_free_pagecache_pages() / 3 * 2;
 
-	for_each_online_pgdat(pgdat) {
+	for_each_zone(zone) {
 		unsigned long present;
+		struct node_stats *ns;
+		int idx;
 
-		present = pgdat->node_present_pages;
+		present = zone->present_pages;
 		if (!present)
 			continue;
-		node = pgdat->node_id;
-		sp_stat.prefetch_watermark[node] += present / 3 * 2;
+
+		ns = &sp_stat.node[zone->zone_pgdat->node_id];
+		ns->prefetch_watermark += present / 3 * 2;
+		idx = zone_idx(zone);
+		ns->pointfree[idx] = &ns->highfree[idx];
 	}
 }
 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-10  9:54 [PATCH] mm: Implement swap prefetching tweaks Con Kolivas
@ 2006-03-10 22:35 ` Andrew Morton
  2006-03-10 23:11   ` Peter Williams
  2006-03-11  3:50   ` Con Kolivas
  0 siblings, 2 replies; 32+ messages in thread
From: Andrew Morton @ 2006-03-10 22:35 UTC (permalink / raw)
  To: Con Kolivas; +Cc: linux-kernel, ck

Con Kolivas <kernel@kolivas.org> wrote:
>
> +	/*
> +	 * get_page_state is super expensive so we only perform it every
> +	 * SWAP_CLUSTER_MAX prefetched_pages.

nr_running() is similarly expensive btw.

> 	 * We also test if we're the only
> +	 * task running anywhere. We want to have as little impact on all
> +	 * resources (cpu, disk, bus etc). As this iterates over every cpu
> +	 * we measure this infrequently.
> +	 */
> +	if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> +		unsigned long cpuload = nr_running();
> +
> +		if (cpuload > 1)
> +			goto out;

Sorry, this is just wrong.  If swap prefetch is useful then it's also
useful if some task happens to be sitting over in the corner calculating
pi.

What's the actual problem here?  Someone's 3d game went blippy?  Why?  How
much?  Are we missing a cond_resched()?

> +		cpuload += nr_uninterruptible();
> +		if (cpuload > 1)
> +			goto out;

Not sure about this either.  


> +		if (ns->last_free) {
> +			if (ns->current_free + SWAP_CLUSTER_MAX <
> +				ns->last_free) {
> +					ns->last_free = ns->current_free;
>  					node_clear(node,
>  						sp_stat.prefetch_nodes);
>  					continue;
>  			}
>  		} else

That has an extra tabstop.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-10 22:35 ` Andrew Morton
@ 2006-03-10 23:11   ` Peter Williams
  2006-03-11  4:18     ` Con Kolivas
  2006-03-11  3:50   ` Con Kolivas
  1 sibling, 1 reply; 32+ messages in thread
From: Peter Williams @ 2006-03-10 23:11 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Con Kolivas, linux-kernel, ck

Andrew Morton wrote:
> Con Kolivas <kernel@kolivas.org> wrote:
> 
>>+	/*
>>+	 * get_page_state is super expensive so we only perform it every
>>+	 * SWAP_CLUSTER_MAX prefetched_pages.
> 
> 
> nr_running() is similarly expensive btw.
> 
> 
>>	 * We also test if we're the only
>>+	 * task running anywhere. We want to have as little impact on all
>>+	 * resources (cpu, disk, bus etc). As this iterates over every cpu
>>+	 * we measure this infrequently.
>>+	 */
>>+	if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
>>+		unsigned long cpuload = nr_running();
>>+
>>+		if (cpuload > 1)
>>+			goto out;
> 
> 
> Sorry, this is just wrong.  If swap prefetch is useful then it's also
> useful if some task happens to be sitting over in the corner calculating
> pi.

On SMP systems, something based on the run queues' raw_weighted_load 
fields (comes with smpnice patch) might be more useful than nr_running() 
as it contains information about the priority of the running tasks. 
Perhaps (raw_weighted_load() > SCHED_LOAD_SCALE) or some variation, 
where raw_weighted_load() is the sum of that field for all CPUs) would 
suffice.  It would mean "there's more than the equivalent of one nice==0 
task running" and shouldn't be any more expensive than nr_running(). 
Dividing SCHED_LOAD_SCALE by some number would be an obvious variation 
to try as would taking into account this process's contribution to the 
weighted load.

Also if this was useful there's no real reason that raw_weighted_load 
couldn't be made available on non SMP systems as well as SMP ones.

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-10 22:35 ` Andrew Morton
  2006-03-10 23:11   ` Peter Williams
@ 2006-03-11  3:50   ` Con Kolivas
  2006-03-11  5:33     ` Mike Galbraith
  1 sibling, 1 reply; 32+ messages in thread
From: Con Kolivas @ 2006-03-11  3:50 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, ck

On Saturday 11 March 2006 09:35, Andrew Morton wrote:
> Con Kolivas <kernel@kolivas.org> wrote:
> > +	/*
> > +	 * get_page_state is super expensive so we only perform it every
> > +	 * SWAP_CLUSTER_MAX prefetched_pages.
>
> nr_running() is similarly expensive btw.

Yes which is why I do it just as infrequently as get_page_state.
>
> > 	 * We also test if we're the only
> > +	 * task running anywhere. We want to have as little impact on all
> > +	 * resources (cpu, disk, bus etc). As this iterates over every cpu
> > +	 * we measure this infrequently.
> > +	 */
> > +	if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > +		unsigned long cpuload = nr_running();
> > +
> > +		if (cpuload > 1)
> > +			goto out;
>
> Sorry, this is just wrong.  If swap prefetch is useful then it's also
> useful if some task happens to be sitting over in the corner calculating
> pi.
>
> What's the actual problem here?  Someone's 3d game went blippy?  Why?  How
> much?  Are we missing a cond_resched()?

No, it's pretty easy to reproduce, kprefetchd sits there in uninterruptible 
sleep with one cpu on SMP pegged at 100% iowait due to it. This tends to have 
noticeable effects everywhere on HT or SMP. On UP the yielding helped it but 
even then it still causes blips. How much? Well to be honest it's noticeable 
a shipload. Running a game, any game, that uses 100% (and most fancy games 
do) causes stuttering on audio, pauses and so on. This is evident on linux 
native games, games under emulators or qemu and so on. That iowait really 
hurts, and tweaking just priority doesn't help it in any way.

With this change it's much more polite and takes a bit longer to complete 
prefetching but is still effective while no longer being noticeable.

> > +		cpuload += nr_uninterruptible();
> > +		if (cpuload > 1)
> > +			goto out;
>
> Not sure about this either.

Same as above. It's the tasks in uninterruptible sleep that cause the most 
harm. I do it sequentially simply because nr_running() is more likely to be 
>1 than the sum total, and I'd prefer not to do nr_uninterruptible() unless 
it's necessary. Both of these are actually done lockless though.

> > +		if (ns->last_free) {
> > +			if (ns->current_free + SWAP_CLUSTER_MAX <
> > +				ns->last_free) {
> > +					ns->last_free = ns->current_free;
> >  					node_clear(node,
> >  						sp_stat.prefetch_nodes);
> >  					continue;
> >  			}
> >  		} else
>
> That has an extra tabstop.

Hrm. 3 years on and I still make basi style mistakes.

Cheers,
Con

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-10 23:11   ` Peter Williams
@ 2006-03-11  4:18     ` Con Kolivas
  2006-03-11  4:28       ` Peter Williams
                         ` (2 more replies)
  0 siblings, 3 replies; 32+ messages in thread
From: Con Kolivas @ 2006-03-11  4:18 UTC (permalink / raw)
  To: Peter Williams; +Cc: Andrew Morton, linux-kernel, ck

On Saturday 11 March 2006 10:11, Peter Williams wrote:
> Andrew Morton wrote:
> > Con Kolivas <kernel@kolivas.org> wrote:
> >>+	/*
> >>+	 * get_page_state is super expensive so we only perform it every
> >>+	 * SWAP_CLUSTER_MAX prefetched_pages.
> >
> > nr_running() is similarly expensive btw.
> >
> >>	 * We also test if we're the only
> >>+	 * task running anywhere. We want to have as little impact on all
> >>+	 * resources (cpu, disk, bus etc). As this iterates over every cpu
> >>+	 * we measure this infrequently.
> >>+	 */
> >>+	if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> >>+		unsigned long cpuload = nr_running();
> >>+
> >>+		if (cpuload > 1)
> >>+			goto out;
> >
> > Sorry, this is just wrong.  If swap prefetch is useful then it's also
> > useful if some task happens to be sitting over in the corner calculating
> > pi.
>
> On SMP systems, something based on the run queues' raw_weighted_load
> fields (comes with smpnice patch) might be more useful than nr_running()
> as it contains information about the priority of the running tasks.
> Perhaps (raw_weighted_load() > SCHED_LOAD_SCALE) or some variation,
> where raw_weighted_load() is the sum of that field for all CPUs) would
> suffice.  It would mean "there's more than the equivalent of one nice==0
> task running" and shouldn't be any more expensive than nr_running().
> Dividing SCHED_LOAD_SCALE by some number would be an obvious variation
> to try as would taking into account this process's contribution to the
> weighted load.
>
> Also if this was useful there's no real reason that raw_weighted_load
> couldn't be made available on non SMP systems as well as SMP ones.

That does seem reasonable, but I'm looking at total system load, not per 
runqueue. So a global_weighted_load() function would be required to return 
that. Because despite what anyone seems to want to believe, reading from disk 
hurts. Why it hurts so much I'm not really sure, but it's not a SCSI vs IDE 
with or without DMA issue. It's not about tweaking parameters. It doesn't 
seem to be only about cpu cycles. This is not a mistuned system that it 
happens on. It just plain hurts if we do lots of disk i/o, perhaps it's 
saturating the bus or something. Whatever it is, as much as I'd _like_ swap 
prefetch to just keep working quietly at ultra ultra low priority, the disk 
reads that swap prefetch does are not innocuous so I really do want them to 
only be done when nothing else wants cpu.

Cheers,
Con

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-11  4:18     ` Con Kolivas
@ 2006-03-11  4:28       ` Peter Williams
  2006-03-11  4:34         ` Con Kolivas
  2006-03-11  5:04       ` [ck] " Radoslaw Szkodzinski
  2006-03-11  5:46       ` Peter Williams
  2 siblings, 1 reply; 32+ messages in thread
From: Peter Williams @ 2006-03-11  4:28 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Andrew Morton, linux-kernel, ck

Con Kolivas wrote:
> On Saturday 11 March 2006 10:11, Peter Williams wrote:
> 
>>Andrew Morton wrote:
>>
>>>Con Kolivas <kernel@kolivas.org> wrote:
>>>
>>>>+	/*
>>>>+	 * get_page_state is super expensive so we only perform it every
>>>>+	 * SWAP_CLUSTER_MAX prefetched_pages.
>>>
>>>nr_running() is similarly expensive btw.
>>>
>>>
>>>>	 * We also test if we're the only
>>>>+	 * task running anywhere. We want to have as little impact on all
>>>>+	 * resources (cpu, disk, bus etc). As this iterates over every cpu
>>>>+	 * we measure this infrequently.
>>>>+	 */
>>>>+	if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
>>>>+		unsigned long cpuload = nr_running();
>>>>+
>>>>+		if (cpuload > 1)
>>>>+			goto out;
>>>
>>>Sorry, this is just wrong.  If swap prefetch is useful then it's also
>>>useful if some task happens to be sitting over in the corner calculating
>>>pi.
>>
>>On SMP systems, something based on the run queues' raw_weighted_load
>>fields (comes with smpnice patch) might be more useful than nr_running()
>>as it contains information about the priority of the running tasks.
>>Perhaps (raw_weighted_load() > SCHED_LOAD_SCALE) or some variation,
>>where raw_weighted_load() is the sum of that field for all CPUs) would
>>suffice.  It would mean "there's more than the equivalent of one nice==0
>>task running" and shouldn't be any more expensive than nr_running().
>>Dividing SCHED_LOAD_SCALE by some number would be an obvious variation
>>to try as would taking into account this process's contribution to the
>>weighted load.
>>
>>Also if this was useful there's no real reason that raw_weighted_load
>>couldn't be made available on non SMP systems as well as SMP ones.
> 
> 
> That does seem reasonable, but I'm looking at total system load, not per 
> runqueue. So a global_weighted_load() function would be required to return 
> that.

Yes. That's why I said "something based on".

> Because despite what anyone seems to want to believe, reading from disk 
> hurts. Why it hurts so much I'm not really sure, but it's not a SCSI vs IDE 
> with or without DMA issue. It's not about tweaking parameters. It doesn't 
> seem to be only about cpu cycles. This is not a mistuned system that it 
> happens on. It just plain hurts if we do lots of disk i/o, perhaps it's 
> saturating the bus or something. Whatever it is, as much as I'd _like_ swap 
> prefetch to just keep working quietly at ultra ultra low priority, the disk 
> reads that swap prefetch does are not innocuous so I really do want them to 
> only be done when nothing else wants cpu.

Would you like to try a prototype version of the soft caps patch I'm 
working on to see if it will help?

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-11  4:28       ` Peter Williams
@ 2006-03-11  4:34         ` Con Kolivas
  2006-03-11  5:34           ` Peter Williams
  0 siblings, 1 reply; 32+ messages in thread
From: Con Kolivas @ 2006-03-11  4:34 UTC (permalink / raw)
  To: Peter Williams; +Cc: Andrew Morton, linux-kernel, ck

On Saturday 11 March 2006 15:28, Peter Williams wrote:
> Con Kolivas wrote:
> > Because despite what anyone seems to want to believe, reading from disk
> > hurts. Why it hurts so much I'm not really sure, but it's not a SCSI vs
> > IDE with or without DMA issue. It's not about tweaking parameters. It
> > doesn't seem to be only about cpu cycles. This is not a mistuned system
> > that it happens on. It just plain hurts if we do lots of disk i/o,
> > perhaps it's saturating the bus or something. Whatever it is, as much as
> > I'd _like_ swap prefetch to just keep working quietly at ultra ultra low
> > priority, the disk reads that swap prefetch does are not innocuous so I
> > really do want them to only be done when nothing else wants cpu.

I didn't make it clear here the things affected are not even doing any I/O of 
their own. It's not about I/O resource allocation. However they are using 
100% cpu and probably doing a lot of gpu bus traffic.

> Would you like to try a prototype version of the soft caps patch I'm
> working on to see if it will help?

What happens if it's using .01% cpu and spends most of its time in 
uninterruptible sleep?

Cheers,
Con

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [ck] Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-11  4:18     ` Con Kolivas
  2006-03-11  4:28       ` Peter Williams
@ 2006-03-11  5:04       ` Radoslaw Szkodzinski
  2006-03-11  5:21         ` Con Kolivas
  2006-03-11  5:46       ` Peter Williams
  2 siblings, 1 reply; 32+ messages in thread
From: Radoslaw Szkodzinski @ 2006-03-11  5:04 UTC (permalink / raw)
  To: ck; +Cc: Con Kolivas, Peter Williams, Andrew Morton, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2938 bytes --]

On Saturday 11 March 2006 05:18, Con Kolivas wrote yet:
> On Saturday 11 March 2006 10:11, Peter Williams wrote:
> > Andrew Morton wrote:
> > > Con Kolivas <kernel@kolivas.org> wrote:
> > >>	 * We also test if we're the only
> > >>+	 * task running anywhere. We want to have as little impact on all
> > >>+	 * resources (cpu, disk, bus etc). As this iterates over every cpu
> > >>+	 * we measure this infrequently.
> > >>+	 */
> > >>+	if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > >>+		unsigned long cpuload = nr_running();
> > >>+
> > >>+		if (cpuload > 1)
> > >>+			goto out;
> > >
> > > Sorry, this is just wrong.  If swap prefetch is useful then it's also
> > > useful if some task happens to be sitting over in the corner
> > > calculating pi.
> >
> > On SMP systems, something based on the run queues' raw_weighted_load
> > fields (comes with smpnice patch) might be more useful than nr_running()
> > as it contains information about the priority of the running tasks.
> > Perhaps (raw_weighted_load() > SCHED_LOAD_SCALE) or some variation,
> > where raw_weighted_load() is the sum of that field for all CPUs) would
> > suffice.  It would mean "there's more than the equivalent of one nice==0
> > task running" and shouldn't be any more expensive than nr_running().
> > Dividing SCHED_LOAD_SCALE by some number would be an obvious variation
> > to try as would taking into account this process's contribution to the
> > weighted load.
> >
> > Also if this was useful there's no real reason that raw_weighted_load
> > couldn't be made available on non SMP systems as well as SMP ones.
>
> That does seem reasonable, but I'm looking at total system load, not per
> runqueue. So a global_weighted_load() function would be required to return
> that. Because despite what anyone seems to want to believe, reading from
> disk hurts. Why it hurts so much I'm not really sure, but it's not a SCSI
> vs IDE with or without DMA issue. It's not about tweaking parameters. It
> doesn't seem to be only about cpu cycles. This is not a mistuned system
> that it happens on. It just plain hurts if we do lots of disk i/o, perhaps
> it's saturating the bus or something. Whatever it is, as much as I'd _like_
> swap prefetch to just keep working quietly at ultra ultra low priority, the
> disk reads that swap prefetch does are not innocuous so I really do want
> them to only be done when nothing else wants cpu.

Wouldn't the change break prefetching if I have 98% CPU time free and not 
100%? Something like an audio player in the background?

It seems that any Seti@home type of calculation would kill it.
In reality, we don't want disk reads when something interactive is running, so 
maybe you'd look at the nice level of the task?
(higher than x = don't count it?)

-- 
GPG Key id:  0xD1F10BA2
Fingerprint: 96E2 304A B9C4 949A 10A0  9105 9543 0453 D1F1 0BA2

AstralStorm

[-- Attachment #2: Type: application/pgp-signature, Size: 191 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [ck] Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-11  5:04       ` [ck] " Radoslaw Szkodzinski
@ 2006-03-11  5:21         ` Con Kolivas
  0 siblings, 0 replies; 32+ messages in thread
From: Con Kolivas @ 2006-03-11  5:21 UTC (permalink / raw)
  To: Radoslaw Szkodzinski; +Cc: ck, Peter Williams, Andrew Morton, linux-kernel

On Saturday 11 March 2006 16:04, Radoslaw Szkodzinski wrote:
> On Saturday 11 March 2006 05:18, Con Kolivas wrote yet:
> > On Saturday 11 March 2006 10:11, Peter Williams wrote:
> > > Andrew Morton wrote:
> > > > Con Kolivas <kernel@kolivas.org> wrote:
> > > >>	 * We also test if we're the only
> > > >>+	 * task running anywhere. We want to have as little impact on all
> > > >>+	 * resources (cpu, disk, bus etc). As this iterates over every cpu
> > > >>+	 * we measure this infrequently.
> > > >>+	 */
> > > >>+	if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > > >>+		unsigned long cpuload = nr_running();
> > > >>+
> > > >>+		if (cpuload > 1)
> > > >>+			goto out;
> > > >
> > > > Sorry, this is just wrong.  If swap prefetch is useful then it's also
> > > > useful if some task happens to be sitting over in the corner
> > > > calculating pi.
> > >
> > > On SMP systems, something based on the run queues' raw_weighted_load
> > > fields (comes with smpnice patch) might be more useful than
> > > nr_running() as it contains information about the priority of the
> > > running tasks. Perhaps (raw_weighted_load() > SCHED_LOAD_SCALE) or some
> > > variation, where raw_weighted_load() is the sum of that field for all
> > > CPUs) would suffice.  It would mean "there's more than the equivalent
> > > of one nice==0 task running" and shouldn't be any more expensive than
> > > nr_running(). Dividing SCHED_LOAD_SCALE by some number would be an
> > > obvious variation to try as would taking into account this process's
> > > contribution to the weighted load.
> > >
> > > Also if this was useful there's no real reason that raw_weighted_load
> > > couldn't be made available on non SMP systems as well as SMP ones.
> >
> > That does seem reasonable, but I'm looking at total system load, not per
> > runqueue. So a global_weighted_load() function would be required to
> > return that. Because despite what anyone seems to want to believe,
> > reading from disk hurts. Why it hurts so much I'm not really sure, but
> > it's not a SCSI vs IDE with or without DMA issue. It's not about tweaking
> > parameters. It doesn't seem to be only about cpu cycles. This is not a
> > mistuned system that it happens on. It just plain hurts if we do lots of
> > disk i/o, perhaps it's saturating the bus or something. Whatever it is,
> > as much as I'd _like_ swap prefetch to just keep working quietly at ultra
> > ultra low priority, the disk reads that swap prefetch does are not
> > innocuous so I really do want them to only be done when nothing else
> > wants cpu.
>
> Wouldn't the change break prefetching if I have 98% CPU time free and not
> 100%? Something like an audio player in the background?

That would only intermittently stop prefetching whenever both happened to use 
cpu at exactly the same time (which is the desired effect). So playing audio 
will slow prefetch a little but it will still prefetch.

> It seems that any Seti@home type of calculation would kill it.
> In reality, we don't want disk reads when something interactive is running,
> so maybe you'd look at the nice level of the task?
> (higher than x = don't count it?)

That's what Peter is promoting here. I could use the "weighted load" value to 
determine just that, and keep running prefetch if highly niced tasks are 
running. I am considering adding that in the future. For the moment I 
definitely think opting out of prefetching whenever anything is running is 
the right thing to do.

Cheers,
Con

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-11  3:50   ` Con Kolivas
@ 2006-03-11  5:33     ` Mike Galbraith
  2006-03-11  5:50       ` Con Kolivas
  0 siblings, 1 reply; 32+ messages in thread
From: Mike Galbraith @ 2006-03-11  5:33 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Andrew Morton, linux-kernel, ck

On Sat, 2006-03-11 at 14:50 +1100, Con Kolivas wrote:
> On Saturday 11 March 2006 09:35, Andrew Morton wrote:
> > Con Kolivas <kernel@kolivas.org> wrote:
> > > +	/*
> > > +	 * get_page_state is super expensive so we only perform it every
> > > +	 * SWAP_CLUSTER_MAX prefetched_pages.
> >
> > nr_running() is similarly expensive btw.
> 
> Yes which is why I do it just as infrequently as get_page_state.
> >
> > > 	 * We also test if we're the only
> > > +	 * task running anywhere. We want to have as little impact on all
> > > +	 * resources (cpu, disk, bus etc). As this iterates over every cpu
> > > +	 * we measure this infrequently.
> > > +	 */
> > > +	if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > > +		unsigned long cpuload = nr_running();
> > > +
> > > +		if (cpuload > 1)
> > > +			goto out;
> >
> > Sorry, this is just wrong.  If swap prefetch is useful then it's also
> > useful if some task happens to be sitting over in the corner calculating
> > pi.
> >
> > What's the actual problem here?  Someone's 3d game went blippy?  Why?  How
> > much?  Are we missing a cond_resched()?
> 
> No, it's pretty easy to reproduce, kprefetchd sits there in uninterruptible 
> sleep with one cpu on SMP pegged at 100% iowait due to it. This tends to have 
> noticeable effects everywhere on HT or SMP. On UP the yielding helped it but 
> even then it still causes blips. How much? Well to be honest it's noticeable 
> a shipload. Running a game, any game, that uses 100% (and most fancy games 
> do) causes stuttering on audio, pauses and so on. This is evident on linux 
> native games, games under emulators or qemu and so on. That iowait really 
> hurts, and tweaking just priority doesn't help it in any way.

That doesn't really make sense to me.  If a task can trigger audio
dropout and stalls by sleeping, we have a serious problem.  In your
SMP/HT case, I'd start crawling over the load balancing code.  I can't
see how trivial CPU with non-saturated IO can cause dropout in the UP
case either.  Am I missing something?

	-Mike


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-11  4:34         ` Con Kolivas
@ 2006-03-11  5:34           ` Peter Williams
  0 siblings, 0 replies; 32+ messages in thread
From: Peter Williams @ 2006-03-11  5:34 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Andrew Morton, linux-kernel, ck

Con Kolivas wrote:
> On Saturday 11 March 2006 15:28, Peter Williams wrote:
> 
>>Con Kolivas wrote:
>>
>>>Because despite what anyone seems to want to believe, reading from disk
>>>hurts. Why it hurts so much I'm not really sure, but it's not a SCSI vs
>>>IDE with or without DMA issue. It's not about tweaking parameters. It
>>>doesn't seem to be only about cpu cycles. This is not a mistuned system
>>>that it happens on. It just plain hurts if we do lots of disk i/o,
>>>perhaps it's saturating the bus or something. Whatever it is, as much as
>>>I'd _like_ swap prefetch to just keep working quietly at ultra ultra low
>>>priority, the disk reads that swap prefetch does are not innocuous so I
>>>really do want them to only be done when nothing else wants cpu.
> 
> 
> I didn't make it clear here the things affected are not even doing any I/O of 
> their own. It's not about I/O resource allocation. However they are using 
> 100% cpu and probably doing a lot of gpu bus traffic.
> 
> 
>>Would you like to try a prototype version of the soft caps patch I'm
>>working on to see if it will help?
> 
> 
> What happens if it's using .01% cpu and spends most of its time in 
> uninterruptible sleep?

Probably not much as I have to let tasks with a soft cap of zero get 
some CPU to avoid problems with them holding resource other tasks may 
need and 0.01% is probably as low as I can keep it anyway.

Just to clarify.  At the moment, what I do to a task with a zero soft 
cap is give them a priority one above MAX_PRIO (i.e. 2 higher than any 
other task can have) and make sure they always go on the expired array 
at the end of their time slice.  They also get a load weight of zero to 
prevent them getting a CPU to themselves.  This means that any task that 
becomes runnable on their CPU should preempt them and if they're the 
only task on their CPU it will look idle and waking tasks may be moved 
there if the other CPUs are idle.  This may be enough to stop them 
interfering with your game's tasks.

I'm currently letting them have a time slice determined by their nice in 
an attempt to reduce context switching but this may change as it 
probably allows them to get CPU access when there are non background 
tasks on the expired array.  I'm still thinking about how to prevent 
this and keep context switching low.

Tasks with non zero soft caps go through a different process and (as far 
as possible) tasks without soft caps avoid the capping code.

Peter
PS This is still work in progress.
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-11  4:18     ` Con Kolivas
  2006-03-11  4:28       ` Peter Williams
  2006-03-11  5:04       ` [ck] " Radoslaw Szkodzinski
@ 2006-03-11  5:46       ` Peter Williams
  2 siblings, 0 replies; 32+ messages in thread
From: Peter Williams @ 2006-03-11  5:46 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Andrew Morton, linux-kernel, ck

Con Kolivas wrote:
> On Saturday 11 March 2006 10:11, Peter Williams wrote:
> 
>>Andrew Morton wrote:
>>
>>>Con Kolivas <kernel@kolivas.org> wrote:
>>>
>>>>+	/*
>>>>+	 * get_page_state is super expensive so we only perform it every
>>>>+	 * SWAP_CLUSTER_MAX prefetched_pages.
>>>
>>>nr_running() is similarly expensive btw.
>>>
>>>
>>>>	 * We also test if we're the only
>>>>+	 * task running anywhere. We want to have as little impact on all
>>>>+	 * resources (cpu, disk, bus etc). As this iterates over every cpu
>>>>+	 * we measure this infrequently.
>>>>+	 */
>>>>+	if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
>>>>+		unsigned long cpuload = nr_running();
>>>>+
>>>>+		if (cpuload > 1)
>>>>+			goto out;
>>>
>>>Sorry, this is just wrong.  If swap prefetch is useful then it's also
>>>useful if some task happens to be sitting over in the corner calculating
>>>pi.
>>
>>On SMP systems, something based on the run queues' raw_weighted_load
>>fields (comes with smpnice patch) might be more useful than nr_running()
>>as it contains information about the priority of the running tasks.
>>Perhaps (raw_weighted_load() > SCHED_LOAD_SCALE) or some variation,
>>where raw_weighted_load() is the sum of that field for all CPUs) would
>>suffice.  It would mean "there's more than the equivalent of one nice==0
>>task running" and shouldn't be any more expensive than nr_running().
>>Dividing SCHED_LOAD_SCALE by some number would be an obvious variation
>>to try as would taking into account this process's contribution to the
>>weighted load.
>>
>>Also if this was useful there's no real reason that raw_weighted_load
>>couldn't be made available on non SMP systems as well as SMP ones.
> 
> 
> That does seem reasonable, but I'm looking at total system load, not per 
> runqueue. So a global_weighted_load() function would be required to return 
> that.

Just another thought here.  Any function such as this and nr_running() 
will be highly racy unless you lock all run queues while running it and 
while you perform the action dependent on the result (which I presume 
you don't do).  This means the answer you get back is probably wrong by 
the time you make a decision based on the answer.

So is there any reason that you can't make the decision inside the loop 
iterating over the CPUs on a per CPU basis?  This would remove the 
raciness.  The only thing that I can think of is that you're trying to 
avoid the cost of that loop but you'll wear most of that running 
global_weighted_load() or nr_running() anyway.

> Because despite what anyone seems to want to believe, reading from disk 
> hurts.  Why it hurts so much I'm not really sure, but it's not a SCSI vs IDE
> with or without DMA issue. It's not about tweaking parameters. It doesn't 
> seem to be only about cpu cycles. This is not a mistuned system that it 
> happens on. It just plain hurts if we do lots of disk i/o, perhaps it's 
> saturating the bus or something. Whatever it is, as much as I'd _like_ swap 
> prefetch to just keep working quietly at ultra ultra low priority, the disk 
> reads that swap prefetch does are not innocuous so I really do want them to 
> only be done when nothing else wants cpu.
> 
> Cheers,
> Con


-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-11  5:33     ` Mike Galbraith
@ 2006-03-11  5:50       ` Con Kolivas
  2006-03-11  5:58         ` Con Kolivas
  2006-03-11  6:00         ` Mike Galbraith
  0 siblings, 2 replies; 32+ messages in thread
From: Con Kolivas @ 2006-03-11  5:50 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Andrew Morton, linux-kernel, ck

On Saturday 11 March 2006 16:33, Mike Galbraith wrote:
> On Sat, 2006-03-11 at 14:50 +1100, Con Kolivas wrote:
> > On Saturday 11 March 2006 09:35, Andrew Morton wrote:
> > > Con Kolivas <kernel@kolivas.org> wrote:
> > > > +	/*
> > > > +	 * get_page_state is super expensive so we only perform it every
> > > > +	 * SWAP_CLUSTER_MAX prefetched_pages.
> > >
> > > nr_running() is similarly expensive btw.
> >
> > Yes which is why I do it just as infrequently as get_page_state.
> >
> > > > 	 * We also test if we're the only
> > > > +	 * task running anywhere. We want to have as little impact on all
> > > > +	 * resources (cpu, disk, bus etc). As this iterates over every cpu
> > > > +	 * we measure this infrequently.
> > > > +	 */
> > > > +	if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > > > +		unsigned long cpuload = nr_running();
> > > > +
> > > > +		if (cpuload > 1)
> > > > +			goto out;
> > >
> > > Sorry, this is just wrong.  If swap prefetch is useful then it's also
> > > useful if some task happens to be sitting over in the corner
> > > calculating pi.
> > >
> > > What's the actual problem here?  Someone's 3d game went blippy?  Why? 
> > > How much?  Are we missing a cond_resched()?
> >
> > No, it's pretty easy to reproduce, kprefetchd sits there in
> > uninterruptible sleep with one cpu on SMP pegged at 100% iowait due to
> > it. This tends to have noticeable effects everywhere on HT or SMP. On UP
> > the yielding helped it but even then it still causes blips. How much?
> > Well to be honest it's noticeable a shipload. Running a game, any game,
> > that uses 100% (and most fancy games do) causes stuttering on audio,
> > pauses and so on. This is evident on linux native games, games under
> > emulators or qemu and so on. That iowait really hurts, and tweaking just
> > priority doesn't help it in any way.
>
> That doesn't really make sense to me.  If a task can trigger audio
> dropout and stalls by sleeping, we have a serious problem.  In your
> SMP/HT case, I'd start crawling over the load balancing code.  I can't
> see how trivial CPU with non-saturated IO can cause dropout in the UP
> case either.  Am I missing something?

Clearly you, me and everyone else is missing something. I see it with each 
task bound to one cpu with cpu affinity so it's not a balancing issue. Try it 
yourself if you can instead of not believing me. Get a big dd reader 
(virtually no cpu and all io wait sleep) on one cpu and try and play a game 
on the other cpu. It dies rectally.

Cheers,
Con

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-11  5:50       ` Con Kolivas
@ 2006-03-11  5:58         ` Con Kolivas
  2006-03-11  6:11           ` Mike Galbraith
  2006-03-11  6:00         ` Mike Galbraith
  1 sibling, 1 reply; 32+ messages in thread
From: Con Kolivas @ 2006-03-11  5:58 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Andrew Morton, linux-kernel, ck

On Saturday 11 March 2006 16:50, Con Kolivas wrote:
> On Saturday 11 March 2006 16:33, Mike Galbraith wrote:
> > On Sat, 2006-03-11 at 14:50 +1100, Con Kolivas wrote:
> > > On Saturday 11 March 2006 09:35, Andrew Morton wrote:
> > > > Con Kolivas <kernel@kolivas.org> wrote:
> > > > > +	/*
> > > > > +	 * get_page_state is super expensive so we only perform it every
> > > > > +	 * SWAP_CLUSTER_MAX prefetched_pages.
> > > >
> > > > nr_running() is similarly expensive btw.
> > >
> > > Yes which is why I do it just as infrequently as get_page_state.
> > >
> > > > > 	 * We also test if we're the only
> > > > > +	 * task running anywhere. We want to have as little impact on all
> > > > > +	 * resources (cpu, disk, bus etc). As this iterates over every
> > > > > cpu +	 * we measure this infrequently.
> > > > > +	 */
> > > > > +	if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > > > > +		unsigned long cpuload = nr_running();
> > > > > +
> > > > > +		if (cpuload > 1)
> > > > > +			goto out;
> > > >
> > > > Sorry, this is just wrong.  If swap prefetch is useful then it's also
> > > > useful if some task happens to be sitting over in the corner
> > > > calculating pi.
> > > >
> > > > What's the actual problem here?  Someone's 3d game went blippy?  Why?
> > > > How much?  Are we missing a cond_resched()?
> > >
> > > No, it's pretty easy to reproduce, kprefetchd sits there in
> > > uninterruptible sleep with one cpu on SMP pegged at 100% iowait due to
> > > it. This tends to have noticeable effects everywhere on HT or SMP. On
> > > UP the yielding helped it but even then it still causes blips. How
> > > much? Well to be honest it's noticeable a shipload. Running a game, any
> > > game, that uses 100% (and most fancy games do) causes stuttering on
> > > audio, pauses and so on. This is evident on linux native games, games
> > > under emulators or qemu and so on. That iowait really hurts, and
> > > tweaking just priority doesn't help it in any way.
> >
> > That doesn't really make sense to me.  If a task can trigger audio
> > dropout and stalls by sleeping, we have a serious problem.  In your
> > SMP/HT case, I'd start crawling over the load balancing code.  I can't
> > see how trivial CPU with non-saturated IO can cause dropout in the UP
> > case either.  Am I missing something?
>
> Clearly you, me and everyone else is missing something. I see it with each
> task bound to one cpu with cpu affinity so it's not a balancing issue. Try
> it yourself if you can instead of not believing me. Get a big dd reader
> (virtually no cpu and all io wait sleep) on one cpu and try and play a game
> on the other cpu. It dies rectally.

I happen to have a tool to instrument this as you're probably aware 
(interbench). Here is an old log I found of this.:

--- Benchmarking simulated cpu of Gaming in the presence of simulated ---
Load	Latency +/- SD (ms)  Max Latency   % Desired CPU
None	      0 +/- 0              0		 100
Write	   36.5 +/- 103          966		73.3
Read	   17.2 +/- 22.9         244		85.3

Note the max latency being massive and desired cpu dropping. This is on a HT 
machine.

Cheers,
Con

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-11  5:50       ` Con Kolivas
  2006-03-11  5:58         ` Con Kolivas
@ 2006-03-11  6:00         ` Mike Galbraith
  2006-03-11  6:05           ` Mike Galbraith
  2006-03-11  7:24           ` Con Kolivas
  1 sibling, 2 replies; 32+ messages in thread
From: Mike Galbraith @ 2006-03-11  6:00 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Andrew Morton, linux-kernel, ck

On Sat, 2006-03-11 at 16:50 +1100, Con Kolivas wrote:
> On Saturday 11 March 2006 16:33, Mike Galbraith wrote:
> > On Sat, 2006-03-11 at 14:50 +1100, Con Kolivas wrote:
> > > On Saturday 11 March 2006 09:35, Andrew Morton wrote:
> > > > Con Kolivas <kernel@kolivas.org> wrote:
> > > > > +	/*
> > > > > +	 * get_page_state is super expensive so we only perform it every
> > > > > +	 * SWAP_CLUSTER_MAX prefetched_pages.
> > > >
> > > > nr_running() is similarly expensive btw.
> > >
> > > Yes which is why I do it just as infrequently as get_page_state.
> > >
> > > > > 	 * We also test if we're the only
> > > > > +	 * task running anywhere. We want to have as little impact on all
> > > > > +	 * resources (cpu, disk, bus etc). As this iterates over every cpu
> > > > > +	 * we measure this infrequently.
> > > > > +	 */
> > > > > +	if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > > > > +		unsigned long cpuload = nr_running();
> > > > > +
> > > > > +		if (cpuload > 1)
> > > > > +			goto out;
> > > >
> > > > Sorry, this is just wrong.  If swap prefetch is useful then it's also
> > > > useful if some task happens to be sitting over in the corner
> > > > calculating pi.
> > > >
> > > > What's the actual problem here?  Someone's 3d game went blippy?  Why? 
> > > > How much?  Are we missing a cond_resched()?
> > >
> > > No, it's pretty easy to reproduce, kprefetchd sits there in
> > > uninterruptible sleep with one cpu on SMP pegged at 100% iowait due to
> > > it. This tends to have noticeable effects everywhere on HT or SMP. On UP
> > > the yielding helped it but even then it still causes blips. How much?
> > > Well to be honest it's noticeable a shipload. Running a game, any game,
> > > that uses 100% (and most fancy games do) causes stuttering on audio,
> > > pauses and so on. This is evident on linux native games, games under
> > > emulators or qemu and so on. That iowait really hurts, and tweaking just
> > > priority doesn't help it in any way.
> >
> > That doesn't really make sense to me.  If a task can trigger audio
> > dropout and stalls by sleeping, we have a serious problem.  In your
> > SMP/HT case, I'd start crawling over the load balancing code.  I can't
> > see how trivial CPU with non-saturated IO can cause dropout in the UP
> > case either.  Am I missing something?
> 
> Clearly you, me and everyone else is missing something. I see it with each 
> task bound to one cpu with cpu affinity so it's not a balancing issue. Try it 
> yourself if you can instead of not believing me. Get a big dd reader 
> (virtually no cpu and all io wait sleep) on one cpu and try and play a game 
> on the other cpu. It dies rectally.

I said it didn't make sense to me, not that I didn't believe you.  If I
had a real SMP box, I would look into it, but all I have is HT.

If you're creating a lot of traffic, I can see it causing problems.  I
was under the impression that you were doing minimal IO and absolutely
trivial CPU.  That's what didn't make sense to me to be clear.

	-Mike


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-11  6:00         ` Mike Galbraith
@ 2006-03-11  6:05           ` Mike Galbraith
  2006-03-11  7:20             ` Con Kolivas
  2006-03-11  7:24           ` Con Kolivas
  1 sibling, 1 reply; 32+ messages in thread
From: Mike Galbraith @ 2006-03-11  6:05 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Andrew Morton, linux-kernel, ck

On Sat, 2006-03-11 at 07:00 +0100, Mike Galbraith wrote:
> On Sat, 2006-03-11 at 16:50 +1100, Con Kolivas wrote:
> > On Saturday 11 March 2006 16:33, Mike Galbraith wrote:
> > > On Sat, 2006-03-11 at 14:50 +1100, Con Kolivas wrote:
> > > > On Saturday 11 March 2006 09:35, Andrew Morton wrote:
> > > > > Con Kolivas <kernel@kolivas.org> wrote:
> > > > > > +	/*
> > > > > > +	 * get_page_state is super expensive so we only perform it every
> > > > > > +	 * SWAP_CLUSTER_MAX prefetched_pages.
> > > > >
> > > > > nr_running() is similarly expensive btw.
> > > >
> > > > Yes which is why I do it just as infrequently as get_page_state.
> > > >
> > > > > > 	 * We also test if we're the only
> > > > > > +	 * task running anywhere. We want to have as little impact on all
> > > > > > +	 * resources (cpu, disk, bus etc). As this iterates over every cpu
> > > > > > +	 * we measure this infrequently.
> > > > > > +	 */
> > > > > > +	if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > > > > > +		unsigned long cpuload = nr_running();
> > > > > > +
> > > > > > +		if (cpuload > 1)
> > > > > > +			goto out;
> > > > >
> > > > > Sorry, this is just wrong.  If swap prefetch is useful then it's also
> > > > > useful if some task happens to be sitting over in the corner
> > > > > calculating pi.
> > > > >
> > > > > What's the actual problem here?  Someone's 3d game went blippy?  Why? 
> > > > > How much?  Are we missing a cond_resched()?
> > > >
> > > > No, it's pretty easy to reproduce, kprefetchd sits there in
> > > > uninterruptible sleep with one cpu on SMP pegged at 100% iowait due to
> > > > it. This tends to have noticeable effects everywhere on HT or SMP. On UP
> > > > the yielding helped it but even then it still causes blips. How much?
> > > > Well to be honest it's noticeable a shipload. Running a game, any game,
> > > > that uses 100% (and most fancy games do) causes stuttering on audio,
> > > > pauses and so on. This is evident on linux native games, games under
> > > > emulators or qemu and so on. That iowait really hurts, and tweaking just
> > > > priority doesn't help it in any way.
> > >
> > > That doesn't really make sense to me.  If a task can trigger audio
> > > dropout and stalls by sleeping, we have a serious problem.  In your
> > > SMP/HT case, I'd start crawling over the load balancing code.  I can't
> > > see how trivial CPU with non-saturated IO can cause dropout in the UP
> > > case either.  Am I missing something?
> > 
> > Clearly you, me and everyone else is missing something. I see it with each 
> > task bound to one cpu with cpu affinity so it's not a balancing issue. Try it 
> > yourself if you can instead of not believing me. Get a big dd reader 
> > (virtually no cpu and all io wait sleep) on one cpu and try and play a game 
> > on the other cpu. It dies rectally.
> 
> I said it didn't make sense to me, not that I didn't believe you.  If I
> had a real SMP box, I would look into it, but all I have is HT.
> 
> If you're creating a lot of traffic, I can see it causing problems.  I
> was under the impression that you were doing minimal IO and absolutely
> trivial CPU.  That's what didn't make sense to me to be clear.
> 
> 	-Mike

P.S.  If it's hefty IO, it makes sense, and having the ability to do PIO
instead of DMA would be probably help.


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-11  5:58         ` Con Kolivas
@ 2006-03-11  6:11           ` Mike Galbraith
  0 siblings, 0 replies; 32+ messages in thread
From: Mike Galbraith @ 2006-03-11  6:11 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Andrew Morton, linux-kernel, ck

On Sat, 2006-03-11 at 16:58 +1100, Con Kolivas wrote:
> On Saturday 11 March 2006 16:50, Con Kolivas wrote:
> > On Saturday 11 March 2006 16:33, Mike Galbraith wrote:
> > > On Sat, 2006-03-11 at 14:50 +1100, Con Kolivas wrote:
> > > > On Saturday 11 March 2006 09:35, Andrew Morton wrote:
> > > > > Con Kolivas <kernel@kolivas.org> wrote:
> > > > > > +	/*
> > > > > > +	 * get_page_state is super expensive so we only perform it every
> > > > > > +	 * SWAP_CLUSTER_MAX prefetched_pages.
> > > > >
> > > > > nr_running() is similarly expensive btw.
> > > >
> > > > Yes which is why I do it just as infrequently as get_page_state.
> > > >
> > > > > > 	 * We also test if we're the only
> > > > > > +	 * task running anywhere. We want to have as little impact on all
> > > > > > +	 * resources (cpu, disk, bus etc). As this iterates over every
> > > > > > cpu +	 * we measure this infrequently.
> > > > > > +	 */
> > > > > > +	if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > > > > > +		unsigned long cpuload = nr_running();
> > > > > > +
> > > > > > +		if (cpuload > 1)
> > > > > > +			goto out;
> > > > >
> > > > > Sorry, this is just wrong.  If swap prefetch is useful then it's also
> > > > > useful if some task happens to be sitting over in the corner
> > > > > calculating pi.
> > > > >
> > > > > What's the actual problem here?  Someone's 3d game went blippy?  Why?
> > > > > How much?  Are we missing a cond_resched()?
> > > >
> > > > No, it's pretty easy to reproduce, kprefetchd sits there in
> > > > uninterruptible sleep with one cpu on SMP pegged at 100% iowait due to
> > > > it. This tends to have noticeable effects everywhere on HT or SMP. On
> > > > UP the yielding helped it but even then it still causes blips. How
> > > > much? Well to be honest it's noticeable a shipload. Running a game, any
> > > > game, that uses 100% (and most fancy games do) causes stuttering on
> > > > audio, pauses and so on. This is evident on linux native games, games
> > > > under emulators or qemu and so on. That iowait really hurts, and
> > > > tweaking just priority doesn't help it in any way.
> > >
> > > That doesn't really make sense to me.  If a task can trigger audio
> > > dropout and stalls by sleeping, we have a serious problem.  In your
> > > SMP/HT case, I'd start crawling over the load balancing code.  I can't
> > > see how trivial CPU with non-saturated IO can cause dropout in the UP
> > > case either.  Am I missing something?
> >
> > Clearly you, me and everyone else is missing something. I see it with each
> > task bound to one cpu with cpu affinity so it's not a balancing issue. Try
> > it yourself if you can instead of not believing me. Get a big dd reader
> > (virtually no cpu and all io wait sleep) on one cpu and try and play a game
> > on the other cpu. It dies rectally.
> 
> I happen to have a tool to instrument this as you're probably aware 
> (interbench). Here is an old log I found of this.:

Yeah, I have a copy.  Interpreting the results isn't necessarily easy
though, just as any other benchmark.

> 
> --- Benchmarking simulated cpu of Gaming in the presence of simulated ---
> Load	Latency +/- SD (ms)  Max Latency   % Desired CPU
> None	      0 +/- 0              0		 100
> Write	   36.5 +/- 103          966		73.3
> Read	   17.2 +/- 22.9         244		85.3
> 
> Note the max latency being massive and desired cpu dropping. This is on a HT 
> machine.

I wonder what that would look like with two real CPUs.

	-Mike


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-11  6:05           ` Mike Galbraith
@ 2006-03-11  7:20             ` Con Kolivas
  2006-03-11  7:44               ` Mike Galbraith
  0 siblings, 1 reply; 32+ messages in thread
From: Con Kolivas @ 2006-03-11  7:20 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Andrew Morton, linux-kernel, ck

On Saturday 11 March 2006 17:05, Mike Galbraith wrote:
> On Sat, 2006-03-11 at 07:00 +0100, Mike Galbraith wrote:
> > On Sat, 2006-03-11 at 16:50 +1100, Con Kolivas wrote:
> > > On Saturday 11 March 2006 16:33, Mike Galbraith wrote:
> > > > On Sat, 2006-03-11 at 14:50 +1100, Con Kolivas wrote:
> > > > > On Saturday 11 March 2006 09:35, Andrew Morton wrote:
> > > > > > Con Kolivas <kernel@kolivas.org> wrote:
> > > > > > > +	/*
> > > > > > > +	 * get_page_state is super expensive so we only perform it
> > > > > > > every +	 * SWAP_CLUSTER_MAX prefetched_pages.
> > > > > >
> > > > > > nr_running() is similarly expensive btw.
> > > > >
> > > > > Yes which is why I do it just as infrequently as get_page_state.
> > > > >
> > > > > > > 	 * We also test if we're the only
> > > > > > > +	 * task running anywhere. We want to have as little impact on
> > > > > > > all +	 * resources (cpu, disk, bus etc). As this iterates over
> > > > > > > every cpu +	 * we measure this infrequently.
> > > > > > > +	 */
> > > > > > > +	if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > > > > > > +		unsigned long cpuload = nr_running();
> > > > > > > +
> > > > > > > +		if (cpuload > 1)
> > > > > > > +			goto out;
> > > > > >
> > > > > > Sorry, this is just wrong.  If swap prefetch is useful then it's
> > > > > > also useful if some task happens to be sitting over in the corner
> > > > > > calculating pi.
> > > > > >
> > > > > > What's the actual problem here?  Someone's 3d game went blippy? 
> > > > > > Why? How much?  Are we missing a cond_resched()?
> > > > >
> > > > > No, it's pretty easy to reproduce, kprefetchd sits there in
> > > > > uninterruptible sleep with one cpu on SMP pegged at 100% iowait due
> > > > > to it. This tends to have noticeable effects everywhere on HT or
> > > > > SMP. On UP the yielding helped it but even then it still causes
> > > > > blips. How much? Well to be honest it's noticeable a shipload.
> > > > > Running a game, any game, that uses 100% (and most fancy games do)
> > > > > causes stuttering on audio, pauses and so on. This is evident on
> > > > > linux native games, games under emulators or qemu and so on. That
> > > > > iowait really hurts, and tweaking just priority doesn't help it in
> > > > > any way.
> > > >
> > > > That doesn't really make sense to me.  If a task can trigger audio
> > > > dropout and stalls by sleeping, we have a serious problem.  In your
> > > > SMP/HT case, I'd start crawling over the load balancing code.  I
> > > > can't see how trivial CPU with non-saturated IO can cause dropout in
> > > > the UP case either.  Am I missing something?
> > >
> > > Clearly you, me and everyone else is missing something. I see it with
> > > each task bound to one cpu with cpu affinity so it's not a balancing
> > > issue. Try it yourself if you can instead of not believing me. Get a
> > > big dd reader (virtually no cpu and all io wait sleep) on one cpu and
> > > try and play a game on the other cpu. It dies rectally.
> >
> > I said it didn't make sense to me, not that I didn't believe you.  If I
> > had a real SMP box, I would look into it, but all I have is HT.
> >
> > If you're creating a lot of traffic, I can see it causing problems.  I
> > was under the impression that you were doing minimal IO and absolutely
> > trivial CPU.  That's what didn't make sense to me to be clear.

> P.S.  If it's hefty IO, it makes sense, and having the ability to do PIO
> instead of DMA would be probably help.

That would probably be worse, because then it would use much more cpu in the 
form of kernel context time and not be attributed to kprefetchd at all. 
Anyway this is clearly not a workaround (yes I do know you weren't promoting 
it as such).

Cheers,
Con

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-11  6:00         ` Mike Galbraith
  2006-03-11  6:05           ` Mike Galbraith
@ 2006-03-11  7:24           ` Con Kolivas
  2006-03-11  7:51             ` Mike Galbraith
  1 sibling, 1 reply; 32+ messages in thread
From: Con Kolivas @ 2006-03-11  7:24 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Andrew Morton, linux-kernel, ck

On Saturday 11 March 2006 17:00, Mike Galbraith wrote:
> On Sat, 2006-03-11 at 16:50 +1100, Con Kolivas wrote:
> > On Saturday 11 March 2006 16:33, Mike Galbraith wrote:
> > > On Sat, 2006-03-11 at 14:50 +1100, Con Kolivas wrote:
> > > > On Saturday 11 March 2006 09:35, Andrew Morton wrote:
> > > > > Con Kolivas <kernel@kolivas.org> wrote:
> > > > > > +	/*
> > > > > > +	 * get_page_state is super expensive so we only perform it
> > > > > > every +	 * SWAP_CLUSTER_MAX prefetched_pages.
> > > > >
> > > > > nr_running() is similarly expensive btw.
> > > >
> > > > Yes which is why I do it just as infrequently as get_page_state.
> > > >
> > > > > > 	 * We also test if we're the only
> > > > > > +	 * task running anywhere. We want to have as little impact on
> > > > > > all +	 * resources (cpu, disk, bus etc). As this iterates over
> > > > > > every cpu +	 * we measure this infrequently.
> > > > > > +	 */
> > > > > > +	if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > > > > > +		unsigned long cpuload = nr_running();
> > > > > > +
> > > > > > +		if (cpuload > 1)
> > > > > > +			goto out;
> > > > >
> > > > > Sorry, this is just wrong.  If swap prefetch is useful then it's
> > > > > also useful if some task happens to be sitting over in the corner
> > > > > calculating pi.
> > > > >
> > > > > What's the actual problem here?  Someone's 3d game went blippy? 
> > > > > Why? How much?  Are we missing a cond_resched()?
> > > >
> > > > No, it's pretty easy to reproduce, kprefetchd sits there in
> > > > uninterruptible sleep with one cpu on SMP pegged at 100% iowait due
> > > > to it. This tends to have noticeable effects everywhere on HT or SMP.
> > > > On UP the yielding helped it but even then it still causes blips. How
> > > > much? Well to be honest it's noticeable a shipload. Running a game,
> > > > any game, that uses 100% (and most fancy games do) causes stuttering
> > > > on audio, pauses and so on. This is evident on linux native games,
> > > > games under emulators or qemu and so on. That iowait really hurts,
> > > > and tweaking just priority doesn't help it in any way.
> > >
> > > That doesn't really make sense to me.  If a task can trigger audio
> > > dropout and stalls by sleeping, we have a serious problem.  In your
> > > SMP/HT case, I'd start crawling over the load balancing code.  I can't
> > > see how trivial CPU with non-saturated IO can cause dropout in the UP
> > > case either.  Am I missing something?
> >
> > Clearly you, me and everyone else is missing something. I see it with
> > each task bound to one cpu with cpu affinity so it's not a balancing
> > issue. Try it yourself if you can instead of not believing me. Get a big
> > dd reader (virtually no cpu and all io wait sleep) on one cpu and try and
> > play a game on the other cpu. It dies rectally.
>
> I said it didn't make sense to me, not that I didn't believe you.  If I
> had a real SMP box, I would look into it, but all I have is HT.

No doubt it would be better on an SMP box. The norm is, however, for all these 
multi-core, multi-threading cpus to be more common than real SMP and they all 
share varying amounts of their resources.

> If you're creating a lot of traffic, I can see it causing problems.  I
> was under the impression that you were doing minimal IO and absolutely
> trivial CPU.  That's what didn't make sense to me to be clear.

A lot of cpu would be easier to handle; it's using absolutely miniscule 
amounts of cpu. The IO is massive though (and seeky in nature), and reading 
from a swap partition seems particularly expensive in this regard.

Cheers,
Con

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-11  7:20             ` Con Kolivas
@ 2006-03-11  7:44               ` Mike Galbraith
  2006-03-11  8:16                 ` Nick Piggin
  0 siblings, 1 reply; 32+ messages in thread
From: Mike Galbraith @ 2006-03-11  7:44 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Andrew Morton, linux-kernel, ck

On Sat, 2006-03-11 at 18:20 +1100, Con Kolivas wrote:
> On Saturday 11 March 2006 17:05, Mike Galbraith wrote:
> > On Sat, 2006-03-11 at 07:00 +0100, Mike Galbraith wrote:
> > > On Sat, 2006-03-11 at 16:50 +1100, Con Kolivas wrote:
> > > > On Saturday 11 March 2006 16:33, Mike Galbraith wrote:
> > > > > On Sat, 2006-03-11 at 14:50 +1100, Con Kolivas wrote:
> > > > > > On Saturday 11 March 2006 09:35, Andrew Morton wrote:
> > > > > > > Con Kolivas <kernel@kolivas.org> wrote:
> > > > > > > > +	/*
> > > > > > > > +	 * get_page_state is super expensive so we only perform it
> > > > > > > > every +	 * SWAP_CLUSTER_MAX prefetched_pages.
> > > > > > >
> > > > > > > nr_running() is similarly expensive btw.
> > > > > >
> > > > > > Yes which is why I do it just as infrequently as get_page_state.
> > > > > >
> > > > > > > > 	 * We also test if we're the only
> > > > > > > > +	 * task running anywhere. We want to have as little impact on
> > > > > > > > all +	 * resources (cpu, disk, bus etc). As this iterates over
> > > > > > > > every cpu +	 * we measure this infrequently.
> > > > > > > > +	 */
> > > > > > > > +	if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > > > > > > > +		unsigned long cpuload = nr_running();
> > > > > > > > +
> > > > > > > > +		if (cpuload > 1)
> > > > > > > > +			goto out;
> > > > > > >
> > > > > > > Sorry, this is just wrong.  If swap prefetch is useful then it's
> > > > > > > also useful if some task happens to be sitting over in the corner
> > > > > > > calculating pi.
> > > > > > >
> > > > > > > What's the actual problem here?  Someone's 3d game went blippy? 
> > > > > > > Why? How much?  Are we missing a cond_resched()?
> > > > > >
> > > > > > No, it's pretty easy to reproduce, kprefetchd sits there in
> > > > > > uninterruptible sleep with one cpu on SMP pegged at 100% iowait due
> > > > > > to it. This tends to have noticeable effects everywhere on HT or
> > > > > > SMP. On UP the yielding helped it but even then it still causes
> > > > > > blips. How much? Well to be honest it's noticeable a shipload.
> > > > > > Running a game, any game, that uses 100% (and most fancy games do)
> > > > > > causes stuttering on audio, pauses and so on. This is evident on
> > > > > > linux native games, games under emulators or qemu and so on. That
> > > > > > iowait really hurts, and tweaking just priority doesn't help it in
> > > > > > any way.
> > > > >
> > > > > That doesn't really make sense to me.  If a task can trigger audio
> > > > > dropout and stalls by sleeping, we have a serious problem.  In your
> > > > > SMP/HT case, I'd start crawling over the load balancing code.  I
> > > > > can't see how trivial CPU with non-saturated IO can cause dropout in
> > > > > the UP case either.  Am I missing something?
> > > >
> > > > Clearly you, me and everyone else is missing something. I see it with
> > > > each task bound to one cpu with cpu affinity so it's not a balancing
> > > > issue. Try it yourself if you can instead of not believing me. Get a
> > > > big dd reader (virtually no cpu and all io wait sleep) on one cpu and
> > > > try and play a game on the other cpu. It dies rectally.
> > >
> > > I said it didn't make sense to me, not that I didn't believe you.  If I
> > > had a real SMP box, I would look into it, but all I have is HT.
> > >
> > > If you're creating a lot of traffic, I can see it causing problems.  I
> > > was under the impression that you were doing minimal IO and absolutely
> > > trivial CPU.  That's what didn't make sense to me to be clear.
> 
> > P.S.  If it's hefty IO, it makes sense, and having the ability to do PIO
> > instead of DMA would be probably help.
> 
> That would probably be worse, because then it would use much more cpu in the 
> form of kernel context time and not be attributed to kprefetchd at all. 
> Anyway this is clearly not a workaround (yes I do know you weren't promoting 
> it as such).

Substitute PIO with trickle mode IO, which we don't have an AFAIK.
Point was, if it's hefty IO, the problem is likely DMA, so what you'd
need to do is prevent the IO from being consolidated into mondo blocks
of DMA==bus contention.  Doing that via yield or whatever would be the
wrong approach to the problem.

	-Mike


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-11  7:24           ` Con Kolivas
@ 2006-03-11  7:51             ` Mike Galbraith
  2006-03-11  8:15               ` Con Kolivas
  2006-03-12  4:54               ` Lee Revell
  0 siblings, 2 replies; 32+ messages in thread
From: Mike Galbraith @ 2006-03-11  7:51 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Andrew Morton, linux-kernel, ck

On Sat, 2006-03-11 at 18:24 +1100, Con Kolivas wrote:
> On Saturday 11 March 2006 17:00, Mike Galbraith wrote:

> > If you're creating a lot of traffic, I can see it causing problems.  I
> > was under the impression that you were doing minimal IO and absolutely
> > trivial CPU.  That's what didn't make sense to me to be clear.
> 
> A lot of cpu would be easier to handle; it's using absolutely miniscule 
> amounts of cpu. The IO is massive though (and seeky in nature), and reading 
> from a swap partition seems particularly expensive in this regard.

There used to be a pages in flight 'restrictor plate' in there that
would have probably helped this situation at least a little.  But in any
case, it sounds like you'll have to find a way to submit the IO in itty
bitty synchronous pieces.

	-Mike


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-11  7:51             ` Mike Galbraith
@ 2006-03-11  8:15               ` Con Kolivas
  2006-03-12  4:54               ` Lee Revell
  1 sibling, 0 replies; 32+ messages in thread
From: Con Kolivas @ 2006-03-11  8:15 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Andrew Morton, linux-kernel, ck

On Saturday 11 March 2006 18:51, Mike Galbraith wrote:
> On Sat, 2006-03-11 at 18:24 +1100, Con Kolivas wrote:
> > On Saturday 11 March 2006 17:00, Mike Galbraith wrote:
> > > If you're creating a lot of traffic, I can see it causing problems.  I
> > > was under the impression that you were doing minimal IO and absolutely
> > > trivial CPU.  That's what didn't make sense to me to be clear.
> >
> > A lot of cpu would be easier to handle; it's using absolutely miniscule
> > amounts of cpu. The IO is massive though (and seeky in nature), and
> > reading from a swap partition seems particularly expensive in this
> > regard.
>
> There used to be a pages in flight 'restrictor plate' in there that
> would have probably helped this situation at least a little.  But in any
> case, it sounds like you'll have to find a way to submit the IO in itty
> bitty synchronous pieces.

Well the original code used to have an heuristic to decide how much to 
prefetch at a time. It was considered opaque so I removed it. It made the 
amount to prefetch proportional to amount of ram which is wrong of course 
because it should depend more on swap partition read speed vs bus bandwidth 
or something. 

This way of deciding based on cpu load works anyway but yet again seems 
unpopular.

Cheers,
Con

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-11  7:44               ` Mike Galbraith
@ 2006-03-11  8:16                 ` Nick Piggin
  2006-03-11  8:22                   ` Mike Galbraith
  0 siblings, 1 reply; 32+ messages in thread
From: Nick Piggin @ 2006-03-11  8:16 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Con Kolivas, Andrew Morton, linux-kernel, ck

Mike Galbraith wrote:
> On Sat, 2006-03-11 at 18:20 +1100, Con Kolivas wrote:
>>On Saturday 11 March 2006 17:05, Mike Galbraith wrote:
>>>On Sat, 2006-03-11 at 07:00 +0100, Mike Galbraith wrote:
>>>>On Sat, 2006-03-11 at 16:50 +1100, Con Kolivas wrote:
>>>>>On Saturday 11 March 2006 16:33, Mike Galbraith wrote:
>>>>>>On Sat, 2006-03-11 at 14:50 +1100, Con Kolivas wrote:
>>>>>>>On Saturday 11 March 2006 09:35, Andrew Morton wrote:
>>>>>>>>Con Kolivas <kernel@kolivas.org> wrote:

So... you guys ever think about trimming this? Not only would
it be faster to read, you can save the list server about 15MB
worth of email a pop with just a small haircut.

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-11  8:16                 ` Nick Piggin
@ 2006-03-11  8:22                   ` Mike Galbraith
  0 siblings, 0 replies; 32+ messages in thread
From: Mike Galbraith @ 2006-03-11  8:22 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Con Kolivas, Andrew Morton, linux-kernel, ck

On Sat, 2006-03-11 at 19:16 +1100, Nick Piggin wrote:
> Mike Galbraith wrote:
> > On Sat, 2006-03-11 at 18:20 +1100, Con Kolivas wrote:
> >>On Saturday 11 March 2006 17:05, Mike Galbraith wrote:
> >>>On Sat, 2006-03-11 at 07:00 +0100, Mike Galbraith wrote:
> >>>>On Sat, 2006-03-11 at 16:50 +1100, Con Kolivas wrote:
> >>>>>On Saturday 11 March 2006 16:33, Mike Galbraith wrote:
> >>>>>>On Sat, 2006-03-11 at 14:50 +1100, Con Kolivas wrote:
> >>>>>>>On Saturday 11 March 2006 09:35, Andrew Morton wrote:
> >>>>>>>>Con Kolivas <kernel@kolivas.org> wrote:
> 
> So... you guys ever think about trimming this? Not only would
> it be faster to read, you can save the list server about 15MB
> worth of email a pop with just a small haircut.
> 

Sorry, was doing too many things at once to notice.  I think we're about
done yacking anyway.

	-Mike


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-11  7:51             ` Mike Galbraith
  2006-03-11  8:15               ` Con Kolivas
@ 2006-03-12  4:54               ` Lee Revell
  2006-03-12  5:27                 ` Mike Galbraith
  2006-03-14  6:40                 ` Mike Galbraith
  1 sibling, 2 replies; 32+ messages in thread
From: Lee Revell @ 2006-03-12  4:54 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Con Kolivas, Andrew Morton, linux-kernel, ck

On Sat, 2006-03-11 at 08:51 +0100, Mike Galbraith wrote:
> There used to be a pages in flight 'restrictor plate' in there that
> would have probably helped this situation at least a little.  But in
> any case, it sounds like you'll have to find a way to submit the IO in
> itty bitty synchronous pieces. 

echo 64 > /sys/block/hd*/queue/max_sectors_kb

There is basically a straight linear relation between whatever you set
this to and the maximum scheduling latency you see.  It was developed to
solve the exact problem you are describing.

Lee


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-12  4:54               ` Lee Revell
@ 2006-03-12  5:27                 ` Mike Galbraith
  2006-03-12  8:36                   ` Con Kolivas
  2006-03-14  6:40                 ` Mike Galbraith
  1 sibling, 1 reply; 32+ messages in thread
From: Mike Galbraith @ 2006-03-12  5:27 UTC (permalink / raw)
  To: Lee Revell; +Cc: Con Kolivas, Andrew Morton, linux-kernel, ck

On Sat, 2006-03-11 at 23:54 -0500, Lee Revell wrote:
> On Sat, 2006-03-11 at 08:51 +0100, Mike Galbraith wrote:
> > There used to be a pages in flight 'restrictor plate' in there that
> > would have probably helped this situation at least a little.  But in
> > any case, it sounds like you'll have to find a way to submit the IO in
> > itty bitty synchronous pieces. 
> 
> echo 64 > /sys/block/hd*/queue/max_sectors_kb
> 
> There is basically a straight linear relation between whatever you set
> this to and the maximum scheduling latency you see.  It was developed to
> solve the exact problem you are describing.

Ah, a very useful bit of information, thanks.

It won't help Con though, because he'll be dealing with every possible
configuration.  I think he's going to have to either submit, wait,
bandwidth limiting sleep, repeat or something clever that does that.
Even with bandwidth restriction though, seek still bites mightily, so I
suspect he's stuck with little trickles of IO started when we'd
otherwise be idle.  We'll see I suppose.

	-Mike


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-12  5:27                 ` Mike Galbraith
@ 2006-03-12  8:36                   ` Con Kolivas
  0 siblings, 0 replies; 32+ messages in thread
From: Con Kolivas @ 2006-03-12  8:36 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Lee Revell, Andrew Morton, linux-kernel, ck

On Sunday 12 March 2006 16:27, Mike Galbraith wrote:
> On Sat, 2006-03-11 at 23:54 -0500, Lee Revell wrote:
> > echo 64 > /sys/block/hd*/queue/max_sectors_kb
> >
> > There is basically a straight linear relation between whatever you set
> > this to and the maximum scheduling latency you see.  It was developed to
> > solve the exact problem you are describing.
>
> Ah, a very useful bit of information, thanks.
>
> It won't help Con though, because he'll be dealing with every possible
> configuration.  I think he's going to have to either submit, wait,
> bandwidth limiting sleep, repeat or something clever that does that.
> Even with bandwidth restriction though, seek still bites mightily, so I
> suspect he's stuck with little trickles of IO started when we'd
> otherwise be idle.  We'll see I suppose.

What I'm doing with that last patch works fine - don't prefetch if anything 
else is running. Prefetching is not a performance critical function and we 
cannot know what tasks are scheduling latency sensitive. With that latest 
patch the most expensive thing is doing nr_running(). Assuming anything is 
running, it only needs to do that once every 5 seconds - and only after 
something is in swap. Furthermore it doesn't do it if swap prefetch is 
disabled with the tunable. I don't think this is an expensive operation in 
that context and certainly avoids any problems with it. 

I could hack in a weighted load variant of it so that prefetch does run when 
only nice 19 tasks are running on top of it so that perhaps low priority 
compiles, distributed computing clients et al don't prevent prefetching from 
happening - I could do this on top of the current patch. I'd like to see that 
last patch go in. Does anyone have another alternative? 

Cheers,
Con

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-12  4:54               ` Lee Revell
  2006-03-12  5:27                 ` Mike Galbraith
@ 2006-03-14  6:40                 ` Mike Galbraith
  2006-03-14  6:50                   ` Lee Revell
  1 sibling, 1 reply; 32+ messages in thread
From: Mike Galbraith @ 2006-03-14  6:40 UTC (permalink / raw)
  To: Lee Revell; +Cc: Con Kolivas, Andrew Morton, linux-kernel, ck

On Sat, 2006-03-11 at 23:54 -0500, Lee Revell wrote:
> On Sat, 2006-03-11 at 08:51 +0100, Mike Galbraith wrote:
> > There used to be a pages in flight 'restrictor plate' in there that
> > would have probably helped this situation at least a little.  But in
> > any case, it sounds like you'll have to find a way to submit the IO in
> > itty bitty synchronous pieces. 
> 
> echo 64 > /sys/block/hd*/queue/max_sectors_kb
> 
> There is basically a straight linear relation between whatever you set
> this to and the maximum scheduling latency you see.  It was developed to
> solve the exact problem you are describing.

<head-scratching>

Is it possible that you mean pci latency?  I'm unable to measure any
scheduling latency > 5ms while pushing IO for all my little Barracuda
disk is worth.  I _can_ generate mp3 player audio dropout though,
despite mp3 files living on a separate drive/controller.

	-Mike


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-14  6:40                 ` Mike Galbraith
@ 2006-03-14  6:50                   ` Lee Revell
  2006-03-14  7:06                     ` Mike Galbraith
  2006-03-14  8:05                     ` [ck] " Jens Axboe
  0 siblings, 2 replies; 32+ messages in thread
From: Lee Revell @ 2006-03-14  6:50 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Con Kolivas, Andrew Morton, linux-kernel, ck

On Tue, 2006-03-14 at 07:40 +0100, Mike Galbraith wrote:
> On Sat, 2006-03-11 at 23:54 -0500, Lee Revell wrote:
> > On Sat, 2006-03-11 at 08:51 +0100, Mike Galbraith wrote:
> > > There used to be a pages in flight 'restrictor plate' in there that
> > > would have probably helped this situation at least a little.  But in
> > > any case, it sounds like you'll have to find a way to submit the IO in
> > > itty bitty synchronous pieces. 
> > 
> > echo 64 > /sys/block/hd*/queue/max_sectors_kb
> > 
> > There is basically a straight linear relation between whatever you set
> > this to and the maximum scheduling latency you see.  It was developed to
> > solve the exact problem you are describing.
> 
> <head-scratching>
> 
> Is it possible that you mean pci latency?  I'm unable to measure any
> scheduling latency > 5ms while pushing IO for all my little Barracuda
> disk is worth.

It's only a big problem if LBA48 is in use which allows 32MB of IO to be
in flight at once, this depends on the size of the drive.

What does that value default to?

>   I _can_ generate mp3 player audio dropout though,
> despite mp3 files living on a separate drive/controller.
> 

Does this go away if you run the mp3 player at nice -20?

> 	-Mike
> 
> 


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-14  6:50                   ` Lee Revell
@ 2006-03-14  7:06                     ` Mike Galbraith
  2006-03-14  8:44                       ` Mike Galbraith
  2006-03-14  8:05                     ` [ck] " Jens Axboe
  1 sibling, 1 reply; 32+ messages in thread
From: Mike Galbraith @ 2006-03-14  7:06 UTC (permalink / raw)
  To: Lee Revell; +Cc: Con Kolivas, Andrew Morton, linux-kernel, ck

On Tue, 2006-03-14 at 01:50 -0500, Lee Revell wrote:
> On Tue, 2006-03-14 at 07:40 +0100, Mike Galbraith wrote:
> >> > 
> > > echo 64 > /sys/block/hd*/queue/max_sectors_kb
> > > 
> > > There is basically a straight linear relation between whatever you set
> > > this to and the maximum scheduling latency you see.  It was developed to
> > > solve the exact problem you are describing.
> > 
> > <head-scratching>
> > 
> > Is it possible that you mean pci latency?  I'm unable to measure any
> > scheduling latency > 5ms while pushing IO for all my little Barracuda
> > disk is worth.
> 
> It's only a big problem if LBA48 is in use which allows 32MB of IO to be
> in flight at once, this depends on the size of the drive.

This is a 120G drive.

> 
> What does that value default to?

512.

> >   I _can_ generate mp3 player audio dropout though,
> > despite mp3 files living on a separate drive/controller.
> > 
> 
> Does this go away if you run the mp3 player at nice -20?

Nope.

	-Mike


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [ck] Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-14  6:50                   ` Lee Revell
  2006-03-14  7:06                     ` Mike Galbraith
@ 2006-03-14  8:05                     ` Jens Axboe
  1 sibling, 0 replies; 32+ messages in thread
From: Jens Axboe @ 2006-03-14  8:05 UTC (permalink / raw)
  To: Lee Revell; +Cc: Mike Galbraith, Andrew Morton, ck, linux-kernel

On Tue, Mar 14 2006, Lee Revell wrote:
> On Tue, 2006-03-14 at 07:40 +0100, Mike Galbraith wrote:
> > On Sat, 2006-03-11 at 23:54 -0500, Lee Revell wrote:
> > > On Sat, 2006-03-11 at 08:51 +0100, Mike Galbraith wrote:
> > > > There used to be a pages in flight 'restrictor plate' in there that
> > > > would have probably helped this situation at least a little.  But in
> > > > any case, it sounds like you'll have to find a way to submit the IO in
> > > > itty bitty synchronous pieces. 
> > > 
> > > echo 64 > /sys/block/hd*/queue/max_sectors_kb
> > > 
> > > There is basically a straight linear relation between whatever you set
> > > this to and the maximum scheduling latency you see.  It was developed to
> > > solve the exact problem you are describing.
> > 
> > <head-scratching>
> > 
> > Is it possible that you mean pci latency?  I'm unable to measure any
> > scheduling latency > 5ms while pushing IO for all my little Barracuda
> > disk is worth.
> 
> It's only a big problem if LBA48 is in use which allows 32MB of IO to be
> in flight at once, this depends on the size of the drive.
> 
> What does that value default to?

Not quite true. Even if lba48 is active on the drive, we don't allow
more than 1MB per request. And nit picking a little, lba48 doesn't
always depend on the size of the drive, some drives smaller than 2^28
sectors also feature lba48 support.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] mm: Implement swap prefetching tweaks
  2006-03-14  7:06                     ` Mike Galbraith
@ 2006-03-14  8:44                       ` Mike Galbraith
  0 siblings, 0 replies; 32+ messages in thread
From: Mike Galbraith @ 2006-03-14  8:44 UTC (permalink / raw)
  To: Lee Revell; +Cc: Con Kolivas, Andrew Morton, linux-kernel, ck

On Tue, 2006-03-14 at 08:06 +0100, Mike Galbraith wrote:
> On Tue, 2006-03-14 at 01:50 -0500, Lee Revell wrote:

> > Does this go away if you run the mp3 player at nice -20?
> 
> Nope.

But it does go away if I change from amarok to xmms, so amarok is
probably just not buffering quite enough.  OTOH, xmms seems to be picky
in other respects.  During heavy disk IO, it'll gripe about my soundcard
not being ready while switching songs, retry by poking the play button,
and all is fine.  Hohum.

Anyway, seems I can't reproduce the really bad stuff here, so no can
tinker with.

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2006-03-14  8:43 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-03-10  9:54 [PATCH] mm: Implement swap prefetching tweaks Con Kolivas
2006-03-10 22:35 ` Andrew Morton
2006-03-10 23:11   ` Peter Williams
2006-03-11  4:18     ` Con Kolivas
2006-03-11  4:28       ` Peter Williams
2006-03-11  4:34         ` Con Kolivas
2006-03-11  5:34           ` Peter Williams
2006-03-11  5:04       ` [ck] " Radoslaw Szkodzinski
2006-03-11  5:21         ` Con Kolivas
2006-03-11  5:46       ` Peter Williams
2006-03-11  3:50   ` Con Kolivas
2006-03-11  5:33     ` Mike Galbraith
2006-03-11  5:50       ` Con Kolivas
2006-03-11  5:58         ` Con Kolivas
2006-03-11  6:11           ` Mike Galbraith
2006-03-11  6:00         ` Mike Galbraith
2006-03-11  6:05           ` Mike Galbraith
2006-03-11  7:20             ` Con Kolivas
2006-03-11  7:44               ` Mike Galbraith
2006-03-11  8:16                 ` Nick Piggin
2006-03-11  8:22                   ` Mike Galbraith
2006-03-11  7:24           ` Con Kolivas
2006-03-11  7:51             ` Mike Galbraith
2006-03-11  8:15               ` Con Kolivas
2006-03-12  4:54               ` Lee Revell
2006-03-12  5:27                 ` Mike Galbraith
2006-03-12  8:36                   ` Con Kolivas
2006-03-14  6:40                 ` Mike Galbraith
2006-03-14  6:50                   ` Lee Revell
2006-03-14  7:06                     ` Mike Galbraith
2006-03-14  8:44                       ` Mike Galbraith
2006-03-14  8:05                     ` [ck] " Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).