public inbox for linux-ia64@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] fix build_zonelists for CONFIG_ACPI_NUMA
@ 2003-09-17 21:31 Jesse Barnes
  2003-09-18 15:16 ` Erich Focht
                   ` (7 more replies)
  0 siblings, 8 replies; 9+ messages in thread
From: Jesse Barnes @ 2003-09-17 21:31 UTC (permalink / raw)
  To: linux-ia64

Here's a ugly little patch to make build_zonelists use the ACPI SLIT
table on ia64 if it's present.  Comments?  Should we have a generic
Linux distance table that we use for this?  That way people could
populate it at early boot and we could make this code work for all
platforms.

Btw, this patch sits on top of the last discontig patch I posted.

Thanks,
Jesse

diff -Nru a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
--- a/arch/ia64/kernel/acpi.c	Wed Sep 17 14:27:48 2003
+++ b/arch/ia64/kernel/acpi.c	Wed Sep 17 14:27:48 2003
@@ -342,7 +342,7 @@
 /* maps to convert between proximity domain and logical node ID */
 int __initdata pxm_to_nid_map[MAX_PXM_DOMAINS];
 int __initdata nid_to_pxm_map[NR_NODES];
-static struct acpi_table_slit __initdata *slit_table;
+struct acpi_table_slit __initdata *slit_table;
 
 /*
  * ACPI 2.0 SLIT (System Locality Information Table)
diff -Nru a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
--- a/arch/ia64/mm/discontig.c	Wed Sep 17 14:27:48 2003
+++ b/arch/ia64/mm/discontig.c	Wed Sep 17 14:27:48 2003
@@ -249,6 +249,160 @@
 	}
 }
 
+#ifdef CONFIG_ACPI_NUMA
+
+/**
+ * sort_distance_array - sort a single row from the SLIT table
+ * @slit: copy of a row from the SLIT table
+ * @nodes: on exit, the sorted list of node numbers
+ * @size: size of @slit and @nodes
+ *
+ * Sorts the SLIT row by proximity domain, preferring proximity
+ * domains in order of their CPU count, from lowest to highest.
+ */
+static void __init
+sort_distance_array(unsigned int *slit, int *nodes, int size)
+{
+	unsigned int i, j, k, x, y;
+
+	/*
+	 * Initialize the nodes array and weight the SLIT values
+	 */
+	for (i = 0; i < size; i++)
+		nodes[i] = i;
+
+	for (i = 0; i < size - 1; i++) {
+		k = i;
+		
+		for (j = k + 1; j < size; j++) {
+			if (slit[j] < slit[k])
+				k = j;
+		}
+		
+		if (k != i) {
+			x = slit[k]; slit[k] = slit[i]; slit[i] = x;
+			y = nodes[k]; nodes[k] = nodes[i]; nodes[i] = y;
+		}
+	}
+}
+
+/*
+ * Since kmalloc isn't available yet... (even on a big system this
+ * won't be more than a few kilobytes and it'll get freed up later).
+ */
+static int pxm_by_distance[NR_NODES] __initdata;
+static int nodes_by_distance[NR_NODES] __initdata;
+
+/*
+ * build_zonelist_others - append to the zonelist of a given node
+ * @local_node: node whose zonelist we'll append
+ *
+ * Use the ACPI SLIT table to build a pretty good fallback zonelist
+ * for memory allocations.
+ *
+ * We have a number of potential options here, given the fact that
+ * some nodes may have CPUs disabled (and are thus probably under
+ * less allocation pressure than others).
+ *
+ * Should we _always_ allocate first from nodes without CPUs if we can't
+ * get memory on our local node?  How about nodes with only one CPU?
+ * Should they be preferred over nodes with two?  All else being equal,
+ * we want to at least allocate in concentric rings based on distance,
+ * which means we have to trust (and use!) the values in the SLIT table
+ * as a first step.
+ *
+ * A simple SLIT table describing the distances between three nodes:
+ *
+ *       0   1   2
+ *   0   0  10  20
+ *   1  10   0  10
+ *   2  20  10   0
+ *
+ */
+void __init
+build_zonelists(pg_data_t *pgdat)
+{
+	int 		i, j, k, n;
+	pg_data_t	*node;
+	struct zonelist	*zonelist;
+	struct zone	*zone;
+
+	/*
+	 * Copy the SLIT table row corresponding to local_node since
+	 * we don't want to modify the global copy.  We use an int
+	 * array to give us more flexibility to weight certain types
+	 * of nodes (e.g. nodes w/o CPUs).
+	 */
+	for (i = 0; i < numnodes; i++)
+		pxm_by_distance[i] = (int)slit_table->entry[numnodes*(pgdat->node_id)+i];
+
+	sort_distance_array(pxm_by_distance, nodes_by_distance, numnodes);
+
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+
+		zonelist = pgdat->node_zonelists + i;
+
+		/* find bottom of the list */
+		for (j = 0; zonelist->zones[j]; j++);
+
+		k = ZONE_NORMAL;
+		if (i & __GFP_HIGHMEM)
+			k = ZONE_HIGHMEM;
+		if (i & __GFP_DMA)
+			k = ZONE_DMA;
+
+		/*
+		 * Now we build the zonelist so that it contains the zones
+		 * of all the other nodes based on the sorting.
+		 */
+		for (n = 0 ; n < numnodes; n++) {
+			/*
+			 * Grab the pgdat struct from the next closest
+			 * node.
+			 */
+			node = NODE_DATA(nodes_by_distance[n]);
+
+			/*
+			 * Add the right zone to the end of the zonelist
+			 * of the local node.
+			 */
+			switch (k) {
+			default:
+				BUG();
+				/*
+				 * fallthrough:
+				 */
+			case ZONE_HIGHMEM:
+				zone = node->node_zones + ZONE_HIGHMEM;
+				if (zone->present_pages) {
+#ifndef CONFIG_HIGHMEM
+					BUG();
+#endif
+					zonelist->zones[j++] = zone;
+				}
+			case ZONE_NORMAL:
+				zone = node->node_zones + ZONE_NORMAL;
+				if (zone->present_pages)
+					zonelist->zones[j++] = zone;
+				
+			case ZONE_DMA:
+				zone = node->node_zones + ZONE_DMA;
+				if (zone->present_pages)
+					zonelist->zones[j++] = zone;
+			}
+		}
+		/* zonelist is NULL terminated */
+		zonelist->zones[j++] = NULL;
+	}
+#ifdef DISCONTIG_DEBUG
+	printk("Zonelist for node %d: ", pgdat->node_id);
+	for (i = 0; i < numnodes; i++)
+		printk("%d ", nodes_by_distance[i]);
+	printk("\n");
+#endif
+}
+
+#endif /* CONFIG_ACPI_NUMA */
 
 /*
  * Called early in boot to setup the boot memory allocator, and to
diff -Nru a/include/asm-ia64/acpi.h b/include/asm-ia64/acpi.h
--- a/include/asm-ia64/acpi.h	Wed Sep 17 14:27:48 2003
+++ b/include/asm-ia64/acpi.h	Wed Sep 17 14:27:48 2003
@@ -98,6 +98,7 @@
 #define MAX_PXM_DOMAINS (256)
 extern int __initdata pxm_to_nid_map[MAX_PXM_DOMAINS];
 extern int __initdata nid_to_pxm_map[NR_NODES];
+extern struct acpi_table_slit *slit_table;
 #endif
 
 #endif /*__KERNEL__*/
diff -Nru a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h
--- a/include/asm-ia64/pgtable.h	Wed Sep 17 14:27:48 2003
+++ b/include/asm-ia64/pgtable.h	Wed Sep 17 14:27:48 2003
@@ -435,11 +435,17 @@
 #ifdef CONFIG_DISCONTIGMEM
 extern void discontig_mem_init(void);
 extern void call_pernode_memory(unsigned long start, unsigned long end, void *arg);
+
+#ifdef CONFIG_ACPI_NUMA
+#define HAVE_ARCH_BUILD_ZONELISTS
+extern void build_zonelists(pg_data_t *pgdat);
+#endif /* CONFIG_ACPI_NUMA */
+
 #else
 extern unsigned long bootmap_start;
 extern int find_max_pfn(unsigned long start, unsigned long end, void *arg);
 extern int find_bootmap_location(unsigned long start, unsigned long end, void *arg);
-#endif
+#endif /* CONFIG_DISCONTIGMEM */
 
 /*
  * Note: The macros below rely on the fact that MAX_SWAPFILES_SHIFT <= number of
diff -Nru a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c	Wed Sep 17 14:27:48 2003
+++ b/mm/page_alloc.c	Wed Sep 17 14:27:48 2003
@@ -1017,6 +1017,8 @@
 	show_swap_cache_info();
 }
 
+#ifndef HAVE_ARCH_BUILD_ZONELISTS
+
 /*
  * Builds allocation fallback zone lists.
  */
@@ -1083,6 +1085,8 @@
 		zonelist->zones[j++] = NULL;
 	} 
 }
+
+#endif /* HAVE_ARCH_BUILD_ZONELISTS */
 
 void __init build_all_zonelists(void)
 {

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] fix build_zonelists for CONFIG_ACPI_NUMA
  2003-09-17 21:31 [PATCH] fix build_zonelists for CONFIG_ACPI_NUMA Jesse Barnes
@ 2003-09-18 15:16 ` Erich Focht
  2003-09-18 15:38 ` Jesse Barnes
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Erich Focht @ 2003-09-18 15:16 UTC (permalink / raw)
  To: linux-ia64

Hi Jesse,

this kind of patch is a GOOD THING!

I just have an objection regarding the sort order. On my computers
(and on yours maybe too) I have matrices of the form:

10 15 15 15
15 10 15 15
15 15 10 15
15 15 15 10

Now just sorting the distance matrix row by row leads to the following
zonelists:
 for node 1:  1, 2, 3, 4
 for node 2:  2, 1, 3, 4
 for node 3:  3, 1, 2, 4
 for node 4:  4, 1, 2, 3

The first node in the list is fine and we'll get memory from the right
node if it is free. But if not, we'll request memory from the second
node in the zonelist and this will be (in most of the cases) node
1. Which means a pretty bad imbalance.

I'd prefer to see this more in a round-robin way, this would ease
things. The following piece of (ugly) code does this, but expects that
the existing values (in the example: 10 and 15) have been sorted into
the array node_levels[]. 

Just an idea...

Regards,
Erich

#define node_distance(from,to) (acpi20_slit[from * numnodes + to])

static void __init
permute_nodes(int curr, int *array)
{
	int lev, perm, node, dist=0, minown, nodes=0;

	array[nodes++] = curr;
	if (nr_node_levels = 1) return;
	for (node=0; node < numnodes; node++) {
		dist = node_distance(curr,node);
		if (dist = node_levels[1] || dist = node_levels[0])
			break;
	}
	minown = node;
	dist=0;
	for (lev=1; lev < nr_node_levels; lev++) {
		if (lev > 1) {
			for (perm=1; perm < numnodes; perm++) {
				node = (curr + perm) % numnodes;
				if (node_distance(curr, node) = node_levels[lev])
					break;
			}
			dist = perm + curr - minown;
		}
		for (perm=0; perm < numnodes; perm++) {
			node = (curr + perm + dist) % numnodes;
			if (node_distance(curr, node) = node_levels[lev])
				array[nodes++] = node;
		}
	}
}



On Wednesday 17 September 2003 23:31, Jesse Barnes wrote:
> Here's a ugly little patch to make build_zonelists use the ACPI SLIT
> table on ia64 if it's present.  Comments?  Should we have a generic
> Linux distance table that we use for this?  That way people could
> populate it at early boot and we could make this code work for all
> platforms.
>
> Btw, this patch sits on top of the last discontig patch I posted.
>
> Thanks,
> Jesse
...
> +sort_distance_array(unsigned int *slit, int *nodes, int size)
> +{
> +	unsigned int i, j, k, x, y;
> +
> +	/*
> +	 * Initialize the nodes array and weight the SLIT values
> +	 */
> +	for (i = 0; i < size; i++)
> +		nodes[i] = i;
> +
> +	for (i = 0; i < size - 1; i++) {
> +		k = i;
> +
> +		for (j = k + 1; j < size; j++) {
> +			if (slit[j] < slit[k])
> +				k = j;
> +		}
> +
> +		if (k != i) {
> +			x = slit[k]; slit[k] = slit[i]; slit[i] = x;
> +			y = nodes[k]; nodes[k] = nodes[i]; nodes[i] = y;
> +		}
> +	}
> +}



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] fix build_zonelists for CONFIG_ACPI_NUMA
  2003-09-17 21:31 [PATCH] fix build_zonelists for CONFIG_ACPI_NUMA Jesse Barnes
  2003-09-18 15:16 ` Erich Focht
@ 2003-09-18 15:38 ` Jesse Barnes
  2003-09-18 16:49 ` Erich Focht
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Jesse Barnes @ 2003-09-18 15:38 UTC (permalink / raw)
  To: linux-ia64

On Thu, Sep 18, 2003 at 05:16:33PM +0200, Erich Focht wrote:
> this kind of patch is a GOOD THING!

Glad you approve! ;)

> I just have an objection regarding the sort order. On my computers
> (and on yours maybe too) I have matrices of the form:
> 
> 10 15 15 15
> 15 10 15 15
> 15 15 10 15
> 15 15 15 10
> 
> Now just sorting the distance matrix row by row leads to the following
> zonelists:
>  for node 1:  1, 2, 3, 4
>  for node 2:  2, 1, 3, 4
>  for node 3:  3, 1, 2, 4
>  for node 4:  4, 1, 2, 3
> 
> The first node in the list is fine and we'll get memory from the right
> node if it is free. But if not, we'll request memory from the second
> node in the zonelist and this will be (in most of the cases) node
> 1. Which means a pretty bad imbalance.

Yeah, that's a good point.  We should fix that.

> I'd prefer to see this more in a round-robin way, this would ease
> things. The following piece of (ugly) code does this, but expects that
> the existing values (in the example: 10 and 15) have been sorted into
> the array node_levels[]. 

Sounds good.

> #define node_distance(from,to) (acpi20_slit[from * numnodes + to])

I should use this in my code, I think it'll make it more readable :).

> static void __init
> permute_nodes(int curr, int *array)
> {
> 	int lev, perm, node, dist=0, minown, nodes=0;
> 
> 	array[nodes++] = curr;
> 	if (nr_node_levels = 1) return;
> 	for (node=0; node < numnodes; node++) {
> 		dist = node_distance(curr,node);
> 		if (dist = node_levels[1] || dist = node_levels[0])
> 			break;
> 	}
> 	minown = node;
> 	dist=0;
> 	for (lev=1; lev < nr_node_levels; lev++) {
> 		if (lev > 1) {
> 			for (perm=1; perm < numnodes; perm++) {
> 				node = (curr + perm) % numnodes;
> 				if (node_distance(curr, node) = node_levels[lev])
> 					break;
> 			}
> 			dist = perm + curr - minown;
> 		}
> 		for (perm=0; perm < numnodes; perm++) {
> 			node = (curr + perm + dist) % numnodes;
> 			if (node_distance(curr, node) = node_levels[lev])
> 				array[nodes++] = node;
> 		}
> 	}
> }

I'll look at integrating something like this into my patch and reposting
it.

Thanks,
Jesse

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] fix build_zonelists for CONFIG_ACPI_NUMA
  2003-09-17 21:31 [PATCH] fix build_zonelists for CONFIG_ACPI_NUMA Jesse Barnes
  2003-09-18 15:16 ` Erich Focht
  2003-09-18 15:38 ` Jesse Barnes
@ 2003-09-18 16:49 ` Erich Focht
  2003-09-18 19:59 ` Jesse Barnes
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Erich Focht @ 2003-09-18 16:49 UTC (permalink / raw)
  To: linux-ia64

On Thursday 18 September 2003 17:38, Jesse Barnes wrote:
> On Thu, Sep 18, 2003 at 05:16:33PM +0200, Erich Focht wrote:
> I'll look at integrating something like this into my patch and reposting
> it.

Great, thanks! (I hoped that you would do the work ;-)

Regards,
Erich



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] fix build_zonelists for CONFIG_ACPI_NUMA
  2003-09-17 21:31 [PATCH] fix build_zonelists for CONFIG_ACPI_NUMA Jesse Barnes
                   ` (2 preceding siblings ...)
  2003-09-18 16:49 ` Erich Focht
@ 2003-09-18 19:59 ` Jesse Barnes
  2003-09-18 20:04 ` Jesse Barnes
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Jesse Barnes @ 2003-09-18 19:59 UTC (permalink / raw)
  To: linux-ia64

On Thu, Sep 18, 2003 at 08:38:32AM -0700, Jesse Barnes wrote:
> On Thu, Sep 18, 2003 at 05:16:33PM +0200, Erich Focht wrote:
> > The first node in the list is fine and we'll get memory from the right
> > node if it is free. But if not, we'll request memory from the second
> > node in the zonelist and this will be (in most of the cases) node
> > 1. Which means a pretty bad imbalance.
> 
> Yeah, that's a good point.  We should fix that.

On second thought, here's the output from a test machine here.  I think
it's working correctly.

...
ACPI 2.0 SLIT locality table:
010 020 022 042
020 010 042 022
022 042 010 020
042 022 020 010
...
Zonelist for node 0: 0 1 2 3
Zonelist for node 1: 1 0 3 2
Zonelist for node 2: 2 3 0 1
Zonelist for node 3: 3 2 1 0
...

So the fallback for nodes 0 and 1 (which are in the same proximity
domain) rotates the distant proximity domain correctly.  Same for nodes
2 and 3.

As opposed to:
...
Zonelist for node 0: 0 1 2 3
Zonelist for node 1: 1 0 2 3
Zonelist for node 2: 2 3 0 1
Zonelist for node 3: 3 2 0 1
...

Which I think you were worried about?

Thanks,
Jesse

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] fix build_zonelists for CONFIG_ACPI_NUMA
  2003-09-17 21:31 [PATCH] fix build_zonelists for CONFIG_ACPI_NUMA Jesse Barnes
                   ` (3 preceding siblings ...)
  2003-09-18 19:59 ` Jesse Barnes
@ 2003-09-18 20:04 ` Jesse Barnes
  2003-09-19 16:56 ` Erich Focht
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Jesse Barnes @ 2003-09-18 20:04 UTC (permalink / raw)
  To: linux-ia64

On Thu, Sep 18, 2003 at 12:59:08PM -0700, Jesse Barnes wrote:
> On Thu, Sep 18, 2003 at 08:38:32AM -0700, Jesse Barnes wrote:
> > On Thu, Sep 18, 2003 at 05:16:33PM +0200, Erich Focht wrote:
> > > The first node in the list is fine and we'll get memory from the right
> > > node if it is free. But if not, we'll request memory from the second
> > > node in the zonelist and this will be (in most of the cases) node
> > > 1. Which means a pretty bad imbalance.
> > 
> > Yeah, that's a good point.  We should fix that.
> 
> On second thought, here's the output from a test machine here.  I think
> it's working correctly.
> 
> ...
> ACPI 2.0 SLIT locality table:
> 010 020 022 042
> 020 010 042 022
> 022 042 010 020
> 042 022 020 010
> ...
> Zonelist for node 0: 0 1 2 3
> Zonelist for node 1: 1 0 3 2
> Zonelist for node 2: 2 3 0 1
> Zonelist for node 3: 3 2 1 0
> ...
> 
> So the fallback for nodes 0 and 1 (which are in the same proximity
> domain) rotates the distant proximity domain correctly.  Same for nodes
> 2 and 3.

On third thought, I think I need to test this on a bigger machine.  I
think our machines have to get pretty big before they have nodes in the
same proximity domain (I was wrong above).   That said, I think we can
fix the problem in the sort_distance_array() routine--when we scan the
slit we should look for duplicate entries following the current node and
increment them each in succession.

Thanks,
Jesse

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] fix build_zonelists for CONFIG_ACPI_NUMA
  2003-09-17 21:31 [PATCH] fix build_zonelists for CONFIG_ACPI_NUMA Jesse Barnes
                   ` (4 preceding siblings ...)
  2003-09-18 20:04 ` Jesse Barnes
@ 2003-09-19 16:56 ` Erich Focht
  2003-09-19 17:07 ` Jesse Barnes
  2003-09-19 17:49 ` Paul Jackson
  7 siblings, 0 replies; 9+ messages in thread
From: Erich Focht @ 2003-09-19 16:56 UTC (permalink / raw)
  To: linux-ia64

On Thursday 18 September 2003 21:59, Jesse Barnes wrote:
> On second thought, here's the output from a test machine here.  I think
> it's working correctly.
>
> ...
> ACPI 2.0 SLIT locality table:
> 010 020 022 042
> 020 010 042 022
> 022 042 010 020
> 042 022 020 010

That unsymmetric??? Interesting... Of course this saves you the
headache of explicitely round-robin.

> Which I think you were worried about?

I'm still worried about the "normal" kind of machines with more
symmetric node-distances. TX-7 for example has

10 15 15 15 20 20 20 20
15 10 15 15 20 20 20 20
15 15 10 15 20 20 20 20
15 15 15 10 20 20 20 20
20 20 20 20 10 15 15 15
20 20 20 20 15 10 15 15
20 20 20 20 15 15 10 15
20 20 20 20 15 15 15 10

I'm fine with sorting if we get some sort of round-robin at least over
the first off-diagonal node level. But the code I've sent you was the
best idea I had (in the limited time I could spend for this...).

Regards,
Erich



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] fix build_zonelists for CONFIG_ACPI_NUMA
  2003-09-17 21:31 [PATCH] fix build_zonelists for CONFIG_ACPI_NUMA Jesse Barnes
                   ` (5 preceding siblings ...)
  2003-09-19 16:56 ` Erich Focht
@ 2003-09-19 17:07 ` Jesse Barnes
  2003-09-19 17:49 ` Paul Jackson
  7 siblings, 0 replies; 9+ messages in thread
From: Jesse Barnes @ 2003-09-19 17:07 UTC (permalink / raw)
  To: linux-ia64

On Fri, Sep 19, 2003 at 06:56:21PM +0200, Erich Focht wrote:
> On Thursday 18 September 2003 21:59, Jesse Barnes wrote:
> > ACPI 2.0 SLIT locality table:
> > 010 020 022 042
> > 020 010 042 022
> > 022 042 010 020
> > 042 022 020 010
> 
> That unsymmetric??? Interesting... Of course this saves you the
> headache of explicitely round-robin.

This is just a small system, only 8p, and I was wrong when I said that
we wouldn't have SLIT tables like the one below.  On a larger system
(above 8 nodes) we'll have 4 nodes in each proximity domain, so
round-robin would help us there.  The system above has a topology like
this:

  ------ fast ------
  |    |__v___|    |
  |  3 |      |  2 |
  ------      ------
    | <- slow -> | 
  ------      ------
  |    |______|    |
  |  1 |  ^   |  0 |
  ------ fast ------

Which explains the SLIT.  Of course, on a larger system we'd have the
slow links all connect to a router, which would mean we'd have multiple
nodes in the same domain.

> > Which I think you were worried about?
> 
> I'm still worried about the "normal" kind of machines with more
> symmetric node-distances. TX-7 for example has

Yeah, sorry.  I was smokin' something when I posted that message.  I
really do understand the issue and agree that it's a problem that we
should fix :)

> 10 15 15 15 20 20 20 20
> 15 10 15 15 20 20 20 20
> 15 15 10 15 20 20 20 20
> 15 15 15 10 20 20 20 20
> 20 20 20 20 10 15 15 15
> 20 20 20 20 15 10 15 15
> 20 20 20 20 15 15 10 15
> 20 20 20 20 15 15 15 10

Thanks,
Jesse

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] fix build_zonelists for CONFIG_ACPI_NUMA
  2003-09-17 21:31 [PATCH] fix build_zonelists for CONFIG_ACPI_NUMA Jesse Barnes
                   ` (6 preceding siblings ...)
  2003-09-19 17:07 ` Jesse Barnes
@ 2003-09-19 17:49 ` Paul Jackson
  7 siblings, 0 replies; 9+ messages in thread
From: Paul Jackson @ 2003-09-19 17:49 UTC (permalink / raw)
  To: linux-ia64

Too bad we can't have user level code build the zone lists, and just
have the kernel use whatever zone list it is told to use.  Keep
mechanism in the kernel and policy outside, where possible.

Should we have a zone list per task, or per vma?  I can imagine cases
where an application might want to place some of its memory differently,
perhaps to get some pages near dma hardware or some other sharing user.

The kernel as well, in addition to tasks or vmas, should have lists
controlling memory placement, depending on which cpu is asking.

Essentially nothing is needed at boot, other than the ability to place
a couple of kernel threads on each cpu or node for specific reasons.

Anything fancier could be setup from initscripts, if we presume we
have enough locking (rcu?) in place to allow the rare change of the
kernel's. or some other process's (init pid=1?) zone lists.

Lists of lists, or 2D arrays of numbers, are a pain to get across the
kernel boundary.  Seems we are getting comfortable with getting such
_out_ of the kernel, using pseudo file system apis with small files
having trivial syntax (like a single ascii number).  I'd like to see us
pushing lists and arrays of numbers such as zone lists back _into_ the
kernel this way, writing to pseudo file systems.  Is that becoming an
acceptable mechanism (beats ioctls and the like ...)?

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2003-09-19 17:49 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2003-09-17 21:31 [PATCH] fix build_zonelists for CONFIG_ACPI_NUMA Jesse Barnes
2003-09-18 15:16 ` Erich Focht
2003-09-18 15:38 ` Jesse Barnes
2003-09-18 16:49 ` Erich Focht
2003-09-18 19:59 ` Jesse Barnes
2003-09-18 20:04 ` Jesse Barnes
2003-09-19 16:56 ` Erich Focht
2003-09-19 17:07 ` Jesse Barnes
2003-09-19 17:49 ` Paul Jackson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox