LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/7] Introduce mechanism for registering active regions of memory
From: Mel Gorman @ 2006-04-18 13:00 UTC (permalink / raw)
  To: davej, tony.luck, linux-mm, ak, bob.picco, linux-kernel,
	linuxppc-dev
  Cc: Mel Gorman
In-Reply-To: <20060418130015.28928.10163.sendpatchset@skynet>


This patch defines the structure to represent an active range of page
frames within a node in an architecture independent manner. Architectures
are expected to register active ranges of PFNs using add_active_range(nid,
start_pfn, end_pfn) and call free_area_init_nodes() passing the PFNs of
the end of each zone.


 include/linux/mm.h     |   18 ++
 include/linux/mmzone.h |   15 +
 mm/page_alloc.c        |  391 +++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 399 insertions(+), 25 deletions(-)

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc1-clean/include/linux/mm.h linux-2.6.17-rc1-101-add_free_area_init_nodes/include/linux/mm.h
--- linux-2.6.17-rc1-clean/include/linux/mm.h	2006-04-03 04:22:10.000000000 +0100
+++ linux-2.6.17-rc1-101-add_free_area_init_nodes/include/linux/mm.h	2006-04-18 10:17:49.000000000 +0100
@@ -867,6 +867,24 @@ extern void free_area_init(unsigned long
 extern void free_area_init_node(int nid, pg_data_t *pgdat,
 	unsigned long * zones_size, unsigned long zone_start_pfn, 
 	unsigned long *zholes_size);
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+extern void free_area_init_nodes(unsigned long max_dma_pfn,
+					unsigned long max_dma32_pfn,
+					unsigned long max_low_pfn,
+					unsigned long max_high_pfn);
+extern void add_active_range(unsigned int nid, unsigned long start_pfn,
+					unsigned long end_pfn);
+extern void get_pfn_range_for_nid(unsigned int nid,
+			unsigned long *start_pfn, unsigned long *end_pfn);
+extern unsigned long find_min_pfn_with_active_regions(void);
+extern unsigned long find_max_pfn_with_active_regions(void);
+extern int early_pfn_to_nid(unsigned long pfn);
+extern void free_bootmem_with_active_regions(int nid,
+						unsigned long max_low_pfn);
+extern void sparse_memory_present_with_active_regions(int nid);
+extern unsigned long absent_pages_in_range(unsigned long start_pfn,
+						unsigned long end_pfn);
+#endif
 extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long);
 extern void setup_per_zone_pages_min(void);
 extern void mem_init(void);
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc1-clean/include/linux/mmzone.h linux-2.6.17-rc1-101-add_free_area_init_nodes/include/linux/mmzone.h
--- linux-2.6.17-rc1-clean/include/linux/mmzone.h	2006-04-03 04:22:10.000000000 +0100
+++ linux-2.6.17-rc1-101-add_free_area_init_nodes/include/linux/mmzone.h	2006-04-18 10:17:49.000000000 +0100
@@ -271,6 +271,18 @@ struct zonelist {
 	struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited
 };
 
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+/*
+ * This represents an active range of physical memory. Architectures register
+ * a pfn range using add_active_range() and later initialise the nodes and
+ * free list with free_area_init_nodes()
+ */
+struct node_active_region {
+	unsigned long start_pfn;
+	unsigned long end_pfn;
+	int nid;
+};
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
 
 /*
  * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM
@@ -465,7 +477,8 @@ extern struct zone *next_zone(struct zon
 
 #endif
 
-#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+#if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \
+	!defined(CONFIG_ARCH_POPULATES_NODE_MAP)
 #define early_pfn_to_nid(nid)  (0UL)
 #endif
 
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc1-clean/mm/page_alloc.c linux-2.6.17-rc1-101-add_free_area_init_nodes/mm/page_alloc.c
--- linux-2.6.17-rc1-clean/mm/page_alloc.c	2006-04-03 04:22:10.000000000 +0100
+++ linux-2.6.17-rc1-101-add_free_area_init_nodes/mm/page_alloc.c	2006-04-18 10:17:49.000000000 +0100
@@ -37,6 +37,8 @@
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
 #include <linux/mempolicy.h>
+#include <linux/sort.h>
+#include <linux/pfn.h>
 
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -84,6 +86,18 @@ int min_free_kbytes = 1024;
 unsigned long __initdata nr_kernel_pages;
 unsigned long __initdata nr_all_pages;
 
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+  #ifdef CONFIG_MAX_ACTIVE_REGIONS
+    #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
+  #else
+    #define MAX_ACTIVE_REGIONS (MAX_NR_ZONES * MAX_NUMNODES + 1)
+  #endif
+
+  struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
+  unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+  unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
@@ -1743,25 +1757,6 @@ static inline unsigned long wait_table_b
 
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 
-static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
-		unsigned long *zones_size, unsigned long *zholes_size)
-{
-	unsigned long realtotalpages, totalpages = 0;
-	int i;
-
-	for (i = 0; i < MAX_NR_ZONES; i++)
-		totalpages += zones_size[i];
-	pgdat->node_spanned_pages = totalpages;
-
-	realtotalpages = totalpages;
-	if (zholes_size)
-		for (i = 0; i < MAX_NR_ZONES; i++)
-			realtotalpages -= zholes_size[i];
-	pgdat->node_present_pages = realtotalpages;
-	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
-}
-
-
 /*
  * Initially all pages are reserved - free ones are freed
  * up by free_all_bootmem() once the early boot process is
@@ -2048,6 +2043,221 @@ static __meminit void init_currently_emp
 	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
 }
 
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+/* Note: nid == MAX_NUMNODES returns first region */
+static int __init first_active_region_index_in_nid(int nid)
+{
+	int i;
+	for (i = 0; early_node_map[i].end_pfn; i++) {
+		if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
+			return i;
+	}
+
+	return MAX_ACTIVE_REGIONS;
+}
+
+/* Note: nid == MAX_NUMNODES returns next region */
+static int __init next_active_region_index_in_nid(unsigned int index, int nid)
+{
+	for (index = index + 1; early_node_map[index].end_pfn; index++) {
+		if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
+			return index;
+	}
+
+	return MAX_ACTIVE_REGIONS;
+}
+
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+int __init early_pfn_to_nid(unsigned long pfn)
+{
+	int i;
+
+	for (i = 0; early_node_map[i].end_pfn; i++) {
+		unsigned long start_pfn = early_node_map[i].start_pfn;
+		unsigned long end_pfn = early_node_map[i].end_pfn;
+
+		if ((start_pfn <= pfn) && (pfn < end_pfn))
+			return early_node_map[i].nid;
+	}
+
+	return -1;
+}
+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
+
+#define for_each_active_range_index_in_nid(i, nid) \
+	for (i = first_active_region_index_in_nid(nid); \
+				i != MAX_ACTIVE_REGIONS; \
+				i = next_active_region_index_in_nid(i, nid))
+
+void __init free_bootmem_with_active_regions(int nid,
+						unsigned long max_low_pfn)
+{
+	unsigned int i;
+	for_each_active_range_index_in_nid(i, nid) {
+		unsigned long size_pages = 0;
+		unsigned long end_pfn = early_node_map[i].end_pfn;
+		if (early_node_map[i].start_pfn >= max_low_pfn)
+			continue;
+
+		if (end_pfn > max_low_pfn)
+			end_pfn = max_low_pfn;
+
+		size_pages = end_pfn - early_node_map[i].start_pfn;
+		free_bootmem_node(NODE_DATA(early_node_map[i].nid),
+				PFN_PHYS(early_node_map[i].start_pfn),
+				PFN_PHYS(size_pages));
+	}
+}
+
+void __init sparse_memory_present_with_active_regions(int nid)
+{
+	unsigned int i;
+	for_each_active_range_index_in_nid(i, nid)
+		memory_present(early_node_map[i].nid,
+				early_node_map[i].start_pfn,
+				early_node_map[i].end_pfn);
+}
+
+void __init get_pfn_range_for_nid(unsigned int nid,
+			unsigned long *start_pfn, unsigned long *end_pfn)
+{
+	unsigned int i;
+	*start_pfn = -1UL;
+	*end_pfn = 0;
+
+	for_each_active_range_index_in_nid(i, nid) {
+		if (early_node_map[i].start_pfn < *start_pfn)
+			*start_pfn = early_node_map[i].start_pfn;
+
+		if (early_node_map[i].end_pfn > *end_pfn)
+			*end_pfn = early_node_map[i].end_pfn;
+	}
+
+	if (*start_pfn == -1UL) {
+		printk(KERN_WARNING "Node %u active with no memory\n", nid);
+		*start_pfn = 0;
+	}
+}
+
+unsigned long __init zone_present_pages_in_node(int nid,
+					unsigned long zone_type,
+					unsigned long *ignored)
+{
+	unsigned long node_start_pfn, node_end_pfn;
+	unsigned long zone_start_pfn, zone_end_pfn;
+
+	/* Get the start and end of the node and zone */
+	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
+	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+
+	/* Check that this node has pages within the zone's required range */
+	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+		return 0;
+
+	/* Move the zone boundaries inside the node if necessary */
+	if (zone_end_pfn > node_end_pfn)
+		zone_end_pfn = node_end_pfn;
+	if (zone_start_pfn < node_start_pfn)
+		zone_start_pfn = node_start_pfn;
+
+	/* Return the spanned pages */
+	return zone_end_pfn - zone_start_pfn;
+}
+
+unsigned long __init __absent_pages_in_range(int nid,
+				unsigned long range_start_pfn,
+				unsigned long range_end_pfn)
+{
+	int i = 0;
+	unsigned long prev_end_pfn = 0, hole_pages = 0;
+	unsigned long start_pfn;
+
+	/* Find the end_pfn of the first active range of pfns in the node */
+	i = first_active_region_index_in_nid(nid);
+	prev_end_pfn = early_node_map[i].start_pfn;
+
+	/* Find all holes for the zone within the node */
+	for (; i != MAX_ACTIVE_REGIONS;
+			i = next_active_region_index_in_nid(i, nid)) {
+
+		/* No need to continue if prev_end_pfn is outside the zone */
+		if (prev_end_pfn >= range_end_pfn)
+			break;
+
+		/* Make sure the end of the zone is not within the hole */
+		start_pfn = early_node_map[i].start_pfn;
+		if (start_pfn > range_end_pfn)
+			start_pfn = range_end_pfn;
+		if (prev_end_pfn < range_start_pfn)
+			prev_end_pfn = range_start_pfn;
+
+		/* Update the hole size cound and move on */
+		if (start_pfn > range_start_pfn) {
+			BUG_ON(prev_end_pfn > start_pfn);
+			hole_pages += start_pfn - prev_end_pfn;
+		}
+		prev_end_pfn = early_node_map[i].end_pfn;
+	}
+
+	return hole_pages;
+}
+
+unsigned long __init absent_pages_in_range(unsigned long start_pfn,
+							unsigned long end_pfn)
+{
+	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
+}
+
+unsigned long __init zone_absent_pages_in_node(int nid,
+					unsigned long zone_type,
+					unsigned long *ignored)
+{
+	return __absent_pages_in_range(nid,
+				arch_zone_lowest_possible_pfn[zone_type],
+				arch_zone_highest_possible_pfn[zone_type]);
+}
+#else
+static inline unsigned long zone_present_pages_in_node(int nid,
+					unsigned long zone_type,
+					unsigned long *zones_size)
+{
+	return zones_size[zone_type];
+}
+
+static inline unsigned long zone_absent_pages_in_node(int nid,
+						unsigned long zone_type,
+						unsigned long *zholes_size)
+{
+	if (!zholes_size)
+		return 0;
+
+	return zholes_size[zone_type];
+}
+#endif
+
+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long *zholes_size)
+{
+	unsigned long realtotalpages, totalpages = 0;
+	int i;
+
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		totalpages += zone_present_pages_in_node(pgdat->node_id, i,
+								zones_size);
+	}
+	pgdat->node_spanned_pages = totalpages;
+
+	realtotalpages = totalpages;
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		realtotalpages -=
+			zone_absent_pages_in_node(pgdat->node_id, i, zholes_size);
+	}
+	pgdat->node_present_pages = realtotalpages;
+	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
+							realtotalpages);
+}
+
 /*
  * Set up the zone data structures:
  *   - mark all pages reserved
@@ -2070,10 +2280,9 @@ static void __init free_area_init_core(s
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize;
 
-		realsize = size = zones_size[j];
-		if (zholes_size)
-			realsize -= zholes_size[j];
-
+		size = zone_present_pages_in_node(nid, j, zones_size);
+		realsize = size - zone_absent_pages_in_node(nid, j,
+								zholes_size);
 		if (j < ZONE_HIGHMEM)
 			nr_kernel_pages += realsize;
 		nr_all_pages += realsize;
@@ -2140,13 +2349,147 @@ void __init free_area_init_node(int nid,
 {
 	pgdat->node_id = nid;
 	pgdat->node_start_pfn = node_start_pfn;
-	calculate_zone_totalpages(pgdat, zones_size, zholes_size);
+	calculate_node_totalpages(pgdat, zones_size, zholes_size);
 
 	alloc_node_mem_map(pgdat);
 
 	free_area_init_core(pgdat, zones_size, zholes_size);
 }
 
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+void __init add_active_range(unsigned int nid, unsigned long start_pfn,
+						unsigned long end_pfn)
+{
+	unsigned int i;
+
+	/* Merge with existing active regions if possible */
+	for (i = 0; early_node_map[i].end_pfn; i++) {
+		if (early_node_map[i].nid != nid)
+			continue;
+
+		/* Skip if an existing region covers this new one */
+		if (start_pfn >= early_node_map[i].start_pfn &&
+				end_pfn <= early_node_map[i].end_pfn)
+			return;
+
+		/* Merge forward if suitable */
+		if (start_pfn <= early_node_map[i].end_pfn &&
+				end_pfn > early_node_map[i].end_pfn) {
+			early_node_map[i].end_pfn = end_pfn;
+			return;
+		}
+
+		/* Merge backward if suitable */
+		if (start_pfn < early_node_map[i].end_pfn &&
+				end_pfn >= early_node_map[i].start_pfn) {
+			early_node_map[i].start_pfn = start_pfn;
+			return;
+		}
+	}
+
+	/* Leave last entry NULL, we use range.end_pfn to terminate the walk */
+	if (i >= MAX_ACTIVE_REGIONS - 1) {
+		printk(KERN_ERR "Too many memory regions, truncating\n");
+		return;
+	}
+
+	early_node_map[i].nid = nid;
+	early_node_map[i].start_pfn = start_pfn;
+	early_node_map[i].end_pfn = end_pfn;
+}
+
+/* Compare two active node_active_regions */
+static int __init cmp_node_active_region(const void *a, const void *b)
+{
+	struct node_active_region *arange = (struct node_active_region *)a;
+	struct node_active_region *brange = (struct node_active_region *)b;
+
+	/* Done this way to avoid overflows */
+	if (arange->start_pfn > brange->start_pfn)
+		return 1;
+	if (arange->start_pfn < brange->start_pfn)
+		return -1;
+
+	return 0;
+}
+
+/* sort the node_map by start_pfn */
+static void __init sort_node_map(void)
+{
+	size_t num = 0;
+	while (early_node_map[num].end_pfn)
+		num++;
+
+	sort(early_node_map, num, sizeof(struct node_active_region),
+						cmp_node_active_region, NULL);
+}
+
+/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
+unsigned long __init find_min_pfn_for_node(unsigned long nid)
+{
+	int i;
+
+	/* Assuming a sorted map, the first range found has the starting pfn */
+	for_each_active_range_index_in_nid(i, nid)
+		return early_node_map[i].start_pfn;
+
+	/* nid does not exist in early_node_map */
+	printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
+	return 0;
+}
+
+
+unsigned long __init find_min_pfn_with_active_regions(void)
+{
+	return find_min_pfn_for_node(MAX_NUMNODES);
+}
+
+unsigned long __init find_max_pfn_with_active_regions(void)
+{
+	int i;
+	unsigned long max_pfn = -1UL;
+
+	for (i = 0; early_node_map[i].end_pfn; i++)
+		max_pfn = max(max_pfn, early_node_map[i].start_pfn);
+
+	return max_pfn;
+}
+
+void __init free_area_init_nodes(unsigned long arch_max_dma_pfn,
+				unsigned long arch_max_dma32_pfn,
+				unsigned long arch_max_low_pfn,
+				unsigned long arch_max_high_pfn)
+{
+	unsigned long nid;
+	int zone_index;
+
+	/* Record where the zone boundaries are */
+	memset(arch_zone_lowest_possible_pfn, 0,
+				sizeof(arch_zone_lowest_possible_pfn));
+	memset(arch_zone_highest_possible_pfn, 0,
+				sizeof(arch_zone_highest_possible_pfn));
+	arch_zone_lowest_possible_pfn[ZONE_DMA] =
+					find_min_pfn_with_active_regions();
+	arch_zone_highest_possible_pfn[ZONE_DMA] = arch_max_dma_pfn;
+	arch_zone_highest_possible_pfn[ZONE_DMA32] = arch_max_dma32_pfn;
+	arch_zone_highest_possible_pfn[ZONE_NORMAL] = arch_max_low_pfn;
+	arch_zone_highest_possible_pfn[ZONE_HIGHMEM] = arch_max_high_pfn;
+	for (zone_index = 1; zone_index < MAX_NR_ZONES; zone_index++) {
+		arch_zone_lowest_possible_pfn[zone_index] =
+			arch_zone_highest_possible_pfn[zone_index-1];
+	}
+
+	/* Regions in the early_node_map can be in any order */
+	sort_node_map();
+
+	for_each_online_node(nid) {
+		pg_data_t *pgdat = NODE_DATA(nid);
+		free_area_init_node(nid, pgdat, NULL,
+				find_min_pfn_for_node(nid), NULL);
+	}
+}
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 static bootmem_data_t contig_bootmem_data;
 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };

^ permalink raw reply

* [PATCH 0/7] [RFC] Sizing zones and holes in an architecture independent manner V3
From: Mel Gorman @ 2006-04-18 13:00 UTC (permalink / raw)
  To: davej, tony.luck, linuxppc-dev, linux-kernel, bob.picco, ak,
	linux-mm
  Cc: Mel Gorman

This is V3 of the patchset to size zones and memory holes in an
architecture-independent manner. A number of bugs have been fixed in the
IA64 changes and it is now known to boot with the correct zone sizes. In
this release, 98 lines of x86_64 arch-specific code is removed because
it used similar initialisation functions to powerpc. In the last release,
only 34 lines were removed.

Andi, in light of the x86_64 changes since the last release, can you take
another look at the x86_64 changes please? Does this release make a bit
more sense for x86_64 now?

Changelog since V2
o Fix a bug where holes in lower zones get double counted
o Catch the case where a new range is registered that is within an range
o Catch the case where a zone boundary is within a hole
o Use the EFI map for registering ranges on x86_64+numa
o On IA64+NUMA, add the active ranges before rounding for granules
o On x86_64, remove e820_hole_size and e820_bootmem_free and use
  arch-independent equivalents
o On x86_64, remove the map walk in e820_end_of_ram()
o Rename memory_present_with_active_regions, name ambiguous
o Add absent_pages_in_range() for arches to call

Changelog since V1
o Correctly convert virtual and physical addresses to PFNs on ia64
o Correctly convert physical addresses to PFN on older ppc 
o When add_active_range() is called with overlapping pfn ranges, merge them
o When a zone boundary occurs within a memory hole, account correctly
o Minor whitespace damage cleanup
o Debugging patch temporarily included

At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate
zone sizes and holes in each architecture is very similar.  Some of this
zone and hole sizing code is difficult to read for no good reason. This
set of patches eliminates the similar-looking architecture-specific code.

The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas
have been discovered, free_area_init_nodes() is called to initialise
the pgdat and zones. The zone sizes and holes are then calculated in an
architecture independent manner.

Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 128 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 150 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 98 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 60 arch-specific LOC removed

At this point, there is a net reduction of 62 lines of code and the
arch-independent code is a lot easier to read in comparison to some of
the arch-specific stuff, particularly in arch/i386/ .

For Patch 6, it was also noted that page_alloc.c has a *lot* of
initialisation code which makes the file harder to read than it needs to
be. Patch 6 creates a new file mem_init.c and moves a lot of initialisation
code from page_alloc.c to it. After the patch is applied, there is still a net
loss of 43 lines.

The patches have been successfully boot tested and verified that the
zones are the correct size on

o x86, flatmem
o x86, NUMAQ
o PPC64, NUMA
o PPC64, CONFIG_NUMA=n
o x86_64, NUMA with SRAT
o x86_64, CONFIG_NUMA=n

There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory
holes but the architecture-specific code accounts the memory as present.
The patches have been compile tested for ia64 for flatmem and sparsemem
configurations. At attempt was made to boot test on an ancient RS/6000
but the vanilla kernel does not boot so I have to investigate there.

The net reduction seems small but the big benefit of this set of patches
is the reduction of 437 lines of architecture-specific code, some of
which is very hairy. There should be a greater net reduction when other
architectures use the same mechanisms for zone and hole sizing but I lack
the hardware to test on.

Comments?

Additional credit;
	Dave Hansen for the initial suggestion and comments on early patches
	Andy Whitcroft for reviewing early versions and catching numerous errors
	Tony Luck and Yasunori Goto for testing and debugging on IA64
	Bob Picco for testing and fixing bugs related to pfn registration

 arch/i386/Kconfig          |    8 
 arch/i386/kernel/setup.c   |   19 
 arch/i386/kernel/srat.c    |   98 ----
 arch/i386/mm/discontig.c   |   59 --
 arch/ia64/Kconfig          |    3 
 arch/ia64/mm/contig.c      |   60 --
 arch/ia64/mm/discontig.c   |   41 -
 arch/ia64/mm/init.c        |   12 
 arch/powerpc/Kconfig       |   13 
 arch/powerpc/mm/mem.c      |   53 --
 arch/powerpc/mm/numa.c     |  157 ------
 arch/ppc/Kconfig           |    3 
 arch/ppc/mm/init.c         |   26 -
 arch/x86_64/Kconfig        |    3 
 arch/x86_64/kernel/e820.c  |  109 +---
 arch/x86_64/kernel/setup.c |    3 
 arch/x86_64/mm/init.c      |   62 --
 arch/x86_64/mm/numa.c      |   18 
 arch/x86_64/mm/srat.c      |    7 
 include/asm-ia64/meminit.h |    1 
 include/asm-x86_64/e820.h  |    5 
 include/asm-x86_64/proto.h |    2 
 include/linux/mm.h         |   18 
 include/linux/mmzone.h     |   15 
 mm/Makefile                |    2 
 mm/mem_init.c              | 1040 +++++++++++++++++++++++++++++++++++++++++++++
 mm/page_alloc.c            |  678 -----------------------------
 27 files changed, 1236 insertions(+), 1279 deletions(-)

-- 
-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

^ permalink raw reply

* Re: [PATCH 00/05] robust per_cpu allocation for modules
From: Steven Rostedt @ 2006-04-18 12:47 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, linux-mips, linux-ia64, Martin Mares, spyro,
	Joe Taylor, Andi Kleen, linuxppc-dev, paulus, bjornw, Ingo Molnar,
	Ravikiran G Thirumalai, Christoph Lameter, grundler, starvik,
	Linus Torvalds, Thomas Gleixner, rth, Chris Zankel, tony.luck,
	LKML, ralf, Marc Gauthier, lethal, schwidefsky, linux390, davem,
	parisc-linux
In-Reply-To: <44448A60.4040903@yahoo.com.au>

[Removed from CC davidm@hpl.hp.com and benedict.gaster@superh.com
because I keep getting "unknown user" bounces from them]

On Tue, 2006-04-18 at 16:42 +1000, Nick Piggin wrote:
> Steven Rostedt wrote:
> 
> > Understood, but I'm going to start looking in the way Rusty and Arnd
> > suggested with the vmalloc approach. This would allow for saving of
> > memory and dynamic allocation of module memory making it more robust. And
> > all this without that evil extra indirection!
> 
> Remember that this approach could effectively just move the indirection to
> the TLB / page tables (well, I say "moves" because large kernel mappings
> are effectively free compared with 4K mappings).

Yeah, I thought about the paging latencies when it was first mentioned.
And this is something that's going to be very hard to know the impact,
because it will be different on every system.

> 
> So be careful about coding up a large amount of work before unleashing it:
> I doubt you'll be able to find a solution that doesn't involve tradeoffs
> somewhere (but wohoo if you can).
> 

OK, but as I mentioned that this is now more of a side project, so a
month of work is not really going to be a month of work ;)  I'll first
try to get something that just "works" and then post an RFC PATCH set,
to get more ideas.  Since obviously there's a lot of people out there
that know their systems much better than I do ;)

Thanks,

-- Steve

^ permalink raw reply

* kernel 2.6.15: cpm_uart driver broken?
From: David Jander @ 2006-04-18 12:46 UTC (permalink / raw)
  To: linuxppc-embedded


Hi all,

Situation 1: MPC852T with SMC1 as uart/console, and SCC3/SCC4 as additional 
uarts. SMC1 works fine, but SCC3/4 don't. On transmission attempt, both UARTS 
transmit a byte 0x00 instead of what was intended to be transmitted.\

Situation 2: The same as above, but console on either SCC3 or SCC4. The uart 
being initialized as console works ok, the other two don't.

I'm pretty sure the following is wrong, but I can't seem to fix it either. 
This seems to apply for both PQ and PQ2 type uarts:
from drivers/serial/cpm_uart/cpm_uart_cpm1.c (line 190):

....
	if (is_con) {
		/* was hostalloc but changed cause it blows away the */
		/* large tlb mapping when pinning the kernel area    */
		mem_addr = (u8 *) cpm_dpram_addr(cpm_dpalloc(memsz, 8));
		dma_addr = 0;
	} else
		mem_addr = dma_alloc_coherent(NULL, memsz, &dma_addr,
					      GFP_KERNEL);

....
	pinfo->dp_addr = dp_offset;
	pinfo->mem_addr = mem_addr;
	pinfo->dma_addr = dma_addr;

	pinfo->rx_buf = mem_addr;
....

AFAICS pinfo->rx_buf is the pointer to a buffer as seen from the CPM's point 
of view, so it should hold a physical adress, not a virtual address. It seems 
to me that it should be more like this (lines marked with ** are changed):

....
	if (is_con) {
		/* was hostalloc but changed cause it blows away the */
		/* large tlb mapping when pinning the kernel area    */
		mem_addr = (u8 *) cpm_dpram_addr(cpm_dpalloc(memsz, 8));
**		dma_addr = mem_addr;
	} else
		mem_addr = dma_alloc_coherent(NULL, memsz, &dma_addr,
					      GFP_KERNEL);

....
	pinfo->dp_addr = dp_offset;
	pinfo->mem_addr = mem_addr;
	pinfo->dma_addr = dma_addr;

**	pinfo->rx_buf = dma_addr;
....

This does not work either, but I suspect this is a different problem, because 
if I change dma_alloc_coherent() for something using kmalloc() and then 
dma_addr=virt_to_phys(mem_addr), uarts begin to work, but trasmit mixed old 
and new data from the buffers due to the cache getting in the way. At least 
reception seems to work ok then. 
So, why doesn't dma_alloc_coherent() work the way one would expect?
Obviously, changing "if (is_con)" into "if (1)" all three uarts work 
correctly, but I guess we want to save on DP_RAM usage if ever possible.

What else is wrong here?

Greetings,

-- 
David Jander

^ permalink raw reply

* please pull powerpc-merge.git
From: Paul Mackerras @ 2006-04-18 12:11 UTC (permalink / raw)
  To: torvalds; +Cc: linuxppc-dev

Linus,

Please do a pull from

git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc-merge.git

to get the following powerpc bug-fixes.

Thanks,
Paul.

Anton Blanchard:
      powerpc: Ensure runlatch is off in the idle loop
      powerpc: Avoid __initcall warnings

Jordi Caubet:
      spufs: fix context-switch decrementer code

Linas Vepstas:
      powerpc/pseries: bugfix: balance calls to pci_device_put

Michael Ellerman:
      powerpc: Fix machine detection in prom_init.c

Olaf Hering:
      powerpc32: Set cpu explicitly in kernel compiles

Paul Mackerras:
      powerpc: Fix CHRP booting - needs a define_machine call
      powerpc: Use correct sequence for putting CPU into nap mode

Stephen Rothwell:
      powerpc: iSeries has only 256 IRQs

Vitaly Bordug:
      ppc32: Fix string comparing in platform_notify_map

 arch/powerpc/Kconfig                       |    6 +++
 arch/powerpc/Makefile                      |    4 ++
 arch/powerpc/kernel/Makefile               |    2 -
 arch/powerpc/kernel/asm-offsets.c          |    1 
 arch/powerpc/kernel/entry_32.S             |   35 +++++++--------
 arch/powerpc/kernel/head_64.S              |   49 +++++++++++++++++++++
 arch/powerpc/kernel/idle.c                 |    4 +-
 arch/powerpc/kernel/idle_6xx.S             |   65 +++++++++-------------------
 arch/powerpc/kernel/idle_power4.S          |   10 +++-
 arch/powerpc/kernel/irq.c                  |   36 +++++++++-------
 arch/powerpc/kernel/prom_init.c            |    5 +-
 arch/powerpc/kernel/rtas-proc.c            |    4 +-
 arch/powerpc/platforms/cell/spufs/switch.c |    2 -
 arch/powerpc/platforms/chrp/chrp.h         |    1 
 arch/powerpc/platforms/chrp/pci.c          |    6 ++-
 arch/powerpc/platforms/chrp/setup.c        |   48 +++++++++------------
 arch/powerpc/platforms/iseries/setup.c     |    7 +++
 arch/powerpc/platforms/pseries/eeh.c       |    6 ++-
 arch/powerpc/platforms/pseries/rtasd.c     |    2 -
 arch/ppc/syslib/ppc_sys.c                  |    9 ++--
 include/asm-powerpc/irq.h                  |    7 +++
 include/asm-powerpc/thread_info.h          |    8 +++
 22 files changed, 194 insertions(+), 123 deletions(-)

^ permalink raw reply

* CF card connection to PPC CPUs
From: Andriy Korud @ 2006-04-18 10:35 UTC (permalink / raw)
  To: linuxppc-embedded

Hi,
Does anybody has experience in connecting CF card to PPC (especially
44x) CPU directly via EBC bus?
I see how this can be done in TrueIDE mode, but unfortunately our
application requires hot swap capability.

Thanks in advance,
--
Andriy Korud

^ permalink raw reply

* Re: [PATCH] ppc64-soft-reset-fixes
From: Olaf Hering @ 2006-04-18  9:56 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linuxppc-dev, fastboot
In-Reply-To: <20060417231235.36546cb6.akpm@osdl.org>

 On Mon, Apr 17, Andrew Morton wrote:

> David Wilder <dwilder@us.ibm.com> wrote:
> >
> >  - For the crash scenario, when a CPU hangs with interrupts disabled and 
> >  the other CPUs panic or user invoked kdump boot using sysrq-c. In this 
> >  case, the hung CPU can not be stopped and causes the kdump boot not 
> >  successful. This case can be treated as complete system hang and asks 
> >  the user to activate soft-reset if all secondary CPUs are not stopped.
> 
> It breaks `make allmodconfig':
> 
> arch/powerpc/kernel/built-in.o(.toc+0x2a00): In function `PPC64_CACHES':
> : undefined reference to `kexec_crash_image'

This change requires another patch.

It also breaks ppc32, this line is included in misc_32.S:

static inline void crash_kexec_secondary(struct pt_regs *regs) { }

Maybe the whole block should be look like that, compile tested on ppc32,
ppc64 and iseries:

---
 include/asm-powerpc/kexec.h |    5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

Index: linux-2.6.16/include/asm-powerpc/kexec.h
===================================================================
--- linux-2.6.16.orig/include/asm-powerpc/kexec.h
+++ linux-2.6.16/include/asm-powerpc/kexec.h
@@ -31,9 +31,8 @@
 #define KEXEC_ARCH KEXEC_ARCH_PPC
 #endif
 
-#ifdef CONFIG_KEXEC
-
 #ifndef __ASSEMBLY__
+#ifdef CONFIG_KEXEC
 #ifdef __powerpc64__
 /*
  * This function is responsible for capturing register states if coming
@@ -124,9 +123,9 @@ extern void default_machine_crash_shutdo
 
 extern void machine_kexec_simple(struct kimage *image);
 extern void crash_kexec_secondary(struct pt_regs *regs);
-#endif /* ! __ASSEMBLY__ */
 #else
 static inline void crash_kexec_secondary(struct pt_regs *regs) { }
 #endif /* CONFIG_KEXEC */
+#endif /* ! __ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_KEXEC_H */

^ permalink raw reply

* isochronous mode of USB audio device on MPC5200
From: Sundar @ 2006-04-18  4:42 UTC (permalink / raw)
  To: linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 175 bytes --]

Hi Benny,
            I am also facing the same problem. Can you send me the details about the changes you have done.

Thank you for any help.

with warm regards,
sundar

[-- Attachment #2: Type: text/html, Size: 806 bytes --]

^ permalink raw reply

* Re: [PATCH 00/05] robust per_cpu allocation for modules
From: Nick Piggin @ 2006-04-18  6:42 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Andrew Morton, linux-mips, David Mosberger-Tang, linux-ia64,
	Martin Mares, spyro, Joe Taylor, Andi Kleen, linuxppc-dev, paulus,
	benedict.gaster, bjornw, Ingo Molnar, Ravikiran G Thirumalai,
	Christoph Lameter, grundler, starvik, Linus Torvalds,
	Thomas Gleixner, rth, Chris Zankel, tony.luck, LKML, ralf,
	Marc Gauthier, lethal, schwidefsky, linux390, davem, parisc-linux
In-Reply-To: <Pine.LNX.4.58.0604171936040.24264@gandalf.stny.rr.com>

Steven Rostedt wrote:

> Understood, but I'm going to start looking in the way Rusty and Arnd
> suggested with the vmalloc approach. This would allow for saving of
> memory and dynamic allocation of module memory making it more robust. And
> all this without that evil extra indirection!

Remember that this approach could effectively just move the indirection to
the TLB / page tables (well, I say "moves" because large kernel mappings
are effectively free compared with 4K mappings).

So be careful about coding up a large amount of work before unleashing it:
I doubt you'll be able to find a solution that doesn't involve tradeoffs
somewhere (but wohoo if you can).

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply

* Re: 7447A strange problem with MSR:POW (WAS: can't boot 2.6.17-rc1)
From: Benjamin Herrenschmidt @ 2006-04-18  6:37 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Becky Bruce, Michael Schmitz, debian-powerpc, linuxppc-dev list
In-Reply-To: <17476.34830.897083.360572@cargo.ozlabs.ibm.com>

On Tue, 2006-04-18 at 16:32 +1000, Paul Mackerras wrote:
> Benjamin Herrenschmidt writes:
> 
> > The 970 version bloats the exception prolog significantly... I
> 
> Four instructions, in the external and decrementer interrupt entry
> paths - I don't think that's really significant bloat.

Yeah well.. including a load.. ok, I admit that should be fairly hot...
btw, I suppose you took care of having those local flags in some hot
cache line ? :)

> More likely we'll get more situations like Cell where we come in
> through the soft reset vector after sleep.

Yeah.

Ben.

^ permalink raw reply

* Re: 7447A strange problem with MSR:POW (WAS: can't boot 2.6.17-rc1)
From: Paul Mackerras @ 2006-04-18  6:32 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Becky Bruce, Michael Schmitz, debian-powerpc, linuxppc-dev list
In-Reply-To: <1145340034.4705.55.camel@localhost.localdomain>

Benjamin Herrenschmidt writes:

> The 970 version bloats the exception prolog significantly... I

Four instructions, in the external and decrementer interrupt entry
paths - I don't think that's really significant bloat.

> understand now why you were talking about putting the code in the exit
> path on irc ... I don't like it that way.... Also, if you want to keep
> it, maybe use a separate CONFIG_PPC_970STYLE_NAP or something that gets
> selected by platforms that can do it ?

The config option makes sense.

> I suppose a PACA field would be less inefficient but still sucks... the
> exception return to userland code path already accesses thread_info and
> definitely looks like a better place to put it... as long as we never
> have to add dodgy workarounds when getting out of NAP like we do on 6xx.

More likely we'll get more situations like Cell where we come in
through the soft reset vector after sleep.

Paul.

^ permalink raw reply

* Re: [PATCH] ppc64-soft-reset-fixes
From: Andrew Morton @ 2006-04-18  6:12 UTC (permalink / raw)
  To: David Wilder; +Cc: linuxppc-dev, fastboot
In-Reply-To: <4443D738.3040704@us.ibm.com>

David Wilder <dwilder@us.ibm.com> wrote:
>
>  - For the crash scenario, when a CPU hangs with interrupts disabled and 
>  the other CPUs panic or user invoked kdump boot using sysrq-c. In this 
>  case, the hung CPU can not be stopped and causes the kdump boot not 
>  successful. This case can be treated as complete system hang and asks 
>  the user to activate soft-reset if all secondary CPUs are not stopped.

It breaks `make allmodconfig':

arch/powerpc/kernel/built-in.o(.toc+0x2a00): In function `PPC64_CACHES':
: undefined reference to `kexec_crash_image'

^ permalink raw reply

* Re: 7447A strange problem with MSR:POW (WAS: can't boot 2.6.17-rc1)
From: Benjamin Herrenschmidt @ 2006-04-18  6:00 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Becky Bruce, Michael Schmitz, debian-powerpc, linuxppc-dev list
In-Reply-To: <17476.31998.168122.413195@cargo.ozlabs.ibm.com>

On Tue, 2006-04-18 at 15:45 +1000, Paul Mackerras wrote:
> Benjamin Herrenschmidt writes:
> 
> > Looks good to me except that we need the same for ppc64 since the 970
> > theorically has the same problem...
> 
> OK, does this look OK to everyone, before I send it off to Linus?  I
> now use a bit in the thread_info rather than using the HID0 bits
> themselves to indicate that we're napping, since the m[ft]spr might be
> slow.  I added a `local_flags' field to the thread_info struct for
> things that are only changed by the task itself and therefore don't
> need to be accessed atomically.
> 
> This version does the same sort of change for the 970 as for 6xx.

Hrm...

The 970 version bloats the exception prolog significantly... I
understand now why you were talking about putting the code in the exit
path on irc ... I don't like it that way.... Also, if you want to keep
it, maybe use a separate CONFIG_PPC_970STYLE_NAP or something that gets
selected by platforms that can do it ?

I suppose a PACA field would be less inefficient but still sucks... the
exception return to userland code path already accesses thread_info and
definitely looks like a better place to put it... as long as we never
have to add dodgy workarounds when getting out of NAP like we do on 6xx.

Ben.

^ permalink raw reply

* Re: 7447A strange problem with MSR:POW (WAS: can't boot 2.6.17-rc1)
From: Paul Mackerras @ 2006-04-18  5:45 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Becky Bruce, Michael Schmitz, debian-powerpc, linuxppc-dev list
In-Reply-To: <1145048461.4223.38.camel@localhost.localdomain>

Benjamin Herrenschmidt writes:

> Looks good to me except that we need the same for ppc64 since the 970
> theorically has the same problem...

OK, does this look OK to everyone, before I send it off to Linus?  I
now use a bit in the thread_info rather than using the HID0 bits
themselves to indicate that we're napping, since the m[ft]spr might be
slow.  I added a `local_flags' field to the thread_info struct for
things that are only changed by the task itself and therefore don't
need to be accessed atomically.

This version does the same sort of change for the 970 as for 6xx.

Oh, and I also fixed a stupid bug in the 32-bit stack overflow code,
where we put &_end into r11, and then if there was a stack overflow,
saved registers into the stack frame pointed to by r11. :)

Paul.

diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 54b48f3..8f85c5e 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -91,6 +91,7 @@ #endif /* CONFIG_SPE */
 #endif /* CONFIG_PPC64 */
 
 	DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
+	DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
 	DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
 	DEFINE(TI_TASK, offsetof(struct thread_info, task));
 #ifdef CONFIG_PPC32
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index b3a9794..8866fd2 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -128,37 +128,36 @@ #if defined(CONFIG_40x) || defined(CONFI
 	stw	r12,4(r11)
 #endif
 	b	3f
+
 2:	/* if from kernel, check interrupted DOZE/NAP mode and
          * check for stack overflow
          */
+	lwz	r9,THREAD_INFO-THREAD(r12)
+	cmplw	r1,r9			/* if r1 <= current->thread_info */
+	ble-	stack_ovf		/* then the kernel stack overflowed */
+5:
 #ifdef CONFIG_6xx
-	mfspr	r11,SPRN_HID0
-	mtcr	r11
-BEGIN_FTR_SECTION
-	bt-	8,4f			/* Check DOZE */
-END_FTR_SECTION_IFSET(CPU_FTR_CAN_DOZE)
-BEGIN_FTR_SECTION
-	bt-	9,4f			/* Check NAP */
-END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
+	tophys(r9,r9)			/* check local flags */
+	lwz	r12,TI_LOCAL_FLAGS(r9)
+	mtcrf	0x01,r12
+	bt-	31-TLF_NAPPING,4f
 #endif /* CONFIG_6xx */
 	.globl transfer_to_handler_cont
 transfer_to_handler_cont:
-	lwz	r11,THREAD_INFO-THREAD(r12)
-	cmplw	r1,r11			/* if r1 <= current->thread_info */
-	ble-	stack_ovf		/* then the kernel stack overflowed */
 3:
 	mflr	r9
 	lwz	r11,0(r9)		/* virtual address of handler */
 	lwz	r9,4(r9)		/* where to go when done */
-	FIX_SRR1(r10,r12)
 	mtspr	SPRN_SRR0,r11
 	mtspr	SPRN_SRR1,r10
 	mtlr	r9
 	SYNC
 	RFI				/* jump to handler, enable MMU */
 
-#ifdef CONFIG_6xx	
-4:	b	power_save_6xx_restore
+#ifdef CONFIG_6xx
+4:	rlwinm	r12,r12,0,~_TLF_NAPPING
+	stw	r12,TI_LOCAL_FLAGS(r9)
+	b	power_save_6xx_restore
 #endif
 
 /*
@@ -167,10 +166,10 @@ #endif
  */
 stack_ovf:
 	/* sometimes we use a statically-allocated stack, which is OK. */
-	lis	r11,_end@h
-	ori	r11,r11,_end@l
-	cmplw	r1,r11
-	ble	3b			/* r1 <= &_end is OK */
+	lis	r12,_end@h
+	ori	r12,r12,_end@l
+	cmplw	r1,r12
+	ble	5b			/* r1 <= &_end is OK */
 	SAVE_NVGPRS(r11)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	lis	r1,init_thread_union@ha
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index a5ae04a..3b500dc 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -381,6 +381,7 @@ #define STD_EXCEPTION_COMMON_LITE(trap, 
 	.globl label##_common;				\
 label##_common:						\
 	EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN);	\
+	FINISH_NAP;					\
 	DISABLE_INTS;					\
 	bl	.ppc64_runlatch_on;			\
 	addi	r3,r1,STACK_FRAME_OVERHEAD;		\
@@ -388,6 +389,25 @@ label##_common:						\
 	b	.ret_from_except_lite
 
 /*
+ * When the idle code in power4_idle puts the CPU into NAP mode,
+ * it has to do so in a loop, and relies on the external interrupt
+ * and decrementer interrupt entry code to get it out of the loop.
+ * It sets the _TLF_NAPPING bit in current_thread_info()->local_flags
+ * to signal that it is in the loop and needs help to get out.
+ */
+#if defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_MAPLE)
+#define FINISH_NAP				\
+BEGIN_FTR_SECTION				\
+	clrrdi	r11,r1,THREAD_SHIFT;		\
+	ld	r9,TI_LOCAL_FLAGS(r11);		\
+	andi.	r10,r9,_TLF_NAPPING;		\
+	bnel	power4_fixup_nap;		\
+END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
+#else
+#define FINISH_NAP
+#endif
+
+/*
  * Start of pSeries system interrupt routines
  */
 	. = 0x100
@@ -1034,12 +1054,22 @@ unrecov_slb:
 	.globl hardware_interrupt_entry
 hardware_interrupt_common:
 	EXCEPTION_PROLOG_COMMON(0x500, PACA_EXGEN)
+	FINISH_NAP
 hardware_interrupt_entry:
 	DISABLE_INTS
 	bl	.ppc64_runlatch_on
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	.do_IRQ
 	b	.ret_from_except_lite
+
+#if defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_MAPLE)
+power4_fixup_nap:
+	andc	r9,r9,r10
+	std	r9,TI_LOCAL_FLAGS(r11)
+	ld	r10,_LINK(r1)		/* make idle task do the */
+	std	r10,_NIP(r1)		/* equivalent of a blr */
+	blr
+#endif
 
 	.align	7
 	.globl alignment_common
diff --git a/arch/powerpc/kernel/idle_6xx.S b/arch/powerpc/kernel/idle_6xx.S
index 12a4efb..b45fa0e 100644
--- a/arch/powerpc/kernel/idle_6xx.S
+++ b/arch/powerpc/kernel/idle_6xx.S
@@ -22,8 +22,6 @@ #include <asm/thread_info.h>
 #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
 
-#undef DEBUG
-
 	.text
 
 /*
@@ -109,12 +107,6 @@ BEGIN_FTR_SECTION
 	dcbf	0,r4
 	dcbf	0,r4
 END_FTR_SECTION_IFSET(CPU_FTR_NAP_DISABLE_L2_PR)
-#ifdef DEBUG
-	lis	r6,nap_enter_count@ha
-	lwz	r4,nap_enter_count@l(r6)
-	addi	r4,r4,1
-	stw	r4,nap_enter_count@l(r6)
-#endif	
 2:
 BEGIN_FTR_SECTION
 	/* Go to low speed mode on some 750FX */
@@ -144,48 +136,42 @@ BEGIN_FTR_SECTION
 	DSSALL
 	sync
 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+	rlwinm	r9,r1,0,0,31-THREAD_SHIFT	/* current thread_info */
+	lwz	r8,TI_LOCAL_FLAGS(r9)	/* set napping bit */
+	ori	r8,r8,_TLF_NAPPING	/* so when we take an exception */
+	stw	r8,TI_LOCAL_FLAGS(r9)	/* it will return to our caller */
 	mfmsr	r7
 	ori	r7,r7,MSR_EE
 	oris	r7,r7,MSR_POW@h
-	sync
-	isync
+1:	sync
 	mtmsr	r7
 	isync
-	sync
-	blr
-	
+	b	1b
+
 /*
  * Return from NAP/DOZE mode, restore some CPU specific registers,
  * we are called with DR/IR still off and r2 containing physical
- * address of current.
+ * address of current.  R11 points to the exception frame (physical
+ * address).  We have to preserve r10.
  */
 _GLOBAL(power_save_6xx_restore)
-	mfspr	r11,SPRN_HID0
-	rlwinm.	r11,r11,0,10,8	/* Clear NAP & copy NAP bit !state to cr1 EQ */
-	cror	4*cr1+eq,4*cr0+eq,4*cr0+eq
-BEGIN_FTR_SECTION
-	rlwinm	r11,r11,0,9,7	/* Clear DOZE */
-END_FTR_SECTION_IFSET(CPU_FTR_CAN_DOZE)
-	mtspr	SPRN_HID0, r11
-
-#ifdef DEBUG
-	beq	cr1,1f
-	lis	r11,(nap_return_count-KERNELBASE)@ha
-	lwz	r9,nap_return_count@l(r11)
-	addi	r9,r9,1
-	stw	r9,nap_return_count@l(r11)
-1:
-#endif
-	
-	rlwinm	r9,r1,0,0,18
-	tophys(r9,r9)
-	lwz	r11,TI_CPU(r9)
+	lwz	r9,_LINK(r11)		/* interrupted in ppc6xx_idle: */
+	stw	r9,_NIP(r11)		/* make it do a blr */
+
+#ifdef CONFIG_SMP
+	mfspr	r12,SPRN_SPRG3
+	lwz	r11,TI_CPU(r12)		/* get cpu number * 4 */
 	slwi	r11,r11,2
+#else
+	li	r11,0
+#endif
 	/* Todo make sure all these are in the same page
-	 * and load r22 (@ha part + CPU offset) only once
+	 * and load r11 (@ha part + CPU offset) only once
 	 */
 BEGIN_FTR_SECTION
-	beq	cr1,1f
+	mfspr	r9,SPRN_HID0
+	andis.	r9,r9,HID0_NAP@h
+	beq	1f
 	addis	r9,r11,(nap_save_msscr0-KERNELBASE)@ha
 	lwz	r9,nap_save_msscr0@l(r9)
 	mtspr	SPRN_MSSCR0, r9
@@ -210,10 +196,3 @@ _GLOBAL(nap_save_hid1)
 
 _GLOBAL(powersave_lowspeed)
 	.long	0
-
-#ifdef DEBUG
-_GLOBAL(nap_enter_count)
-	.space	4
-_GLOBAL(nap_return_count)
-	.space	4
-#endif
diff --git a/arch/powerpc/kernel/idle_power4.S b/arch/powerpc/kernel/idle_power4.S
index 6dad1c0..d85c7c9 100644
--- a/arch/powerpc/kernel/idle_power4.S
+++ b/arch/powerpc/kernel/idle_power4.S
@@ -35,12 +35,16 @@ BEGIN_FTR_SECTION
 	DSSALL
 	sync
 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+	clrrdi	r9,r1,THREAD_SHIFT	/* current thread_info */
+	ld	r8,TI_LOCAL_FLAGS(r9)	/* set napping bit */
+	ori	r8,r8,_TLF_NAPPING	/* so when we take an exception */
+	std	r8,TI_LOCAL_FLAGS(r9)	/* it will return to our caller */
 	mfmsr	r7
 	ori	r7,r7,MSR_EE
 	oris	r7,r7,MSR_POW@h
-	sync
+1:	sync
 	isync
 	mtmsrd	r7
 	isync
-	sync
-	blr
+	b	1b
+
diff --git a/include/asm-powerpc/thread_info.h b/include/asm-powerpc/thread_info.h
index ffc7462..88b553c 100644
--- a/include/asm-powerpc/thread_info.h
+++ b/include/asm-powerpc/thread_info.h
@@ -37,6 +37,8 @@ struct thread_info {
 	int		preempt_count;		/* 0 => preemptable,
 						   <0 => BUG */
 	struct restart_block restart_block;
+	unsigned long	local_flags;		/* private flags for thread */
+
 	/* low level flags - has atomic operations done on it */
 	unsigned long	flags ____cacheline_aligned_in_smp;
 };
@@ -142,6 +144,12 @@ #define _TIF_SYSCALL_T_OR_A	(_TIF_SYSCAL
 #define _TIF_USER_WORK_MASK	(_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | \
 				 _TIF_NEED_RESCHED | _TIF_RESTORE_SIGMASK)
 #define _TIF_PERSYSCALL_MASK	(_TIF_RESTOREALL|_TIF_NOERROR)
+
+/* Bits in local_flags */
+/* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
+#define TLF_NAPPING		0	/* idle thread enabled NAP mode */
+
+#define _TLF_NAPPING		(1 << TLF_NAPPING)
 
 #endif /* __KERNEL__ */
 

^ permalink raw reply related

* [PATCH] powermac: Fix i2c on keywest based chips
From: Benjamin Herrenschmidt @ 2006-04-18  4:11 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev list

The new i2c implementation for PowerMac has a regression that causes the
hardware to go out of state when probing non-existent devices. While
fixing that, I also found & fixed a couple of other corner cases. This
fixes booting with a pbbuttons version that scans the i2c bus for an LMU
controller among others. Tested on a dual G5 with thermal control (which
has heavy i2c activity) with no problem so far.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

Index: linux-work/arch/powerpc/platforms/powermac/low_i2c.c
===================================================================
--- linux-work.orig/arch/powerpc/platforms/powermac/low_i2c.c	2006-04-17 16:08:42.000000000 +1000
+++ linux-work/arch/powerpc/platforms/powermac/low_i2c.c	2006-04-17 16:41:32.000000000 +1000
@@ -231,6 +231,14 @@
 	return isr;
 }
 
+static void kw_i2c_do_stop(struct pmac_i2c_host_kw *host, int result)
+{
+	kw_write_reg(reg_control, KW_I2C_CTL_STOP);
+	host->state = state_stop;
+	host->result = result;
+}
+
+
 static void kw_i2c_handle_interrupt(struct pmac_i2c_host_kw *host, u8 isr)
 {
 	u8 ack;
@@ -246,42 +254,36 @@
 	}
 
 	if (isr == 0) {
+		printk(KERN_WARNING "low_i2c: Timeout in i2c transfer"
+		       " on keywest !\n");
 		if (host->state != state_stop) {
-			DBG_LOW("KW: Timeout !\n");
-			host->result = -EIO;
-			goto stop;
-		}
-		if (host->state == state_stop) {
-			ack = kw_read_reg(reg_status);
-			if (ack & KW_I2C_STAT_BUSY)
-				kw_write_reg(reg_status, 0);
-			host->state = state_idle;
-			kw_write_reg(reg_ier, 0x00);
-			if (!host->polled)
-				complete(&host->complete);
+			kw_i2c_do_stop(host, -EIO);
+			return;
 		}
+		ack = kw_read_reg(reg_status);
+		if (ack & KW_I2C_STAT_BUSY)
+			kw_write_reg(reg_status, 0);
+		host->state = state_idle;
+		kw_write_reg(reg_ier, 0x00);
+		if (!host->polled)
+			complete(&host->complete);
 		return;
 	}
 
 	if (isr & KW_I2C_IRQ_ADDR) {
 		ack = kw_read_reg(reg_status);
 		if (host->state != state_addr) {
-			kw_write_reg(reg_isr, KW_I2C_IRQ_ADDR);
 			WRONG_STATE("KW_I2C_IRQ_ADDR"); 
-			host->result = -EIO;
-			goto stop;
+			kw_i2c_do_stop(host, -EIO);
 		}
 		if ((ack & KW_I2C_STAT_LAST_AAK) == 0) {
-			host->result = -ENODEV;
-			DBG_LOW("KW: NAK on address\n");
+			host->result = -ENXIO;
 			host->state = state_stop;
-			return;
+			DBG_LOW("KW: NAK on address\n");
 		} else {
-			if (host->len == 0) {
-				kw_write_reg(reg_isr, KW_I2C_IRQ_ADDR);
-				goto stop;
-			}
-			if (host->rw) {
+			if (host->len == 0)
+				kw_i2c_do_stop(host, 0);
+			else if (host->rw) {
 				host->state = state_read;
 				if (host->len > 1)
 					kw_write_reg(reg_control,
@@ -308,25 +310,19 @@
 			ack = kw_read_reg(reg_status);
 			if ((ack & KW_I2C_STAT_LAST_AAK) == 0) {
 				DBG_LOW("KW: nack on data write\n");
-				host->result = -EIO;
-				goto stop;
+				host->result = -EFBIG;
+				host->state = state_stop;
 			} else if (host->len) {
 				kw_write_reg(reg_data, *(host->data++));
 				host->len--;
-			} else {
-				kw_write_reg(reg_control, KW_I2C_CTL_STOP);
-				host->state = state_stop;
-				host->result = 0;
-			}
-			kw_write_reg(reg_isr, KW_I2C_IRQ_DATA);
+			} else
+				kw_i2c_do_stop(host, 0);
 		} else {
-			kw_write_reg(reg_isr, KW_I2C_IRQ_DATA);
 			WRONG_STATE("KW_I2C_IRQ_DATA"); 
-			if (host->state != state_stop) {
-				host->result = -EIO;
-				goto stop;
-			}
+			if (host->state != state_stop)
+				kw_i2c_do_stop(host, -EIO);
 		}
+		kw_write_reg(reg_isr, KW_I2C_IRQ_DATA);
 	}
 
 	if (isr & KW_I2C_IRQ_STOP) {
@@ -340,14 +336,10 @@
 			complete(&host->complete);
 	}
 
+	/* Below should only happen in manual mode which we don't use ... */
 	if (isr & KW_I2C_IRQ_START)
 		kw_write_reg(reg_isr, KW_I2C_IRQ_START);
 
-	return;
- stop:
-	kw_write_reg(reg_control, KW_I2C_CTL_STOP);	
-	host->state = state_stop;
-	return;
 }
 
 /* Interrupt handler */
@@ -544,11 +536,11 @@
 		return NULL;
 	}
 
-	/* Make sure IRA is disabled */
+	/* Make sure IRQ is disabled */
 	kw_write_reg(reg_ier, 0);
 
 	/* Request chip interrupt */
-	if (request_irq(host->irq, kw_i2c_irq, SA_SHIRQ, "keywest i2c", host))
+	if (request_irq(host->irq, kw_i2c_irq, 0, "keywest i2c", host))
 		host->irq = NO_IRQ;
 
 	printk(KERN_INFO "KeyWest i2c @0x%08x irq %d %s\n",

^ permalink raw reply

* Re: Xilinx Virtex-2 PRO FPGA ppc 405 on ML310 board
From: Aidan Williams @ 2006-04-18  2:45 UTC (permalink / raw)
  To: Vincent Winstead; +Cc: linuxppc-embedded list
In-Reply-To: <20060417210038.18561.qmail@web52008.mail.yahoo.com>


Vincent Winstead wrote:
> Now, as far as step 5, am I supposed to have a symbolic link that is 
> named linux-2.4.x placed into the uClinux-dist directory?  Because 
> there's already a folder named linux-2.4.x which was in there already 
> when I untarred everything.  At the command prompt in the uClinux-dist 
> directory I entered the following line:
>  
> ln -s ../linuxppc-2.4 linux-2.4.x
>  
> and the result of this operation was to put a symbolic link into my 
> linuxppc-2.4 directory with the name of linux-2.4.x  - is this correct?
>  

First, you'll need to move the existing directory aside using
a command like:

    mv linux-2.4.x linux-2.4.x-dist

and then re-run the ln -s command above.

> Now on to Step 6 problem. 
>   How am I supposed to make use uClinux EDK Board Support Package 1.0 
> files?  I'm not sure how to go about using them in the Xilinx Platform 
> Studio in order to generate the necessary auto-config.in file. 
>  

See the document below for the general approach:

>     Even though it is about the microblaze rather than
>     the PPC, a helpful "getting started" document is:
>     http://www.itee.uq.edu.au/~wu/downloads/uClinux_ready_Microblaze_design.pdf
> 

Look particularly at the section "Software Platform Settings"
on page 29, steps 67,68.

If you are not overly familiar with the EDK, it would
be best to find someone locally who can help walk you
through the process of generating a system.

- aidan


--------------------------------------------------------------------------
This email and any attachments may be confidential. They may contain legally
privileged information or copyright material. You should not read, copy,
use or disclose them without authorisation. If you are not an intended
recipient, please contact us at once by return email and then delete both
messages. We do not accept liability in connection with computer virus,
data corruption, delay, interruption, unauthorised access or unauthorised
amendment. This notice should not be removed.

^ permalink raw reply

* Re: [PATCH 00/05] robust per_cpu allocation for modules
From: Steven Rostedt @ 2006-04-18  1:51 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, linux-mips, David Mosberger-Tang, linux-ia64,
	Martin Mares, spyro, Joe Taylor, Andi Kleen, linuxppc-dev, paulus,
	benedict.gaster, bjornw, Ingo Molnar, Ravikiran G Thirumalai,
	Nick Piggin, grundler, starvik, Linus Torvalds, Thomas Gleixner,
	rth, Chris Zankel, tony.luck, LKML, ralf, Marc Gauthier, lethal,
	schwidefsky, linux390, davem, parisc-linux
In-Reply-To: <Pine.LNX.4.64.0604171647330.31773@schroedinger.engr.sgi.com>

On Mon, 2006-04-17 at 16:48 -0700, Christoph Lameter wrote:
> On Mon, 17 Apr 2006, Steven Rostedt wrote:
> 
> > So now we can focus on a better solution.
> 
> Could you have a look at Kiran's work?
> 
> Maybe one result of your work could be that the existing indirection
> for alloc_percpu could be avoided?

Sure,  I'll spend some time looking at what others have done and see
what I can put together.  I'm also very busy on other stuff at the
moment, so this will be something I do more on the side.  Don't think
there's a rush here, but I stated in a previous post, I probably wont
have something out for a month or two.

-- Steve

^ permalink raw reply

* Re: [PATCH 00/05] robust per_cpu allocation for modules
From: Christoph Lameter @ 2006-04-17 23:48 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Andrew Morton, linux-mips, David Mosberger-Tang, linux-ia64,
	Martin Mares, spyro, Joe Taylor, Andi Kleen, linuxppc-dev, paulus,
	benedict.gaster, bjornw, Ingo Molnar, Ravikiran G Thirumalai,
	Nick Piggin, grundler, starvik, Linus Torvalds, Thomas Gleixner,
	rth, Chris Zankel, tony.luck, LKML, ralf, Marc Gauthier, lethal,
	schwidefsky, linux390, davem, parisc-linux
In-Reply-To: <Pine.LNX.4.58.0604171936040.24264@gandalf.stny.rr.com>

On Mon, 17 Apr 2006, Steven Rostedt wrote:

> So now we can focus on a better solution.

Could you have a look at Kiran's work?

Maybe one result of your work could be that the existing indirection
for alloc_percpu could be avoided?

^ permalink raw reply

* Re: [PATCH 00/05] robust per_cpu allocation for modules
From: Steven Rostedt @ 2006-04-17 23:44 UTC (permalink / raw)
  To: Ravikiran G Thirumalai
  Cc: Andrew Morton, linux-mips, David Mosberger-Tang, linux-ia64,
	Martin Mares, spyro, Joe Taylor, Andi Kleen, linuxppc-dev, paulus,
	benedict.gaster, bjornw, Ingo Molnar, Christoph Lameter,
	Nick Piggin, grundler, starvik, Linus Torvalds, Thomas Gleixner,
	rth, Chris Zankel, tony.luck, LKML, ralf, Marc Gauthier, lethal,
	schwidefsky, linux390, davem, parisc-linux
In-Reply-To: <20060417220238.GD3945@localhost.localdomain>


On Mon, 17 Apr 2006, Ravikiran G Thirumalai wrote:

> On Mon, Apr 17, 2006 at 09:55:02AM -0700, Christoph Lameter wrote:
> > On Sat, 15 Apr 2006, Nick Piggin wrote:
> >
> > > If I'm following you correctly, this adds another dependent load
> > > to a per-CPU data access, and from memory that isn't node-affine.
> >
> > I am also concerned about that. Kiran has a patch to avoid allocpercpu
> > having to go through one level of indirection that I guess would no
> > longer work with this scheme.
>
> The alloc_percpu reimplementation would work regardless of changes to
> static per-cpu areas.  But, any extra indirection as was proposed initially
> is bad IMHO.
>

Don't worry, that idea has been shot down more than once ;-)

> >
> > > If so, I think people with SMP and NUMA kernels would care more
> > > about performance and scalability than the few k of memory this
> > > saves.
> >
> > Right.
>
> Me too :)
>

Understood, but I'm going to start looking in the way Rusty and Arnd
suggested with the vmalloc approach. This would allow for saving of
memory and dynamic allocation of module memory making it more robust. And
all this without that evil extra indirection!

So lets put my original patches where they belong, in the bit grave and
continue on. I lived, I learned and I've been shown the Way (thanks to
all BTW).

So now we can focus on a better solution.

Cheers,

-- Steve

^ permalink raw reply

* Re: [PATCH 00/05] robust per_cpu allocation for modules
From: Ravikiran G Thirumalai @ 2006-04-17 22:02 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, linux-mips, David Mosberger-Tang, linux-ia64,
	Martin Mares, spyro, Joe Taylor, Andi Kleen, linuxppc-dev, paulus,
	benedict.gaster, bjornw, Ingo Molnar, Nick Piggin, grundler,
	Steven Rostedt, starvik, Linus Torvalds, Thomas Gleixner, rth,
	Chris Zankel, tony.luck, LKML, ralf, Marc Gauthier, lethal,
	schwidefsky, linux390, davem, parisc-linux
In-Reply-To: <Pine.LNX.4.64.0604170953390.29732@schroedinger.engr.sgi.com>

On Mon, Apr 17, 2006 at 09:55:02AM -0700, Christoph Lameter wrote:
> On Sat, 15 Apr 2006, Nick Piggin wrote:
> 
> > If I'm following you correctly, this adds another dependent load
> > to a per-CPU data access, and from memory that isn't node-affine.
> 
> I am also concerned about that. Kiran has a patch to avoid allocpercpu
> having to go through one level of indirection that I guess would no 
> longer work with this scheme.

The alloc_percpu reimplementation would work regardless of changes to
static per-cpu areas.  But, any extra indirection as was proposed initially
is bad IMHO. 

>  
> > If so, I think people with SMP and NUMA kernels would care more
> > about performance and scalability than the few k of memory this
> > saves.
> 
> Right.

Me too :)

Kiran

^ permalink raw reply

* Re: PowerBook5,4 -- no sound?
From: Benjamin Herrenschmidt @ 2006-04-17 21:50 UTC (permalink / raw)
  To: Andreas Schwab; +Cc: linuxppc-dev, Johannes Berg
In-Reply-To: <jey7y3dgzw.fsf@sykes.suse.de>

On Mon, 2006-04-17 at 23:33 +0200, Andreas Schwab wrote:
> Benjamin Herrenschmidt <benh@kernel.crashing.org> writes:
> 
> > On Mon, 2006-04-17 at 15:45 +0200, Andreas Schwab wrote:
> >> request_mem_region(80000000 - 80000fff, i2sbus control)
> >> 
> >>       80000000-8007ffff : 0.80000000:mac-io
> >>         8000002c-8000002f : 0.0000004c:fans
> >>         80000030-80000033 : 0.0000004c:fans
> >>         80000034-80000037 : 0.0000004c:fans
> >>         8000004c-8000004f : 0.0000004c:fans
> >>         80000050-8000008a : 0.00000050:gpio
> >
> > Johannes,
> >
> > Look like the bugs in the device-tree I told you about ...
> 
> There is special support for K2-Keylargo in sound/ppc/pmac.c, is it that
> what you mean?

Yes, though it doesn't seem to affect all K2 based models, thus in this
regard, snd-powermac isn't bug free neither ... Apple keeps fucking up
their device-tree in all sort of funny ways.. at least kept, hopfeully
this is finished now :)

Ben.

^ permalink raw reply

* Re: PowerBook5,4 -- no sound?
From: Andreas Schwab @ 2006-04-17 21:33 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev, Johannes Berg
In-Reply-To: <1145307118.3912.11.camel@localhost.localdomain>

Benjamin Herrenschmidt <benh@kernel.crashing.org> writes:

> On Mon, 2006-04-17 at 15:45 +0200, Andreas Schwab wrote:
>> request_mem_region(80000000 - 80000fff, i2sbus control)
>> 
>>       80000000-8007ffff : 0.80000000:mac-io
>>         8000002c-8000002f : 0.0000004c:fans
>>         80000030-80000033 : 0.0000004c:fans
>>         80000034-80000037 : 0.0000004c:fans
>>         8000004c-8000004f : 0.0000004c:fans
>>         80000050-8000008a : 0.00000050:gpio
>
> Johannes,
>
> Look like the bugs in the device-tree I told you about ...

There is special support for K2-Keylargo in sound/ppc/pmac.c, is it that
what you mean?

Andreas.

-- 
Andreas Schwab, SuSE Labs, schwab@suse.de
SuSE Linux Products GmbH, Maxfeldstraße 5, 90409 Nürnberg, Germany
PGP key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."

^ permalink raw reply

* Re: [PATCH 00/05] robust per_cpu allocation for modules
From: Ravikiran G Thirumalai @ 2006-04-17 20:06 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Andrew Morton, linux-mips, David Mosberger-Tang, linux-ia64,
	Martin Mares, spyro, Christoph Lameter, Joe Taylor, Andi Kleen,
	linuxppc-dev, Paul Mackerras, benedict.gaster, bjornw,
	Ingo Molnar, Nick Piggin, grundler, rusty, Steven Rostedt,
	starvik, Linus Torvalds, Thomas Gleixner, rth, Chris Zankel,
	tony.luck, LKML, ralf, Marc Gauthier, lethal, schwidefsky,
	linux390, davem, parisc-linux
In-Reply-To: <200604161734.20256.arnd@arndb.de>

On Sun, Apr 16, 2006 at 05:34:18PM +0200, Arnd Bergmann wrote:
> On Sunday 16 April 2006 15:40, Steven Rostedt wrote:
> > I'll think more about this, but maybe someone else has some crazy ideas
> > that can find a solution to this that is both fast and robust.
> 
> Ok, you asked for a crazy idea, you're going to get it ;-)
> 
> You could take a fixed range from the vmalloc area (e.g. 1MB per cpu)
> and use that to remap pages on demand when you need per cpu data.
> 
> #define PER_CPU_BASE 0xe000000000000000UL /* arch dependant */
> #define PER_CPU_SHIFT 0x100000UL
> #define __per_cpu_offset(__cpu) (PER_CPU_BASE + PER_CPU_STRIDE * (__cpu))
> #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)))
> #define __get_cpu_var(var) per_cpu(var, smp_processor_id())
> 
> This is a lot like the current sparc64 implementation already is.
> 
> The tricky part here is the remapping of pages. You'd need to 
> alloc_pages_node() new pages whenever the already reserved space is
> not enough for the module you want to load and then map_vm_area()
> them into the space reserved for them.
> 
> Advantages of this solution are:
> - no dependant load access for per_cpu()
> - might be flexible enough to implement a faster per_cpu_ptr()
> - can be combined with ia64-style per-cpu remapping

An implemenation similar to one you are mentioning was already proposed
sometime back.
http://lwn.net/Articles/119532/
The design was also meant to not restrict/limit per-cpu memory being
allocated from modules.  Maybe it was too early then, and maybe now is the 
right time, going by the interest in this thread :).  IMHO, a new solution
should fix both static and dynamic per-cpu allocators, 
- Avoid possibility of false sharing for dynamically allocated per-CPU data
(with current alloc percpu) 
- work early enough -- if alloc_percpu can work early enough, (we can use
that for counters like slab cachep stats which is currently racy; using 
atomic_t for them would be bad for performance)

An extra dereference in Steven's original proposal is bad, (I had done some
measurements earlier).  My implementation had one less reference compared to
static per-cpu allocators, but the performance of both were the same as
the __per_cpu_offset table is always cache hot.

> 
> Disadvantages are:
> - you can't use huge tlbs for mapping per cpu data like the
>   regular linear mapping -> may be slower on some archs

Yep, we waste a few tlb entries then, which is a bit of concern, but then we
might be able to use hugetlbs for blocks of per-cpu data and minimize the 
impact.

Thanks,
Kiran

^ permalink raw reply

* Re: Xilinx Virtex-2 PRO FPGA ppc 405 on ML310 board
From: Vincent Winstead @ 2006-04-17 21:00 UTC (permalink / raw)
  To: Aidan Williams; +Cc: linuxppc-embedded list
In-Reply-To: <443D7AF5.6030401@nicta.com.au>

[-- Attachment #1: Type: text/plain, Size: 2219 bytes --]

  OK - I think i'm on a roll now.  I downloaded the necessary files for the uClinux distribution.  Two questions:
   
  1. I'm having a problem with two steps
          Step 5.  In your uClinux-dist directory, create a symbolic link called linux-2.4.x which points to the linuxppc-2.4 directory (linux-2.4.x -> ../linuxppc-2.4). 
          Step 6.  Ensure that you have the latest uClinux EDK Board Support Package installed (version 1.00d or later is required). 
   
  Now, as far as step 5, am I supposed to have a symbolic link that is named linux-2.4.x placed into the uClinux-dist directory?  Because there's already a folder named linux-2.4.x which was in there already when I untarred everything.  At the command prompt in the uClinux-dist directory I entered the following line:
   
  ln -s ../linuxppc-2.4 linux-2.4.x 
   
  and the result of this operation was to put a symbolic link into my linuxppc-2.4 directory with the name of linux-2.4.x  - is this correct?
   
  Now on to Step 6 problem.  
    How am I supposed to make use uClinux EDK Board Support Package 1.0 files?  I'm not sure how to go about using them in the Xilinx Platform Studio in order to generate the necessary auto-config.in file.  
   
  -Vincent
   
  
Aidan Williams <aidan@nicta.com.au> wrote:
  Grant Likely wrote:
> If you use 2.4: You need to use rsync to get the linuxppc-2.4 tree
> because I don't think anyone is maintaining .tar.gz of the rsync
> snapshot. Email Tom Rini and ask him. The linuxppc-2.4 tree is
> special because it's one of the trees that actually has the ML300
> patches integrated. Mainline does not.
> 

For 2.4, I've had good success on two different boards
using the v2pro and virtex4 parts with:

http://www.itee.uq.edu.au/~pml/uclinux_powerpc/

The kernel tarball there has support for various xilinx
supplied peripheral cores..

The neat thing about this approach is that there is a
uclinux BSP for the EDK that is used to generate an
auto-config.in file which you then drop directly
into the linux kernel tree.

Even though it is about the microblaze rather than
the PPC, a helpful "getting started" document is:
http://www.itee.uq.edu.au/~wu/downloads/uClinux_ready_Microblaze_design.pdf

- aidan



[-- Attachment #2: Type: text/html, Size: 2985 bytes --]

^ permalink raw reply

* Re: PowerBook5,4 -- no sound?
From: Johannes Berg @ 2006-04-17 20:57 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev
In-Reply-To: <1145307118.3912.11.camel@localhost.localdomain>

[-- Attachment #1: Type: text/plain, Size: 592 bytes --]

On Tue, 2006-04-18 at 06:51 +1000, Benjamin Herrenschmidt wrote:
> > 
> > request_mem_region(80000000 - 80000fff, i2sbus control)
> > 
> >       80000000-8007ffff : 0.80000000:mac-io
> >         8000002c-8000002f : 0.0000004c:fans
> >         80000030-80000033 : 0.0000004c:fans
> >         80000034-80000037 : 0.0000004c:fans
> >         8000004c-8000004f : 0.0000004c:fans
> >         80000050-8000008a : 0.00000050:gpio
> 
> Johannes,
> 
> Look like the bugs in the device-tree I told you about ...

I suppose so. I guess I'll have to go hardcode it then. Ick.

johannes

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 793 bytes --]

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox