* [RFC] pcibus_to_node implementation for ia64
@ 2005-05-06 0:12 Christoph Lameter
2005-05-10 23:56 ` colin ngam
` (11 more replies)
0 siblings, 12 replies; 13+ messages in thread
From: Christoph Lameter @ 2005-05-06 0:12 UTC (permalink / raw)
To: linux-ia64
This is a patch against 2.6.11-rc3-mm3. The mm tree contains a patch
x86-x86_64-pcibus_to_node.patch that introduces the ability to determine
the node number for a pci bus id. pcibus_to_node() is used by these
two patches
numa-aware-block-device-control-structure-allocation.patch
numa-aware-block-device-control-structure-allocation-tidy.patch
in order to allocate blockdev control structures on the node that the
device is connected to.
This patch only provides an actual implementation of pcibus_to_node for
Altix not for other subarches since I have no idea how to do that on these
arches. I would appreciate patches that allow the implementation on the
other ia64 subarches.
This is the first time I have modified the machvec.h file and the related
things so I may have missed something.
Index: linux-2.6.11/arch/ia64/sn/kernel/io_init.c
=================================--- linux-2.6.11.orig/arch/ia64/sn/kernel/io_init.c 2005-05-05 10:59:08.000000000 -0700
+++ linux-2.6.11/arch/ia64/sn/kernel/io_init.c 2005-05-05 16:33:19.000000000 -0700
@@ -67,7 +67,7 @@ static struct sn_pcibus_provider sn_pci_
};
/*
- * Retrieve the DMA Flush List given nasid. This list is needed
+ * Retrieve the DMA Flush List given nasid. This list is needed
* to implement the WAR - Flush DMA data on PIO Reads.
*/
static inline uint64_t
@@ -122,7 +122,7 @@ static inline uint64_t sal_get_pcibus_in
* Retrieve the pci device information given the bus and device|function number.
*/
static inline uint64_t
-sal_get_pcidev_info(u64 segment, u64 bus_number, u64 devfn, u64 pci_dev,
+sal_get_pcidev_info(u64 segment, u64 bus_number, u64 devfn, u64 pci_dev,
u64 sn_irq_info)
{
struct ia64_sal_retval ret_stuff;
@@ -131,13 +131,22 @@ sal_get_pcidev_info(u64 segment, u64 bus
SAL_CALL_NOLOCK(ret_stuff,
(u64) SN_SAL_IOIF_GET_PCIDEV_INFO,
- (u64) segment, (u64) bus_number, (u64) devfn,
+ (u64) segment, (u64) bus_number, (u64) devfn,
(u64) pci_dev,
sn_irq_info, 0, 0);
return ret_stuff.v0;
}
/*
+ * Figure out the node on which a given pci bus resides
+ */
+int sn_pcibus_to_node(struct pci_bus *bus)
+{
+ return nasid_to_cnodeid(NASID_GET(SN_PCIBUS_BUSSOFT(bus)->bs_base));
+}
+EXPORT_SYMBOL(sn_pcibus_to_node);
+
+/*
* sn_alloc_pci_sysdata() - This routine allocates a pci controller
* which is expected as the pci_dev and pci_bus sysdata by the Linux
* PCI infrastructure.
Index: linux-2.6.11/include/asm-ia64/topology.h
=================================--- linux-2.6.11.orig/include/asm-ia64/topology.h 2005-05-05 11:01:46.000000000 -0700
+++ linux-2.6.11/include/asm-ia64/topology.h 2005-05-05 16:55:00.000000000 -0700
@@ -29,6 +29,15 @@
#define node_to_cpumask(node) (node_to_cpu_mask[node])
/*
+ * Returns the cpus local to a given PCI device.
+ * pcibus_to_node is defined in asm/pci.h
+ */
+#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) = -1 ? \
+ CPU_MASK_ALL : \
+ node_to_cpumask(pcibus_to_node(bus)) \
+ )
+
+/*
* Returns the number of the node containing Node 'nid'.
* Not implemented here. Multi-level hierarchies detected with
* the help of node_distance().
Index: linux-2.6.11/arch/ia64/kernel/topology.c
=================================--- linux-2.6.11.orig/arch/ia64/kernel/topology.c 2005-05-05 11:01:24.000000000 -0700
+++ linux-2.6.11/arch/ia64/kernel/topology.c 2005-05-05 16:44:11.000000000 -0700
@@ -31,7 +31,7 @@ static struct ia64_cpu *sysfs_cpus;
int arch_register_cpu(int num)
{
struct node *parent = NULL;
-
+
#ifdef CONFIG_NUMA
parent = &sysfs_nodes[cpu_to_node(num)];
#endif /* CONFIG_NUMA */
@@ -63,6 +63,16 @@ EXPORT_SYMBOL(arch_register_cpu);
EXPORT_SYMBOL(arch_unregister_cpu);
#endif /*CONFIG_HOTPLUG_CPU*/
+int ia64_pcibus_to_node(struct pci_bus *bus)
+{
+ /*
+ * Confess our ignorance about the location of a
+ * pci bus.
+ */
+ return -1;
+}
+EXPORT_SYMBOL(ia64_pcibus_to_node);
+
static int __init topology_init(void)
{
Index: linux-2.6.11/include/asm-ia64/machvec.h
=================================--- linux-2.6.11.orig/include/asm-ia64/machvec.h 2005-03-01 23:38:10.000000000 -0800
+++ linux-2.6.11/include/asm-ia64/machvec.h 2005-05-05 16:40:25.000000000 -0700
@@ -34,6 +34,7 @@ typedef int ia64_mv_pci_legacy_read_t (s
u8 size);
typedef int ia64_mv_pci_legacy_write_t (struct pci_bus *, u16 port, u32 val,
u8 size);
+typedef int ia64_mv_pcibus_to_node(struct pci_bus *);
/* DMA-mapping interface: */
typedef void ia64_mv_dma_init (void);
@@ -131,6 +132,7 @@ extern void machvec_tlb_migrate_finish (
# define platform_pci_get_legacy_mem ia64_mv.pci_get_legacy_mem
# define platform_pci_legacy_read ia64_mv.pci_legacy_read
# define platform_pci_legacy_write ia64_mv.pci_legacy_write
+# define platform_pcibus_to_node ia64_mv.pcibus_to_node
# define platform_inb ia64_mv.inb
# define platform_inw ia64_mv.inw
# define platform_inl ia64_mv.inl
@@ -179,6 +181,7 @@ struct ia64_machine_vector {
ia64_mv_pci_get_legacy_mem_t *pci_get_legacy_mem;
ia64_mv_pci_legacy_read_t *pci_legacy_read;
ia64_mv_pci_legacy_write_t *pci_legacy_write;
+ ia64_mv_pcibus_to_node *pcibus_to_node;
ia64_mv_inb_t *inb;
ia64_mv_inw_t *inw;
ia64_mv_inl_t *inl;
@@ -223,6 +226,7 @@ struct ia64_machine_vector {
platform_pci_get_legacy_mem, \
platform_pci_legacy_read, \
platform_pci_legacy_write, \
+ platform_pcibus_to_node, \
platform_inb, \
platform_inw, \
platform_inl, \
@@ -341,6 +345,9 @@ extern ia64_mv_dma_supported swiotlb_dm
#ifndef platform_pci_legacy_write
# define platform_pci_legacy_write ia64_pci_legacy_write
#endif
+#ifndef platform_pcibus_to_node
+# define platform_pcibus_to_node ia64_pcibus_to_node
+#endif
#ifndef platform_inb
# define platform_inb __ia64_inb
#endif
Index: linux-2.6.11/include/asm-ia64/machvec_sn2.h
=================================--- linux-2.6.11.orig/include/asm-ia64/machvec_sn2.h 2005-05-05 16:42:45.000000000 -0700
+++ linux-2.6.11/include/asm-ia64/machvec_sn2.h 2005-05-05 16:45:23.000000000 -0700
@@ -44,7 +44,7 @@ extern ia64_mv_local_vector_to_irq sn_lo
extern ia64_mv_pci_get_legacy_mem_t sn_pci_get_legacy_mem;
extern ia64_mv_pci_legacy_read_t sn_pci_legacy_read;
extern ia64_mv_pci_legacy_write_t sn_pci_legacy_write;
-extern ia64_mv_pcibus_to_node sn_pcibus_to_node
+extern ia64_mv_pcibus_to_node sn_pcibus_to_node;
extern ia64_mv_inb_t __sn_inb;
extern ia64_mv_inw_t __sn_inw;
extern ia64_mv_inl_t __sn_inl;
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [RFC] pcibus_to_node implementation for ia64
2005-05-06 0:12 [RFC] pcibus_to_node implementation for ia64 Christoph Lameter
@ 2005-05-10 23:56 ` colin ngam
2005-05-11 1:34 ` Christoph Lameter
` (10 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: colin ngam @ 2005-05-10 23:56 UTC (permalink / raw)
To: linux-ia64
Christoph Lameter wrote:
Hi Christoph,
I do not have visibility to how these get used - my apology. However, I
do have some comments for your perusal:
..... SNIP .....
>+int sn_pcibus_to_node(struct pci_bus *bus)
>+{
>+ return nasid_to_cnodeid(NASID_GET(SN_PCIBUS_BUSSOFT(bus)->bs_base));
>
>
The cnodeid returned by the above function can be a node id:
1. With memory but no cpus - Headless Nodes.
2. With no memory and no cpus - IO Nodes.
>+}
>+EXPORT_SYMBOL(sn_pcibus_to_node);
>+
>+/*
> * sn_alloc_pci_sysdata() - This routine allocates a pci controller
> * which is expected as the pci_dev and pci_bus sysdata by the Linux
> * PCI infrastructure.
>Index: linux-2.6.11/include/asm-ia64/topology.h
>=================================>--- linux-2.6.11.orig/include/asm-ia64/topology.h 2005-05-05 11:01:46.000000000 -0700
>+++ linux-2.6.11/include/asm-ia64/topology.h 2005-05-05 16:55:00.000000000 -0700
>@@ -29,6 +29,15 @@
> #define node_to_cpumask(node) (node_to_cpu_mask[node])
>
> /*
>+ * Returns the cpus local to a given PCI device.
>+ * pcibus_to_node is defined in asm/pci.h
>+ */
>+#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) = -1 ? \
>+ CPU_MASK_ALL : \
>+ node_to_cpumask(pcibus_to_node(bus)) \
>+ )
>
>
pcibus_to_cpumask() can return 0 if the node is an ionode - TIO or
Headless Node(node with no CPUs but has memory).
... SNIP ...
Thanks.
colin
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [RFC] pcibus_to_node implementation for ia64
2005-05-06 0:12 [RFC] pcibus_to_node implementation for ia64 Christoph Lameter
2005-05-10 23:56 ` colin ngam
@ 2005-05-11 1:34 ` Christoph Lameter
2005-05-11 2:42 ` colin ngam
` (9 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: Christoph Lameter @ 2005-05-11 1:34 UTC (permalink / raw)
To: linux-ia64
On Tue, 10 May 2005, colin ngam wrote:
> >+int sn_pcibus_to_node(struct pci_bus *bus)
> >+{
> >+ return nasid_to_cnodeid(NASID_GET(SN_PCIBUS_BUSSOFT(bus)->bs_base));
> >
> >
> The cnodeid returned by the above function can be a node id:
> 1. With memory but no cpus - Headless Nodes.
> 2. With no memory and no cpus - IO Nodes.
How do I make the function return the correct result?
> pcibus_to_cpumask() can return 0 if the node is an ionode - TIO or
> Headless Node(node with no CPUs but has memory).
How can I determine that? A return value of 0 would mean that the block
i/o layer would allocate the control structures on node 0. The correct
result is -1 if there is no node associated with the device.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [RFC] pcibus_to_node implementation for ia64
2005-05-06 0:12 [RFC] pcibus_to_node implementation for ia64 Christoph Lameter
2005-05-10 23:56 ` colin ngam
2005-05-11 1:34 ` Christoph Lameter
@ 2005-05-11 2:42 ` colin ngam
2005-05-11 6:51 ` Christoph Lameter
` (8 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: colin ngam @ 2005-05-11 2:42 UTC (permalink / raw)
To: linux-ia64
Christoph Lameter wrote:
Hi Christoph,
>On Tue, 10 May 2005, colin ngam wrote:
>
>
>
>>>+int sn_pcibus_to_node(struct pci_bus *bus)
>>>+{
>>>+ return nasid_to_cnodeid(NASID_GET(SN_PCIBUS_BUSSOFT(bus)->bs_base));
>>>
>>>
>>>
>>>
>>The cnodeid returned by the above function can be a node id:
>> 1. With memory but no cpus - Headless Nodes.
>> 2. With no memory and no cpus - IO Nodes.
>>
>>
>
>How do I make the function return the correct result?
>
>
This is the correct result - with respect to which node the
bus/device/function is directly connected. Either than using this
function in pcibus_to_cpumask(), what other purpose is this routine
targeted? I assume you want the node id for memory placement? If that
is the case it may return the wrong node id. If you expect this node id
to have cpus(to feed the result to get pcibus_to_cpumask()), it may
return the wrong node id.
Depending on what you want, you have to test the node id to see if it
contains memory or if it contains cpus or both.
>
>
>>pcibus_to_cpumask() can return 0 if the node is an ionode - TIO or
>>Headless Node(node with no CPUs but has memory).
>>
>>
>
>How can I determine that? A return value of 0 would mean that the block
>i/o layer would allocate the control structures on node 0. The correct
>result is -1 if there is no node associated with the device.
>
>
There is always a node associated with the device - the issue is that
the node may not have cpu but has memory, or may not have both memory
and cpu. I am a little bit confused about your comment above and the
usage of pcibus_to_cpumask(). Are you using sn_pcibus_to_node() to
target memory placement and pcibus_to_cpumask() to target interrupt?
Thanks.
colin
>-
>To unsubscribe from this list: send the line "unsubscribe linux-ia64" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [RFC] pcibus_to_node implementation for ia64
2005-05-06 0:12 [RFC] pcibus_to_node implementation for ia64 Christoph Lameter
` (2 preceding siblings ...)
2005-05-11 2:42 ` colin ngam
@ 2005-05-11 6:51 ` Christoph Lameter
2005-05-11 15:29 ` Jesse Barnes
` (7 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: Christoph Lameter @ 2005-05-11 6:51 UTC (permalink / raw)
To: linux-ia64
On Tue, 10 May 2005, colin ngam wrote:
> >How do I make the function return the correct result?
> This is the correct result - with respect to which node the
> bus/device/function is directly connected. Either than using this
> function in pcibus_to_cpumask(), what other purpose is this routine
> targeted? I assume you want the node id for memory placement? If that
> is the case it may return the wrong node id. If you expect this node id
> to have cpus(to feed the result to get pcibus_to_cpumask()), it may
> return the wrong node id.
It needs to return a node id that the system knows about. The zonelist
for the node will take care of memory placement.
> There is always a node associated with the device - the issue is that
> the node may not have cpu but has memory, or may not have both memory
> and cpu. I am a little bit confused about your comment above and the
> usage of pcibus_to_cpumask(). Are you using sn_pcibus_to_node() to
> target memory placement and pcibus_to_cpumask() to target interrupt?
pcibus_to_node is currently only used to target memory placement. It can
return -1 for unknow or any node id that the page allocator knows about.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [RFC] pcibus_to_node implementation for ia64
2005-05-06 0:12 [RFC] pcibus_to_node implementation for ia64 Christoph Lameter
` (3 preceding siblings ...)
2005-05-11 6:51 ` Christoph Lameter
@ 2005-05-11 15:29 ` Jesse Barnes
2005-05-11 15:44 ` colin ngam
` (6 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: Jesse Barnes @ 2005-05-11 15:29 UTC (permalink / raw)
To: linux-ia64
On Tuesday, May 10, 2005 7:42 pm, colin ngam wrote:
> This is the correct result - with respect to which node the
> bus/device/function is directly connected.
Not really, since the kernel is only aware of nodes (for most purposes)
that actually have memory. If the I/O and memoryless nodes were listed
in the SLIT and SRAT tables, this would be the case (the kernel would
build empty node structures for those nodes), but it's not at the
moment.
> Either than using this
> function in pcibus_to_cpumask(), what other purpose is this routine
> targeted?
You mean 'other than pcibus_to_cpumask'? The idea is to use it for
memory allocation in device drivers, as well as the obvious use in the
actual DMA mapping layer (e.g. pci_alloc_consistent). It could also be
used to simplify the interrupt targetting code a little, iirc, assuming
the above condition was met (that is, that all nodes, regardless of
configuration, had pgdat structures associated with them).
> Depending on what you want, you have to test the node id to see if it
> contains memory or if it contains cpus or both.
That's certainly another way to go--just make the function do a search
to find the closest node with memory (and/or CPUs) all by itself. The
obvious disadvantage is that you'll incur that cost on every function
call unless you build a lookup table at boot time or somesuch.
> There is always a node associated with the device - the issue is that
> the node may not have cpu but has memory, or may not have both memory
> and cpu.
That's where things get confusing, since 'node' is an overloaded term
here. It does *not* return a node that you can pass into
alloc_pages_node for instance, which I think is what Christoph wants.
It does, however, return a node id in the cnodeid sense, but one that
could be useless in most parts of the kernel.
Jesse
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [RFC] pcibus_to_node implementation for ia64
2005-05-06 0:12 [RFC] pcibus_to_node implementation for ia64 Christoph Lameter
` (4 preceding siblings ...)
2005-05-11 15:29 ` Jesse Barnes
@ 2005-05-11 15:44 ` colin ngam
2005-05-11 15:49 ` Christoph Lameter
` (5 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: colin ngam @ 2005-05-11 15:44 UTC (permalink / raw)
To: linux-ia64
Jesse Barnes wrote:
Hi Jesse,
>On Tuesday, May 10, 2005 7:42 pm, colin ngam wrote:
>
>
>>This is the correct result - with respect to which node the
>>bus/device/function is directly connected.
>>
>>
>
>Not really, since the kernel is only aware of nodes (for most purposes)
>that actually have memory. If the I/O and memoryless nodes were listed
>in the SLIT and SRAT tables, this would be the case (the kernel would
>build empty node structures for those nodes), but it's not at the
>moment.
>
>
Fine line here .. a valid node id is always returned and that nodeid is
a vaild node id for addressing the bus/devices and may be the only node
id you can use to address these buses and devices, except when the IO
Brick is dual ported. It is valid with respect to addressing the
bus/devices but may not be "valid" with respect to "having Memory" or
"having cpus". Depends on what you expect :-) Depends on how you want
to use it :-) Irrespective, headless and memoryless nodes will exist on
the system, therefore you cannot assume that the node id that is used in
addressing the pci bus is actually a node that contains memory or cpus.
>
>
>>Either than using this
>>function in pcibus_to_cpumask(), what other purpose is this routine
>>targeted?
>>
>>
>
>You mean 'other than pcibus_to_cpumask'? The idea is to use it for
>memory allocation in device drivers, as well as the obvious use in the
>actual DMA mapping layer (e.g. pci_alloc_consistent). It could also be
>used to simplify the interrupt targetting code a little, iirc, assuming
>the above condition was met (that is, that all nodes, regardless of
>configuration, had pgdat structures associated with them).
>
>
>
>>Depending on what you want, you have to test the node id to see if it
>>contains memory or if it contains cpus or both.
>>
>>
>
>That's certainly another way to go--just make the function do a search
>to find the closest node with memory (and/or CPUs) all by itself. The
>obvious disadvantage is that you'll incur that cost on every function
>call unless you build a lookup table at boot time or somesuch.
>
>
How often do you do this to worry about latency?
>
>
>>There is always a node associated with the device - the issue is that
>>the node may not have cpu but has memory, or may not have both memory
>>and cpu.
>>
>>
>
>That's where things get confusing, since 'node' is an overloaded term
>here. It does *not* return a node that you can pass into
>alloc_pages_node for instance, which I think is what Christoph wants.
>It does, however, return a node id in the cnodeid sense, but one that
>could be useless in most parts of the kernel.
>
>
Exactly.
Thanks.
colin
>Jesse
>
>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [RFC] pcibus_to_node implementation for ia64
2005-05-06 0:12 [RFC] pcibus_to_node implementation for ia64 Christoph Lameter
` (5 preceding siblings ...)
2005-05-11 15:44 ` colin ngam
@ 2005-05-11 15:49 ` Christoph Lameter
2005-05-11 15:52 ` colin ngam
` (4 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: Christoph Lameter @ 2005-05-11 15:49 UTC (permalink / raw)
To: linux-ia64
On Wed, 11 May 2005, colin ngam wrote:
> >That's certainly another way to go--just make the function do a search
> >to find the closest node with memory (and/or CPUs) all by itself. The
> >obvious disadvantage is that you'll incur that cost on every function
> >call unless you build a lookup table at boot time or somesuch.
This search is performed by alloc_pages using the zonelists.
> How often do you do this to worry about latency?
Currently pcibus_to_node is only used during device initialization to
allocate the control structures on a node.
> >That's where things get confusing, since 'node' is an overloaded term
> >here. It does *not* return a node that you can pass into
> >alloc_pages_node for instance, which I think is what Christoph wants.
> >It does, however, return a node id in the cnodeid sense, but one that
> >could be useless in most parts of the kernel.
> >
> >
> Exactly.
How do I get a node that I can pass to alloc_pages_node?
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [RFC] pcibus_to_node implementation for ia64
2005-05-06 0:12 [RFC] pcibus_to_node implementation for ia64 Christoph Lameter
` (6 preceding siblings ...)
2005-05-11 15:49 ` Christoph Lameter
@ 2005-05-11 15:52 ` colin ngam
2005-05-11 15:54 ` Jesse Barnes
` (3 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: colin ngam @ 2005-05-11 15:52 UTC (permalink / raw)
To: linux-ia64
Hi Christoph,
>
>How do I get a node that I can pass to alloc_pages_node?
>
>
Working on it - I have talked to Jack this morning - I am concern about
future changes that may break current ideas.
Will get back to you ASAP.
Thanks.
colin
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [RFC] pcibus_to_node implementation for ia64
2005-05-06 0:12 [RFC] pcibus_to_node implementation for ia64 Christoph Lameter
` (7 preceding siblings ...)
2005-05-11 15:52 ` colin ngam
@ 2005-05-11 15:54 ` Jesse Barnes
2005-05-11 15:58 ` Jesse Barnes
` (2 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: Jesse Barnes @ 2005-05-11 15:54 UTC (permalink / raw)
To: linux-ia64
On Wednesday, May 11, 2005 8:44 am, colin ngam wrote:
> Fine line here .. a valid node id is always returned and that nodeid
> is a vaild node id for addressing the bus/devices and may be the only
> node id you can use to address these buses and devices, except when
> the IO Brick is dual ported. It is valid with respect to addressing
> the bus/devices but may not be "valid" with respect to "having
> Memory" or "having cpus". Depends on what you expect :-) Depends on
> how you want to use it :-)
But not valid in the sense that you can pass it to any of the kernel
routines that say they take a 'node' argument. IMO, that's a bug in
the implementation of I/O and memoryless nodes in sn2. As Jack and I
discussed last year at OLS (he convinced me of this), a node is any
combination of memory, CPUs, and/or I/O. If I/O nodes (or nodes w/o
memory generally) are special cased, we're breaking that assumption,
making things harder for every caller and user of nodes.
> >That's certainly another way to go--just make the function do a
> > search to find the closest node with memory (and/or CPUs) all by
> > itself. The obvious disadvantage is that you'll incur that cost on
> > every function call unless you build a lookup table at boot time or
> > somesuch.
>
> How often do you do this to worry about latency?
Not sure, but the search could get pretty expensive so it's probably
best to avoid it just to give callers flexibility.
Jesse
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [RFC] pcibus_to_node implementation for ia64
2005-05-06 0:12 [RFC] pcibus_to_node implementation for ia64 Christoph Lameter
` (8 preceding siblings ...)
2005-05-11 15:54 ` Jesse Barnes
@ 2005-05-11 15:58 ` Jesse Barnes
2005-05-11 16:05 ` colin ngam
2005-05-11 16:19 ` Jack Steiner
11 siblings, 0 replies; 13+ messages in thread
From: Jesse Barnes @ 2005-05-11 15:58 UTC (permalink / raw)
To: linux-ia64
On Wednesday, May 11, 2005 8:49 am, you wrote:
> On Wed, 11 May 2005, colin ngam wrote:
> > >That's certainly another way to go--just make the function do a
> > > search to find the closest node with memory (and/or CPUs) all by
> > > itself. The obvious disadvantage is that you'll incur that cost
> > > on every function call unless you build a lookup table at boot
> > > time or somesuch.
>
> This search is performed by alloc_pages using the zonelists.
Not right now it isn't. In the snippet you posted you might get back a
special node id, one that corresponds to an I/O node, which doesn't
have a pgdat associated with it, and therefore won't work when passed
to alloc_pages_node (I think it'll panic).
> How do I get a node that I can pass to alloc_pages_node?
Fix the sn2 implementation of I/O nodes to make them more like regular
nodes. Basically,
o put them in the SLIT and SRAT tables
o make sure empty pgdats are created for memoryless nodes (in
discontig.c), ideally on the closest node containing memory.
The first step isn't strictly necessary since you could fake it (you'd
probably have to pull some code from io_init.c out and make it usable
at early boot), but is probably preferable to putting together an ugly
hack.
Jesse
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [RFC] pcibus_to_node implementation for ia64
2005-05-06 0:12 [RFC] pcibus_to_node implementation for ia64 Christoph Lameter
` (9 preceding siblings ...)
2005-05-11 15:58 ` Jesse Barnes
@ 2005-05-11 16:05 ` colin ngam
2005-05-11 16:19 ` Jack Steiner
11 siblings, 0 replies; 13+ messages in thread
From: colin ngam @ 2005-05-11 16:05 UTC (permalink / raw)
To: linux-ia64
Jesse Barnes wrote:
>On Wednesday, May 11, 2005 8:44 am, colin ngam wrote:
>
>
>>Fine line here .. a valid node id is always returned and that nodeid
>>is a vaild node id for addressing the bus/devices and may be the only
>>node id you can use to address these buses and devices, except when
>>the IO Brick is dual ported. It is valid with respect to addressing
>>the bus/devices but may not be "valid" with respect to "having
>>Memory" or "having cpus". Depends on what you expect :-) Depends on
>>how you want to use it :-)
>>
>>
>
>But not valid in the sense that you can pass it to any of the kernel
>routines that say they take a 'node' argument. IMO, that's a bug in
>the implementation of I/O and memoryless nodes in sn2. As Jack and I
>discussed last year at OLS (he convinced me of this), a node is any
>combination of memory, CPUs, and/or I/O. If I/O nodes (or nodes w/o
>memory generally) are special cased, we're breaking that assumption,
>making things harder for every caller and user of nodes.
>
>
I think we are in agreement here. They should not be special case - but
caller and users of node id has to be cognizant that a node does not
necessary mean it has cpus, memory or both.
Thanks.
colin
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [RFC] pcibus_to_node implementation for ia64
2005-05-06 0:12 [RFC] pcibus_to_node implementation for ia64 Christoph Lameter
` (10 preceding siblings ...)
2005-05-11 16:05 ` colin ngam
@ 2005-05-11 16:19 ` Jack Steiner
11 siblings, 0 replies; 13+ messages in thread
From: Jack Steiner @ 2005-05-11 16:19 UTC (permalink / raw)
To: linux-ia64
On Wed, May 11, 2005 at 08:58:26AM -0700, Jesse Barnes wrote:
> On Wednesday, May 11, 2005 8:49 am, you wrote:
> > On Wed, 11 May 2005, colin ngam wrote:
> > > >That's certainly another way to go--just make the function do a
> > > > search to find the closest node with memory (and/or CPUs) all by
> > > > itself. The obvious disadvantage is that you'll incur that cost
> > > > on every function call unless you build a lookup table at boot
> > > > time or somesuch.
> >
> > This search is performed by alloc_pages using the zonelists.
>
> Not right now it isn't. In the snippet you posted you might get back a
> special node id, one that corresponds to an I/O node, which doesn't
> have a pgdat associated with it, and therefore won't work when passed
> to alloc_pages_node (I think it'll panic).
>
> > How do I get a node that I can pass to alloc_pages_node?
>
> Fix the sn2 implementation of I/O nodes to make them more like regular
> nodes. Basically,
>
> o put them in the SLIT and SRAT tables
> o make sure empty pgdats are created for memoryless nodes (in
> discontig.c), ideally on the closest node containing memory.
You are correct.
Most of the confusion here results from overloading the
term "node". Our (SGI) usage of the term "node" for TIO nodes is
inompatible with "node" as used in the rest of the kernel.
TIO "nodes" are unusual in the sense that they have no cpus or memory, but DO
have the properties of distance & "nearness" to ordinary nodes. For most
purposes, you can think of TIO nodes as ordinary nodes with cpus & memory
removed - only IO remains. (Physically, this is not correct but it's close
enough....).
TIO nodes _should_ be described in the SRAT & SLIT tables.
Currently, there are several problems with treating TIO nodes as ordinary
nodes & describing them in the SRTA/SLIT. AFAICT, ACPI2.0 does not support
an SRAT description for this type of node. Only PROCESSOR & MEMORY SRAT
entries exist.
The second limitation is that ACPI 2.0 limits the size of the SLIT to
256x256. Our largest system currently supports 256 nodes with memory/cpus +
an additional 256 TIO nodes. We would need at least a 512x512 SLIT to describe
our largest system. In ACPI3.0, these limitations are removed (I think).
I think it is time to take a step back & figure out how to correctly
describe TIO nodes & make them fit within the ACPI & kernel structures.
(BTW, does anyone know when ACPI3.0 will be implemented for IA64 ???)
>
> The first step isn't strictly necessary since you could fake it (you'd
> probably have to pull some code from io_init.c out and make it usable
> at early boot), but is probably preferable to putting together an ugly
> hack.
>
> Jesse
> -
> To unsubscribe from this list: send the line "unsubscribe linux-ia64" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Thanks
Jack Steiner (steiner@sgi.com) 651-683-5302
Principal Engineer SGI - Silicon Graphics, Inc.
^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2005-05-11 16:19 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-05-06 0:12 [RFC] pcibus_to_node implementation for ia64 Christoph Lameter
2005-05-10 23:56 ` colin ngam
2005-05-11 1:34 ` Christoph Lameter
2005-05-11 2:42 ` colin ngam
2005-05-11 6:51 ` Christoph Lameter
2005-05-11 15:29 ` Jesse Barnes
2005-05-11 15:44 ` colin ngam
2005-05-11 15:49 ` Christoph Lameter
2005-05-11 15:52 ` colin ngam
2005-05-11 15:54 ` Jesse Barnes
2005-05-11 15:58 ` Jesse Barnes
2005-05-11 16:05 ` colin ngam
2005-05-11 16:19 ` Jack Steiner
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox