* [PATCH 1/5] hugetlb: numafy several functions @ 2008-04-11 23:44 Nishanth Aravamudan 2008-04-11 23:47 ` [RFC][PATCH 2/5] " Nishanth Aravamudan 0 siblings, 1 reply; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-11 23:44 UTC (permalink / raw) To: wli; +Cc: clameter, agl, luick, Lee.Schermerhorn, linux-mm, npiggin Add node-parameterized helpers for dequeue_huge_page, alloc_fresh_huge_page, adjust_pool_surplus and try_to_free_low. Also have update_and_free_page() take a nid parameter. These changes are necessary to add sysfs attributes to specify the number of static hugepages on NUMA nodes. Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com> diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e13a7b2..8faaa16 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -71,6 +71,20 @@ static void enqueue_huge_page(struct page *page) free_huge_pages_node[nid]++; } +static struct page *dequeue_huge_page_node(struct vm_area_struct *vma, + int nid) +{ + struct page *page; + + page = list_entry(hugepage_freelists[nid].next, struct page, lru); + list_del(&page->lru); + free_huge_pages--; + free_huge_pages_node[nid]--; + if (vma && vma->vm_flags & VM_MAYSHARE) + resv_huge_pages--; + return page; +} + static struct page *dequeue_huge_page(void) { int nid; @@ -78,11 +92,7 @@ static struct page *dequeue_huge_page(void) for (nid = 0; nid < MAX_NUMNODES; ++nid) { if (!list_empty(&hugepage_freelists[nid])) { - page = list_entry(hugepage_freelists[nid].next, - struct page, lru); - list_del(&page->lru); - free_huge_pages--; - free_huge_pages_node[nid]--; + page = dequeue_huge_page_node(NULL, nid); break; } } @@ -106,13 +116,7 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, nid = zone_to_nid(zone); if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && !list_empty(&hugepage_freelists[nid])) { - page = list_entry(hugepage_freelists[nid].next, - struct page, lru); - list_del(&page->lru); - free_huge_pages--; - free_huge_pages_node[nid]--; - if (vma && vma->vm_flags & VM_MAYSHARE) - resv_huge_pages--; + page = dequeue_huge_page_node(vma, nid); break; } } @@ -120,11 +124,11 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, return page; } -static void update_and_free_page(struct page *page) +static void update_and_free_page(int nid, struct page *page) { int i; nr_huge_pages--; - nr_huge_pages_node[page_to_nid(page)]--; + nr_huge_pages_node[nid]--; for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | @@ -148,7 +152,7 @@ static void free_huge_page(struct page *page) spin_lock(&hugetlb_lock); if (surplus_huge_pages_node[nid]) { - update_and_free_page(page); + update_and_free_page(nid, page); surplus_huge_pages--; surplus_huge_pages_node[nid]--; } else { @@ -164,6 +168,20 @@ static void free_huge_page(struct page *page) * balanced by operating on them in a round-robin fashion. * Returns 1 if an adjustment was made. */ +static int adjust_pool_surplus_node(int delta, int nid) +{ + /* To shrink on this node, there must be a surplus page */ + if (delta < 0 && !surplus_huge_pages_node[nid]) + return 0; + /* Surplus cannot exceed the total number of pages */ + if (delta > 0 && surplus_huge_pages_node[nid] >= + nr_huge_pages_node[nid]) + return 0; + surplus_huge_pages += delta; + surplus_huge_pages_node[nid] += delta; + return 1; +} + static int adjust_pool_surplus(int delta) { static int prev_nid; @@ -175,19 +193,9 @@ static int adjust_pool_surplus(int delta) nid = next_node(nid, node_online_map); if (nid == MAX_NUMNODES) nid = first_node(node_online_map); - - /* To shrink on this node, there must be a surplus page */ - if (delta < 0 && !surplus_huge_pages_node[nid]) - continue; - /* Surplus cannot exceed the total number of pages */ - if (delta > 0 && surplus_huge_pages_node[nid] >= - nr_huge_pages_node[nid]) - continue; - - surplus_huge_pages += delta; - surplus_huge_pages_node[nid] += delta; - ret = 1; - break; + ret = adjust_pool_surplus_node(delta, nid); + if (ret == 1) + break; } while (nid != prev_nid); prev_nid = nid; @@ -450,7 +458,7 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) page = list_entry(hugepage_freelists[nid].next, struct page, lru); list_del(&page->lru); - update_and_free_page(page); + update_and_free_page(nid, page); free_huge_pages--; free_huge_pages_node[nid]--; surplus_huge_pages--; @@ -556,25 +564,35 @@ static unsigned int cpuset_mems_nr(unsigned int *array) #ifdef CONFIG_SYSCTL #ifdef CONFIG_HIGHMEM +static void try_to_free_low_node(unsigned long count, int nid) +{ + struct page *page, *next; + list_for_each_entry_safe(page, next, &hugepage_freelists[nid], lru) { + if (count >= nr_huge_pages_node[nid]) + return; + if (PageHighMem(page)) + continue; + list_del(&page->lru); + update_and_free_page(nid, page); + free_huge_pages--; + free_huge_pages_node[nid]--; + } +} + static void try_to_free_low(unsigned long count) { int i; for (i = 0; i < MAX_NUMNODES; ++i) { - struct page *page, *next; - list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { - if (count >= nr_huge_pages) - return; - if (PageHighMem(page)) - continue; - list_del(&page->lru); - update_and_free_page(page); - free_huge_pages--; - free_huge_pages_node[page_to_nid(page)]--; - } + if (count >= nr_huge_pages) + return; + try_to_free_low_node(count, i); } } #else +static inline void try_to_free_low_node(unsigned long count, int nid) +{ +} static inline void try_to_free_low(unsigned long count) { } @@ -639,7 +657,7 @@ static unsigned long set_max_huge_pages(unsigned long count) struct page *page = dequeue_huge_page(); if (!page) break; - update_and_free_page(page); + update_and_free_page(page_to_nid(page), page); } while (count < persistent_huge_pages) { if (!adjust_pool_surplus(1)) -- Nishanth Aravamudan <nacc@us.ibm.com> IBM Linux Technology Center -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 51+ messages in thread
* [RFC][PATCH 2/5] hugetlb: numafy several functions 2008-04-11 23:44 [PATCH 1/5] hugetlb: numafy several functions Nishanth Aravamudan @ 2008-04-11 23:47 ` Nishanth Aravamudan 2008-04-11 23:47 ` [PATCH 3/5] hugetlb: interleave dequeueing of huge pages Nishanth Aravamudan 2008-04-14 14:52 ` [RFC][PATCH 2/5] hugetlb: numafy several functions Adam Litke 0 siblings, 2 replies; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-11 23:47 UTC (permalink / raw) To: wli; +Cc: clameter, agl, luick, Lee.Schermerhorn, linux-mm, npiggin Allow specifying the number of hugepages to allocate on a particular node. Our current global sysctl will try its best to put hugepages equally on each node, but htat may not always be desired. This allows the admin to control the layout of hugepage allocation at a finer level (while not breaking the existing interface). Add callbacks in the sysfs node registration and unregistration functions into hugetlb to add the nr_hugepages attribute, which is a no-op if !NUMA or !HUGETLB. This new interface requires some changes to the nr_hugepages sysctl as well. We update max_huge_pages via a call to set_max_huge_pages() with the value written into the nr_hugepages sysctl, even when only reading. This is not very efficient. More importantly when nr_hugepages can be altered by other interfaces (per-node sysfs attributes), this side effect of reading can invoke set_max_huge_pages with a value less than nr_hugepages, resulting in hugepages being freed! Rather than relying on set_max_huge_pages() at all in the read-path, update max_huge_pages (which is still the syctl variable) to the appropriate value on reads (before invoking the generic sysctl handler) and call set_max_huge_pages() on writes (after invoking the generic sysctl handler). Thanks to Dean Luick for finding some bugs in my previous posting of the patch. Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com> --- Same patch as before, but an RFC this time to decide if /sys/devices/system/node is where we want to be putting the pool allocators. As discussed in a separate thread with Nick ("[patch 00/17] multi size, and giatn hugetlb page support, 1GB hugetlb for x86" on linux-mm), perhaps a better location would be /sys/kernel, but then we'd need to replicate a bit of the NUMA layout into /sys/kernel. However, the advantage would be when we put the multiple hugepage pool allocation interfaces in /sys/kernel, all of the hugetlb related interfaces will be in one place (as presumably we'll want per-node control on a per-pool basis!). diff --git a/drivers/base/node.c b/drivers/base/node.c index 4c2caff..96aa493 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -154,6 +154,7 @@ int register_node(struct node *node, int num, struct node *parent) sysdev_create_file(&node->sysdev, &attr_meminfo); sysdev_create_file(&node->sysdev, &attr_numastat); sysdev_create_file(&node->sysdev, &attr_distance); + hugetlb_register_node(node); } return error; } @@ -171,6 +172,7 @@ void unregister_node(struct node *node) sysdev_remove_file(&node->sysdev, &attr_meminfo); sysdev_remove_file(&node->sysdev, &attr_numastat); sysdev_remove_file(&node->sysdev, &attr_distance); + hugetlb_unregister_node(node); sysdev_unregister(&node->sysdev); } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a79e80b..ac8c8d9 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -6,7 +6,9 @@ #ifdef CONFIG_HUGETLB_PAGE #include <linux/mempolicy.h> +#include <linux/node.h> #include <linux/shm.h> +#include <linux/sysdev.h> #include <asm/tlbflush.h> #include <asm/hugetlb.h> @@ -27,6 +29,13 @@ void __unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned lon int hugetlb_prefault(struct address_space *, struct vm_area_struct *); int hugetlb_report_meminfo(char *); int hugetlb_report_node_meminfo(int, char *); +#ifdef CONFIG_NUMA +int hugetlb_register_node(struct node *); +void hugetlb_unregister_node(struct node *); +#else +#define hugetlb_register_node(node) do {} while (0) +#define hugetlb_unregister_node(node) do {} while (0) +#endif unsigned long hugetlb_total_pages(void); int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access); @@ -70,6 +79,8 @@ static inline unsigned long hugetlb_total_pages(void) #define unmap_hugepage_range(vma, start, end) BUG() #define hugetlb_report_meminfo(buf) 0 #define hugetlb_report_node_meminfo(n, buf) 0 +#define hugetlb_register_node(node) do {} while (0) +#define hugetlb_unregister_node(node) do {} while (0) #define follow_huge_pmd(mm, addr, pmd, write) NULL #define prepare_hugepage_range(addr,len) (-EINVAL) #define pmd_huge(x) 0 diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8faaa16..d35b087 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -562,7 +562,6 @@ static unsigned int cpuset_mems_nr(unsigned int *array) return nr; } -#ifdef CONFIG_SYSCTL #ifdef CONFIG_HIGHMEM static void try_to_free_low_node(unsigned long count, int nid) { @@ -578,7 +577,14 @@ static void try_to_free_low_node(unsigned long count, int nid) free_huge_pages_node[nid]--; } } +#else +static inline void try_to_free_low_node(unsigned long count, int nid) +{ +} +#endif +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_HIGHMEM static void try_to_free_low(unsigned long count) { int i; @@ -590,18 +596,15 @@ static void try_to_free_low(unsigned long count) } } #else -static inline void try_to_free_low_node(unsigned long count, int nid) -{ -} static inline void try_to_free_low(unsigned long count) { } #endif #define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) -static unsigned long set_max_huge_pages(unsigned long count) +static void set_max_huge_pages(unsigned long count) { - unsigned long min_count, ret; + unsigned long min_count; /* * Increase the pool size @@ -664,17 +667,21 @@ static unsigned long set_max_huge_pages(unsigned long count) break; } out: - ret = persistent_huge_pages; spin_unlock(&hugetlb_lock); - return ret; } int hugetlb_sysctl_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { + if (!write) { + spin_lock(&hugetlb_lock); + max_huge_pages = persistent_huge_pages; + spin_unlock(&hugetlb_lock); + } proc_doulongvec_minmax(table, write, file, buffer, length, ppos); - max_huge_pages = set_max_huge_pages(max_huge_pages); + if (write) + set_max_huge_pages(max_huge_pages); return 0; } @@ -729,6 +736,115 @@ int hugetlb_report_node_meminfo(int nid, char *buf) nid, surplus_huge_pages_node[nid]); } +#ifdef CONFIG_NUMA +static ssize_t hugetlb_read_nr_hugepages_node(struct sys_device *dev, + char *buf) +{ + return sprintf(buf, "%u\n", nr_huge_pages_node[dev->id]); +} + +#define persistent_huge_pages_node(nid) \ + (nr_huge_pages_node[nid] - surplus_huge_pages_node[nid]) +static ssize_t hugetlb_write_nr_hugepages_node(struct sys_device *dev, + const char *buf, size_t count) +{ + int nid = dev->id; + unsigned long target; + unsigned long free_on_other_nodes; + unsigned long nr_huge_pages_req = simple_strtoul(buf, NULL, 10); + + /* + * Increase the pool size on the node + * First take pages out of surplus state. Then make up the + * remaining difference by allocating fresh huge pages. + * + * We might race with alloc_buddy_huge_page() here and be unable + * to convert a surplus huge page to a normal huge page. That is + * not critical, though, it just means the overall size of the + * pool might be one hugepage larger than it needs to be, but + * within all the constraints specified by the sysctls. + */ + spin_lock(&hugetlb_lock); + while (surplus_huge_pages_node[nid] && + nr_huge_pages_req > persistent_huge_pages_node(nid)) { + if (!adjust_pool_surplus_node(-1, nid)) + break; + } + + while (nr_huge_pages_req > persistent_huge_pages_node(nid)) { + struct page *ret; + /* + * If this allocation races such that we no longer need the + * page, free_huge_page will handle it by freeing the page + * and reducing the surplus. + */ + spin_unlock(&hugetlb_lock); + ret = alloc_fresh_huge_page_node(nid); + spin_lock(&hugetlb_lock); + if (!ret) + goto out; + + } + + if (nr_huge_pages_req >= nr_huge_pages_node[nid]) + goto out; + + /* + * Decrease the pool size + * First return free pages to the buddy allocator (being careful + * to keep enough around to satisfy reservations). Then place + * pages into surplus state as needed so the pool will shrink + * to the desired size as pages become free. + * + * By placing pages into the surplus state independent of the + * overcommit value, we are allowing the surplus pool size to + * exceed overcommit. There are few sane options here. Since + * alloc_buddy_huge_page() is checking the global counter, + * though, we'll note that we're not allowed to exceed surplus + * and won't grow the pool anywhere else. Not until one of the + * sysctls are changed, or the surplus pages go out of use. + */ + free_on_other_nodes = free_huge_pages - free_huge_pages_node[nid]; + if (free_on_other_nodes >= resv_huge_pages) { + /* other nodes can satisfy reserve */ + target = nr_huge_pages_req; + } else { + /* this node needs some free to satisfy reserve */ + target = max((resv_huge_pages - free_on_other_nodes), + nr_huge_pages_req); + } + try_to_free_low_node(nid, target); + while (target < persistent_huge_pages_node(nid)) { + struct page *page = dequeue_huge_page_node(NULL, nid); + if (!page) + break; + update_and_free_page(nid, page); + } + + while (target < persistent_huge_pages_node(nid)) { + if (!adjust_pool_surplus_node(1, nid)) + break; + } +out: + spin_unlock(&hugetlb_lock); + return count; +} + +static SYSDEV_ATTR(nr_hugepages, S_IRUGO | S_IWUSR, + hugetlb_read_nr_hugepages_node, + hugetlb_write_nr_hugepages_node); + +int hugetlb_register_node(struct node *node) +{ + return sysdev_create_file(&node->sysdev, &attr_nr_hugepages); +} + +void hugetlb_unregister_node(struct node *node) +{ + sysdev_remove_file(&node->sysdev, &attr_nr_hugepages); +} +#endif + /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { -- Nishanth Aravamudan <nacc@us.ibm.com> IBM Linux Technology Center -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 51+ messages in thread
* [PATCH 3/5] hugetlb: interleave dequeueing of huge pages 2008-04-11 23:47 ` [RFC][PATCH 2/5] " Nishanth Aravamudan @ 2008-04-11 23:47 ` Nishanth Aravamudan 2008-04-11 23:49 ` [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI Nishanth Aravamudan 2008-04-14 14:52 ` [RFC][PATCH 2/5] hugetlb: numafy several functions Adam Litke 1 sibling, 1 reply; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-11 23:47 UTC (permalink / raw) To: wli; +Cc: clameter, agl, luick, Lee.Schermerhorn, linux-mm, npiggin Currently, when shrinking the hugetlb pool, we free all of the pages on node 0, then all the pages on node 1, etc. With this patch we instead interleave over the nodes with memory. If some particularly node should be cleared first, the to-be-introduced sysfs allocator can be used for finer-grained control. This also helps with keeping the pool balanced as we change the pool at run-time. Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com> diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d35b087..18ece9e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -87,15 +87,32 @@ static struct page *dequeue_huge_page_node(struct vm_area_struct *vma, static struct page *dequeue_huge_page(void) { - int nid; struct page *page = NULL; + int start_nid; + int next_nid; + + start_nid = hugetlb_next_nid; + + do { + if (!list_empty(&hugepage_freelists[hugetlb_next_nid])) + page = dequeue_huge_page_node(NULL, hugetlb_next_nid); + /* + * Use a helper variable to find the next node and then + * copy it back to hugetlb_next_nid afterwards: + * otherwise there's a window in which a racer might + * pass invalid nid MAX_NUMNODES to alloc_pages_node. + * But we don't need to use a spin_lock here: it really + * doesn't matter if occasionally a racer chooses the + * same nid as we do. Move nid forward in the mask even + * if we just successfully allocated a hugepage so that + * the next caller gets hugepages on the next node. + */ + next_nid = next_node(hugetlb_next_nid, node_online_map); + if (next_nid == MAX_NUMNODES) + next_nid = first_node(node_online_map); + hugetlb_next_nid = next_nid; + } while (!page && hugetlb_next_nid != start_nid); - for (nid = 0; nid < MAX_NUMNODES; ++nid) { - if (!list_empty(&hugepage_freelists[nid])) { - page = dequeue_huge_page_node(NULL, nid); - break; - } - } return page; } -- Nishanth Aravamudan <nacc@us.ibm.com> IBM Linux Technology Center -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 51+ messages in thread
* [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-11 23:47 ` [PATCH 3/5] hugetlb: interleave dequeueing of huge pages Nishanth Aravamudan @ 2008-04-11 23:49 ` Nishanth Aravamudan 2008-04-11 23:50 ` [RFC][PATCH 5/5] Documentation: update ABI and hugetlbpage.txt for per-node files Nishanth Aravamudan 2008-04-11 23:56 ` [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI Greg KH 0 siblings, 2 replies; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-11 23:49 UTC (permalink / raw) To: wli; +Cc: clameter, agl, luick, Lee.Schermerhorn, linux-mm, npiggin, gregkh /sys/devices/system/node represents the current NUMA configuration of the machine, but is undocumented in the ABI files. Add bare-bones documentation for these files. Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com> --- Greg, is something like this what you'd want? Should I be striving for more detail? Should the file have a preamble indicating none of it exists if !NUMA? diff --git a/Documentation/ABI/testing/sysfs-devices-system-node b/Documentation/ABI/testing/sysfs-devices-system-node new file mode 100644 index 0000000..97d6145 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-system-node @@ -0,0 +1,59 @@ +What: /sys/devices/system/node/has_cpu +Date: October 2007 +Contact: Lee Schermerhorn <Lee.Schermerhonr@hp.com> +Description: + List of nodes which have one ore more CPUs. + +What: /sys/devices/system/node/has_high_memory +Date: October 2007 +Contact: Lee Schermerhorn <Lee.Schermerhorn@hp.com> +Description: + List of nodes which have regular or high memory. This + file will not exist if CONFIG_HIGHMEM is off. + +What: /sys/devices/system/node/has_normal_memory +Date: October 2007 +Contact: Lee Schermerhorn <Lee.Schermerhorn@hp.com> +Description: + List of nodes which have regular memory. + +What: /sys/devices/system/node/online +Date: October 2007 +Contact: Lee Schermerhorn <Lee.Schermerhorn@hp.com> +Description: + List of online nodes. + +What: /sys/devices/system/node/possible +Date: October 2007 +Contact: Lee Schermerhorn <Lee.Schermerhorn@hp.com> +Description: + List of nodes which could go online. + +What: /sys/devices/system/node/<node>/<cpu> +Date: June 2006 +Contact: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> +Description: + Symlink to the sysfs CPU information for each <cpu> on + <node>. + +What: /sys/devices/system/node/<node>/cpumap +Date: +Contact: Christoph Lameter <clameter@sgi.com> +Description: + Hexadecimal mask of which CPUs are on <node>. + +What: /sys/devices/system/node/<node>/meminfo +Date: +Contact: Christoph Lameter <clameter@sgi.com> +Description: + Memory information for <node>. + NOTE: This file violates the sysfs rules for one value + per file. + +What: /sys/devices/system/node/<node>/numastat +Date: +Contact: Christoph Lameter <clameter@sgi.com> +Description: + NUMA statistics for <node>. + NOTE: This file violates the sysfs rules for one value + per file. -- Nishanth Aravamudan <nacc@us.ibm.com> IBM Linux Technology Center -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 51+ messages in thread
* [RFC][PATCH 5/5] Documentation: update ABI and hugetlbpage.txt for per-node files 2008-04-11 23:49 ` [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI Nishanth Aravamudan @ 2008-04-11 23:50 ` Nishanth Aravamudan 2008-04-11 23:56 ` [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI Greg KH 1 sibling, 0 replies; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-11 23:50 UTC (permalink / raw) To: wli; +Cc: clameter, agl, luick, Lee.Schermerhorn, linux-mm, npiggin, gregkh --- This patch will change if we decide to move the per-node interface to another location in sysfs. Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com> diff --git a/Documentation/ABI/testing/sysfs-devices-system-node b/Documentation/ABI/testing/sysfs-devices-system-node index 97d6145..5766902 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-node +++ b/Documentation/ABI/testing/sysfs-devices-system-node @@ -57,3 +57,10 @@ Description: NUMA statistics for <node>. NOTE: This file violates the sysfs rules for one value per file. + +What: /sys/devices/system/node/<node>/nr_hugepages +Date: April 2008 +Contact: Nish Aravamudan <nacc@us.ibm.com> +Description: + Interface to allocate (and check) hugepages on <node>. + This file will not exist if CONFIG_HUGETLB_PAGE is off. diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index 3102b81..b749607 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt @@ -80,6 +80,13 @@ of getting physical contiguous pages is still very high). In either case, adminstrators will want to verify the number of hugepages actually allocated by checking the sysctl or meminfo. +/sys/devices/system/node/nodeX/nr_hugepages allows for finer-grained +control of the hugepage pool on NUMA machines. The functionality is the +same as for nr_hugepages, but the effects are restricted to the node in +question. Similarly, administrators will want to verify the number of +hugepages actually allocated or freed by checking the per-node meminfo +or nr_hugepages file. + /proc/sys/vm/nr_overcommit_hugepages indicates how large the pool of hugepages can grow, if more hugepages than /proc/sys/vm/nr_hugepages are requested by applications. echo'ing any non-zero value into this file -- Nishanth Aravamudan <nacc@us.ibm.com> IBM Linux Technology Center -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-11 23:49 ` [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI Nishanth Aravamudan 2008-04-11 23:50 ` [RFC][PATCH 5/5] Documentation: update ABI and hugetlbpage.txt for per-node files Nishanth Aravamudan @ 2008-04-11 23:56 ` Greg KH 2008-04-12 0:27 ` Nishanth Aravamudan 2008-04-12 9:41 ` Nick Piggin 1 sibling, 2 replies; 51+ messages in thread From: Greg KH @ 2008-04-11 23:56 UTC (permalink / raw) To: Nishanth Aravamudan Cc: wli, clameter, agl, luick, Lee.Schermerhorn, linux-mm, npiggin On Fri, Apr 11, 2008 at 04:49:13PM -0700, Nishanth Aravamudan wrote: > /sys/devices/system/node represents the current NUMA configuration of > the machine, but is undocumented in the ABI files. Add bare-bones > documentation for these files. > > Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com> > > --- > Greg, is something like this what you'd want? Yes it is, thanks for doing it. > Should I be striving for more detail? You might want to show what you mean by "list of nodes". But other than that, this is a great start. > Should the file have a preamble indicating none of it exists if !NUMA? Yes, that would be helpful for people who might worry that they do not see these files :) thanks, greg k-h -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-11 23:56 ` [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI Greg KH @ 2008-04-12 0:27 ` Nishanth Aravamudan 2008-04-12 9:41 ` Nick Piggin 1 sibling, 0 replies; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-12 0:27 UTC (permalink / raw) To: Greg KH; +Cc: wli, clameter, agl, luick, Lee.Schermerhorn, linux-mm, npiggin On 11.04.2008 [16:56:48 -0700], Greg KH wrote: > On Fri, Apr 11, 2008 at 04:49:13PM -0700, Nishanth Aravamudan wrote: > > /sys/devices/system/node represents the current NUMA configuration of > > the machine, but is undocumented in the ABI files. Add bare-bones > > documentation for these files. > > > > Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com> > > > > --- > > Greg, is something like this what you'd want? > > Yes it is, thanks for doing it. Ok, good. > > Should I be striving for more detail? > > You might want to show what you mean by "list of nodes". But other than > that, this is a great start. Yeah, I was thinking for a few of the files, an example output might clarify their use{,fulness}. > > Should the file have a preamble indicating none of it exists if !NUMA? > > Yes, that would be helpful for people who might worry that they do not > see these files :) Ok, I'll make that change in the next version. Thanks, Nish -- Nishanth Aravamudan <nacc@us.ibm.com> IBM Linux Technology Center -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-11 23:56 ` [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI Greg KH 2008-04-12 0:27 ` Nishanth Aravamudan @ 2008-04-12 9:41 ` Nick Piggin 2008-04-12 10:26 ` Christoph Lameter 2008-04-13 3:41 ` Greg KH 1 sibling, 2 replies; 51+ messages in thread From: Nick Piggin @ 2008-04-12 9:41 UTC (permalink / raw) To: Greg KH Cc: Nishanth Aravamudan, wli, clameter, agl, luick, Lee.Schermerhorn, linux-mm On Fri, Apr 11, 2008 at 04:56:48PM -0700, Greg KH wrote: > On Fri, Apr 11, 2008 at 04:49:13PM -0700, Nishanth Aravamudan wrote: > > /sys/devices/system/node represents the current NUMA configuration of > > the machine, but is undocumented in the ABI files. Add bare-bones > > documentation for these files. > > > > Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com> > > > > --- > > Greg, is something like this what you'd want? > > Yes it is, thanks for doing it. Can you comment on the aspect of configuring various kernel hugetlb configuration parameters? Especifically, what directory it should go in? IMO it should be /sys/kernel/* /sys/devices/system/etc should be fine eg. for showing how many pages are available in a given node, or what kinds of TLBs the CPU has, but I would have thought that configuring the kernel's hugetlb settings should be in /sys/kernel. Then again, I can't say I'm up to speed on sysfs policy so the main thing I care about is that it is consistent and correct. Thanks, Nick -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-12 9:41 ` Nick Piggin @ 2008-04-12 10:26 ` Christoph Lameter 2008-04-14 21:09 ` Nishanth Aravamudan 2008-04-13 3:41 ` Greg KH 1 sibling, 1 reply; 51+ messages in thread From: Christoph Lameter @ 2008-04-12 10:26 UTC (permalink / raw) To: Nick Piggin Cc: Greg KH, Nishanth Aravamudan, wli, agl, luick, Lee.Schermerhorn, linux-mm On Sat, 12 Apr 2008, Nick Piggin wrote: > Can you comment on the aspect of configuring various kernel hugetlb > configuration parameters? Especifically, what directory it should go in? > IMO it should be /sys/kernel/* Yes that would be more consistent. However, it will break the tools that now access /sys/devices. Something like /sys/kernel/node/<nodenr>/<numa setting> and /sys/kernel/memory/<global setting> ? -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-12 10:26 ` Christoph Lameter @ 2008-04-14 21:09 ` Nishanth Aravamudan 0 siblings, 0 replies; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-14 21:09 UTC (permalink / raw) To: Christoph Lameter Cc: Nick Piggin, Greg KH, wli, agl, luick, Lee.Schermerhorn, linux-mm On 12.04.2008 [03:26:35 -0700], Christoph Lameter wrote: > On Sat, 12 Apr 2008, Nick Piggin wrote: > > > Can you comment on the aspect of configuring various kernel hugetlb > > configuration parameters? Especifically, what directory it should go in? > > IMO it should be /sys/kernel/* > > Yes that would be more consistent. However, it will break the tools that > now access /sys/devices. Since the ABI was undocumented, do we have any idea what those tools would be? libnuma seems to have some references to sysfs but they result in warnings, not errors, AFAICT (and I will add libnuma as a consumer of the interfaces in question, in my patch to add the ABI documentation). > Something like > > /sys/kernel/node/<nodenr>/<numa setting> Well, right now, the node devices are anchored in the right place, I think, and represent a real non-global property (unlike the /sys/kernel bits). My understanding is that Nick is wondering if /sys/devices/system/node/nodeX/* should be read-only or if kernel-changing attributes should also be placed there? You had a similar question earlier, and we never really resolved it, beyond saying this was the first attempt at adding a tunable in the directory :) > and > > /sys/kernel/memory/<global setting> This is an interesting idea. However, moving the meminfo-like files into this directory would probably require us obeying the sysfs rules (which many of the /sys/devices/system/node files do not!) for one-value-per-file, which would make meminfo lookup non-atomic/less useful? So, what settings are you thinking go there? Or, am I completely misunderstanding, and the settings you refer to in both cases strictly hugetlb-related settings? Thanks, Nish -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-12 9:41 ` Nick Piggin 2008-04-12 10:26 ` Christoph Lameter @ 2008-04-13 3:41 ` Greg KH 2008-04-14 21:05 ` Nishanth Aravamudan 1 sibling, 1 reply; 51+ messages in thread From: Greg KH @ 2008-04-13 3:41 UTC (permalink / raw) To: Nick Piggin Cc: Nishanth Aravamudan, wli, clameter, agl, luick, Lee.Schermerhorn, linux-mm On Sat, Apr 12, 2008 at 11:41:18AM +0200, Nick Piggin wrote: > On Fri, Apr 11, 2008 at 04:56:48PM -0700, Greg KH wrote: > > On Fri, Apr 11, 2008 at 04:49:13PM -0700, Nishanth Aravamudan wrote: > > > /sys/devices/system/node represents the current NUMA configuration of > > > the machine, but is undocumented in the ABI files. Add bare-bones > > > documentation for these files. > > > > > > Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com> > > > > > > --- > > > Greg, is something like this what you'd want? > > > > Yes it is, thanks for doing it. > > Can you comment on the aspect of configuring various kernel hugetlb > configuration parameters? Especifically, what directory it should go in? > IMO it should be /sys/kernel/* I don't really know. > /sys/devices/system/etc should be fine eg. for showing how many pages are > available in a given node, or what kinds of TLBs the CPU has, but I would > have thought that configuring the kernel's hugetlb settings should be > in /sys/kernel. /sys/devices/system are for "sysdev" devices, a breed of device structures that are problimatic to use, and are on my TODO list to rework. If you need a hugetlb paramter to be tied to a cpu or other system device, then it should go under here. Otherwise, if it is just a "system wide" parameter, then put it in /sys/kernel/ thanks, greg k-h -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-13 3:41 ` Greg KH @ 2008-04-14 21:05 ` Nishanth Aravamudan 2008-04-17 23:16 ` Nishanth Aravamudan 0 siblings, 1 reply; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-14 21:05 UTC (permalink / raw) To: Greg KH; +Cc: Nick Piggin, wli, clameter, agl, luick, Lee.Schermerhorn, linux-mm On 12.04.2008 [20:41:36 -0700], Greg KH wrote: > On Sat, Apr 12, 2008 at 11:41:18AM +0200, Nick Piggin wrote: > > On Fri, Apr 11, 2008 at 04:56:48PM -0700, Greg KH wrote: > > > On Fri, Apr 11, 2008 at 04:49:13PM -0700, Nishanth Aravamudan wrote: > > > > /sys/devices/system/node represents the current NUMA configuration of > > > > the machine, but is undocumented in the ABI files. Add bare-bones > > > > documentation for these files. > > > > > > > > Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com> > > > > > > > > --- > > > > Greg, is something like this what you'd want? > > > > > > Yes it is, thanks for doing it. > > > > Can you comment on the aspect of configuring various kernel hugetlb > > configuration parameters? Especifically, what directory it should go in? > > IMO it should be /sys/kernel/* > > I don't really know. > > > /sys/devices/system/etc should be fine eg. for showing how many pages are > > available in a given node, or what kinds of TLBs the CPU has, but I would > > have thought that configuring the kernel's hugetlb settings should be > > in /sys/kernel. > > /sys/devices/system are for "sysdev" devices, a breed of device > structures that are problimatic to use, and are on my TODO list to > rework. If you need a hugetlb paramter to be tied to a cpu or other > system device, then it should go under here. > > Otherwise, if it is just a "system wide" parameter, then put it in > /sys/kernel/ We have both, and that's kind of where things are being discussed right now. Currently, we have: /proc/sys/vm/nr_hugepages /proc/sys/vm/nr_overcommit_hugepages which are global sysctls. My patchset would add: /sys/devices/system/node/nodeX/nr_hugepages to allow for finer-grained control of the hugetlb pool allocation. Nick/Andi's patchset would modify /proc/sys/vm/nr_hugepages to allow specifying the pool sizes for multiple hugepage sizes. To make my patchset and Nick's work well together, I think we'd need a per-node, per-hugepage-size interface in sysfs. I pointed out to Nick that it might be better to make the extended interface (supporting multiple hugepage sizes) be in sysfs altogether, and leave /proc/sys/vm/nr_hugepages alone (as only controlling the default hugepage size). That would leave us with [1]: /sys/kernel/nr_hugepages --> nr_hugepages_2M /sys/kernel/nr_hugepages_2M /sys/kernel/nr_hugepages_1G /sys/kernel/nr_overcommit_hugepages --> nr_overcommit_hugepages_2M /sys/kernel/nr_overcommit_hugepages_2M /sys/kernel/nr_overcommit_hugepages_1G and [2] /sys/devices/system/node/nodeX/nr_hugepages --> nr_hugepages_2M /sys/devices/system/node/nodeX/nr_hugepages_2M /sys/devices/system/node/nodeX/nr_hugepages_1G The questions I see are (with my answers): Is this separation correct? - I believe this puts the globals in one place and the per-nodes in another (both of which are correct) keeping things accurate. The per-node interface would be the first writable attribute in /sys/devices/system/node, though. Is this separation confusing to an administrator? - Similar to the previous question, I think the separation corresponds well to the system's layout. Is there a better way of presenting these attributes? - Nick's alternative was to (I think, please CMIIW) have: /sys/kernel/hugetlb/2M/nr_hugepages /sys/kernel/hugetlb/2M/nr_overcommit_hugepages /sys/kernel/hugetlb/2M/nodeX/nr_hugepages /sys/kernel/hugetlb/2M/nodeX/nr_overcommit_hugepages with perhaps symlinks in /sys/kernel/ or /sys/kernel/hugetlb directly to the default pools. And similar diretories/files for 1G pages. This seems like a lot of duplication of the NUMA layout, but I can see it also being better in that all of the hugetlb-related interface is in one place. [3] Do you see a particular more-sysfs-way here, Greg? Thanks for reading this particularly long e-mail, Nish [1] Nick suggested using directories in /sys/kernel per-hugepage-size, but I'm not sure how they should be named, so I went with the simpler filename-style, to make the point clearer. [2] I have a patch to allow for per-node dynamic pool control, but it's pretty gross. Right now, we let the memory policy enforce where we get hugepages from, presuming we can allocate there. If we had per-node control, we'd need some way to specify a restriction on how many hugepages can be allocated on a particular node down to alloc_pages, or use a round-robin style, which would probably break mempolicies. For now, I've let the patch alone while I try to find a better way. [3] Is there an in-between, perhaps, that we could have the real files in /sys/devices/system/node, but have symlinks, like /sys/kernel/hugetlb/nodeX/nr_hugepages_2M --> /sys/devices/system/node/nodeX/nr_hugepages_2M ? That seems like overkill... -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-14 21:05 ` Nishanth Aravamudan @ 2008-04-17 23:16 ` Nishanth Aravamudan 2008-04-17 23:22 ` Christoph Lameter 0 siblings, 1 reply; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-17 23:16 UTC (permalink / raw) To: Greg KH; +Cc: Nick Piggin, wli, clameter, agl, luick, Lee.Schermerhorn, linux-mm On 14.04.2008 [14:05:06 -0700], Nishanth Aravamudan wrote: > On 12.04.2008 [20:41:36 -0700], Greg KH wrote: > > On Sat, Apr 12, 2008 at 11:41:18AM +0200, Nick Piggin wrote: > > > On Fri, Apr 11, 2008 at 04:56:48PM -0700, Greg KH wrote: > > > > On Fri, Apr 11, 2008 at 04:49:13PM -0700, Nishanth Aravamudan wrote: > > > > > /sys/devices/system/node represents the current NUMA configuration of > > > > > the machine, but is undocumented in the ABI files. Add bare-bones > > > > > documentation for these files. > > > > > > > > > > Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com> > > > > > > > > > > --- > > > > > Greg, is something like this what you'd want? > > > > > > > > Yes it is, thanks for doing it. > > > > > > Can you comment on the aspect of configuring various kernel hugetlb > > > configuration parameters? Especifically, what directory it should go in? > > > IMO it should be /sys/kernel/* > > > > I don't really know. > > > > > /sys/devices/system/etc should be fine eg. for showing how many pages are > > > available in a given node, or what kinds of TLBs the CPU has, but I would > > > have thought that configuring the kernel's hugetlb settings should be > > > in /sys/kernel. > > > > /sys/devices/system are for "sysdev" devices, a breed of device > > structures that are problimatic to use, and are on my TODO list to > > rework. If you need a hugetlb paramter to be tied to a cpu or other > > system device, then it should go under here. > > > > Otherwise, if it is just a "system wide" parameter, then put it in > > /sys/kernel/ > > We have both, and that's kind of where things are being discussed right > now. <snip> > Do you see a particular more-sysfs-way here, Greg? So I've received no comments yet? Perhaps I should leave things the way they are (per-node files in /sys/devices/system/node) and add nr_hugepages to /sys/kernel? Do we want to put it in a subdirectory of /sys/kernel? What should the subdir be called? "hugetlb" (refers to the implementation?) or "hugepages"? Do we want nr_hugepages in sysfs to actually be a symlink to the underlying default hugepage size (in my patch, there will be only one, but it allows for future-proofing)? Or I can make it a real file in my patch and the multiple hugepage sizes at run-time patchset (which I'm willing to help with) can change it to a symlink? Thoughts? Nish -- Nishanth Aravamudan <nacc@us.ibm.com> IBM Linux Technology Center -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-17 23:16 ` Nishanth Aravamudan @ 2008-04-17 23:22 ` Christoph Lameter 2008-04-17 23:36 ` Nishanth Aravamudan 2008-04-22 5:14 ` Nick Piggin 0 siblings, 2 replies; 51+ messages in thread From: Christoph Lameter @ 2008-04-17 23:22 UTC (permalink / raw) To: Nishanth Aravamudan Cc: Greg KH, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm On Thu, 17 Apr 2008, Nishanth Aravamudan wrote: > > Do you see a particular more-sysfs-way here, Greg? > > So I've received no comments yet? Perhaps I should leave things the way > they are (per-node files in /sys/devices/system/node) and add > nr_hugepages to /sys/kernel? The strange location of the node directories has always irked me. > > Do we want to put it in a subdirectory of /sys/kernel? What should the > subdir be called? "hugetlb" (refers to the implementation?) or > "hugepages"? How about: /sys/kernel/node<nr>/<node specific setting/status files> ? -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-17 23:22 ` Christoph Lameter @ 2008-04-17 23:36 ` Nishanth Aravamudan 2008-04-17 23:39 ` Christoph Lameter 2008-04-22 5:14 ` Nick Piggin 1 sibling, 1 reply; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-17 23:36 UTC (permalink / raw) To: Christoph Lameter Cc: Greg KH, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm On 17.04.2008 [16:22:17 -0700], Christoph Lameter wrote: > On Thu, 17 Apr 2008, Nishanth Aravamudan wrote: > > > > Do you see a particular more-sysfs-way here, Greg? > > > > So I've received no comments yet? Perhaps I should leave things the way > > they are (per-node files in /sys/devices/system/node) and add > > nr_hugepages to /sys/kernel? > > The strange location of the node directories has always irked me. But it's now part of the ABI? We'd have to deprecate the current location and such. I'm ok with that, or maybe duplicating the information for now, while deprecating the old location, but don't want to spend the time doing that if we don't want it to be changed. > > Do we want to put it in a subdirectory of /sys/kernel? What should the > > subdir be called? "hugetlb" (refers to the implementation?) or > > "hugepages"? > > How about: > > /sys/kernel/node<nr>/<node specific setting/status files> ? That seems fine to me. I will work on it. However, as I mentioned in a previous e-mail, the files in /sys/devices/system/node/node<nr>/ already violate the "one value per file" rule in several instances. I'm guessing Greg won't want me moving the files and keeping that violation? Thanks, Nish -- Nishanth Aravamudan <nacc@us.ibm.com> IBM Linux Technology Center -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-17 23:36 ` Nishanth Aravamudan @ 2008-04-17 23:39 ` Christoph Lameter 2008-04-18 6:04 ` Nishanth Aravamudan 2008-04-20 2:21 ` Greg KH 0 siblings, 2 replies; 51+ messages in thread From: Christoph Lameter @ 2008-04-17 23:39 UTC (permalink / raw) To: Nishanth Aravamudan Cc: Greg KH, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm On Thu, 17 Apr 2008, Nishanth Aravamudan wrote: > That seems fine to me. I will work on it. However, as I mentioned in a > previous e-mail, the files in /sys/devices/system/node/node<nr>/ > already violate the "one value per file" rule in several instances. I'm > guessing Greg won't want me moving the files and keeping that violation? That violation is replicated in /proc/meminfo /proc/vmstat etc etc. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-17 23:39 ` Christoph Lameter @ 2008-04-18 6:04 ` Nishanth Aravamudan 2008-04-18 17:27 ` Nishanth Aravamudan 2008-04-20 2:21 ` Greg KH 1 sibling, 1 reply; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-18 6:04 UTC (permalink / raw) To: Christoph Lameter Cc: Greg KH, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm On 17.04.2008 [16:39:56 -0700], Christoph Lameter wrote: > On Thu, 17 Apr 2008, Nishanth Aravamudan wrote: > > > That seems fine to me. I will work on it. However, as I mentioned in a > > previous e-mail, the files in /sys/devices/system/node/node<nr>/ > > already violate the "one value per file" rule in several instances. I'm > > guessing Greg won't want me moving the files and keeping that violation? > > That violation is replicated in /proc/meminfo /proc/vmstat etc etc. Right, but /proc doesn't have such a restriction (the "one value per file" rule). I'm not sure how the meminfo, etc. files in sysfs got put in past Greg, but that's how it is :) Thanks, Nish -- Nishanth Aravamudan <nacc@us.ibm.com> IBM Linux Technology Center -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-18 6:04 ` Nishanth Aravamudan @ 2008-04-18 17:27 ` Nishanth Aravamudan 2008-04-20 2:24 ` Greg KH 0 siblings, 1 reply; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-18 17:27 UTC (permalink / raw) To: Christoph Lameter Cc: Greg KH, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm On 17.04.2008 [23:04:04 -0700], Nishanth Aravamudan wrote: > On 17.04.2008 [16:39:56 -0700], Christoph Lameter wrote: > > On Thu, 17 Apr 2008, Nishanth Aravamudan wrote: > > > > > That seems fine to me. I will work on it. However, as I mentioned in a > > > previous e-mail, the files in /sys/devices/system/node/node<nr>/ > > > already violate the "one value per file" rule in several instances. I'm > > > guessing Greg won't want me moving the files and keeping that violation? > > > > That violation is replicated in /proc/meminfo /proc/vmstat etc etc. > > Right, but /proc doesn't have such a restriction (the "one value per > file" rule). I'm not sure how the meminfo, etc. files in sysfs got put > in past Greg, but that's how it is :) Greg, can you give any insight here? Are we better off leaving the files in question in /sys/devices/system/node/node<nr>/{meminfo,numastat,etc} since they are part of the ABI there and already violate the rules for sysfs? Or can we move them to /sys/kernel and continue to violate the rules? In this case, I don't see any way to provide a "snapshot" of the system's memory information without all the values being in one file? Thanks, Nish -- Nishanth Aravamudan <nacc@us.ibm.com> IBM Linux Technology Center -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-18 17:27 ` Nishanth Aravamudan @ 2008-04-20 2:24 ` Greg KH 2008-04-21 16:43 ` Nishanth Aravamudan 0 siblings, 1 reply; 51+ messages in thread From: Greg KH @ 2008-04-20 2:24 UTC (permalink / raw) To: Nishanth Aravamudan Cc: Christoph Lameter, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm On Fri, Apr 18, 2008 at 10:27:30AM -0700, Nishanth Aravamudan wrote: > On 17.04.2008 [23:04:04 -0700], Nishanth Aravamudan wrote: > > On 17.04.2008 [16:39:56 -0700], Christoph Lameter wrote: > > > On Thu, 17 Apr 2008, Nishanth Aravamudan wrote: > > > > > > > That seems fine to me. I will work on it. However, as I mentioned in a > > > > previous e-mail, the files in /sys/devices/system/node/node<nr>/ > > > > already violate the "one value per file" rule in several instances. I'm > > > > guessing Greg won't want me moving the files and keeping that violation? > > > > > > That violation is replicated in /proc/meminfo /proc/vmstat etc etc. > > > > Right, but /proc doesn't have such a restriction (the "one value per > > file" rule). I'm not sure how the meminfo, etc. files in sysfs got put > > in past Greg, but that's how it is :) > > Greg, can you give any insight here? Are we better off leaving the files > in question in /sys/devices/system/node/node<nr>/{meminfo,numastat,etc} > since they are part of the ABI there and already violate the rules for > sysfs? Or can we move them to /sys/kernel and continue to violate the > rules? In this case, I don't see any way to provide a "snapshot" of the > system's memory information without all the values being in one file? Yeah, the "snapshot" issue is what allows those values all to be present at once. As for where to place them, are there any tools out there that are expecting the current file locations? If so, can they work if they are in both places? If you think they should be moved, I'll defer to your judgement, but it will be a bit harder, as you will be working with "raw" kobjects in that case, not the sysdev structures, which do make things a bit easier for you. sorry for the delay, am traveling... greg k-h -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-20 2:24 ` Greg KH @ 2008-04-21 16:43 ` Nishanth Aravamudan 0 siblings, 0 replies; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-21 16:43 UTC (permalink / raw) To: Greg KH Cc: Christoph Lameter, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm On 19.04.2008 [19:24:21 -0700], Greg KH wrote: > On Fri, Apr 18, 2008 at 10:27:30AM -0700, Nishanth Aravamudan wrote: > > On 17.04.2008 [23:04:04 -0700], Nishanth Aravamudan wrote: > > > On 17.04.2008 [16:39:56 -0700], Christoph Lameter wrote: > > > > On Thu, 17 Apr 2008, Nishanth Aravamudan wrote: > > > > > > > > > That seems fine to me. I will work on it. However, as I mentioned in a > > > > > previous e-mail, the files in /sys/devices/system/node/node<nr>/ > > > > > already violate the "one value per file" rule in several instances. I'm > > > > > guessing Greg won't want me moving the files and keeping that violation? > > > > > > > > That violation is replicated in /proc/meminfo /proc/vmstat etc etc. > > > > > > Right, but /proc doesn't have such a restriction (the "one value per > > > file" rule). I'm not sure how the meminfo, etc. files in sysfs got put > > > in past Greg, but that's how it is :) > > > > Greg, can you give any insight here? Are we better off leaving the files > > in question in /sys/devices/system/node/node<nr>/{meminfo,numastat,etc} > > since they are part of the ABI there and already violate the rules for > > sysfs? Or can we move them to /sys/kernel and continue to violate the > > rules? In this case, I don't see any way to provide a "snapshot" of the > > system's memory information without all the values being in one file? > > Yeah, the "snapshot" issue is what allows those values all to be present > at once. > > As for where to place them, are there any tools out there that are > expecting the current file locations? If so, can they work if they are > in both places? I believe libnuma uses /sys/devices/system/node for some information. As long as the files are in both places, nothing should be affected, though. And we could deprecate the old files (if we decide to move them) for the longer-term and update the necessary libraries. > If you think they should be moved, I'll defer to your judgement, but > it will be a bit harder, as you will be working with "raw" kobjects in > that case, not the sysdev structures, which do make things a bit > easier for you. Yeah, I noticed that while fiddling around. Still possible, just not as easy. > sorry for the delay, am traveling... No problem, thanks for the input! -Nish -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-17 23:39 ` Christoph Lameter 2008-04-18 6:04 ` Nishanth Aravamudan @ 2008-04-20 2:21 ` Greg KH 2008-04-21 6:06 ` Christoph Lameter 1 sibling, 1 reply; 51+ messages in thread From: Greg KH @ 2008-04-20 2:21 UTC (permalink / raw) To: Christoph Lameter Cc: Nishanth Aravamudan, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm On Thu, Apr 17, 2008 at 04:39:56PM -0700, Christoph Lameter wrote: > On Thu, 17 Apr 2008, Nishanth Aravamudan wrote: > > > That seems fine to me. I will work on it. However, as I mentioned in a > > previous e-mail, the files in /sys/devices/system/node/node<nr>/ > > already violate the "one value per file" rule in several instances. I'm > > guessing Greg won't want me moving the files and keeping that violation? > > That violation is replicated in /proc/meminfo /proc/vmstat etc etc. Those are /proc files, not sysfs files :) -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-20 2:21 ` Greg KH @ 2008-04-21 6:06 ` Christoph Lameter 2008-04-21 16:41 ` Nishanth Aravamudan 0 siblings, 1 reply; 51+ messages in thread From: Christoph Lameter @ 2008-04-21 6:06 UTC (permalink / raw) To: Greg KH Cc: Nishanth Aravamudan, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm On Sat, 19 Apr 2008, Greg KH wrote: > > That violation is replicated in /proc/meminfo /proc/vmstat etc etc. > > Those are /proc files, not sysfs files :) Hmmm.. Maybe we need to have /proc/node<x>/meminfo etc that replicates the /proc content for each node? Otherwise this cannot be symmetric because the different mount points have different requirements on how the output should look like. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-21 6:06 ` Christoph Lameter @ 2008-04-21 16:41 ` Nishanth Aravamudan 0 siblings, 0 replies; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-21 16:41 UTC (permalink / raw) To: Christoph Lameter Cc: Greg KH, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm On 20.04.2008 [23:06:48 -0700], Christoph Lameter wrote: > On Sat, 19 Apr 2008, Greg KH wrote: > > > > That violation is replicated in /proc/meminfo /proc/vmstat etc etc. > > > > Those are /proc files, not sysfs files :) > > Hmmm.. Maybe we need to have /proc/node<x>/meminfo etc that replicates > the /proc content for each node? Otherwise this cannot be symmetric > because the different mount points have different requirements on how > the output should look like. But the memory info has nothing to do with process specific information, which is what "new" /proc files should contain (or maybe I'm mis-remembering). The current location (/sys/devices/system/node) reflects that memory is tied to system devices called "nodes"; I'm not entirely convinced we'd want to change that? Especially, as Greg noted, it's easier to obtain the information we want off a sysdev, rather than the raw kobject. While I understand the desire to maintain sanity for sysfs files, perhaps the meminfo files (and numastat, etc) are just special, in that they only make sense as a collective (the snapshot mentioned earlier in this thread) -- to get a view of the component (memory, NUMA statistics, etc) as a whole. In that sense, perhaps the sysfs notion should be extended to "One logical value per file", where logical is defined as the minimum atomic information needed by the user [1]? Or perhaps sysfs just isn't the best place for this information, I don't know. I don't believe I am the person to make that call. Thanks, Nish [1] That would allow files like available_clocksource not to seem like violators of the sysfs rule: $ sudo cat /sys/devices/system/clocksource/clocksource0/available_clocksource hpet acpi_pm pit jiffies tsc -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-17 23:22 ` Christoph Lameter 2008-04-17 23:36 ` Nishanth Aravamudan @ 2008-04-22 5:14 ` Nick Piggin 2008-04-22 16:56 ` Nishanth Aravamudan 1 sibling, 1 reply; 51+ messages in thread From: Nick Piggin @ 2008-04-22 5:14 UTC (permalink / raw) To: Christoph Lameter Cc: Nishanth Aravamudan, Greg KH, wli, agl, luick, Lee.Schermerhorn, linux-mm On Thu, Apr 17, 2008 at 04:22:17PM -0700, Christoph Lameter wrote: > On Thu, 17 Apr 2008, Nishanth Aravamudan wrote: > > > > Do you see a particular more-sysfs-way here, Greg? > > > > So I've received no comments yet? Perhaps I should leave things the way > > they are (per-node files in /sys/devices/system/node) and add > > nr_hugepages to /sys/kernel? > > The strange location of the node directories has always irked me. > > > > Do we want to put it in a subdirectory of /sys/kernel? What should the > > subdir be called? "hugetlb" (refers to the implementation?) or > > "hugepages"? > > How about: > > /sys/kernel/node<nr>/<node specific setting/status files> ? I don't like /sys/kernel/node :P Under /sys/kernel, we should have parameters to set and query various kernel functionality. Control of the kernel software implementation. I think this is pretty well agreed (although there are maybe grey areas I guess) So anyway, underneath that directory, we should have more subdirectories grouping subsystems or sumilar functionality. We aren't tuning node, but hugepages subsystem. /sys/kernel/huge{tlb|pages}/ Under that directory could be global settings as well as per node settings or subdirectories and so on. The layout should be similar to /proc/sys/* IMO. Actually it should be much neater since we have some hindsight, but unfortunately it is looking like it is actually messier ;) Let's really try to put some thought into new sysfs locations. Not just will it work, but is it logical and will it work tomorrow... -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-22 5:14 ` Nick Piggin @ 2008-04-22 16:56 ` Nishanth Aravamudan 2008-04-23 1:03 ` Nick Piggin 0 siblings, 1 reply; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-22 16:56 UTC (permalink / raw) To: Nick Piggin Cc: Christoph Lameter, Greg KH, wli, agl, luick, Lee.Schermerhorn, linux-mm On 22.04.2008 [07:14:47 +0200], Nick Piggin wrote: > On Thu, Apr 17, 2008 at 04:22:17PM -0700, Christoph Lameter wrote: > > On Thu, 17 Apr 2008, Nishanth Aravamudan wrote: > > > > > > Do you see a particular more-sysfs-way here, Greg? > > > > > > So I've received no comments yet? Perhaps I should leave things the way > > > they are (per-node files in /sys/devices/system/node) and add > > > nr_hugepages to /sys/kernel? > > > > The strange location of the node directories has always irked me. > > > > > > Do we want to put it in a subdirectory of /sys/kernel? What should the > > > subdir be called? "hugetlb" (refers to the implementation?) or > > > "hugepages"? > > > > How about: > > > > /sys/kernel/node<nr>/<node specific setting/status files> ? > > I don't like /sys/kernel/node :P Neither do I. My reasoning is that it duplicates information available elsewhere -- what Christoph was suggesting, I think, was moving all of the node files there. That seems like it might be outside the scope of our discussion given the files we have now (but becomes intertwined once we start talking about the intersection of hugetlb + NUMA in per-node control). > Under /sys/kernel, we should have parameters to set and query various > kernel functionality. Control of the kernel software implementation. I > think this is pretty well agreed (although there are maybe grey areas > I guess) I am fine with this claim. > So anyway, underneath that directory, we should have more > subdirectories grouping subsystems or sumilar functionality. We aren't > tuning node, but hugepages subsystem. > > /sys/kernel/huge{tlb|pages}/ > > Under that directory could be global settings as well as per node > settings or subdirectories and so on. The layout should be similar to > /proc/sys/* IMO. Actually it should be much neater since we have some > hindsight, but unfortunately it is looking like it is actually messier > ;) Well, that's where I start to get a little stymied. It seems odd to me to have some per-node information in one place and some in another, where the two are not even rooted at the same location, beyond both being in sysfs. Perhaps, as I've mentioned elsewhere, we simply have symlinks underneath /sys/kernel/hugepages into /sys/devices/system/node/nodeX ... but the immediate ugliness I see there is either we duplicate the directories, or we symlink the directories and there are now to paths into all the NUMA information, where one (/sys/kernel/hugepages/nodeX) seems like it should only have hugepage information. I'd prefer hugepages to hugetlb, I think, but don't necessarily care one way or the other. > Let's really try to put some thought into new sysfs locations. Not > just will it work, but is it logical and will it work tomorrow... I agree and that's why I keep sending out e-mails about it :) Perhaps I should prototype /sys/kernel/hugepages so we can see how it would look as a first step, and then decide given that layout how we want the per-node information to be presented? Thanks, Nish -- Nishanth Aravamudan <nacc@us.ibm.com> IBM Linux Technology Center -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-22 16:56 ` Nishanth Aravamudan @ 2008-04-23 1:03 ` Nick Piggin 2008-04-23 18:32 ` Nishanth Aravamudan 0 siblings, 1 reply; 51+ messages in thread From: Nick Piggin @ 2008-04-23 1:03 UTC (permalink / raw) To: Nishanth Aravamudan Cc: Christoph Lameter, Greg KH, wli, agl, luick, Lee.Schermerhorn, linux-mm On Tue, Apr 22, 2008 at 09:56:02AM -0700, Nishanth Aravamudan wrote: > On 22.04.2008 [07:14:47 +0200], Nick Piggin wrote: > > > So anyway, underneath that directory, we should have more > > subdirectories grouping subsystems or sumilar functionality. We aren't > > tuning node, but hugepages subsystem. > > > > /sys/kernel/huge{tlb|pages}/ > > > > Under that directory could be global settings as well as per node > > settings or subdirectories and so on. The layout should be similar to > > /proc/sys/* IMO. Actually it should be much neater since we have some > > hindsight, but unfortunately it is looking like it is actually messier > > ;) > > Well, that's where I start to get a little stymied. It seems odd to me > to have some per-node information in one place and some in another, > where the two are not even rooted at the same location, beyond both > being in sysfs. Why are nodes special? Why wouldn't you also group per-CPU information in one place, for example? Anyway, I'd argue that you wouldn't group either of those things primarily. You would group by functionality first. If you wanted to tweak or view your hugepages parameters, where do you start? /sys/kernel/node is unintuitive; /sys/kernel/hugepages is easy. > Perhaps, as I've mentioned elsewhere, we simply have > symlinks underneath /sys/kernel/hugepages into > /sys/devices/system/node/nodeX ... but the immediate ugliness I see > there is either we duplicate the directories, or we symlink the I don't like the idea of putting kernel implementation parameters in /sys/devices/ (grey area for device drivers, perhaps). > directories and there are now to paths into all the NUMA information, > where one (/sys/kernel/hugepages/nodeX) seems like it should only have > hugepage information. But the idea of getting "all NUMA information" from one place just seems wrong to me. Getting all *hardware* NUMA information from one place is fine. But kernel implementation wise I think you are really interested in subsystems *first*. Just to demonstrate how badly "all NUMA information in one place" generalises: you also then need a completely different place to store global information for that subsystem, and a different place again to store per-CPU information. > I'd prefer hugepages to hugetlb, I think, but don't necessarily care one > way or the other. I'm fine with that. > > Let's really try to put some thought into new sysfs locations. Not > > just will it work, but is it logical and will it work tomorrow... > > I agree and that's why I keep sending out e-mails about it :) Perhaps I > should prototype /sys/kernel/hugepages so we can see how it would look > as a first step, and then decide given that layout how we want the > per-node information to be presented? Sure. Thanks, Nick -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-23 1:03 ` Nick Piggin @ 2008-04-23 18:32 ` Nishanth Aravamudan 2008-04-23 19:07 ` Adam Litke 2008-04-24 7:13 ` Nick Piggin 0 siblings, 2 replies; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-23 18:32 UTC (permalink / raw) To: Nick Piggin Cc: Christoph Lameter, Greg KH, wli, agl, luick, Lee.Schermerhorn, linux-mm On 23.04.2008 [03:03:00 +0200], Nick Piggin wrote: > On Tue, Apr 22, 2008 at 09:56:02AM -0700, Nishanth Aravamudan wrote: > > On 22.04.2008 [07:14:47 +0200], Nick Piggin wrote: > > > > > So anyway, underneath that directory, we should have more > > > subdirectories grouping subsystems or sumilar functionality. We aren't > > > tuning node, but hugepages subsystem. > > > > > > /sys/kernel/huge{tlb|pages}/ > > > > > > Under that directory could be global settings as well as per node > > > settings or subdirectories and so on. The layout should be similar to > > > /proc/sys/* IMO. Actually it should be much neater since we have some > > > hindsight, but unfortunately it is looking like it is actually messier > > > ;) > > > > Well, that's where I start to get a little stymied. It seems odd to me > > to have some per-node information in one place and some in another, > > where the two are not even rooted at the same location, beyond both > > being in sysfs. > > Why are nodes special? Why wouldn't you also group per-CPU information > in one place, for example? > > Anyway, I'd argue that you wouldn't group either of those things > primarily. You would group by functionality first. > > If you wanted to tweak or view your hugepages parameters, where do you > start? /sys/kernel/node is unintuitive; /sys/kernel/hugepages is easy. Let's be clear, here. I do *not* agree with Christoph's /sys/kernel/node proposal. I was referring simply to how things were laid out now, and that we'd have per-node control of hugepages in /sys/kernel/hugepages and per-node memory information in /sys/devices/system/node. I have been convinced that /sys/kernel/hugepages to control all hugepage functionality is reasonable. My primary concern is making sure the code is clean to move the per-node patches to that location; however, I am going to focus on moving nr_{,overcommit}_hugepages to sysfs first. > > Perhaps, as I've mentioned elsewhere, we simply have symlinks > > underneath /sys/kernel/hugepages into /sys/devices/system/node/nodeX > > ... but the immediate ugliness I see there is either we duplicate > > the directories, or we symlink the > > I don't like the idea of putting kernel implementation parameters in > /sys/devices/ (grey area for device drivers, perhaps). Ack. > > directories and there are now to paths into all the NUMA information, > > where one (/sys/kernel/hugepages/nodeX) seems like it should only have > > hugepage information. > > But the idea of getting "all NUMA information" from one place just > seems wrong to me. Getting all *hardware* NUMA information from one > place is fine. But kernel implementation wise I think you are really > interested in subsystems *first*. Ok. > Just to demonstrate how badly "all NUMA information in one place" > generalises: you also then need a completely different place to store > global information for that subsystem, and a different place again to > store per-CPU information. > > > > I'd prefer hugepages to hugetlb, I think, but don't necessarily care > > one way or the other. > > I'm fine with that. Ok, thanks. > > > Let's really try to put some thought into new sysfs locations. Not > > > just will it work, but is it logical and will it work tomorrow... > > > > I agree and that's why I keep sending out e-mails about it :) Perhaps I > > should prototype /sys/kernel/hugepages so we can see how it would look > > as a first step, and then decide given that layout how we want the > > per-node information to be presented? > > Sure. So, I think, we pretty much agree on how things should be: Direct translation of the current sysctl: /sys/kernel/hugepages/nr_hugepages nr_overcommit_hugepages Adding multiple pools: /sys/kernel/hugepages/nr_hugepages -> nr_hugepages_${default_size} nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size} nr_hugepages_${default_size} nr_overcommit_hugepages_${default_size} nr_hugepages_${other_size1} nr_overcommit_hugepages_${other_size2} Adding per-node control: /sys/kernel/hugepages/nr_hugepages -> nr_hugepages_${default_size} nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size} nr_hugepages_${default_size} nr_overcommit_hugepages_${default_size} nr_hugepages_${other_size1} nr_overcommit_hugepages_${other_size2} nodeX/nr_hugepages -> nr_hugepages_${default_size} nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size} nr_hugepages_${default_size} nr_overcommit_hugepages_${default_size} nr_hugepages_${other_size1} nr_overcommit_hugepages_${other_size2} How does that look? Does anyone have any problems with such an arrangement? Thanks, Nish -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-23 18:32 ` Nishanth Aravamudan @ 2008-04-23 19:07 ` Adam Litke 2008-04-24 7:13 ` Nick Piggin 1 sibling, 0 replies; 51+ messages in thread From: Adam Litke @ 2008-04-23 19:07 UTC (permalink / raw) To: Nishanth Aravamudan Cc: Nick Piggin, Christoph Lameter, Greg KH, wli, luick, Lee.Schermerhorn, linux-mm On Wed, 2008-04-23 at 11:32 -0700, Nishanth Aravamudan wrote: > So, I think, we pretty much agree on how things should be: > > Direct translation of the current sysctl: > > /sys/kernel/hugepages/nr_hugepages > nr_overcommit_hugepages > > Adding multiple pools: > > /sys/kernel/hugepages/nr_hugepages -> nr_hugepages_${default_size} > nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size} > nr_hugepages_${default_size} > nr_overcommit_hugepages_${default_size} > nr_hugepages_${other_size1} > nr_overcommit_hugepages_${other_size2} > > Adding per-node control: > > /sys/kernel/hugepages/nr_hugepages -> nr_hugepages_${default_size} > nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size} > nr_hugepages_${default_size} > nr_overcommit_hugepages_${default_size} > nr_hugepages_${other_size1} > nr_overcommit_hugepages_${other_size2} > nodeX/nr_hugepages -> nr_hugepages_${default_size} > nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size} > nr_hugepages_${default_size} > nr_overcommit_hugepages_${default_size} > nr_hugepages_${other_size1} > nr_overcommit_hugepages_${other_size2} > > How does that look? Does anyone have any problems with such an > arrangement? This seems sensible to me. -- Adam Litke - (agl at us.ibm.com) IBM Linux Technology Center -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-23 18:32 ` Nishanth Aravamudan 2008-04-23 19:07 ` Adam Litke @ 2008-04-24 7:13 ` Nick Piggin 2008-04-24 15:54 ` Nishanth Aravamudan 2008-04-27 3:49 ` [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] Nishanth Aravamudan 1 sibling, 2 replies; 51+ messages in thread From: Nick Piggin @ 2008-04-24 7:13 UTC (permalink / raw) To: Nishanth Aravamudan Cc: Christoph Lameter, Greg KH, wli, agl, luick, Lee.Schermerhorn, linux-mm On Wed, Apr 23, 2008 at 11:32:52AM -0700, Nishanth Aravamudan wrote: > > So, I think, we pretty much agree on how things should be: > > Direct translation of the current sysctl: > > /sys/kernel/hugepages/nr_hugepages > nr_overcommit_hugepages > > Adding multiple pools: > > /sys/kernel/hugepages/nr_hugepages -> nr_hugepages_${default_size} > nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size} > nr_hugepages_${default_size} > nr_overcommit_hugepages_${default_size} > nr_hugepages_${other_size1} > nr_overcommit_hugepages_${other_size2} > > Adding per-node control: > > /sys/kernel/hugepages/nr_hugepages -> nr_hugepages_${default_size} > nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size} > nr_hugepages_${default_size} > nr_overcommit_hugepages_${default_size} > nr_hugepages_${other_size1} > nr_overcommit_hugepages_${other_size2} > nodeX/nr_hugepages -> nr_hugepages_${default_size} > nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size} > nr_hugepages_${default_size} > nr_overcommit_hugepages_${default_size} > nr_hugepages_${other_size1} > nr_overcommit_hugepages_${other_size2} > > How does that look? Does anyone have any problems with such an > arrangement? Looks pretty good. I would personally lean toward subdirectories for hstates. Pros are that it would be a little easier to navigate from the shell, and maybe more regular to program for. You could possibly have hugepages_default symlink as well to one of the directories of your choice. This could be used by apps which do not specify exactly what size they want... I don't know, just ideas. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI 2008-04-24 7:13 ` Nick Piggin @ 2008-04-24 15:54 ` Nishanth Aravamudan 2008-04-27 3:49 ` [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] Nishanth Aravamudan 1 sibling, 0 replies; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-24 15:54 UTC (permalink / raw) To: Nick Piggin Cc: Christoph Lameter, Greg KH, wli, agl, luick, Lee.Schermerhorn, linux-mm On 24.04.2008 [09:13:52 +0200], Nick Piggin wrote: > On Wed, Apr 23, 2008 at 11:32:52AM -0700, Nishanth Aravamudan wrote: > > > > So, I think, we pretty much agree on how things should be: > > > > Direct translation of the current sysctl: > > > > /sys/kernel/hugepages/nr_hugepages > > nr_overcommit_hugepages > > > > Adding multiple pools: > > > > /sys/kernel/hugepages/nr_hugepages -> nr_hugepages_${default_size} > > nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size} > > nr_hugepages_${default_size} > > nr_overcommit_hugepages_${default_size} > > nr_hugepages_${other_size1} > > nr_overcommit_hugepages_${other_size2} > > > > Adding per-node control: > > > > /sys/kernel/hugepages/nr_hugepages -> nr_hugepages_${default_size} > > nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size} > > nr_hugepages_${default_size} > > nr_overcommit_hugepages_${default_size} > > nr_hugepages_${other_size1} > > nr_overcommit_hugepages_${other_size2} > > nodeX/nr_hugepages -> nr_hugepages_${default_size} > > nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size} > > nr_hugepages_${default_size} > > nr_overcommit_hugepages_${default_size} > > nr_hugepages_${other_size1} > > nr_overcommit_hugepages_${other_size2} > > > > How does that look? Does anyone have any problems with such an > > arrangement? > > Looks pretty good. I would personally lean toward subdirectories for > hstates. Pros are that it would be a little easier to navigate from > the shell, and maybe more regular to program for. That's probably true -- and extracting the pagesize to which things correspond should be simpler too. And perhaps that would allow for easier output of meminfo for the various hugepage sizes? > You could possibly have hugepages_default symlink as well to one of > the directories of your choice. This could be used by apps which do > not specify exactly what size they want... Yep, that would seem sensible...or perhaps even have nr_hugepages in the root directory (and other corresponding files directly symlink into the default_size directory?). > I don't know, just ideas. Thanks for the feedback! -Nish -- Nishanth Aravamudan <nacc@us.ibm.com> IBM Linux Technology Center -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] 2008-04-24 7:13 ` Nick Piggin 2008-04-24 15:54 ` Nishanth Aravamudan @ 2008-04-27 3:49 ` Nishanth Aravamudan 2008-04-27 5:10 ` Greg KH 1 sibling, 1 reply; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-27 3:49 UTC (permalink / raw) To: Nick Piggin Cc: Christoph Lameter, Greg KH, wli, agl, luick, Lee.Schermerhorn, linux-mm On 24.04.2008 [09:13:52 +0200], Nick Piggin wrote: > On Wed, Apr 23, 2008 at 11:32:52AM -0700, Nishanth Aravamudan wrote: > > > > So, I think, we pretty much agree on how things should be: > > > > Direct translation of the current sysctl: > > > > /sys/kernel/hugepages/nr_hugepages > > nr_overcommit_hugepages > > > > Adding multiple pools: > > > > /sys/kernel/hugepages/nr_hugepages -> nr_hugepages_${default_size} > > nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size} > > nr_hugepages_${default_size} > > nr_overcommit_hugepages_${default_size} > > nr_hugepages_${other_size1} > > nr_overcommit_hugepages_${other_size2} > > > > Adding per-node control: > > > > /sys/kernel/hugepages/nr_hugepages -> nr_hugepages_${default_size} > > nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size} > > nr_hugepages_${default_size} > > nr_overcommit_hugepages_${default_size} > > nr_hugepages_${other_size1} > > nr_overcommit_hugepages_${other_size2} > > nodeX/nr_hugepages -> nr_hugepages_${default_size} > > nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size} > > nr_hugepages_${default_size} > > nr_overcommit_hugepages_${default_size} > > nr_hugepages_${other_size1} > > nr_overcommit_hugepages_${other_size2} > > > > How does that look? Does anyone have any problems with such an > > arrangement? > > Looks pretty good. I would personally lean toward subdirectories for > hstates. Pros are that it would be a little easier to navigate from > the shell, and maybe more regular to program for. > > You could possibly have hugepages_default symlink as well to one of > the directories of your choice. This could be used by apps which do > not specify exactly what size they want... > > I don't know, just ideas. So, here's the first cut of the patch. Still very rough, but it builds and I'm running it now: [20:41:34]nacc@arkanoid:/sys/kernel/hugepages$ tree . `-- hugepages-2MB |-- meminfo |-- nr_huge_pages `-- nr_overcommit_huge_pages 1 directory, 3 files [20:41:56]nacc@arkanoid:/sys/kernel/hugepages$ cat /sys/kernel/hugepages/hugepages-2MB/meminfo HugePages_Total: 0 HugePages_Free: 0 HugePages_Rsvd: 0 HugePages_Surp: 0 Hugepagesize: 2048 kB [20:42:20]nacc@arkanoid:/sys/kernel/hugepages$ sudo echo 10 > /sys/kernel/hugepages/hugepages-2MB/nr_huge_pages [20:42:57]nacc@arkanoid:/sys/kernel/hugepages$ cat /sys/kernel/hugepages/hugepages-2MB/nr_huge_pages 10 [20:43:02]nacc@arkanoid:/sys/kernel/hugepages$ cat /sys/kernel/hugepages/hugepages-2MB/meminfo HugePages_Total: 10 HugePages_Free: 10 HugePages_Rsvd: 0 HugePages_Surp: 0 Hugepagesize: 2048 kB [20:43:05]nacc@arkanoid:/sys/kernel/hugepages$ cat /proc/m [20:43:10]nacc@arkanoid:/sys/kernel/hugepages$ grep Huge /proc/meminfo HugePages_Total: 10 HugePages_Free: 10 HugePages_Rsvd: 0 HugePages_Surp: 0 Hugepagesize: 2048 kB I haven't tested yet with multiple pools, will hopefully get to that Monday. I see one obvious issue, in that I left an underscore in huge_pages :) Will fix. How does the naming seem? I don't like having two memfmt()s but I couldn't think of a good way, beyond perhaps having two strings, one for the magnitude and one for the units, but that seemed gross. A lot of the functions and macros, perhaps all of them, are clones of the ones used for /sys/kernel/slab. Thanks to those authors for that code! Greg, do you see any obvious violations of sysfs rules here? Well, beyond meminfo itself, I guess, but given our previous snapshot discussion, I left it simple and the same, rather than split it up. Not-yet-Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com> include/linux/hugetlb.h | 9 +- mm/hugetlb.c | 317 ++++++++++++++++++++++++++++++++++++----------- 2 files changed, 251 insertions(+), 75 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 7aa22e7..cac63bd 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -3,6 +3,9 @@ #include <linux/fs.h> #include <linux/shm.h> +#include <linux/mempolicy.h> +#include <asm/tlbflush.h> +#include <asm/hugetlb.h> #ifdef CONFIG_HUGETLBFS struct hugetlbfs_config { @@ -69,10 +72,6 @@ static inline void set_file_hugepages(struct file *file) #ifdef CONFIG_HUGETLB_PAGE -#include <linux/mempolicy.h> -#include <asm/tlbflush.h> -#include <asm/hugetlb.h> - struct ctl_table; static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) @@ -131,6 +130,8 @@ struct hstate { unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; + const char *name; + struct kobject kobj; }; void __init huge_add_hstate(unsigned order); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index de03a14..c30e45d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -15,6 +15,7 @@ #include <linux/cpuset.h> #include <linux/mutex.h> #include <linux/bootmem.h> +#include <linux/sysfs.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -604,9 +605,21 @@ static void __init gather_bootmem_prealloc(void) } } +static __init char *memfmt_nospaces(char *buf, unsigned long n) +{ + if (n >= (1UL << 30)) + sprintf(buf, "%luGB", n >> 30); + else if (n >= (1UL << 20)) + sprintf(buf, "%luMB", n >> 20); + else + sprintf(buf, "%luKB", n >> 10); + return buf; +} + static void __init hugetlb_init_hstate(struct hstate *h) { unsigned long i; + char buf[32]; /* Don't reinitialize lists if they have been already init'ed */ if (!h->hugepage_freelists[0].next) { @@ -624,6 +637,8 @@ static void __init hugetlb_init_hstate(struct hstate *h) break; } h->max_huge_pages = i; + h->name = kasprintf(GFP_KERNEL, "hugepages-%s", + memfmt_nospaces(buf, huge_page_size(h))); } static void __init hugetlb_init_hstates(void) @@ -662,77 +677,6 @@ static void __init report_hugepages(void) } } -static int __init hugetlb_init(void) -{ - BUILD_BUG_ON(HPAGE_SHIFT == 0); - - if (!size_to_hstate(HPAGE_SIZE)) { - huge_add_hstate(HUGETLB_PAGE_ORDER); - parsed_hstate->max_huge_pages = default_hstate_resv; - } - - hugetlb_init_hstates(); - - gather_bootmem_prealloc(); - - report_hugepages(); - - return 0; -} -module_init(hugetlb_init); - -/* Should be called on processing a hugepagesz=... option */ -void __init huge_add_hstate(unsigned order) -{ - struct hstate *h; - if (size_to_hstate(PAGE_SIZE << order)) { - printk("hugepagesz= specified twice, ignoring\n"); - return; - } - BUG_ON(max_hstate >= HUGE_MAX_HSTATE); - BUG_ON(order < HPAGE_SHIFT - PAGE_SHIFT); - h = &hstates[max_hstate++]; - h->order = order; - h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); - hugetlb_init_hstate(h); - parsed_hstate = h; -} - -static int __init hugetlb_setup(char *s) -{ - unsigned long *mhp; - - if (!max_hstate) - mhp = &default_hstate_resv; - else - mhp = &parsed_hstate->max_huge_pages; - - if (sscanf(s, "%lu", mhp) <= 0) - *mhp = 0; - - /* - * Global state is always initialized later in hugetlb_init. - * But we need to allocate >= MAX_ORDER hstates here early to still - * use the bootmem allocator. - */ - if (max_hstate > 0 && parsed_hstate->order >= MAX_ORDER) - hugetlb_init_hstate(parsed_hstate); - - return 1; -} -__setup("hugepages=", hugetlb_setup); - -static unsigned int cpuset_mems_nr(unsigned int *array) -{ - int node; - unsigned int nr = 0; - - for_each_node_mask(node, cpuset_current_mems_allowed) - nr += array[node]; - - return nr; -} - #ifdef CONFIG_SYSCTL #ifdef CONFIG_HIGHMEM static void try_to_free_low(struct hstate *h, unsigned long count) @@ -843,6 +787,237 @@ out: return ret; } +#ifdef CONFIG_SYSFS +#define to_hstate_attr(n) container_of(n, struct hstate_attribute, attr) +#define to_hstate(n) container_of(n, struct hstate, kobj) + +struct hstate_attribute { + struct attribute attr; + ssize_t (*show)(struct hstate *h, char *buf); + ssize_t (*store)(struct hstate *h, const char *buf, size_t count); +}; + +#define HSTATE_ATTR_RO(_name) \ + static struct hstate_attribute _name##_attr = __ATTR_RO(_name) + +#define HSTATE_ATTR(_name) \ + static struct hstate_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static ssize_t nr_huge_pages_show(struct hstate *h, char *buf) +{ + return sprintf(buf, "%lu\n", h->nr_huge_pages); +} +static ssize_t nr_huge_pages_store(struct hstate *h, const char *buf, size_t count) +{ + int tmp; + + h->max_huge_pages = set_max_huge_pages(h, + simple_strtoul(buf, NULL, 10), &tmp); + max_huge_pages[h - hstates] = h->max_huge_pages; + return count; +} +HSTATE_ATTR(nr_huge_pages); + +static ssize_t nr_overcommit_huge_pages_show(struct hstate *h, char *buf) +{ + return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); +} +static ssize_t nr_overcommit_huge_pages_store(struct hstate *h, const char *buf, size_t count) +{ + spin_lock(&hugetlb_lock); + h->nr_overcommit_huge_pages = simple_strtoul(buf, NULL, 10); + sysctl_overcommit_huge_pages[h - hstates] = h->nr_overcommit_huge_pages; + spin_unlock(&hugetlb_lock); + return count; +} +HSTATE_ATTR(nr_overcommit_huge_pages); + +static ssize_t meminfo_show(struct hstate *h, char *buf) +{ + return sprintf(buf, + "HugePages_Total: %5lu\n" + "HugePages_Free: %5lu\n" + "HugePages_Rsvd: %5lu\n" + "HugePages_Surp: %5lu\n" + "Hugepagesize: %5lu kB\n", + h->nr_huge_pages, + h->free_huge_pages, + h->resv_huge_pages, + h->surplus_huge_pages, + huge_page_size(h) / 1024); +} +HSTATE_ATTR_RO(meminfo); + +static struct kset *hstate_kset; + +static struct attribute *hstate_attrs[] = { + &meminfo_attr.attr, + &nr_huge_pages_attr.attr, + &nr_overcommit_huge_pages_attr.attr, +}; + +static struct attribute_group hstate_attr_group = { + .attrs = hstate_attrs, +}; + +static ssize_t hstate_attr_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct hstate_attribute *attribute; + struct hstate *h; + int err; + + attribute = to_hstate_attr(attr); + h = to_hstate(kobj); + + if (!attribute->show) + return -EIO; + + err = attribute->show(h, buf); + + return err; +} + +static ssize_t hstate_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct hstate_attribute *attribute; + struct hstate *h; + int err; + + attribute = to_hstate_attr(attr); + h = to_hstate(kobj); + + if (!attribute->store) + return -EIO; + + err = attribute->store(h, buf, len); + + return err; +} + +static struct sysfs_ops hstate_sysfs_ops = { + .show = hstate_attr_show, + .store = hstate_attr_store, +}; + +static struct kobj_type hstate_ktype = { + .sysfs_ops = &hstate_sysfs_ops, +}; + +static int __init hugetlb_sysfs_add_hstate(struct hstate *h) +{ + int err; + h->kobj.kset = hstate_kset; + err = kobject_init_and_add(&h->kobj, &hstate_ktype, NULL, h->name); + if (err) { + kobject_put(&h->kobj); + return err; + } + err = sysfs_create_group(&h->kobj, &hstate_attr_group); + if (err) + return err; + return 0; +} + +static void __init hugetlb_sysfs_init(void) +{ + struct hstate *h; + int err; + + hstate_kset = kset_create_and_add("hugepages", NULL, kernel_kobj); + if (!hstate_kset) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h); + if (err) + printk(KERN_ERR "Hugetlb: Unable to add hstate %s", h->name); + } +} +#else +static void __init hugetlb_sysfs_init(void) +{ +} +#endif + +static int __init hugetlb_init(void) +{ + BUILD_BUG_ON(HPAGE_SHIFT == 0); + + if (!size_to_hstate(HPAGE_SIZE)) { + huge_add_hstate(HUGETLB_PAGE_ORDER); + parsed_hstate->max_huge_pages = default_hstate_resv; + } + + hugetlb_init_hstates(); + + gather_bootmem_prealloc(); + + report_hugepages(); + + hugetlb_sysfs_init(); + + return 0; +} +module_init(hugetlb_init); + +/* Should be called on processing a hugepagesz=... option */ +void __init huge_add_hstate(unsigned order) +{ + struct hstate *h; + if (size_to_hstate(PAGE_SIZE << order)) { + printk("hugepagesz= specified twice, ignoring\n"); + return; + } + BUG_ON(max_hstate >= HUGE_MAX_HSTATE); + BUG_ON(order < HPAGE_SHIFT - PAGE_SHIFT); + h = &hstates[max_hstate++]; + h->order = order; + h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); + hugetlb_init_hstate(h); + parsed_hstate = h; +} + +static int __init hugetlb_setup(char *s) +{ + unsigned long *mhp; + + if (!max_hstate) + mhp = &default_hstate_resv; + else + mhp = &parsed_hstate->max_huge_pages; + + if (sscanf(s, "%lu", mhp) <= 0) + *mhp = 0; + + /* + * Global state is always initialized later in hugetlb_init. + * But we need to allocate >= MAX_ORDER hstates here early to still + * use the bootmem allocator. + */ + if (max_hstate > 0 && parsed_hstate->order >= MAX_ORDER) + hugetlb_init_hstate(parsed_hstate); + + return 1; +} +__setup("hugepages=", hugetlb_setup); + +static unsigned int cpuset_mems_nr(unsigned int *array) +{ + int node; + unsigned int nr = 0; + + for_each_node_mask(node, cpuset_current_mems_allowed) + nr += array[node]; + + return nr; +} + + int hugetlb_sysctl_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) -- Nishanth Aravamudan <nacc@us.ibm.com> IBM Linux Technology Center -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] 2008-04-27 3:49 ` [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] Nishanth Aravamudan @ 2008-04-27 5:10 ` Greg KH 2008-04-28 17:22 ` Nishanth Aravamudan 2008-04-28 20:31 ` Christoph Lameter 0 siblings, 2 replies; 51+ messages in thread From: Greg KH @ 2008-04-27 5:10 UTC (permalink / raw) To: Nishanth Aravamudan Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn, linux-mm On Sat, Apr 26, 2008 at 08:49:42PM -0700, Nishanth Aravamudan wrote: > > [20:41:56]nacc@arkanoid:/sys/kernel/hugepages$ cat /sys/kernel/hugepages/hugepages-2MB/meminfo > HugePages_Total: 0 > HugePages_Free: 0 > HugePages_Rsvd: 0 > HugePages_Surp: 0 > Hugepagesize: 2048 kB > > Greg, do you see any obvious violations of sysfs rules here? Well, beyond > meminfo itself, I guess, but given our previous snapshot discussion, I left it > simple and the same, rather than split it up. Yeah, I don't like that file. Why not just have 5 files, one for each value? There isn't such a need for an immediate snapshot shere you can't just read all 5 values from 5 files? Also, why use a "units" here, just always use the lowest unit, and userspace can convert from kB to GB if needed. thanks, greg k-h -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] 2008-04-27 5:10 ` Greg KH @ 2008-04-28 17:22 ` Nishanth Aravamudan 2008-04-28 17:29 ` Greg KH 2008-04-28 20:31 ` Christoph Lameter 1 sibling, 1 reply; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-28 17:22 UTC (permalink / raw) To: Greg KH Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn, linux-mm On 26.04.2008 [22:10:29 -0700], Greg KH wrote: > On Sat, Apr 26, 2008 at 08:49:42PM -0700, Nishanth Aravamudan wrote: > > > > [20:41:56]nacc@arkanoid:/sys/kernel/hugepages$ cat /sys/kernel/hugepages/hugepages-2MB/meminfo > > HugePages_Total: 0 > > HugePages_Free: 0 > > HugePages_Rsvd: 0 > > HugePages_Surp: 0 > > Hugepagesize: 2048 kB > > > > Greg, do you see any obvious violations of sysfs rules here? Well, beyond > > meminfo itself, I guess, but given our previous snapshot discussion, I left it > > simple and the same, rather than split it up. > > Yeah, I don't like that file. Why not just have 5 files, one for each > value? There isn't such a need for an immediate snapshot shere you > can't just read all 5 values from 5 files? Actually, we already have Total in nr_hugepages, so I only needed to add 3 files. The size is implicit in the directory name? > Also, why use a "units" here, just always use the lowest unit, and > userspace can convert from kB to GB if needed. Agreed, so I changed the name of the directory from hugepages-2M to hugpeages-2048 for instance. Userspace utilities can pretty-ize it :) Thanks, Nish -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] 2008-04-28 17:22 ` Nishanth Aravamudan @ 2008-04-28 17:29 ` Greg KH 2008-04-29 17:11 ` Nishanth Aravamudan 0 siblings, 1 reply; 51+ messages in thread From: Greg KH @ 2008-04-28 17:29 UTC (permalink / raw) To: Nishanth Aravamudan Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn, linux-mm On Mon, Apr 28, 2008 at 10:22:39AM -0700, Nishanth Aravamudan wrote: > On 26.04.2008 [22:10:29 -0700], Greg KH wrote: > > On Sat, Apr 26, 2008 at 08:49:42PM -0700, Nishanth Aravamudan wrote: > > > > > > [20:41:56]nacc@arkanoid:/sys/kernel/hugepages$ cat /sys/kernel/hugepages/hugepages-2MB/meminfo > > > HugePages_Total: 0 > > > HugePages_Free: 0 > > > HugePages_Rsvd: 0 > > > HugePages_Surp: 0 > > > Hugepagesize: 2048 kB > > > > > > Greg, do you see any obvious violations of sysfs rules here? Well, beyond > > > meminfo itself, I guess, but given our previous snapshot discussion, I left it > > > simple and the same, rather than split it up. > > > > Yeah, I don't like that file. Why not just have 5 files, one for each > > value? There isn't such a need for an immediate snapshot shere you > > can't just read all 5 values from 5 files? > > Actually, we already have Total in nr_hugepages, so I only needed to add > 3 files. The size is implicit in the directory name? Ah, good point. > > Also, why use a "units" here, just always use the lowest unit, and > > userspace can convert from kB to GB if needed. > > Agreed, so I changed the name of the directory from > > hugepages-2M > > to > > hugpeages-2048 > > for instance. Userspace utilities can pretty-ize it :) Exactly, that would be much better. thanks, greg k-h -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] 2008-04-28 17:29 ` Greg KH @ 2008-04-29 17:11 ` Nishanth Aravamudan 2008-04-29 17:22 ` Greg KH 0 siblings, 1 reply; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-29 17:11 UTC (permalink / raw) To: Greg KH Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn, linux-mm On 28.04.2008 [10:29:51 -0700], Greg KH wrote: > On Mon, Apr 28, 2008 at 10:22:39AM -0700, Nishanth Aravamudan wrote: > > On 26.04.2008 [22:10:29 -0700], Greg KH wrote: > > > On Sat, Apr 26, 2008 at 08:49:42PM -0700, Nishanth Aravamudan wrote: > > > > > > > > [20:41:56]nacc@arkanoid:/sys/kernel/hugepages$ cat /sys/kernel/hugepages/hugepages-2MB/meminfo > > > > HugePages_Total: 0 > > > > HugePages_Free: 0 > > > > HugePages_Rsvd: 0 > > > > HugePages_Surp: 0 > > > > Hugepagesize: 2048 kB > > > > > > > > Greg, do you see any obvious violations of sysfs rules here? Well, beyond > > > > meminfo itself, I guess, but given our previous snapshot discussion, I left it > > > > simple and the same, rather than split it up. > > > > > > Yeah, I don't like that file. Why not just have 5 files, one for each > > > value? There isn't such a need for an immediate snapshot shere you > > > can't just read all 5 values from 5 files? > > > > Actually, we already have Total in nr_hugepages, so I only needed to add > > 3 files. The size is implicit in the directory name? > > Ah, good point. > > > > Also, why use a "units" here, just always use the lowest unit, and > > > userspace can convert from kB to GB if needed. > > > > Agreed, so I changed the name of the directory from > > > > hugepages-2M > > > > to > > > > hugpeages-2048 > > > > for instance. Userspace utilities can pretty-ize it :) > > Exactly, that would be much better. FWIW, here's the updated patch. Still needs more testing. include/linux/hugetlb.h | 9 +- mm/hugetlb.c | 322 ++++++++++++++++++++++++++++++++++++----------- 2 files changed, 256 insertions(+), 75 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 7aa22e7..cac63bd 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -3,6 +3,9 @@ #include <linux/fs.h> #include <linux/shm.h> +#include <linux/mempolicy.h> +#include <asm/tlbflush.h> +#include <asm/hugetlb.h> #ifdef CONFIG_HUGETLBFS struct hugetlbfs_config { @@ -69,10 +72,6 @@ static inline void set_file_hugepages(struct file *file) #ifdef CONFIG_HUGETLB_PAGE -#include <linux/mempolicy.h> -#include <asm/tlbflush.h> -#include <asm/hugetlb.h> - struct ctl_table; static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) @@ -131,6 +130,8 @@ struct hstate { unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; + const char *name; + struct kobject kobj; }; void __init huge_add_hstate(unsigned order); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index de03a14..1d94a85 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -15,6 +15,7 @@ #include <linux/cpuset.h> #include <linux/mutex.h> #include <linux/bootmem.h> +#include <linux/sysfs.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -624,6 +625,8 @@ static void __init hugetlb_init_hstate(struct hstate *h) break; } h->max_huge_pages = i; + h->name = kasprintf(GFP_KERNEL, "hugepages-%lu", + huge_page_size(h) / 1024); } static void __init hugetlb_init_hstates(void) @@ -662,77 +665,6 @@ static void __init report_hugepages(void) } } -static int __init hugetlb_init(void) -{ - BUILD_BUG_ON(HPAGE_SHIFT == 0); - - if (!size_to_hstate(HPAGE_SIZE)) { - huge_add_hstate(HUGETLB_PAGE_ORDER); - parsed_hstate->max_huge_pages = default_hstate_resv; - } - - hugetlb_init_hstates(); - - gather_bootmem_prealloc(); - - report_hugepages(); - - return 0; -} -module_init(hugetlb_init); - -/* Should be called on processing a hugepagesz=... option */ -void __init huge_add_hstate(unsigned order) -{ - struct hstate *h; - if (size_to_hstate(PAGE_SIZE << order)) { - printk("hugepagesz= specified twice, ignoring\n"); - return; - } - BUG_ON(max_hstate >= HUGE_MAX_HSTATE); - BUG_ON(order < HPAGE_SHIFT - PAGE_SHIFT); - h = &hstates[max_hstate++]; - h->order = order; - h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); - hugetlb_init_hstate(h); - parsed_hstate = h; -} - -static int __init hugetlb_setup(char *s) -{ - unsigned long *mhp; - - if (!max_hstate) - mhp = &default_hstate_resv; - else - mhp = &parsed_hstate->max_huge_pages; - - if (sscanf(s, "%lu", mhp) <= 0) - *mhp = 0; - - /* - * Global state is always initialized later in hugetlb_init. - * But we need to allocate >= MAX_ORDER hstates here early to still - * use the bootmem allocator. - */ - if (max_hstate > 0 && parsed_hstate->order >= MAX_ORDER) - hugetlb_init_hstate(parsed_hstate); - - return 1; -} -__setup("hugepages=", hugetlb_setup); - -static unsigned int cpuset_mems_nr(unsigned int *array) -{ - int node; - unsigned int nr = 0; - - for_each_node_mask(node, cpuset_current_mems_allowed) - nr += array[node]; - - return nr; -} - #ifdef CONFIG_SYSCTL #ifdef CONFIG_HIGHMEM static void try_to_free_low(struct hstate *h, unsigned long count) @@ -843,6 +775,254 @@ out: return ret; } +#ifdef CONFIG_SYSFS +#define to_hstate_attr(n) container_of(n, struct hstate_attribute, attr) +#define to_hstate(n) container_of(n, struct hstate, kobj) + +struct hstate_attribute { + struct attribute attr; + ssize_t (*show)(struct hstate *h, char *buf); + ssize_t (*store)(struct hstate *h, const char *buf, size_t count); +}; + +#define HSTATE_ATTR_RO(_name) \ + static struct hstate_attribute _name##_attr = __ATTR_RO(_name) + +#define HSTATE_ATTR(_name) \ + static struct hstate_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static ssize_t nr_hugepages_show(struct hstate *h, char *buf) +{ + return sprintf(buf, "%lu\n", h->nr_huge_pages); +} +static ssize_t nr_hugepages_store(struct hstate *h, const char *buf, + size_t count) +{ + int tmp, err; + unsigned long input; + err = strict_strtoul(buf, 10, &input); + if (err) + return 0; + + h->max_huge_pages = set_max_huge_pages(h, input, &tmp); + max_huge_pages[h - hstates] = h->max_huge_pages; + return count; +} +HSTATE_ATTR(nr_hugepages); + +static ssize_t nr_overcommit_hugepages_show(struct hstate *h, char *buf) +{ + return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); +} +static ssize_t nr_overcommit_hugepages_store(struct hstate *h, const char *buf, + size_t count) +{ + unsigned long input; + int err; + + err = strict_strtoul(buf, 10, &input); + if (err) + return 0; + + spin_lock(&hugetlb_lock); + h->nr_overcommit_huge_pages = input; + sysctl_overcommit_huge_pages[h - hstates] = h->nr_overcommit_huge_pages; + spin_unlock(&hugetlb_lock); + return count; +} +HSTATE_ATTR(nr_overcommit_hugepages); + +static ssize_t free_hugepages_show(struct hstate *h, char *buf) +{ + return sprintf(buf, "%lu\n", h->free_huge_pages); +} +HSTATE_ATTR_RO(free_hugepages); + +static ssize_t resv_hugepages_show(struct hstate *h, char *buf) +{ + return sprintf(buf, "%lu\n", h->resv_huge_pages); +} +HSTATE_ATTR_RO(resv_hugepages); + +static ssize_t surplus_hugepages_show(struct hstate *h, char *buf) +{ + return sprintf(buf, "%lu\n", h->surplus_huge_pages); +} +HSTATE_ATTR_RO(surplus_hugepages); + +static struct kset *hstate_kset; + +static struct attribute *hstate_attrs[] = { + &nr_hugepages_attr.attr, + &nr_overcommit_hugepages_attr.attr, + &free_hugepages_attr.attr, + &resv_hugepages_attr.attr, + &surplus_hugepages_attr.attr, +}; + +static struct attribute_group hstate_attr_group = { + .attrs = hstate_attrs, +}; + +static ssize_t hstate_attr_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct hstate_attribute *attribute; + struct hstate *h; + int err; + + attribute = to_hstate_attr(attr); + h = to_hstate(kobj); + + if (!attribute->show) + return -EIO; + + err = attribute->show(h, buf); + + return err; +} + +static ssize_t hstate_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct hstate_attribute *attribute; + struct hstate *h; + int err; + + attribute = to_hstate_attr(attr); + h = to_hstate(kobj); + + if (!attribute->store) + return -EIO; + + err = attribute->store(h, buf, len); + + return err; +} + +static struct sysfs_ops hstate_sysfs_ops = { + .show = hstate_attr_show, + .store = hstate_attr_store, +}; + +static struct kobj_type hstate_ktype = { + .sysfs_ops = &hstate_sysfs_ops, +}; + +static int __init hugetlb_sysfs_add_hstate(struct hstate *h) +{ + int err; + h->kobj.kset = hstate_kset; + err = kobject_init_and_add(&h->kobj, &hstate_ktype, NULL, h->name); + if (err) { + kobject_put(&h->kobj); + return err; + } + err = sysfs_create_group(&h->kobj, &hstate_attr_group); + if (err) + return err; + return 0; +} + +static void __init hugetlb_sysfs_init(void) +{ + struct hstate *h; + int err; + + hstate_kset = kset_create_and_add("hugepages", NULL, kernel_kobj); + if (!hstate_kset) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h); + if (err) + printk(KERN_ERR "Hugetlb: Unable to add hstate %s", + h->name); + } +} +#else +static void __init hugetlb_sysfs_init(void) +{ +} +#endif + +static int __init hugetlb_init(void) +{ + BUILD_BUG_ON(HPAGE_SHIFT == 0); + + if (!size_to_hstate(HPAGE_SIZE)) { + huge_add_hstate(HUGETLB_PAGE_ORDER); + parsed_hstate->max_huge_pages = default_hstate_resv; + } + + hugetlb_init_hstates(); + + gather_bootmem_prealloc(); + + report_hugepages(); + + hugetlb_sysfs_init(); + + return 0; +} +module_init(hugetlb_init); + +/* Should be called on processing a hugepagesz=... option */ +void __init huge_add_hstate(unsigned order) +{ + struct hstate *h; + if (size_to_hstate(PAGE_SIZE << order)) { + printk("hugepagesz= specified twice, ignoring\n"); + return; + } + BUG_ON(max_hstate >= HUGE_MAX_HSTATE); + BUG_ON(order < HPAGE_SHIFT - PAGE_SHIFT); + h = &hstates[max_hstate++]; + h->order = order; + h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); + hugetlb_init_hstate(h); + parsed_hstate = h; +} + +static int __init hugetlb_setup(char *s) +{ + unsigned long *mhp; + + if (!max_hstate) + mhp = &default_hstate_resv; + else + mhp = &parsed_hstate->max_huge_pages; + + if (sscanf(s, "%lu", mhp) <= 0) + *mhp = 0; + + /* + * Global state is always initialized later in hugetlb_init. + * But we need to allocate >= MAX_ORDER hstates here early to still + * use the bootmem allocator. + */ + if (max_hstate > 0 && parsed_hstate->order >= MAX_ORDER) + hugetlb_init_hstate(parsed_hstate); + + return 1; +} +__setup("hugepages=", hugetlb_setup); + +static unsigned int cpuset_mems_nr(unsigned int *array) +{ + int node; + unsigned int nr = 0; + + for_each_node_mask(node, cpuset_current_mems_allowed) + nr += array[node]; + + return nr; +} + + int hugetlb_sysctl_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) -- Nishanth Aravamudan <nacc@us.ibm.com> IBM Linux Technology Center -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] 2008-04-29 17:11 ` Nishanth Aravamudan @ 2008-04-29 17:22 ` Greg KH 2008-04-29 18:14 ` Nishanth Aravamudan 0 siblings, 1 reply; 51+ messages in thread From: Greg KH @ 2008-04-29 17:22 UTC (permalink / raw) To: Nishanth Aravamudan Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn, linux-mm On Tue, Apr 29, 2008 at 10:11:15AM -0700, Nishanth Aravamudan wrote: > +struct hstate_attribute { > + struct attribute attr; > + ssize_t (*show)(struct hstate *h, char *buf); > + ssize_t (*store)(struct hstate *h, const char *buf, size_t count); > +}; Do you need your own attribute type with show and store? Can't you just use the "default" kobject attributes? Also, you have no release function for your kobject to be cleaned up, that's a major bug. thanks, greg k-h -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] 2008-04-29 17:22 ` Greg KH @ 2008-04-29 18:14 ` Nishanth Aravamudan 2008-04-29 18:26 ` Greg KH 0 siblings, 1 reply; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-29 18:14 UTC (permalink / raw) To: Greg KH Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn, linux-mm On 29.04.2008 [10:22:43 -0700], Greg KH wrote: > On Tue, Apr 29, 2008 at 10:11:15AM -0700, Nishanth Aravamudan wrote: > > +struct hstate_attribute { > > + struct attribute attr; > > + ssize_t (*show)(struct hstate *h, char *buf); > > + ssize_t (*store)(struct hstate *h, const char *buf, size_t count); > > +}; > > Do you need your own attribute type with show and store? Can't you just > use the "default" kobject attributes? Hrm, I don't know? Probably. Like I said, I was using the /sys/kernel/slab code as my reference. Can you explain this more? Or just point me to the source/documentation I should read for info. Are you referring to kobj_attr_show/kobj_attr_store? Should I just be using kobj_sysfs_ops, then, most likely? > Also, you have no release function for your kobject to be cleaned up, > that's a major bug. Well, these kobjects never go away? They will be statically initialized at boot-time and then stick around until the kernel goes away. Looking at /sys/kernel/slab's code, again, the release() function there does a kfree() on the containing kmem_cache, but for hugetlb, the hstates are static... If we do move to dynamic allocations ever (or allow adding hugepage sizes at run-time somehow), then perhaps we'll need a release method then? Thanks, Nish -- Nishanth Aravamudan <nacc@us.ibm.com> IBM Linux Technology Center -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] 2008-04-29 18:14 ` Nishanth Aravamudan @ 2008-04-29 18:26 ` Greg KH 2008-04-29 23:48 ` Nishanth Aravamudan 2008-04-30 19:19 ` Nishanth Aravamudan 0 siblings, 2 replies; 51+ messages in thread From: Greg KH @ 2008-04-29 18:26 UTC (permalink / raw) To: Nishanth Aravamudan Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn, linux-mm On Tue, Apr 29, 2008 at 11:14:15AM -0700, Nishanth Aravamudan wrote: > On 29.04.2008 [10:22:43 -0700], Greg KH wrote: > > On Tue, Apr 29, 2008 at 10:11:15AM -0700, Nishanth Aravamudan wrote: > > > +struct hstate_attribute { > > > + struct attribute attr; > > > + ssize_t (*show)(struct hstate *h, char *buf); > > > + ssize_t (*store)(struct hstate *h, const char *buf, size_t count); > > > +}; > > > > Do you need your own attribute type with show and store? Can't you just > > use the "default" kobject attributes? > > Hrm, I don't know? Probably. Like I said, I was using the > /sys/kernel/slab code as my reference. Can you explain this more? Or > just point me to the source/documentation I should read for info. Documentation/kobject.txt, with sample examples in samples/kobject/ for you to copy and use. > Are you referring to kobj_attr_show/kobj_attr_store? Should I just be > using kobj_sysfs_ops, then, most likely? See the above examples for more details. > > Also, you have no release function for your kobject to be cleaned up, > > that's a major bug. > > Well, these kobjects never go away? They will be statically initialized > at boot-time and then stick around until the kernel goes away. Looking > at /sys/kernel/slab's code, again, the release() function there does a > kfree() on the containing kmem_cache, but for hugetlb, the hstates are > static... If we do move to dynamic allocations ever (or allow adding > hugepage sizes at run-time somehow), then perhaps we'll need a release > method then? Yes you will. Please always create one, what happens when you want to clean them up at shut-down time... thanks, greg k-h -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] 2008-04-29 18:26 ` Greg KH @ 2008-04-29 23:48 ` Nishanth Aravamudan 2008-05-01 3:07 ` Greg KH 2008-04-30 19:19 ` Nishanth Aravamudan 1 sibling, 1 reply; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-29 23:48 UTC (permalink / raw) To: Greg KH Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn, linux-mm On 29.04.2008 [11:26:13 -0700], Greg KH wrote: > On Tue, Apr 29, 2008 at 11:14:15AM -0700, Nishanth Aravamudan wrote: > > On 29.04.2008 [10:22:43 -0700], Greg KH wrote: > > > On Tue, Apr 29, 2008 at 10:11:15AM -0700, Nishanth Aravamudan wrote: > > > > +struct hstate_attribute { > > > > + struct attribute attr; > > > > + ssize_t (*show)(struct hstate *h, char *buf); > > > > + ssize_t (*store)(struct hstate *h, const char *buf, size_t count); > > > > +}; > > > > > > Do you need your own attribute type with show and store? Can't you just > > > use the "default" kobject attributes? > > > > Hrm, I don't know? Probably. Like I said, I was using the > > /sys/kernel/slab code as my reference. Can you explain this more? Or > > just point me to the source/documentation I should read for info. > > Documentation/kobject.txt, with sample examples in samples/kobject/ for > you to copy and use. Great thanks! > > Are you referring to kobj_attr_show/kobj_attr_store? Should I just be > > using kobj_sysfs_ops, then, most likely? > > See the above examples for more details. Will do -- I think we'll need our own store, at least, though, because of locking issues? And I'm guessing if we provide our own store, we're going to need to provide our own show? > > > Also, you have no release function for your kobject to be cleaned up, > > > that's a major bug. > > > > Well, these kobjects never go away? They will be statically initialized > > at boot-time and then stick around until the kernel goes away. Looking > > at /sys/kernel/slab's code, again, the release() function there does a > > kfree() on the containing kmem_cache, but for hugetlb, the hstates are > > static... If we do move to dynamic allocations ever (or allow adding > > hugepage sizes at run-time somehow), then perhaps we'll need a release > > method then? > > Yes you will. Please always create one, what happens when you want to > clean them up at shut-down time... Again, I'm not sure what you want me to clean-up? The examples in samples/ are freeing dynamically allocated objects containing the kobject in question -- but /sys/kernel/hugepages only dynamically allocates the kobject itself... Although, I guess I should free the name string since I used kasprintf()... Thanks, Nish -- Nishanth Aravamudan <nacc@us.ibm.com> IBM Linux Technology Center -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] 2008-04-29 23:48 ` Nishanth Aravamudan @ 2008-05-01 3:07 ` Greg KH 2008-05-01 18:25 ` Nishanth Aravamudan 0 siblings, 1 reply; 51+ messages in thread From: Greg KH @ 2008-05-01 3:07 UTC (permalink / raw) To: Nishanth Aravamudan Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn, linux-mm On Tue, Apr 29, 2008 at 04:48:39PM -0700, Nishanth Aravamudan wrote: > On 29.04.2008 [11:26:13 -0700], Greg KH wrote: > > On Tue, Apr 29, 2008 at 11:14:15AM -0700, Nishanth Aravamudan wrote: > > > On 29.04.2008 [10:22:43 -0700], Greg KH wrote: > > > > On Tue, Apr 29, 2008 at 10:11:15AM -0700, Nishanth Aravamudan wrote: > > > > > +struct hstate_attribute { > > > > > + struct attribute attr; > > > > > + ssize_t (*show)(struct hstate *h, char *buf); > > > > > + ssize_t (*store)(struct hstate *h, const char *buf, size_t count); > > > > > +}; > > > > > > > > Do you need your own attribute type with show and store? Can't you just > > > > use the "default" kobject attributes? > > > > > > Hrm, I don't know? Probably. Like I said, I was using the > > > /sys/kernel/slab code as my reference. Can you explain this more? Or > > > just point me to the source/documentation I should read for info. > > > > Documentation/kobject.txt, with sample examples in samples/kobject/ for > > you to copy and use. > > Great thanks! > > > > Are you referring to kobj_attr_show/kobj_attr_store? Should I just be > > > using kobj_sysfs_ops, then, most likely? > > > > See the above examples for more details. > > Will do -- I think we'll need our own store, at least, though, because > of locking issues? And I'm guessing if we provide our own store, we're > going to need to provide our own show? Yes, but see below... > > > > Also, you have no release function for your kobject to be cleaned up, > > > > that's a major bug. > > > > > > Well, these kobjects never go away? They will be statically initialized > > > at boot-time and then stick around until the kernel goes away. Looking > > > at /sys/kernel/slab's code, again, the release() function there does a > > > kfree() on the containing kmem_cache, but for hugetlb, the hstates are > > > static... If we do move to dynamic allocations ever (or allow adding > > > hugepage sizes at run-time somehow), then perhaps we'll need a release > > > method then? > > > > Yes you will. Please always create one, what happens when you want to > > clean them up at shut-down time... > > Again, I'm not sure what you want me to clean-up? The examples in > samples/ are freeing dynamically allocated objects containing the > kobject in question -- but /sys/kernel/hugepages only dynamically > allocates the kobject itself... Although, I guess I should free the name > string since I used kasprintf()... Ugh. Embed a kobject into a structure if you want it to control the lifetime rules of that structure. And that includes tearing it down. If you _only_ want to use a kobject to create some sysfs trees and files, then just use the dynamic kobject functions, as documented. Then you only have a pointer to a kobject, it does not control the lifetime of your structure, you don't have to write your own show/store wrappers, and life is oh so much more easier. So you might want to rethink your current patch :) thanks, greg k-h -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] 2008-05-01 3:07 ` Greg KH @ 2008-05-01 18:25 ` Nishanth Aravamudan 0 siblings, 0 replies; 51+ messages in thread From: Nishanth Aravamudan @ 2008-05-01 18:25 UTC (permalink / raw) To: Greg KH Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn, linux-mm On 30.04.2008 [20:07:38 -0700], Greg KH wrote: > On Tue, Apr 29, 2008 at 04:48:39PM -0700, Nishanth Aravamudan wrote: > > On 29.04.2008 [11:26:13 -0700], Greg KH wrote: > > > On Tue, Apr 29, 2008 at 11:14:15AM -0700, Nishanth Aravamudan wrote: > > > > On 29.04.2008 [10:22:43 -0700], Greg KH wrote: > > > > > On Tue, Apr 29, 2008 at 10:11:15AM -0700, Nishanth Aravamudan wrote: > > > > > > +struct hstate_attribute { > > > > > > + struct attribute attr; > > > > > > + ssize_t (*show)(struct hstate *h, char *buf); > > > > > > + ssize_t (*store)(struct hstate *h, const char *buf, size_t count); > > > > > > +}; > > > > > > > > > > Do you need your own attribute type with show and store? Can't you just > > > > > use the "default" kobject attributes? > > > > > > > > Hrm, I don't know? Probably. Like I said, I was using the > > > > /sys/kernel/slab code as my reference. Can you explain this more? Or > > > > just point me to the source/documentation I should read for info. > > > > > > Documentation/kobject.txt, with sample examples in samples/kobject/ for > > > you to copy and use. > > > > Great thanks! > > > > > > Are you referring to kobj_attr_show/kobj_attr_store? Should I just be > > > > using kobj_sysfs_ops, then, most likely? > > > > > > See the above examples for more details. > > > > Will do -- I think we'll need our own store, at least, though, because > > of locking issues? And I'm guessing if we provide our own store, we're > > going to need to provide our own show? > > Yes, but see below... > > > > > > Also, you have no release function for your kobject to be cleaned up, > > > > > that's a major bug. > > > > > > > > Well, these kobjects never go away? They will be statically initialized > > > > at boot-time and then stick around until the kernel goes away. Looking > > > > at /sys/kernel/slab's code, again, the release() function there does a > > > > kfree() on the containing kmem_cache, but for hugetlb, the hstates are > > > > static... If we do move to dynamic allocations ever (or allow adding > > > > hugepage sizes at run-time somehow), then perhaps we'll need a release > > > > method then? > > > > > > Yes you will. Please always create one, what happens when you want to > > > clean them up at shut-down time... > > > > Again, I'm not sure what you want me to clean-up? The examples in > > samples/ are freeing dynamically allocated objects containing the > > kobject in question -- but /sys/kernel/hugepages only dynamically > > allocates the kobject itself... Although, I guess I should free the name > > string since I used kasprintf()... > > Ugh. > > Embed a kobject into a structure if you want it to control the > lifetime rules of that structure. And that includes tearing it down. > > If you _only_ want to use a kobject to create some sysfs trees and > files, then just use the dynamic kobject functions, as documented. > Then you only have a pointer to a kobject, it does not control the > lifetime of your structure, you don't have to write your own > show/store wrappers, and life is oh so much more easier. > > So you might want to rethink your current patch :) Ok, I get this now, and have started moving over to it. However, I see a few problems, or have a few questions: 1) I do need my own store() wrapper due to locking, right? We can't change the writable values here without grabbing the hugetlb_lock. And the examples in samples/kobject/kobject-sample.c, at least, do have their own show/store methods (or do you mean something else by wrapper)? Oh, maybe you are referring to hstate_attr_store()/hstate_attr_show()? Those no longer exist in this patch... 2) I will need a kobject pointer for each hstate, right? So what I have now is: static struct kobject *hstate_kobj[HUGE_MAX_HSTATE]; and then I use kobject_create_and_add() for each of them. How do I then refer back to which hstate I'm dealing with (because I want to manipulate that hstate's values in the show/store methods) -- would I need to iterate through hstate_kobj until I find the kobject that was passed in and then use that index into hstates() to find the corresponding hstate? I guess unlike in the embedding case, I don't see the link between the structure I'm trying to represent and the kobject... 3) Each hstate is going to have the same set of attributes. Let's say I use sysfs_create_group() on each of the hstate_kobj's array members. Will I then actually need duplicates of the set of attributes so that there is a static set of attributes per-hstate? This directly relates to 2), actually -- if I can get to the hstate from the kobject then I can do that with one set of attributes. Thanks, Nish -- Nishanth Aravamudan <nacc@us.ibm.com> IBM Linux Technology Center -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] 2008-04-29 18:26 ` Greg KH 2008-04-29 23:48 ` Nishanth Aravamudan @ 2008-04-30 19:19 ` Nishanth Aravamudan 2008-05-01 3:08 ` Greg KH 1 sibling, 1 reply; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-30 19:19 UTC (permalink / raw) To: Greg KH Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn, linux-mm On 29.04.2008 [11:26:13 -0700], Greg KH wrote: > On Tue, Apr 29, 2008 at 11:14:15AM -0700, Nishanth Aravamudan wrote: > > On 29.04.2008 [10:22:43 -0700], Greg KH wrote: > > > On Tue, Apr 29, 2008 at 10:11:15AM -0700, Nishanth Aravamudan wrote: > > > > +struct hstate_attribute { > > > > + struct attribute attr; > > > > + ssize_t (*show)(struct hstate *h, char *buf); > > > > + ssize_t (*store)(struct hstate *h, const char *buf, size_t count); > > > > +}; > > > > > > Do you need your own attribute type with show and store? Can't you just > > > use the "default" kobject attributes? > > > > Hrm, I don't know? Probably. Like I said, I was using the > > /sys/kernel/slab code as my reference. Can you explain this more? Or > > just point me to the source/documentation I should read for info. > > Documentation/kobject.txt, with sample examples in samples/kobject/ for > you to copy and use. > > > Are you referring to kobj_attr_show/kobj_attr_store? Should I just be > > using kobj_sysfs_ops, then, most likely? > > See the above examples for more details. > > > > Also, you have no release function for your kobject to be cleaned up, > > > that's a major bug. > > > > Well, these kobjects never go away? They will be statically initialized > > at boot-time and then stick around until the kernel goes away. Looking > > at /sys/kernel/slab's code, again, the release() function there does a > > kfree() on the containing kmem_cache, but for hugetlb, the hstates are > > static... If we do move to dynamic allocations ever (or allow adding > > hugepage sizes at run-time somehow), then perhaps we'll need a release > > method then? > > Yes you will. Please always create one, what happens when you want to > clean them up at shut-down time... Does this look better? I really appreciate the review, Greg. include/linux/hugetlb.h | 9 +- mm/hugetlb.c | 292 +++++++++++++++++++++++++++++++++++------------ 2 files changed, 226 insertions(+), 75 deletions(-) Still-not-Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 7aa22e7..cac63bd 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -3,6 +3,9 @@ #include <linux/fs.h> #include <linux/shm.h> +#include <linux/mempolicy.h> +#include <asm/tlbflush.h> +#include <asm/hugetlb.h> #ifdef CONFIG_HUGETLBFS struct hugetlbfs_config { @@ -69,10 +72,6 @@ static inline void set_file_hugepages(struct file *file) #ifdef CONFIG_HUGETLB_PAGE -#include <linux/mempolicy.h> -#include <asm/tlbflush.h> -#include <asm/hugetlb.h> - struct ctl_table; static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) @@ -131,6 +130,8 @@ struct hstate { unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; + const char *name; + struct kobject kobj; }; void __init huge_add_hstate(unsigned order); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index de03a14..8a40afa 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -15,6 +15,7 @@ #include <linux/cpuset.h> #include <linux/mutex.h> #include <linux/bootmem.h> +#include <linux/sysfs.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -624,6 +625,8 @@ static void __init hugetlb_init_hstate(struct hstate *h) break; } h->max_huge_pages = i; + h->name = kasprintf(GFP_KERNEL, "hugepages-%lu", + huge_page_size(h) / 1024); } static void __init hugetlb_init_hstates(void) @@ -662,77 +665,6 @@ static void __init report_hugepages(void) } } -static int __init hugetlb_init(void) -{ - BUILD_BUG_ON(HPAGE_SHIFT == 0); - - if (!size_to_hstate(HPAGE_SIZE)) { - huge_add_hstate(HUGETLB_PAGE_ORDER); - parsed_hstate->max_huge_pages = default_hstate_resv; - } - - hugetlb_init_hstates(); - - gather_bootmem_prealloc(); - - report_hugepages(); - - return 0; -} -module_init(hugetlb_init); - -/* Should be called on processing a hugepagesz=... option */ -void __init huge_add_hstate(unsigned order) -{ - struct hstate *h; - if (size_to_hstate(PAGE_SIZE << order)) { - printk("hugepagesz= specified twice, ignoring\n"); - return; - } - BUG_ON(max_hstate >= HUGE_MAX_HSTATE); - BUG_ON(order < HPAGE_SHIFT - PAGE_SHIFT); - h = &hstates[max_hstate++]; - h->order = order; - h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); - hugetlb_init_hstate(h); - parsed_hstate = h; -} - -static int __init hugetlb_setup(char *s) -{ - unsigned long *mhp; - - if (!max_hstate) - mhp = &default_hstate_resv; - else - mhp = &parsed_hstate->max_huge_pages; - - if (sscanf(s, "%lu", mhp) <= 0) - *mhp = 0; - - /* - * Global state is always initialized later in hugetlb_init. - * But we need to allocate >= MAX_ORDER hstates here early to still - * use the bootmem allocator. - */ - if (max_hstate > 0 && parsed_hstate->order >= MAX_ORDER) - hugetlb_init_hstate(parsed_hstate); - - return 1; -} -__setup("hugepages=", hugetlb_setup); - -static unsigned int cpuset_mems_nr(unsigned int *array) -{ - int node; - unsigned int nr = 0; - - for_each_node_mask(node, cpuset_current_mems_allowed) - nr += array[node]; - - return nr; -} - #ifdef CONFIG_SYSCTL #ifdef CONFIG_HIGHMEM static void try_to_free_low(struct hstate *h, unsigned long count) @@ -843,6 +775,224 @@ out: return ret; } +#ifdef CONFIG_SYSFS +#define HSTATE_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +#define HSTATE_ATTR(_name) \ + static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static ssize_t nr_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = container_of(kobj, struct hstate, kobj); + return sprintf(buf, "%lu\n", h->nr_huge_pages); +} +static ssize_t nr_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int tmp, err; + unsigned long input; + struct hstate *h = container_of(kobj, struct hstate, kobj); + + err = strict_strtoul(buf, 10, &input); + if (err) + return 0; + + h->max_huge_pages = set_max_huge_pages(h, input, &tmp); + max_huge_pages[h - hstates] = h->max_huge_pages; + + return count; +} +HSTATE_ATTR(nr_hugepages); + +static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = container_of(kobj, struct hstate, kobj); + return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); +} +static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long input; + struct hstate *h = container_of(kobj, struct hstate, kobj); + + err = strict_strtoul(buf, 10, &input); + if (err) + return 0; + + spin_lock(&hugetlb_lock); + h->nr_overcommit_huge_pages = input; + sysctl_overcommit_huge_pages[h - hstates] = h->nr_overcommit_huge_pages; + spin_unlock(&hugetlb_lock); + + return count; +} +HSTATE_ATTR(nr_overcommit_hugepages); + +static ssize_t free_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = container_of(kobj, struct hstate, kobj); + return sprintf(buf, "%lu\n", h->free_huge_pages); +} +HSTATE_ATTR_RO(free_hugepages); + +static ssize_t resv_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = container_of(kobj, struct hstate, kobj); + return sprintf(buf, "%lu\n", h->resv_huge_pages); +} +HSTATE_ATTR_RO(resv_hugepages); + +static ssize_t surplus_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = container_of(kobj, struct hstate, kobj); + return sprintf(buf, "%lu\n", h->surplus_huge_pages); +} +HSTATE_ATTR_RO(surplus_hugepages); + +static void hstate_release(struct kobject *kobj) +{ + struct hstate *h = container_of(kobj, struct hstate, kobj); + kfree(h->name); +} + +static struct kset *hstate_kset; + +static struct attribute *hstate_attrs[] = { + &nr_hugepages_attr.attr, + &nr_overcommit_hugepages_attr.attr, + &free_hugepages_attr.attr, + &resv_hugepages_attr.attr, + &surplus_hugepages_attr.attr, + NULL, +}; + +static struct kobj_type hstate_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_attrs = hstate_attrs, + .release = hstate_release, +}; + +static int __init hugetlb_sysfs_add_hstate(struct hstate *h) +{ + int retval; + + h->kobj.kset = hstate_kset; + + retval = kobject_init_and_add(&h->kobj, &hstate_ktype, NULL, h->name); + if (retval) { + kfree(h->name); + return retval; + } + + kobject_uevent(&h->kobj, KOBJ_ADD); + + return 0; +} + +static void __init hugetlb_sysfs_init(void) +{ + struct hstate *h; + int err; + + hstate_kset = kset_create_and_add("hugepages", NULL, kernel_kobj); + if (!hstate_kset) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h); + if (err) + printk(KERN_ERR "Hugetlb: Unable to add hstate %s", + h->name); + } +} +#else +static void __init hugetlb_sysfs_init(void) +{ +} +#endif + +static int __init hugetlb_init(void) +{ + BUILD_BUG_ON(HPAGE_SHIFT == 0); + + if (!size_to_hstate(HPAGE_SIZE)) { + huge_add_hstate(HUGETLB_PAGE_ORDER); + parsed_hstate->max_huge_pages = default_hstate_resv; + } + + hugetlb_init_hstates(); + + gather_bootmem_prealloc(); + + report_hugepages(); + + hugetlb_sysfs_init(); + + return 0; +} +module_init(hugetlb_init); + +/* Should be called on processing a hugepagesz=... option */ +void __init huge_add_hstate(unsigned order) +{ + struct hstate *h; + if (size_to_hstate(PAGE_SIZE << order)) { + printk("hugepagesz= specified twice, ignoring\n"); + return; + } + BUG_ON(max_hstate >= HUGE_MAX_HSTATE); + BUG_ON(order < HPAGE_SHIFT - PAGE_SHIFT); + h = &hstates[max_hstate++]; + h->order = order; + h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); + hugetlb_init_hstate(h); + parsed_hstate = h; +} + +static int __init hugetlb_setup(char *s) +{ + unsigned long *mhp; + + if (!max_hstate) + mhp = &default_hstate_resv; + else + mhp = &parsed_hstate->max_huge_pages; + + if (sscanf(s, "%lu", mhp) <= 0) + *mhp = 0; + + /* + * Global state is always initialized later in hugetlb_init. + * But we need to allocate >= MAX_ORDER hstates here early to still + * use the bootmem allocator. + */ + if (max_hstate > 0 && parsed_hstate->order >= MAX_ORDER) + hugetlb_init_hstate(parsed_hstate); + + return 1; +} +__setup("hugepages=", hugetlb_setup); + +static unsigned int cpuset_mems_nr(unsigned int *array) +{ + int node; + unsigned int nr = 0; + + for_each_node_mask(node, cpuset_current_mems_allowed) + nr += array[node]; + + return nr; +} + + int hugetlb_sysctl_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] 2008-04-30 19:19 ` Nishanth Aravamudan @ 2008-05-01 3:08 ` Greg KH 2008-05-02 17:58 ` Nishanth Aravamudan 0 siblings, 1 reply; 51+ messages in thread From: Greg KH @ 2008-05-01 3:08 UTC (permalink / raw) To: Nishanth Aravamudan Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn, linux-mm On Wed, Apr 30, 2008 at 12:19:41PM -0700, Nishanth Aravamudan wrote: > On 29.04.2008 [11:26:13 -0700], Greg KH wrote: > > On Tue, Apr 29, 2008 at 11:14:15AM -0700, Nishanth Aravamudan wrote: > > > On 29.04.2008 [10:22:43 -0700], Greg KH wrote: > > > > On Tue, Apr 29, 2008 at 10:11:15AM -0700, Nishanth Aravamudan wrote: > > > > > +struct hstate_attribute { > > > > > + struct attribute attr; > > > > > + ssize_t (*show)(struct hstate *h, char *buf); > > > > > + ssize_t (*store)(struct hstate *h, const char *buf, size_t count); > > > > > +}; > > > > > > > > Do you need your own attribute type with show and store? Can't you just > > > > use the "default" kobject attributes? > > > > > > Hrm, I don't know? Probably. Like I said, I was using the > > > /sys/kernel/slab code as my reference. Can you explain this more? Or > > > just point me to the source/documentation I should read for info. > > > > Documentation/kobject.txt, with sample examples in samples/kobject/ for > > you to copy and use. > > > > > Are you referring to kobj_attr_show/kobj_attr_store? Should I just be > > > using kobj_sysfs_ops, then, most likely? > > > > See the above examples for more details. > > > > > > Also, you have no release function for your kobject to be cleaned up, > > > > that's a major bug. > > > > > > Well, these kobjects never go away? They will be statically initialized > > > at boot-time and then stick around until the kernel goes away. Looking > > > at /sys/kernel/slab's code, again, the release() function there does a > > > kfree() on the containing kmem_cache, but for hugetlb, the hstates are > > > static... If we do move to dynamic allocations ever (or allow adding > > > hugepage sizes at run-time somehow), then perhaps we'll need a release > > > method then? > > > > Yes you will. Please always create one, what happens when you want to > > clean them up at shut-down time... > > Does this look better? I really appreciate the review, Greg. See my previous email, you should not embed a kobject into this structure. Just use a pointer to one, it will shrink this patch a lot. thanks, greg k-h -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] 2008-05-01 3:08 ` Greg KH @ 2008-05-02 17:58 ` Nishanth Aravamudan 0 siblings, 0 replies; 51+ messages in thread From: Nishanth Aravamudan @ 2008-05-02 17:58 UTC (permalink / raw) To: Greg KH Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn, linux-mm On 30.04.2008 [20:08:44 -0700], Greg KH wrote: > On Wed, Apr 30, 2008 at 12:19:41PM -0700, Nishanth Aravamudan wrote: > > On 29.04.2008 [11:26:13 -0700], Greg KH wrote: > > > On Tue, Apr 29, 2008 at 11:14:15AM -0700, Nishanth Aravamudan wrote: > > > > On 29.04.2008 [10:22:43 -0700], Greg KH wrote: > > > > > On Tue, Apr 29, 2008 at 10:11:15AM -0700, Nishanth Aravamudan wrote: > > > > > > +struct hstate_attribute { > > > > > > + struct attribute attr; > > > > > > + ssize_t (*show)(struct hstate *h, char *buf); > > > > > > + ssize_t (*store)(struct hstate *h, const char *buf, size_t count); > > > > > > +}; > > > > > > > > > > Do you need your own attribute type with show and store? Can't you just > > > > > use the "default" kobject attributes? > > > > > > > > Hrm, I don't know? Probably. Like I said, I was using the > > > > /sys/kernel/slab code as my reference. Can you explain this more? Or > > > > just point me to the source/documentation I should read for info. > > > > > > Documentation/kobject.txt, with sample examples in samples/kobject/ for > > > you to copy and use. > > > > > > > Are you referring to kobj_attr_show/kobj_attr_store? Should I just be > > > > using kobj_sysfs_ops, then, most likely? > > > > > > See the above examples for more details. > > > > > > > > Also, you have no release function for your kobject to be cleaned up, > > > > > that's a major bug. > > > > > > > > Well, these kobjects never go away? They will be statically initialized > > > > at boot-time and then stick around until the kernel goes away. Looking > > > > at /sys/kernel/slab's code, again, the release() function there does a > > > > kfree() on the containing kmem_cache, but for hugetlb, the hstates are > > > > static... If we do move to dynamic allocations ever (or allow adding > > > > hugepage sizes at run-time somehow), then perhaps we'll need a release > > > > method then? > > > > > > Yes you will. Please always create one, what happens when you want to > > > clean them up at shut-down time... > > > > Does this look better? I really appreciate the review, Greg. > > See my previous email, you should not embed a kobject into this > structure. Just use a pointer to one, it will shrink this patch a lot. Ok, I did that -- and the patch grew (due to adding a helper function to figure out which hstate a kobject corresponds to?). I'm sure I'm doing something stupid. FWIW, this patch does work with Jon's efforts and shows 64k/16m/16g at run-time, all correct and such. commit 164d446024a76b9d785b11141e1b53b330f6ce4d Author: Nishanth Aravamudan <nacc@us.ibm.com> Date: Fri Apr 25 15:34:58 2008 -0700 hugetlb: present information in sysfs Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 4fe8d16..4898f32 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -3,6 +3,9 @@ #include <linux/fs.h> #include <linux/shm.h> +#include <linux/mempolicy.h> +#include <asm/tlbflush.h> +#include <asm/hugetlb.h> #ifdef CONFIG_HUGETLBFS struct hugetlbfs_config { @@ -69,10 +72,6 @@ static inline void set_file_hugepages(struct file *file) #ifdef CONFIG_HUGETLB_PAGE -#include <linux/mempolicy.h> -#include <asm/tlbflush.h> -#include <asm/hugetlb.h> - struct ctl_table; static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) @@ -132,6 +131,7 @@ struct hstate { unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; + char name[32]; }; struct huge_bm_page { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bd07510..c87eeca 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -15,6 +15,7 @@ #include <linux/cpuset.h> #include <linux/mutex.h> #include <linux/bootmem.h> +#include <linux/sysfs.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -659,76 +660,6 @@ static void __init report_hugepages(void) } } -static int __init hugetlb_init(void) -{ - BUILD_BUG_ON(HPAGE_SHIFT == 0); - - if (!size_to_hstate(HPAGE_SIZE)) { - huge_add_hstate(HUGETLB_PAGE_ORDER); - parsed_hstate->max_huge_pages = default_hstate_resv; - } - - hugetlb_init_hstates(); - - gather_bootmem_prealloc(); - - report_hugepages(); - - return 0; -} -module_init(hugetlb_init); - -/* Should be called on processing a hugepagesz=... option */ -void __init huge_add_hstate(unsigned order) -{ - struct hstate *h; - if (size_to_hstate(PAGE_SIZE << order)) { - printk("hugepagesz= specified twice, ignoring\n"); - return; - } - BUG_ON(max_hstate >= HUGE_MAX_HSTATE); - h = &hstates[max_hstate++]; - h->order = order; - h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); - hugetlb_init_hstate(h); - parsed_hstate = h; -} - -static int __init hugetlb_setup(char *s) -{ - unsigned long *mhp; - - if (!max_hstate) - mhp = &default_hstate_resv; - else - mhp = &parsed_hstate->max_huge_pages; - - if (sscanf(s, "%lu", mhp) <= 0) - *mhp = 0; - - /* - * Global state is always initialized later in hugetlb_init. - * But we need to allocate >= MAX_ORDER hstates here early to still - * use the bootmem allocator. - */ - if (max_hstate > 0 && parsed_hstate->order >= MAX_ORDER) - hugetlb_init_hstate(parsed_hstate); - - return 1; -} -__setup("hugepages=", hugetlb_setup); - -static unsigned int cpuset_mems_nr(unsigned int *array) -{ - int node; - unsigned int nr = 0; - - for_each_node_mask(node, cpuset_current_mems_allowed) - nr += array[node]; - - return nr; -} - #ifdef CONFIG_SYSCTL #ifdef CONFIG_HIGHMEM static void try_to_free_low(struct hstate *h, unsigned long count) @@ -839,6 +770,236 @@ out: return ret; } +#ifdef CONFIG_SYSFS +#define HSTATE_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +#define HSTATE_ATTR(_name) \ + static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static struct kobject *hugepages_kobj; +static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; + +static struct hstate *kobj_to_hstate(struct kobject *kobj) +{ + int i; + for (i = 0; i < HUGE_MAX_HSTATE; i++) + if (hstate_kobjs[i] == kobj) + return &hstates[i]; + BUG(); + return NULL; +} + +static ssize_t nr_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->nr_huge_pages); +} +static ssize_t nr_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int tmp, err; + unsigned long input; + struct hstate *h = kobj_to_hstate(kobj); + + err = strict_strtoul(buf, 10, &input); + if (err) + return 0; + + h->max_huge_pages = set_max_huge_pages(h, input, &tmp); + max_huge_pages[h - hstates] = h->max_huge_pages; + + return count; +} +HSTATE_ATTR(nr_hugepages); + +static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); +} +static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long input; + struct hstate *h = kobj_to_hstate(kobj); + + err = strict_strtoul(buf, 10, &input); + if (err) + return 0; + + spin_lock(&hugetlb_lock); + h->nr_overcommit_huge_pages = input; + sysctl_overcommit_huge_pages[h - hstates] = h->nr_overcommit_huge_pages; + spin_unlock(&hugetlb_lock); + + return count; +} +HSTATE_ATTR(nr_overcommit_hugepages); + +static ssize_t free_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->free_huge_pages); +} +HSTATE_ATTR_RO(free_hugepages); + +static ssize_t resv_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->resv_huge_pages); +} +HSTATE_ATTR_RO(resv_hugepages); + +static ssize_t surplus_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->surplus_huge_pages); +} +HSTATE_ATTR_RO(surplus_hugepages); + +static struct attribute *hstate_attrs[] = { + &nr_hugepages_attr.attr, + &nr_overcommit_hugepages_attr.attr, + &free_hugepages_attr.attr, + &resv_hugepages_attr.attr, + &surplus_hugepages_attr.attr, + NULL, +}; + +static struct attribute_group hstate_attr_group = { + .attrs = hstate_attrs, +}; + +static int __init hugetlb_sysfs_add_hstate(struct hstate *h) +{ + int retval; + + hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, hugepages_kobj); + if (!hstate_kobjs[h - hstates]) + return -ENOMEM; + + retval = sysfs_create_group(hstate_kobjs[h - hstates], &hstate_attr_group); + if (retval) + kobject_put(hstate_kobjs[h - hstates]); + + return retval; +} + +static void __init hugetlb_sysfs_init(void) +{ + struct hstate *h; + int err; + + hugepages_kobj = kobject_create_and_add("hugepages", kernel_kobj); + if (!hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h); + if (err) + printk(KERN_ERR "Hugetlb: Unable to add hstate %s", + h->name); + } +} +#else +static void __init hugetlb_sysfs_init(void) +{ +} +#endif + +static int __init hugetlb_init(void) +{ + BUILD_BUG_ON(HPAGE_SHIFT == 0); + + if (!size_to_hstate(HPAGE_SIZE)) { + huge_add_hstate(HUGETLB_PAGE_ORDER); + parsed_hstate->max_huge_pages = default_hstate_resv; + } + + hugetlb_init_hstates(); + + gather_bootmem_prealloc(); + + report_hugepages(); + + hugetlb_sysfs_init(); + + return 0; +} +module_init(hugetlb_init); + +static void __exit hugetlb_exit(void) +{ + struct hstate *h; + + for_each_hstate(h) + kobject_put(hstate_kobjs[h - hstates]); + + kobject_put(hugepages_kobj); +} +module_exit(hugetlb_exit); + +/* Should be called on processing a hugepagesz=... option */ +void __init huge_add_hstate(unsigned order) +{ + struct hstate *h; + if (size_to_hstate(PAGE_SIZE << order)) { + printk("hugepagesz= specified twice, ignoring\n"); + return; + } + BUG_ON(max_hstate >= HUGE_MAX_HSTATE); + h = &hstates[max_hstate++]; + h->order = order; + h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); + snprintf(h->name, 32, "hugepages-%lu", huge_page_size(h)/1024); + hugetlb_init_hstate(h); + parsed_hstate = h; +} + +static int __init hugetlb_setup(char *s) +{ + unsigned long *mhp; + + if (!max_hstate) + mhp = &default_hstate_resv; + else + mhp = &parsed_hstate->max_huge_pages; + + if (sscanf(s, "%lu", mhp) <= 0) + *mhp = 0; + + /* + * Global state is always initialized later in hugetlb_init. + * But we need to allocate >= MAX_ORDER hstates here early to still + * use the bootmem allocator. + */ + if (max_hstate > 0 && parsed_hstate->order >= MAX_ORDER) + hugetlb_init_hstate(parsed_hstate); + + return 1; +} +__setup("hugepages=", hugetlb_setup); + +static unsigned int cpuset_mems_nr(unsigned int *array) +{ + int node; + unsigned int nr = 0; + + for_each_node_mask(node, cpuset_current_mems_allowed) + nr += array[node]; + + return nr; +} + + int hugetlb_sysctl_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) -- Nishanth Aravamudan <nacc@us.ibm.com> IBM Linux Technology Center -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] 2008-04-27 5:10 ` Greg KH 2008-04-28 17:22 ` Nishanth Aravamudan @ 2008-04-28 20:31 ` Christoph Lameter 2008-04-28 20:52 ` Nishanth Aravamudan 1 sibling, 1 reply; 51+ messages in thread From: Christoph Lameter @ 2008-04-28 20:31 UTC (permalink / raw) To: Greg KH Cc: Nishanth Aravamudan, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm On Sat, 26 Apr 2008, Greg KH wrote: > Also, why use a "units" here, just always use the lowest unit, and > userspace can convert from kB to GB if needed. Additional complications will come about because IA64 supports varying hugetlb sizes from 4kb to 1GB. Also we would at some point like to add support for 1TB hugepages (that may depend on the presence of a special device that handles these). -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] 2008-04-28 20:31 ` Christoph Lameter @ 2008-04-28 20:52 ` Nishanth Aravamudan 2008-04-28 21:29 ` Christoph Lameter 0 siblings, 1 reply; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-28 20:52 UTC (permalink / raw) To: Christoph Lameter Cc: Greg KH, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm On 28.04.2008 [13:31:00 -0700], Christoph Lameter wrote: > On Sat, 26 Apr 2008, Greg KH wrote: > > > Also, why use a "units" here, just always use the lowest unit, and > > userspace can convert from kB to GB if needed. > > Additional complications will come about because IA64 supports > varying hugetlb sizes from 4kb to 1GB. What "complications" do you mean? It's a small function indeed to convert from the directory name to the corresponding "human-named" size, e.g. hugepages-1048576 to "1 GB". And such a function will probably exist in libhugetlbfs at some point, for applications to use, if they like. A potential problem I do see is for a 32-bit binary running on a 64-bit kernel and is one we've run against for 32-bit binaries with 16G pages available. The 32-bit binary can't actually store the size of the hugepage in an unsigned long, so we have to remember how big of a value we can represent (i.e., max_hugepage_size_in_kb) and check what's obtained from /proc/meminfo against that. Not ideal, for sure. > Also we would at some point like to add support for 1TB hugepages > (that may depend on the presence of a special device that handles > these). I also don't see a limitation here? For 32-bit programs, we'll see 1073741824 and know we can't convert that into a valid value in bytes. More importnatly, I think the fact that IA64 supports multiple hugepage sizes is a reason *for* moving to sysfs for this information? However, I think we may need to massage the IA64-specific bits of the kernel to actually support multiple hugepage size pools being available at run-time? That is, with the current kernel, we can only support one hugepagesize at run-time, due to VHPT restrictions? Thanks, Nish -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] 2008-04-28 20:52 ` Nishanth Aravamudan @ 2008-04-28 21:29 ` Christoph Lameter 2008-04-29 16:43 ` Nishanth Aravamudan 0 siblings, 1 reply; 51+ messages in thread From: Christoph Lameter @ 2008-04-28 21:29 UTC (permalink / raw) To: Nishanth Aravamudan Cc: Greg KH, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm On Mon, 28 Apr 2008, Nishanth Aravamudan wrote: > More importnatly, I think the fact that IA64 supports multiple hugepage > sizes is a reason *for* moving to sysfs for this information? However, I > think we may need to massage the IA64-specific bits of the kernel to > actually support multiple hugepage size pools being available at > run-time? That is, with the current kernel, we can only support one > hugepagesize at run-time, due to VHPT restrictions? We'd love to have multiple huge page pools available but the current rigid region setup limits us to one size. Switching off the VHPT or doing some tricks with the tlb fault handler, or freeing up an unused region (region 0?) could get us there. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] 2008-04-28 21:29 ` Christoph Lameter @ 2008-04-29 16:43 ` Nishanth Aravamudan 2008-04-29 17:01 ` Christoph Lameter 0 siblings, 1 reply; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-29 16:43 UTC (permalink / raw) To: Christoph Lameter Cc: Greg KH, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm On 28.04.2008 [14:29:02 -0700], Christoph Lameter wrote: > On Mon, 28 Apr 2008, Nishanth Aravamudan wrote: > > > More importnatly, I think the fact that IA64 supports multiple hugepage > > sizes is a reason *for* moving to sysfs for this information? However, I > > think we may need to massage the IA64-specific bits of the kernel to > > actually support multiple hugepage size pools being available at > > run-time? That is, with the current kernel, we can only support one > > hugepagesize at run-time, due to VHPT restrictions? > > We'd love to have multiple huge page pools available but the current > rigid region setup limits us to one size. Switching off the VHPT or > doing some tricks with the tlb fault handler, or freeing up an unused > region (region 0?) could get us there. Ok, that was my impression. So on IA64, without further kernel modifications, we will always only have one hugepage size visible in /proc/meminfo and /sys/kernel/hugepages? Thanks, Nish -- Nishanth Aravamudan <nacc@us.ibm.com> IBM Linux Technology Center -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] 2008-04-29 16:43 ` Nishanth Aravamudan @ 2008-04-29 17:01 ` Christoph Lameter 0 siblings, 0 replies; 51+ messages in thread From: Christoph Lameter @ 2008-04-29 17:01 UTC (permalink / raw) To: Nishanth Aravamudan Cc: Greg KH, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm On Tue, 29 Apr 2008, Nishanth Aravamudan wrote: > Ok, that was my impression. So on IA64, without further kernel > modifications, we will always only have one hugepage size visible in > /proc/meminfo and /sys/kernel/hugepages? I am not aware of any work in progress. So yes. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 2/5] hugetlb: numafy several functions 2008-04-11 23:47 ` [RFC][PATCH 2/5] " Nishanth Aravamudan 2008-04-11 23:47 ` [PATCH 3/5] hugetlb: interleave dequeueing of huge pages Nishanth Aravamudan @ 2008-04-14 14:52 ` Adam Litke 2008-04-14 21:10 ` Nishanth Aravamudan 1 sibling, 1 reply; 51+ messages in thread From: Adam Litke @ 2008-04-14 14:52 UTC (permalink / raw) To: Nishanth Aravamudan Cc: wli, clameter, luick, Lee.Schermerhorn, linux-mm, npiggin On Fri, 2008-04-11 at 16:47 -0700, Nishanth Aravamudan wrote: > +#define persistent_huge_pages_node(nid) \ > + (nr_huge_pages_node[nid] - surplus_huge_pages_node[nid]) > +static ssize_t hugetlb_write_nr_hugepages_node(struct sys_device *dev, > + const char *buf, size_t count) > +{ > + int nid = dev->id; > + unsigned long target; > + unsigned long free_on_other_nodes; > + unsigned long nr_huge_pages_req = simple_strtoul(buf, NULL, 10); > + > + /* > + * Increase the pool size on the node > + * First take pages out of surplus state. Then make up the > + * remaining difference by allocating fresh huge pages. > + * > + * We might race with alloc_buddy_huge_page() here and be unable > + * to convert a surplus huge page to a normal huge page. That is > + * not critical, though, it just means the overall size of the > + * pool might be one hugepage larger than it needs to be, but > + * within all the constraints specified by the sysctls. > + */ > + spin_lock(&hugetlb_lock); > + while (surplus_huge_pages_node[nid] && > + nr_huge_pages_req > persistent_huge_pages_node(nid)) { > + if (!adjust_pool_surplus_node(-1, nid)) > + break; > + } > + > + while (nr_huge_pages_req > persistent_huge_pages_node(nid)) { > + struct page *ret; > + /* > + * If this allocation races such that we no longer need the > + * page, free_huge_page will handle it by freeing the page > + * and reducing the surplus. > + */ > + spin_unlock(&hugetlb_lock); > + ret = alloc_fresh_huge_page_node(nid); > + spin_lock(&hugetlb_lock); > + if (!ret) > + goto out; > + > + } > + > + if (nr_huge_pages_req >= nr_huge_pages_node[nid]) > + goto out; > + > + /* > + * Decrease the pool size > + * First return free pages to the buddy allocator (being careful > + * to keep enough around to satisfy reservations). Then place > + * pages into surplus state as needed so the pool will shrink > + * to the desired size as pages become free. > + * > + * By placing pages into the surplus state independent of the > + * overcommit value, we are allowing the surplus pool size to > + * exceed overcommit. There are few sane options here. Since > + * alloc_buddy_huge_page() is checking the global counter, > + * though, we'll note that we're not allowed to exceed surplus > + * and won't grow the pool anywhere else. Not until one of the > + * sysctls are changed, or the surplus pages go out of use. > + */ > + free_on_other_nodes = free_huge_pages - free_huge_pages_node[nid]; > + if (free_on_other_nodes >= resv_huge_pages) { > + /* other nodes can satisfy reserve */ > + target = nr_huge_pages_req; > + } else { > + /* this node needs some free to satisfy reserve */ > + target = max((resv_huge_pages - free_on_other_nodes), > + nr_huge_pages_req); > + } > + try_to_free_low_node(nid, target); > + while (target < persistent_huge_pages_node(nid)) { > + struct page *page = dequeue_huge_page_node(NULL, nid); > + if (!page) > + break; > + update_and_free_page(nid, page); > + } > + > + while (target < persistent_huge_pages_node(nid)) { > + if (!adjust_pool_surplus_node(1, nid)) > + break; > + } > +out: > + spin_unlock(&hugetlb_lock); > + return count; > +} Hmm, this function looks very familiar ;) Is there any way we can consolidate it with set_max_huge_pages()? Perhaps the new node helpers from the beginning of this series will help? -- Adam Litke - (agl at us.ibm.com) IBM Linux Technology Center -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [RFC][PATCH 2/5] hugetlb: numafy several functions 2008-04-14 14:52 ` [RFC][PATCH 2/5] hugetlb: numafy several functions Adam Litke @ 2008-04-14 21:10 ` Nishanth Aravamudan 0 siblings, 0 replies; 51+ messages in thread From: Nishanth Aravamudan @ 2008-04-14 21:10 UTC (permalink / raw) To: Adam Litke; +Cc: wli, clameter, luick, Lee.Schermerhorn, linux-mm, npiggin On 14.04.2008 [09:52:50 -0500], Adam Litke wrote: > > On Fri, 2008-04-11 at 16:47 -0700, Nishanth Aravamudan wrote: > > +#define persistent_huge_pages_node(nid) \ > > + (nr_huge_pages_node[nid] - surplus_huge_pages_node[nid]) > > +static ssize_t hugetlb_write_nr_hugepages_node(struct sys_device *dev, > > + const char *buf, size_t count) > > +{ > > + int nid = dev->id; > > + unsigned long target; > > + unsigned long free_on_other_nodes; > > + unsigned long nr_huge_pages_req = simple_strtoul(buf, NULL, 10); > > + > > + /* > > + * Increase the pool size on the node > > + * First take pages out of surplus state. Then make up the > > + * remaining difference by allocating fresh huge pages. > > + * > > + * We might race with alloc_buddy_huge_page() here and be unable > > + * to convert a surplus huge page to a normal huge page. That is > > + * not critical, though, it just means the overall size of the > > + * pool might be one hugepage larger than it needs to be, but > > + * within all the constraints specified by the sysctls. > > + */ > > + spin_lock(&hugetlb_lock); > > + while (surplus_huge_pages_node[nid] && > > + nr_huge_pages_req > persistent_huge_pages_node(nid)) { > > + if (!adjust_pool_surplus_node(-1, nid)) > > + break; > > + } > > + > > + while (nr_huge_pages_req > persistent_huge_pages_node(nid)) { > > + struct page *ret; > > + /* > > + * If this allocation races such that we no longer need the > > + * page, free_huge_page will handle it by freeing the page > > + * and reducing the surplus. > > + */ > > + spin_unlock(&hugetlb_lock); > > + ret = alloc_fresh_huge_page_node(nid); > > + spin_lock(&hugetlb_lock); > > + if (!ret) > > + goto out; > > + > > + } > > + > > + if (nr_huge_pages_req >= nr_huge_pages_node[nid]) > > + goto out; > > + > > + /* > > + * Decrease the pool size > > + * First return free pages to the buddy allocator (being careful > > + * to keep enough around to satisfy reservations). Then place > > + * pages into surplus state as needed so the pool will shrink > > + * to the desired size as pages become free. > > + * > > + * By placing pages into the surplus state independent of the > > + * overcommit value, we are allowing the surplus pool size to > > + * exceed overcommit. There are few sane options here. Since > > + * alloc_buddy_huge_page() is checking the global counter, > > + * though, we'll note that we're not allowed to exceed surplus > > + * and won't grow the pool anywhere else. Not until one of the > > + * sysctls are changed, or the surplus pages go out of use. > > + */ > > + free_on_other_nodes = free_huge_pages - free_huge_pages_node[nid]; > > + if (free_on_other_nodes >= resv_huge_pages) { > > + /* other nodes can satisfy reserve */ > > + target = nr_huge_pages_req; > > + } else { > > + /* this node needs some free to satisfy reserve */ > > + target = max((resv_huge_pages - free_on_other_nodes), > > + nr_huge_pages_req); > > + } > > + try_to_free_low_node(nid, target); > > + while (target < persistent_huge_pages_node(nid)) { > > + struct page *page = dequeue_huge_page_node(NULL, nid); > > + if (!page) > > + break; > > + update_and_free_page(nid, page); > > + } > > + > > + while (target < persistent_huge_pages_node(nid)) { > > + if (!adjust_pool_surplus_node(1, nid)) > > + break; > > + } > > +out: > > + spin_unlock(&hugetlb_lock); > > + return count; > > +} > > Hmm, this function looks very familiar ;) Is there any way we can > consolidate it with set_max_huge_pages()? Perhaps the new node helpers > from the beginning of this series will help? A good idea. I think I was more worried about getting something wrong if I did that in my first cut after the dynamic pool was merged and just hadn't recombined. I will work on it for the next version, once we've come up with a consensus on the interface's location. Thanks for the review, Nish -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 51+ messages in thread
end of thread, other threads:[~2008-05-02 17:58 UTC | newest] Thread overview: 51+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2008-04-11 23:44 [PATCH 1/5] hugetlb: numafy several functions Nishanth Aravamudan 2008-04-11 23:47 ` [RFC][PATCH 2/5] " Nishanth Aravamudan 2008-04-11 23:47 ` [PATCH 3/5] hugetlb: interleave dequeueing of huge pages Nishanth Aravamudan 2008-04-11 23:49 ` [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI Nishanth Aravamudan 2008-04-11 23:50 ` [RFC][PATCH 5/5] Documentation: update ABI and hugetlbpage.txt for per-node files Nishanth Aravamudan 2008-04-11 23:56 ` [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI Greg KH 2008-04-12 0:27 ` Nishanth Aravamudan 2008-04-12 9:41 ` Nick Piggin 2008-04-12 10:26 ` Christoph Lameter 2008-04-14 21:09 ` Nishanth Aravamudan 2008-04-13 3:41 ` Greg KH 2008-04-14 21:05 ` Nishanth Aravamudan 2008-04-17 23:16 ` Nishanth Aravamudan 2008-04-17 23:22 ` Christoph Lameter 2008-04-17 23:36 ` Nishanth Aravamudan 2008-04-17 23:39 ` Christoph Lameter 2008-04-18 6:04 ` Nishanth Aravamudan 2008-04-18 17:27 ` Nishanth Aravamudan 2008-04-20 2:24 ` Greg KH 2008-04-21 16:43 ` Nishanth Aravamudan 2008-04-20 2:21 ` Greg KH 2008-04-21 6:06 ` Christoph Lameter 2008-04-21 16:41 ` Nishanth Aravamudan 2008-04-22 5:14 ` Nick Piggin 2008-04-22 16:56 ` Nishanth Aravamudan 2008-04-23 1:03 ` Nick Piggin 2008-04-23 18:32 ` Nishanth Aravamudan 2008-04-23 19:07 ` Adam Litke 2008-04-24 7:13 ` Nick Piggin 2008-04-24 15:54 ` Nishanth Aravamudan 2008-04-27 3:49 ` [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] Nishanth Aravamudan 2008-04-27 5:10 ` Greg KH 2008-04-28 17:22 ` Nishanth Aravamudan 2008-04-28 17:29 ` Greg KH 2008-04-29 17:11 ` Nishanth Aravamudan 2008-04-29 17:22 ` Greg KH 2008-04-29 18:14 ` Nishanth Aravamudan 2008-04-29 18:26 ` Greg KH 2008-04-29 23:48 ` Nishanth Aravamudan 2008-05-01 3:07 ` Greg KH 2008-05-01 18:25 ` Nishanth Aravamudan 2008-04-30 19:19 ` Nishanth Aravamudan 2008-05-01 3:08 ` Greg KH 2008-05-02 17:58 ` Nishanth Aravamudan 2008-04-28 20:31 ` Christoph Lameter 2008-04-28 20:52 ` Nishanth Aravamudan 2008-04-28 21:29 ` Christoph Lameter 2008-04-29 16:43 ` Nishanth Aravamudan 2008-04-29 17:01 ` Christoph Lameter 2008-04-14 14:52 ` [RFC][PATCH 2/5] hugetlb: numafy several functions Adam Litke 2008-04-14 21:10 ` Nishanth Aravamudan
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).