[PATCH 1/5] hugetlb: numafy several functions

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/5] hugetlb: numafy several functions
@ 2008-04-11 23:44 Nishanth Aravamudan
  2008-04-11 23:47 ` [RFC][PATCH 2/5] " Nishanth Aravamudan
  0 siblings, 1 reply; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-11 23:44 UTC (permalink / raw)
  To: wli; +Cc: clameter, agl, luick, Lee.Schermerhorn, linux-mm, npiggin

Add node-parameterized helpers for dequeue_huge_page,
alloc_fresh_huge_page, adjust_pool_surplus and try_to_free_low. Also
have update_and_free_page() take a nid parameter. These changes are
necessary to add sysfs attributes to specify the number of static
hugepages on NUMA nodes.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e13a7b2..8faaa16 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -71,6 +71,20 @@ static void enqueue_huge_page(struct page *page)
 	free_huge_pages_node[nid]++;
 }
 
+static struct page *dequeue_huge_page_node(struct vm_area_struct *vma,
+								int nid)
+{
+	struct page *page;
+
+	page = list_entry(hugepage_freelists[nid].next, struct page, lru);
+	list_del(&page->lru);
+	free_huge_pages--;
+	free_huge_pages_node[nid]--;
+	if (vma && vma->vm_flags & VM_MAYSHARE)
+		resv_huge_pages--;
+	return page;
+}
+
 static struct page *dequeue_huge_page(void)
 {
 	int nid;
@@ -78,11 +92,7 @@ static struct page *dequeue_huge_page(void)
 
 	for (nid = 0; nid < MAX_NUMNODES; ++nid) {
 		if (!list_empty(&hugepage_freelists[nid])) {
-			page = list_entry(hugepage_freelists[nid].next,
-					  struct page, lru);
-			list_del(&page->lru);
-			free_huge_pages--;
-			free_huge_pages_node[nid]--;
+			page = dequeue_huge_page_node(NULL, nid);
 			break;
 		}
 	}
@@ -106,13 +116,7 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
 		nid = zone_to_nid(zone);
 		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
 		    !list_empty(&hugepage_freelists[nid])) {
-			page = list_entry(hugepage_freelists[nid].next,
-					  struct page, lru);
-			list_del(&page->lru);
-			free_huge_pages--;
-			free_huge_pages_node[nid]--;
-			if (vma && vma->vm_flags & VM_MAYSHARE)
-				resv_huge_pages--;
+			page = dequeue_huge_page_node(vma, nid);
 			break;
 		}
 	}
@@ -120,11 +124,11 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
 	return page;
 }
 
-static void update_and_free_page(struct page *page)
+static void update_and_free_page(int nid, struct page *page)
 {
 	int i;
 	nr_huge_pages--;
-	nr_huge_pages_node[page_to_nid(page)]--;
+	nr_huge_pages_node[nid]--;
 	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
 				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
@@ -148,7 +152,7 @@ static void free_huge_page(struct page *page)
 
 	spin_lock(&hugetlb_lock);
 	if (surplus_huge_pages_node[nid]) {
-		update_and_free_page(page);
+		update_and_free_page(nid, page);
 		surplus_huge_pages--;
 		surplus_huge_pages_node[nid]--;
 	} else {
@@ -164,6 +168,20 @@ static void free_huge_page(struct page *page)
  * balanced by operating on them in a round-robin fashion.
  * Returns 1 if an adjustment was made.
  */
+static int adjust_pool_surplus_node(int delta, int nid)
+{
+	/* To shrink on this node, there must be a surplus page */
+	if (delta < 0 && !surplus_huge_pages_node[nid])
+		return 0;
+	/* Surplus cannot exceed the total number of pages */
+	if (delta > 0 && surplus_huge_pages_node[nid] >=
+					nr_huge_pages_node[nid])
+		return 0;
+	surplus_huge_pages += delta;
+	surplus_huge_pages_node[nid] += delta;
+	return 1;
+}
+
 static int adjust_pool_surplus(int delta)
 {
 	static int prev_nid;
@@ -175,19 +193,9 @@ static int adjust_pool_surplus(int delta)
 		nid = next_node(nid, node_online_map);
 		if (nid == MAX_NUMNODES)
 			nid = first_node(node_online_map);
-
-		/* To shrink on this node, there must be a surplus page */
-		if (delta < 0 && !surplus_huge_pages_node[nid])
-			continue;
-		/* Surplus cannot exceed the total number of pages */
-		if (delta > 0 && surplus_huge_pages_node[nid] >=
-						nr_huge_pages_node[nid])
-			continue;
-
-		surplus_huge_pages += delta;
-		surplus_huge_pages_node[nid] += delta;
-		ret = 1;
-		break;
+		ret = adjust_pool_surplus_node(delta, nid);
+		if (ret == 1)
+			break;
 	} while (nid != prev_nid);
 
 	prev_nid = nid;
@@ -450,7 +458,7 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
 			page = list_entry(hugepage_freelists[nid].next,
 					  struct page, lru);
 			list_del(&page->lru);
-			update_and_free_page(page);
+			update_and_free_page(nid, page);
 			free_huge_pages--;
 			free_huge_pages_node[nid]--;
 			surplus_huge_pages--;
@@ -556,25 +564,35 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
 
 #ifdef CONFIG_SYSCTL
 #ifdef CONFIG_HIGHMEM
+static void try_to_free_low_node(unsigned long count, int nid)
+{
+	struct page *page, *next;
+	list_for_each_entry_safe(page, next, &hugepage_freelists[nid], lru) {
+		if (count >= nr_huge_pages_node[nid])
+			return;
+		if (PageHighMem(page))
+			continue;
+		list_del(&page->lru);
+		update_and_free_page(nid, page);
+		free_huge_pages--;
+		free_huge_pages_node[nid]--;
+	}
+}
+
 static void try_to_free_low(unsigned long count)
 {
 	int i;
 
 	for (i = 0; i < MAX_NUMNODES; ++i) {
-		struct page *page, *next;
-		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
-			if (count >= nr_huge_pages)
-				return;
-			if (PageHighMem(page))
-				continue;
-			list_del(&page->lru);
-			update_and_free_page(page);
-			free_huge_pages--;
-			free_huge_pages_node[page_to_nid(page)]--;
-		}
+		if (count >= nr_huge_pages)
+			return;
+		try_to_free_low_node(count, i);
 	}
 }
 #else
+static inline void try_to_free_low_node(unsigned long count, int nid)
+{
+}
 static inline void try_to_free_low(unsigned long count)
 {
 }
@@ -639,7 +657,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
 		struct page *page = dequeue_huge_page();
 		if (!page)
 			break;
-		update_and_free_page(page);
+		update_and_free_page(page_to_nid(page), page);
 	}
 	while (count < persistent_huge_pages) {
 		if (!adjust_pool_surplus(1))

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [RFC][PATCH 2/5] hugetlb: numafy several functions
  2008-04-11 23:44 [PATCH 1/5] hugetlb: numafy several functions Nishanth Aravamudan
@ 2008-04-11 23:47 ` Nishanth Aravamudan
  2008-04-11 23:47   ` [PATCH 3/5] hugetlb: interleave dequeueing of huge pages Nishanth Aravamudan
  2008-04-14 14:52   ` [RFC][PATCH 2/5] hugetlb: numafy several functions Adam Litke
  0 siblings, 2 replies; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-11 23:47 UTC (permalink / raw)
  To: wli; +Cc: clameter, agl, luick, Lee.Schermerhorn, linux-mm, npiggin

Allow specifying the number of hugepages to allocate on a particular
node. Our current global sysctl will try its best to put hugepages
equally on each node, but htat may not always be desired. This allows
the admin to control the layout of hugepage allocation at a finer level
(while not breaking the existing interface).  Add callbacks in the sysfs
node registration and unregistration functions into hugetlb to add the
nr_hugepages attribute, which is a no-op if !NUMA or !HUGETLB.

This new interface requires some changes to the nr_hugepages sysctl as
well. We update max_huge_pages via a call to set_max_huge_pages() with
the value written into the nr_hugepages sysctl, even when only reading.
This is not very efficient. More importantly when nr_hugepages can be
altered by other interfaces (per-node sysfs attributes), this side
effect of reading can invoke set_max_huge_pages with a value less than
nr_hugepages, resulting in hugepages being freed! Rather than relying on
set_max_huge_pages() at all in the read-path, update max_huge_pages
(which is still the syctl variable) to the appropriate value on reads
(before invoking the generic sysctl handler) and call
set_max_huge_pages() on writes (after invoking the generic sysctl
handler).

Thanks to Dean Luick for finding some bugs in my previous posting of the
patch.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

---
Same patch as before, but an RFC this time to decide if
/sys/devices/system/node is where we want to be putting the pool
allocators. As discussed in a separate thread with Nick ("[patch 00/17]
multi size, and giatn hugetlb page support, 1GB hugetlb for x86" on
linux-mm), perhaps a better location would be /sys/kernel, but then we'd
need to replicate a bit of the NUMA layout into /sys/kernel. However,
the advantage would be when we put the multiple hugepage pool
allocation interfaces in /sys/kernel, all of the hugetlb related
interfaces will be in one place (as presumably we'll want per-node
control on a per-pool basis!).

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 4c2caff..96aa493 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -154,6 +154,7 @@ int register_node(struct node *node, int num, struct node *parent)
 		sysdev_create_file(&node->sysdev, &attr_meminfo);
 		sysdev_create_file(&node->sysdev, &attr_numastat);
 		sysdev_create_file(&node->sysdev, &attr_distance);
+		hugetlb_register_node(node);
 	}
 	return error;
 }
@@ -171,6 +172,7 @@ void unregister_node(struct node *node)
 	sysdev_remove_file(&node->sysdev, &attr_meminfo);
 	sysdev_remove_file(&node->sysdev, &attr_numastat);
 	sysdev_remove_file(&node->sysdev, &attr_distance);
+	hugetlb_unregister_node(node);
 
 	sysdev_unregister(&node->sysdev);
 }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index a79e80b..ac8c8d9 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -6,7 +6,9 @@
 #ifdef CONFIG_HUGETLB_PAGE
 
 #include <linux/mempolicy.h>
+#include <linux/node.h>
 #include <linux/shm.h>
+#include <linux/sysdev.h>
 #include <asm/tlbflush.h>
 #include <asm/hugetlb.h>
 
@@ -27,6 +29,13 @@ void __unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned lon
 int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
 int hugetlb_report_meminfo(char *);
 int hugetlb_report_node_meminfo(int, char *);
+#ifdef CONFIG_NUMA
+int hugetlb_register_node(struct node *);
+void hugetlb_unregister_node(struct node *);
+#else
+#define hugetlb_register_node(node)		do {} while (0)
+#define hugetlb_unregister_node(node)		do {} while (0)
+#endif
 unsigned long hugetlb_total_pages(void);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, int write_access);
@@ -70,6 +79,8 @@ static inline unsigned long hugetlb_total_pages(void)
 #define unmap_hugepage_range(vma, start, end)	BUG()
 #define hugetlb_report_meminfo(buf)		0
 #define hugetlb_report_node_meminfo(n, buf)	0
+#define hugetlb_register_node(node)		do {} while (0)
+#define hugetlb_unregister_node(node)		do {} while (0)
 #define follow_huge_pmd(mm, addr, pmd, write)	NULL
 #define prepare_hugepage_range(addr,len)	(-EINVAL)
 #define pmd_huge(x)	0
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8faaa16..d35b087 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -562,7 +562,6 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
 	return nr;
 }
 
-#ifdef CONFIG_SYSCTL
 #ifdef CONFIG_HIGHMEM
 static void try_to_free_low_node(unsigned long count, int nid)
 {
@@ -578,7 +577,14 @@ static void try_to_free_low_node(unsigned long count, int nid)
 		free_huge_pages_node[nid]--;
 	}
 }
+#else
+static inline void try_to_free_low_node(unsigned long count, int nid)
+{
+}
+#endif
 
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_HIGHMEM
 static void try_to_free_low(unsigned long count)
 {
 	int i;
@@ -590,18 +596,15 @@ static void try_to_free_low(unsigned long count)
 	}
 }
 #else
-static inline void try_to_free_low_node(unsigned long count, int nid)
-{
-}
 static inline void try_to_free_low(unsigned long count)
 {
 }
 #endif
 
 #define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
-static unsigned long set_max_huge_pages(unsigned long count)
+static void set_max_huge_pages(unsigned long count)
 {
-	unsigned long min_count, ret;
+	unsigned long min_count;
 
 	/*
 	 * Increase the pool size
@@ -664,17 +667,21 @@ static unsigned long set_max_huge_pages(unsigned long count)
 			break;
 	}
 out:
-	ret = persistent_huge_pages;
 	spin_unlock(&hugetlb_lock);
-	return ret;
 }
 
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 			   struct file *file, void __user *buffer,
 			   size_t *length, loff_t *ppos)
 {
+	if (!write) {
+		spin_lock(&hugetlb_lock);
+		max_huge_pages = persistent_huge_pages;
+		spin_unlock(&hugetlb_lock);
+	}
 	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
-	max_huge_pages = set_max_huge_pages(max_huge_pages);
+	if (write)
+		set_max_huge_pages(max_huge_pages);
 	return 0;
 }
 
@@ -729,6 +736,115 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
 		nid, surplus_huge_pages_node[nid]);
 }
 
+#ifdef CONFIG_NUMA
+static ssize_t hugetlb_read_nr_hugepages_node(struct sys_device *dev,
+							char *buf)
+{
+	return sprintf(buf, "%u\n", nr_huge_pages_node[dev->id]);
+}
+
+#define persistent_huge_pages_node(nid)	\
+		(nr_huge_pages_node[nid] - surplus_huge_pages_node[nid])
+static ssize_t hugetlb_write_nr_hugepages_node(struct sys_device *dev,
+					const char *buf, size_t count)
+{
+	int nid = dev->id;
+	unsigned long target;
+	unsigned long free_on_other_nodes;
+	unsigned long nr_huge_pages_req = simple_strtoul(buf, NULL, 10);
+
+	/*
+	 * Increase the pool size on the node
+	 * First take pages out of surplus state.  Then make up the
+	 * remaining difference by allocating fresh huge pages.
+	 *
+	 * We might race with alloc_buddy_huge_page() here and be unable
+	 * to convert a surplus huge page to a normal huge page. That is
+	 * not critical, though, it just means the overall size of the
+	 * pool might be one hugepage larger than it needs to be, but
+	 * within all the constraints specified by the sysctls.
+	 */
+	spin_lock(&hugetlb_lock);
+	while (surplus_huge_pages_node[nid] &&
+		nr_huge_pages_req > persistent_huge_pages_node(nid)) {
+		if (!adjust_pool_surplus_node(-1, nid))
+			break;
+	}
+
+	while (nr_huge_pages_req > persistent_huge_pages_node(nid)) {
+		struct page *ret;
+		/*
+		 * If this allocation races such that we no longer need the
+		 * page, free_huge_page will handle it by freeing the page
+		 * and reducing the surplus.
+		 */
+		spin_unlock(&hugetlb_lock);
+		ret = alloc_fresh_huge_page_node(nid);
+		spin_lock(&hugetlb_lock);
+		if (!ret)
+			goto out;
+
+	}
+
+	if (nr_huge_pages_req >= nr_huge_pages_node[nid])
+		goto out;
+
+	/*
+	 * Decrease the pool size
+	 * First return free pages to the buddy allocator (being careful
+	 * to keep enough around to satisfy reservations).  Then place
+	 * pages into surplus state as needed so the pool will shrink
+	 * to the desired size as pages become free.
+	 *
+	 * By placing pages into the surplus state independent of the
+	 * overcommit value, we are allowing the surplus pool size to
+	 * exceed overcommit. There are few sane options here. Since
+	 * alloc_buddy_huge_page() is checking the global counter,
+	 * though, we'll note that we're not allowed to exceed surplus
+	 * and won't grow the pool anywhere else. Not until one of the
+	 * sysctls are changed, or the surplus pages go out of use.
+	 */
+	free_on_other_nodes = free_huge_pages - free_huge_pages_node[nid];
+	if (free_on_other_nodes >= resv_huge_pages) {
+		/* other nodes can satisfy reserve */
+		target = nr_huge_pages_req;
+	} else {
+		/* this node needs some free to satisfy reserve */
+		target = max((resv_huge_pages - free_on_other_nodes),
+						nr_huge_pages_req);
+	}
+	try_to_free_low_node(nid, target);
+	while (target < persistent_huge_pages_node(nid)) {
+		struct page *page = dequeue_huge_page_node(NULL, nid);
+		if (!page)
+			break;
+		update_and_free_page(nid, page);
+	}
+
+	while (target < persistent_huge_pages_node(nid)) {
+		if (!adjust_pool_surplus_node(1, nid))
+			break;
+	}
+out:
+	spin_unlock(&hugetlb_lock);
+	return count;
+}
+
+static SYSDEV_ATTR(nr_hugepages, S_IRUGO | S_IWUSR,
+			hugetlb_read_nr_hugepages_node,
+			hugetlb_write_nr_hugepages_node);
+
+int hugetlb_register_node(struct node *node)
+{
+	return sysdev_create_file(&node->sysdev, &attr_nr_hugepages);
+}
+
+void hugetlb_unregister_node(struct node *node)
+{
+	sysdev_remove_file(&node->sysdev, &attr_nr_hugepages);
+}
+#endif
+
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 3/5] hugetlb: interleave dequeueing of huge pages
  2008-04-11 23:47 ` [RFC][PATCH 2/5] " Nishanth Aravamudan
@ 2008-04-11 23:47   ` Nishanth Aravamudan
  2008-04-11 23:49     ` [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI Nishanth Aravamudan
  2008-04-14 14:52   ` [RFC][PATCH 2/5] hugetlb: numafy several functions Adam Litke
  1 sibling, 1 reply; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-11 23:47 UTC (permalink / raw)
  To: wli; +Cc: clameter, agl, luick, Lee.Schermerhorn, linux-mm, npiggin

Currently, when shrinking the hugetlb pool, we free all of the pages on
node 0, then all the pages on node 1, etc. With this patch we instead
interleave over the nodes with memory. If some particularly node should
be cleared first, the to-be-introduced sysfs allocator can be used for
finer-grained control. This also helps with keeping the pool balanced as
we change the pool at run-time.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d35b087..18ece9e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -87,15 +87,32 @@ static struct page *dequeue_huge_page_node(struct vm_area_struct *vma,
 
 static struct page *dequeue_huge_page(void)
 {
-	int nid;
 	struct page *page = NULL;
+	int start_nid;
+	int next_nid;
+
+	start_nid = hugetlb_next_nid;
+
+	do {
+		if (!list_empty(&hugepage_freelists[hugetlb_next_nid]))
+			page = dequeue_huge_page_node(NULL, hugetlb_next_nid);
+		/*
+		 * Use a helper variable to find the next node and then
+		 * copy it back to hugetlb_next_nid afterwards:
+		 * otherwise there's a window in which a racer might
+		 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
+		 * But we don't need to use a spin_lock here: it really
+		 * doesn't matter if occasionally a racer chooses the
+		 * same nid as we do.  Move nid forward in the mask even
+		 * if we just successfully allocated a hugepage so that
+		 * the next caller gets hugepages on the next node.
+		 */
+		next_nid = next_node(hugetlb_next_nid, node_online_map);
+		if (next_nid == MAX_NUMNODES)
+			next_nid = first_node(node_online_map);
+		hugetlb_next_nid = next_nid;
+	} while (!page && hugetlb_next_nid != start_nid);
 
-	for (nid = 0; nid < MAX_NUMNODES; ++nid) {
-		if (!list_empty(&hugepage_freelists[nid])) {
-			page = dequeue_huge_page_node(NULL, nid);
-			break;
-		}
-	}
 	return page;
 }
 

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-11 23:47   ` [PATCH 3/5] hugetlb: interleave dequeueing of huge pages Nishanth Aravamudan
@ 2008-04-11 23:49     ` Nishanth Aravamudan
  2008-04-11 23:50       ` [RFC][PATCH 5/5] Documentation: update ABI and hugetlbpage.txt for per-node files Nishanth Aravamudan
  2008-04-11 23:56       ` [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI Greg KH
  0 siblings, 2 replies; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-11 23:49 UTC (permalink / raw)
  To: wli; +Cc: clameter, agl, luick, Lee.Schermerhorn, linux-mm, npiggin, gregkh

/sys/devices/system/node represents the current NUMA configuration of
the machine, but is undocumented in the ABI files. Add bare-bones
documentation for these files.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

---
Greg, is something like this what you'd want? Should I be striving for
more detail? Should the file have a preamble indicating none of it
exists if !NUMA?

diff --git a/Documentation/ABI/testing/sysfs-devices-system-node b/Documentation/ABI/testing/sysfs-devices-system-node
new file mode 100644
index 0000000..97d6145
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-system-node
@@ -0,0 +1,59 @@
+What:		/sys/devices/system/node/has_cpu
+Date:		October 2007
+Contact:	Lee Schermerhorn <Lee.Schermerhonr@hp.com>
+Description:
+		List of nodes which have one ore more CPUs.
+
+What:		/sys/devices/system/node/has_high_memory
+Date:		October 2007
+Contact:	Lee Schermerhorn <Lee.Schermerhorn@hp.com>
+Description:
+		List of nodes which have regular or high memory. This
+		file will not exist if CONFIG_HIGHMEM is off.
+
+What:		/sys/devices/system/node/has_normal_memory
+Date:		October 2007
+Contact:	Lee Schermerhorn <Lee.Schermerhorn@hp.com>
+Description:
+		List of nodes which have regular memory.
+
+What:		/sys/devices/system/node/online
+Date:		October 2007
+Contact:	Lee Schermerhorn <Lee.Schermerhorn@hp.com>
+Description:
+		List of online nodes.
+
+What:		/sys/devices/system/node/possible
+Date:		October 2007
+Contact:	Lee Schermerhorn <Lee.Schermerhorn@hp.com>
+Description:
+		List of nodes which could go online.
+
+What:		/sys/devices/system/node/<node>/<cpu>
+Date:		June 2006
+Contact:	Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
+Description:
+		Symlink to the sysfs CPU information for each <cpu> on
+		<node>.
+
+What:		/sys/devices/system/node/<node>/cpumap
+Date:
+Contact:	Christoph Lameter <clameter@sgi.com>
+Description:
+		Hexadecimal mask of which CPUs are on <node>.
+
+What:		/sys/devices/system/node/<node>/meminfo
+Date:
+Contact:	Christoph Lameter <clameter@sgi.com>
+Description:
+		Memory information for <node>.
+		NOTE: This file violates the sysfs rules for one value
+		per file.
+
+What:		/sys/devices/system/node/<node>/numastat
+Date:
+Contact:	Christoph Lameter <clameter@sgi.com>
+Description:
+		NUMA statistics for <node>.
+		NOTE: This file violates the sysfs rules for one value
+		per file.

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [RFC][PATCH 5/5] Documentation: update ABI and hugetlbpage.txt for per-node files
  2008-04-11 23:49     ` [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI Nishanth Aravamudan
@ 2008-04-11 23:50       ` Nishanth Aravamudan
  2008-04-11 23:56       ` [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI Greg KH
  1 sibling, 0 replies; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-11 23:50 UTC (permalink / raw)
  To: wli; +Cc: clameter, agl, luick, Lee.Schermerhorn, linux-mm, npiggin, gregkh

---
This patch will change if we decide to move the per-node interface to
another location in sysfs.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

diff --git a/Documentation/ABI/testing/sysfs-devices-system-node b/Documentation/ABI/testing/sysfs-devices-system-node
index 97d6145..5766902 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-node
+++ b/Documentation/ABI/testing/sysfs-devices-system-node
@@ -57,3 +57,10 @@ Description:
 		NUMA statistics for <node>.
 		NOTE: This file violates the sysfs rules for one value
 		per file.
+
+What:		/sys/devices/system/node/<node>/nr_hugepages
+Date:		April 2008
+Contact:	Nish Aravamudan <nacc@us.ibm.com>
+Description:
+		Interface to allocate (and check) hugepages on <node>.
+		This file will not exist if CONFIG_HUGETLB_PAGE is off.
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt
index 3102b81..b749607 100644
--- a/Documentation/vm/hugetlbpage.txt
+++ b/Documentation/vm/hugetlbpage.txt
@@ -80,6 +80,13 @@ of getting physical contiguous pages is still very high). In either
 case, adminstrators will want to verify the number of hugepages actually
 allocated by checking the sysctl or meminfo.
 
+/sys/devices/system/node/nodeX/nr_hugepages allows for finer-grained
+control of the hugepage pool on NUMA machines. The functionality is the
+same as for nr_hugepages, but the effects are restricted to the node in
+question. Similarly, administrators will want to verify the number of
+hugepages actually allocated or freed by checking the per-node meminfo
+or nr_hugepages file.
+
 /proc/sys/vm/nr_overcommit_hugepages indicates how large the pool of
 hugepages can grow, if more hugepages than /proc/sys/vm/nr_hugepages are
 requested by applications. echo'ing any non-zero value into this file

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-11 23:49     ` [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI Nishanth Aravamudan
  2008-04-11 23:50       ` [RFC][PATCH 5/5] Documentation: update ABI and hugetlbpage.txt for per-node files Nishanth Aravamudan
@ 2008-04-11 23:56       ` Greg KH
  2008-04-12  0:27         ` Nishanth Aravamudan
  2008-04-12  9:41         ` Nick Piggin
  1 sibling, 2 replies; 51+ messages in thread
From: Greg KH @ 2008-04-11 23:56 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: wli, clameter, agl, luick, Lee.Schermerhorn, linux-mm, npiggin

On Fri, Apr 11, 2008 at 04:49:13PM -0700, Nishanth Aravamudan wrote:
> /sys/devices/system/node represents the current NUMA configuration of
> the machine, but is undocumented in the ABI files. Add bare-bones
> documentation for these files.
> 
> Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
> 
> ---
> Greg, is something like this what you'd want?

Yes it is, thanks for doing it.

> Should I be striving for more detail?

You might want to show what you mean by "list of nodes".  But other than
that, this is a great start.

> Should the file have a preamble indicating none of it exists if !NUMA?

Yes, that would be helpful for people who might worry that they do not
see these files :)

thanks,

greg k-h

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-11 23:56       ` [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI Greg KH
@ 2008-04-12  0:27         ` Nishanth Aravamudan
  2008-04-12  9:41         ` Nick Piggin
  1 sibling, 0 replies; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-12  0:27 UTC (permalink / raw)
  To: Greg KH; +Cc: wli, clameter, agl, luick, Lee.Schermerhorn, linux-mm, npiggin

On 11.04.2008 [16:56:48 -0700], Greg KH wrote:
> On Fri, Apr 11, 2008 at 04:49:13PM -0700, Nishanth Aravamudan wrote:
> > /sys/devices/system/node represents the current NUMA configuration of
> > the machine, but is undocumented in the ABI files. Add bare-bones
> > documentation for these files.
> > 
> > Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
> > 
> > ---
> > Greg, is something like this what you'd want?
> 
> Yes it is, thanks for doing it.

Ok, good.

> > Should I be striving for more detail?
> 
> You might want to show what you mean by "list of nodes".  But other than
> that, this is a great start.

Yeah, I was thinking for a few of the files, an example output might
clarify their use{,fulness}.

> > Should the file have a preamble indicating none of it exists if !NUMA?
> 
> Yes, that would be helpful for people who might worry that they do not
> see these files :)

Ok, I'll make that change in the next version.

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-11 23:56       ` [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI Greg KH
  2008-04-12  0:27         ` Nishanth Aravamudan
@ 2008-04-12  9:41         ` Nick Piggin
  2008-04-12 10:26           ` Christoph Lameter
  2008-04-13  3:41           ` Greg KH
  1 sibling, 2 replies; 51+ messages in thread
From: Nick Piggin @ 2008-04-12  9:41 UTC (permalink / raw)
  To: Greg KH
  Cc: Nishanth Aravamudan, wli, clameter, agl, luick, Lee.Schermerhorn,
	linux-mm

On Fri, Apr 11, 2008 at 04:56:48PM -0700, Greg KH wrote:
> On Fri, Apr 11, 2008 at 04:49:13PM -0700, Nishanth Aravamudan wrote:
> > /sys/devices/system/node represents the current NUMA configuration of
> > the machine, but is undocumented in the ABI files. Add bare-bones
> > documentation for these files.
> > 
> > Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
> > 
> > ---
> > Greg, is something like this what you'd want?
> 
> Yes it is, thanks for doing it.

Can you comment on the aspect of configuring various kernel hugetlb 
configuration parameters? Especifically, what directory it should go in?
IMO it should be /sys/kernel/*

/sys/devices/system/etc should be fine eg. for showing how many pages are
available in a given node, or what kinds of TLBs the CPU has, but I would
have thought that configuring the kernel's hugetlb settings should be
in /sys/kernel.

Then again, I can't say I'm up to speed on sysfs policy so the main thing
I care about is that it is consistent and correct.

Thanks,
Nick

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-12  9:41         ` Nick Piggin
@ 2008-04-12 10:26           ` Christoph Lameter
  2008-04-14 21:09             ` Nishanth Aravamudan
  2008-04-13  3:41           ` Greg KH
  1 sibling, 1 reply; 51+ messages in thread
From: Christoph Lameter @ 2008-04-12 10:26 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Greg KH, Nishanth Aravamudan, wli, agl, luick, Lee.Schermerhorn,
	linux-mm

On Sat, 12 Apr 2008, Nick Piggin wrote:

> Can you comment on the aspect of configuring various kernel hugetlb 
> configuration parameters? Especifically, what directory it should go in?
> IMO it should be /sys/kernel/*

Yes that would be more consistent. However, it will break the tools that 
now access /sys/devices.

Something like

/sys/kernel/node/<nodenr>/<numa setting>

and

/sys/kernel/memory/<global setting>

?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-12 10:26           ` Christoph Lameter
@ 2008-04-14 21:09             ` Nishanth Aravamudan
  0 siblings, 0 replies; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-14 21:09 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Greg KH, wli, agl, luick, Lee.Schermerhorn, linux-mm

On 12.04.2008 [03:26:35 -0700], Christoph Lameter wrote:
> On Sat, 12 Apr 2008, Nick Piggin wrote:
> 
> > Can you comment on the aspect of configuring various kernel hugetlb 
> > configuration parameters? Especifically, what directory it should go in?
> > IMO it should be /sys/kernel/*
> 
> Yes that would be more consistent. However, it will break the tools that 
> now access /sys/devices.

Since the ABI was undocumented, do we have any idea what those tools
would be? libnuma seems to have some references to sysfs but they result
in warnings, not errors, AFAICT (and I will add libnuma as a consumer of
the interfaces in question, in my patch to add the ABI documentation).

> Something like
> 
> /sys/kernel/node/<nodenr>/<numa setting>

Well, right now, the node devices are anchored in the right place, I
think, and represent a real non-global property (unlike the /sys/kernel
bits). My understanding is that Nick is wondering if
/sys/devices/system/node/nodeX/* should be read-only or if
kernel-changing attributes should also be placed there? You had a
similar question earlier, and we never really resolved it, beyond saying
this was the first attempt at adding a tunable in the directory :)

> and
> 
> /sys/kernel/memory/<global setting>

This is an interesting idea. However, moving the meminfo-like files into
this directory would probably require us obeying the sysfs rules (which
many of the /sys/devices/system/node files do not!) for
one-value-per-file, which would make meminfo lookup non-atomic/less
useful? So, what settings are you thinking go there?

Or, am I completely misunderstanding, and the settings you refer to in
both cases strictly hugetlb-related settings?

Thanks,
Nish

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-12  9:41         ` Nick Piggin
  2008-04-12 10:26           ` Christoph Lameter
@ 2008-04-13  3:41           ` Greg KH
  2008-04-14 21:05             ` Nishanth Aravamudan
  1 sibling, 1 reply; 51+ messages in thread
From: Greg KH @ 2008-04-13  3:41 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Nishanth Aravamudan, wli, clameter, agl, luick, Lee.Schermerhorn,
	linux-mm

On Sat, Apr 12, 2008 at 11:41:18AM +0200, Nick Piggin wrote:
> On Fri, Apr 11, 2008 at 04:56:48PM -0700, Greg KH wrote:
> > On Fri, Apr 11, 2008 at 04:49:13PM -0700, Nishanth Aravamudan wrote:
> > > /sys/devices/system/node represents the current NUMA configuration of
> > > the machine, but is undocumented in the ABI files. Add bare-bones
> > > documentation for these files.
> > > 
> > > Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
> > > 
> > > ---
> > > Greg, is something like this what you'd want?
> > 
> > Yes it is, thanks for doing it.
> 
> Can you comment on the aspect of configuring various kernel hugetlb 
> configuration parameters? Especifically, what directory it should go in?
> IMO it should be /sys/kernel/*

I don't really know.

> /sys/devices/system/etc should be fine eg. for showing how many pages are
> available in a given node, or what kinds of TLBs the CPU has, but I would
> have thought that configuring the kernel's hugetlb settings should be
> in /sys/kernel.

/sys/devices/system are for "sysdev" devices, a breed of device
structures that are problimatic to use, and are on my TODO list to
rework.  If you need a hugetlb paramter to be tied to a cpu or other
system device, then it should go under here.

Otherwise, if it is just a "system wide" parameter, then put it in
/sys/kernel/

thanks,

greg k-h

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-13  3:41           ` Greg KH
@ 2008-04-14 21:05             ` Nishanth Aravamudan
  2008-04-17 23:16               ` Nishanth Aravamudan
  0 siblings, 1 reply; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-14 21:05 UTC (permalink / raw)
  To: Greg KH; +Cc: Nick Piggin, wli, clameter, agl, luick, Lee.Schermerhorn,
	linux-mm

On 12.04.2008 [20:41:36 -0700], Greg KH wrote:
> On Sat, Apr 12, 2008 at 11:41:18AM +0200, Nick Piggin wrote:
> > On Fri, Apr 11, 2008 at 04:56:48PM -0700, Greg KH wrote:
> > > On Fri, Apr 11, 2008 at 04:49:13PM -0700, Nishanth Aravamudan wrote:
> > > > /sys/devices/system/node represents the current NUMA configuration of
> > > > the machine, but is undocumented in the ABI files. Add bare-bones
> > > > documentation for these files.
> > > > 
> > > > Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
> > > > 
> > > > ---
> > > > Greg, is something like this what you'd want?
> > > 
> > > Yes it is, thanks for doing it.
> > 
> > Can you comment on the aspect of configuring various kernel hugetlb 
> > configuration parameters? Especifically, what directory it should go in?
> > IMO it should be /sys/kernel/*
> 
> I don't really know.
> 
> > /sys/devices/system/etc should be fine eg. for showing how many pages are
> > available in a given node, or what kinds of TLBs the CPU has, but I would
> > have thought that configuring the kernel's hugetlb settings should be
> > in /sys/kernel.
> 
> /sys/devices/system are for "sysdev" devices, a breed of device
> structures that are problimatic to use, and are on my TODO list to
> rework.  If you need a hugetlb paramter to be tied to a cpu or other
> system device, then it should go under here.
> 
> Otherwise, if it is just a "system wide" parameter, then put it in
> /sys/kernel/

We have both, and that's kind of where things are being discussed right
now.

Currently, we have:

/proc/sys/vm/nr_hugepages
/proc/sys/vm/nr_overcommit_hugepages

which are global sysctls.

My patchset would add:

/sys/devices/system/node/nodeX/nr_hugepages

to allow for finer-grained control of the hugetlb pool allocation.

Nick/Andi's patchset would modify /proc/sys/vm/nr_hugepages to allow
specifying the pool sizes for multiple hugepage sizes.

To make my patchset and Nick's work well together, I think we'd need a
per-node, per-hugepage-size interface in sysfs. I pointed out to Nick
that it might be better to make the extended interface (supporting
multiple hugepage sizes) be in sysfs altogether, and leave
/proc/sys/vm/nr_hugepages alone (as only controlling the default
hugepage size).

That would leave us with [1]:

/sys/kernel/nr_hugepages --> nr_hugepages_2M
/sys/kernel/nr_hugepages_2M
/sys/kernel/nr_hugepages_1G
/sys/kernel/nr_overcommit_hugepages --> nr_overcommit_hugepages_2M
/sys/kernel/nr_overcommit_hugepages_2M
/sys/kernel/nr_overcommit_hugepages_1G

and [2]

/sys/devices/system/node/nodeX/nr_hugepages --> nr_hugepages_2M
/sys/devices/system/node/nodeX/nr_hugepages_2M
/sys/devices/system/node/nodeX/nr_hugepages_1G

The questions I see are (with my answers):

Is this separation correct?

	- I believe this puts the globals in one place and the per-nodes
	  in another (both of which are correct) keeping things
	  accurate. The per-node interface would be the first writable
	  attribute in /sys/devices/system/node, though.

Is this separation confusing to an administrator?

	- Similar to the previous question, I think the separation
	  corresponds well to the system's layout.

Is there a better way of presenting these attributes?

	- Nick's alternative was to (I think, please CMIIW) have:

	/sys/kernel/hugetlb/2M/nr_hugepages
	/sys/kernel/hugetlb/2M/nr_overcommit_hugepages
	/sys/kernel/hugetlb/2M/nodeX/nr_hugepages
	/sys/kernel/hugetlb/2M/nodeX/nr_overcommit_hugepages

	with perhaps symlinks in /sys/kernel/ or /sys/kernel/hugetlb
	directly to the default pools. And similar diretories/files for
	1G pages. This seems like a lot of duplication of the NUMA
	layout, but I can see it also being better in that all of the
	hugetlb-related interface is in one place. [3]

Do you see a particular more-sysfs-way here, Greg?

Thanks for reading this particularly long e-mail,
Nish

[1] Nick suggested using directories in /sys/kernel per-hugepage-size,
but I'm not sure how they should be named, so I went with the simpler
filename-style, to make the point clearer.

[2] I have a patch to allow for per-node dynamic pool control, but it's
pretty gross. Right now, we let the memory policy enforce where we get
hugepages from, presuming we can allocate there. If we had per-node
control, we'd need some way to specify a restriction on how many
hugepages can be allocated on a particular node down to alloc_pages, or
use a round-robin style, which would probably break mempolicies. For
now, I've let the patch alone while I try to find a better way.

[3] Is there an in-between, perhaps, that we could have the real files
in /sys/devices/system/node, but have symlinks, like
/sys/kernel/hugetlb/nodeX/nr_hugepages_2M -->
/sys/devices/system/node/nodeX/nr_hugepages_2M ? That seems like
overkill...

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-14 21:05             ` Nishanth Aravamudan
@ 2008-04-17 23:16               ` Nishanth Aravamudan
  2008-04-17 23:22                 ` Christoph Lameter
  0 siblings, 1 reply; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-17 23:16 UTC (permalink / raw)
  To: Greg KH; +Cc: Nick Piggin, wli, clameter, agl, luick, Lee.Schermerhorn,
	linux-mm

On 14.04.2008 [14:05:06 -0700], Nishanth Aravamudan wrote:
> On 12.04.2008 [20:41:36 -0700], Greg KH wrote:
> > On Sat, Apr 12, 2008 at 11:41:18AM +0200, Nick Piggin wrote:
> > > On Fri, Apr 11, 2008 at 04:56:48PM -0700, Greg KH wrote:
> > > > On Fri, Apr 11, 2008 at 04:49:13PM -0700, Nishanth Aravamudan wrote:
> > > > > /sys/devices/system/node represents the current NUMA configuration of
> > > > > the machine, but is undocumented in the ABI files. Add bare-bones
> > > > > documentation for these files.
> > > > > 
> > > > > Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
> > > > > 
> > > > > ---
> > > > > Greg, is something like this what you'd want?
> > > > 
> > > > Yes it is, thanks for doing it.
> > > 
> > > Can you comment on the aspect of configuring various kernel hugetlb 
> > > configuration parameters? Especifically, what directory it should go in?
> > > IMO it should be /sys/kernel/*
> > 
> > I don't really know.
> > 
> > > /sys/devices/system/etc should be fine eg. for showing how many pages are
> > > available in a given node, or what kinds of TLBs the CPU has, but I would
> > > have thought that configuring the kernel's hugetlb settings should be
> > > in /sys/kernel.
> > 
> > /sys/devices/system are for "sysdev" devices, a breed of device
> > structures that are problimatic to use, and are on my TODO list to
> > rework.  If you need a hugetlb paramter to be tied to a cpu or other
> > system device, then it should go under here.
> > 
> > Otherwise, if it is just a "system wide" parameter, then put it in
> > /sys/kernel/
> 
> We have both, and that's kind of where things are being discussed right
> now.

<snip>

> Do you see a particular more-sysfs-way here, Greg?

So I've received no comments yet? Perhaps I should leave things the way
they are (per-node files in /sys/devices/system/node) and add
nr_hugepages to /sys/kernel?

Do we want to put it in a subdirectory of /sys/kernel? What should the
subdir be called? "hugetlb" (refers to the implementation?) or
"hugepages"?

Do we want nr_hugepages in sysfs to actually be a symlink to the
underlying default hugepage size (in my patch, there will be only one,
but it allows for future-proofing)? Or I can make it a real file in my
patch and the multiple hugepage sizes at run-time patchset (which I'm
willing to help with) can change it to a symlink?

Thoughts?
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-17 23:16               ` Nishanth Aravamudan
@ 2008-04-17 23:22                 ` Christoph Lameter
  2008-04-17 23:36                   ` Nishanth Aravamudan
  2008-04-22  5:14                   ` Nick Piggin
  0 siblings, 2 replies; 51+ messages in thread
From: Christoph Lameter @ 2008-04-17 23:22 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Greg KH, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm

On Thu, 17 Apr 2008, Nishanth Aravamudan wrote:

> > Do you see a particular more-sysfs-way here, Greg?
> 
> So I've received no comments yet? Perhaps I should leave things the way
> they are (per-node files in /sys/devices/system/node) and add
> nr_hugepages to /sys/kernel?

The strange location of the node directories has always irked me.
> 
> Do we want to put it in a subdirectory of /sys/kernel? What should the
> subdir be called? "hugetlb" (refers to the implementation?) or
> "hugepages"?

How about:

/sys/kernel/node<nr>/<node specific setting/status files> ?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-17 23:22                 ` Christoph Lameter
@ 2008-04-17 23:36                   ` Nishanth Aravamudan
  2008-04-17 23:39                     ` Christoph Lameter
  2008-04-22  5:14                   ` Nick Piggin
  1 sibling, 1 reply; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-17 23:36 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Greg KH, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm

On 17.04.2008 [16:22:17 -0700], Christoph Lameter wrote:
> On Thu, 17 Apr 2008, Nishanth Aravamudan wrote:
> 
> > > Do you see a particular more-sysfs-way here, Greg?
> > 
> > So I've received no comments yet? Perhaps I should leave things the way
> > they are (per-node files in /sys/devices/system/node) and add
> > nr_hugepages to /sys/kernel?
> 
> The strange location of the node directories has always irked me.

But it's now part of the ABI? We'd have to deprecate the current
location and such. I'm ok with that, or maybe duplicating the
information for now, while deprecating the old location, but don't want
to spend the time doing that if we don't want it to be changed.

> > Do we want to put it in a subdirectory of /sys/kernel? What should the
> > subdir be called? "hugetlb" (refers to the implementation?) or
> > "hugepages"?
> 
> How about:
> 
> /sys/kernel/node<nr>/<node specific setting/status files> ?

That seems fine to me. I will work on it. However, as I mentioned in a
previous e-mail, the files in /sys/devices/system/node/node<nr>/
already violate the "one value per file" rule in several instances. I'm
guessing Greg won't want me moving the files and keeping that violation?

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-17 23:36                   ` Nishanth Aravamudan
@ 2008-04-17 23:39                     ` Christoph Lameter
  2008-04-18  6:04                       ` Nishanth Aravamudan
  2008-04-20  2:21                       ` Greg KH
  0 siblings, 2 replies; 51+ messages in thread
From: Christoph Lameter @ 2008-04-17 23:39 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Greg KH, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm

On Thu, 17 Apr 2008, Nishanth Aravamudan wrote:

> That seems fine to me. I will work on it. However, as I mentioned in a
> previous e-mail, the files in /sys/devices/system/node/node<nr>/
> already violate the "one value per file" rule in several instances. I'm
> guessing Greg won't want me moving the files and keeping that violation?

That violation is replicated in /proc/meminfo /proc/vmstat etc etc.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-17 23:39                     ` Christoph Lameter
@ 2008-04-18  6:04                       ` Nishanth Aravamudan
  2008-04-18 17:27                         ` Nishanth Aravamudan
  2008-04-20  2:21                       ` Greg KH
  1 sibling, 1 reply; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-18  6:04 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Greg KH, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm

On 17.04.2008 [16:39:56 -0700], Christoph Lameter wrote:
> On Thu, 17 Apr 2008, Nishanth Aravamudan wrote:
> 
> > That seems fine to me. I will work on it. However, as I mentioned in a
> > previous e-mail, the files in /sys/devices/system/node/node<nr>/
> > already violate the "one value per file" rule in several instances. I'm
> > guessing Greg won't want me moving the files and keeping that violation?
> 
> That violation is replicated in /proc/meminfo /proc/vmstat etc etc.

Right, but /proc doesn't have such a restriction (the "one value per
file" rule). I'm not sure how the meminfo, etc. files in sysfs got put
in past Greg, but that's how it is :)

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-18  6:04                       ` Nishanth Aravamudan
@ 2008-04-18 17:27                         ` Nishanth Aravamudan
  2008-04-20  2:24                           ` Greg KH
  0 siblings, 1 reply; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-18 17:27 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Greg KH, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm

On 17.04.2008 [23:04:04 -0700], Nishanth Aravamudan wrote:
> On 17.04.2008 [16:39:56 -0700], Christoph Lameter wrote:
> > On Thu, 17 Apr 2008, Nishanth Aravamudan wrote:
> > 
> > > That seems fine to me. I will work on it. However, as I mentioned in a
> > > previous e-mail, the files in /sys/devices/system/node/node<nr>/
> > > already violate the "one value per file" rule in several instances. I'm
> > > guessing Greg won't want me moving the files and keeping that violation?
> > 
> > That violation is replicated in /proc/meminfo /proc/vmstat etc etc.
> 
> Right, but /proc doesn't have such a restriction (the "one value per
> file" rule). I'm not sure how the meminfo, etc. files in sysfs got put
> in past Greg, but that's how it is :)

Greg, can you give any insight here? Are we better off leaving the files
in question in /sys/devices/system/node/node<nr>/{meminfo,numastat,etc}
since they are part of the ABI there and already violate the rules for
sysfs? Or can we move them to /sys/kernel and continue to violate the
rules? In this case, I don't see any way to provide a "snapshot" of the
system's memory information without all the values being in one file?

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-18 17:27                         ` Nishanth Aravamudan
@ 2008-04-20  2:24                           ` Greg KH
  2008-04-21 16:43                             ` Nishanth Aravamudan
  0 siblings, 1 reply; 51+ messages in thread
From: Greg KH @ 2008-04-20  2:24 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Christoph Lameter, Nick Piggin, wli, agl, luick, Lee.Schermerhorn,
	linux-mm

On Fri, Apr 18, 2008 at 10:27:30AM -0700, Nishanth Aravamudan wrote:
> On 17.04.2008 [23:04:04 -0700], Nishanth Aravamudan wrote:
> > On 17.04.2008 [16:39:56 -0700], Christoph Lameter wrote:
> > > On Thu, 17 Apr 2008, Nishanth Aravamudan wrote:
> > > 
> > > > That seems fine to me. I will work on it. However, as I mentioned in a
> > > > previous e-mail, the files in /sys/devices/system/node/node<nr>/
> > > > already violate the "one value per file" rule in several instances. I'm
> > > > guessing Greg won't want me moving the files and keeping that violation?
> > > 
> > > That violation is replicated in /proc/meminfo /proc/vmstat etc etc.
> > 
> > Right, but /proc doesn't have such a restriction (the "one value per
> > file" rule). I'm not sure how the meminfo, etc. files in sysfs got put
> > in past Greg, but that's how it is :)
> 
> Greg, can you give any insight here? Are we better off leaving the files
> in question in /sys/devices/system/node/node<nr>/{meminfo,numastat,etc}
> since they are part of the ABI there and already violate the rules for
> sysfs? Or can we move them to /sys/kernel and continue to violate the
> rules? In this case, I don't see any way to provide a "snapshot" of the
> system's memory information without all the values being in one file?

Yeah, the "snapshot" issue is what allows those values all to be present
at once.

As for where to place them, are there any tools out there that are
expecting the current file locations?  If so, can they work if they are
in both places?

If you think they should be moved, I'll defer to your judgement, but it
will be a bit harder, as you will be working with "raw" kobjects in that
case, not the sysdev structures, which do make things a bit easier for
you.

sorry for the delay, am traveling...

greg k-h

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-20  2:24                           ` Greg KH
@ 2008-04-21 16:43                             ` Nishanth Aravamudan
  0 siblings, 0 replies; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-21 16:43 UTC (permalink / raw)
  To: Greg KH
  Cc: Christoph Lameter, Nick Piggin, wli, agl, luick, Lee.Schermerhorn,
	linux-mm

On 19.04.2008 [19:24:21 -0700], Greg KH wrote:
> On Fri, Apr 18, 2008 at 10:27:30AM -0700, Nishanth Aravamudan wrote:
> > On 17.04.2008 [23:04:04 -0700], Nishanth Aravamudan wrote:
> > > On 17.04.2008 [16:39:56 -0700], Christoph Lameter wrote:
> > > > On Thu, 17 Apr 2008, Nishanth Aravamudan wrote:
> > > > 
> > > > > That seems fine to me. I will work on it. However, as I mentioned in a
> > > > > previous e-mail, the files in /sys/devices/system/node/node<nr>/
> > > > > already violate the "one value per file" rule in several instances. I'm
> > > > > guessing Greg won't want me moving the files and keeping that violation?
> > > > 
> > > > That violation is replicated in /proc/meminfo /proc/vmstat etc etc.
> > > 
> > > Right, but /proc doesn't have such a restriction (the "one value per
> > > file" rule). I'm not sure how the meminfo, etc. files in sysfs got put
> > > in past Greg, but that's how it is :)
> > 
> > Greg, can you give any insight here? Are we better off leaving the files
> > in question in /sys/devices/system/node/node<nr>/{meminfo,numastat,etc}
> > since they are part of the ABI there and already violate the rules for
> > sysfs? Or can we move them to /sys/kernel and continue to violate the
> > rules? In this case, I don't see any way to provide a "snapshot" of the
> > system's memory information without all the values being in one file?
> 
> Yeah, the "snapshot" issue is what allows those values all to be present
> at once.
> 
> As for where to place them, are there any tools out there that are
> expecting the current file locations?  If so, can they work if they are
> in both places?

I believe libnuma uses /sys/devices/system/node for some information. As
long as the files are in both places, nothing should be affected,
though. And we could deprecate the old files (if we decide to move them)
for the longer-term and update the necessary libraries.

> If you think they should be moved, I'll defer to your judgement, but
> it will be a bit harder, as you will be working with "raw" kobjects in
> that case, not the sysdev structures, which do make things a bit
> easier for you.

Yeah, I noticed that while fiddling around. Still possible, just not as
easy.

> sorry for the delay, am traveling...

No problem, thanks for the input!

-Nish

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-17 23:39                     ` Christoph Lameter
  2008-04-18  6:04                       ` Nishanth Aravamudan
@ 2008-04-20  2:21                       ` Greg KH
  2008-04-21  6:06                         ` Christoph Lameter
  1 sibling, 1 reply; 51+ messages in thread
From: Greg KH @ 2008-04-20  2:21 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nishanth Aravamudan, Nick Piggin, wli, agl, luick,
	Lee.Schermerhorn, linux-mm

On Thu, Apr 17, 2008 at 04:39:56PM -0700, Christoph Lameter wrote:
> On Thu, 17 Apr 2008, Nishanth Aravamudan wrote:
> 
> > That seems fine to me. I will work on it. However, as I mentioned in a
> > previous e-mail, the files in /sys/devices/system/node/node<nr>/
> > already violate the "one value per file" rule in several instances. I'm
> > guessing Greg won't want me moving the files and keeping that violation?
> 
> That violation is replicated in /proc/meminfo /proc/vmstat etc etc.

Those are /proc files, not sysfs files :)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-20  2:21                       ` Greg KH
@ 2008-04-21  6:06                         ` Christoph Lameter
  2008-04-21 16:41                           ` Nishanth Aravamudan
  0 siblings, 1 reply; 51+ messages in thread
From: Christoph Lameter @ 2008-04-21  6:06 UTC (permalink / raw)
  To: Greg KH
  Cc: Nishanth Aravamudan, Nick Piggin, wli, agl, luick,
	Lee.Schermerhorn, linux-mm

On Sat, 19 Apr 2008, Greg KH wrote:

> > That violation is replicated in /proc/meminfo /proc/vmstat etc etc.
> 
> Those are /proc files, not sysfs files :)

Hmmm.. Maybe we need to have /proc/node<x>/meminfo etc that replicates the 
/proc content for each node? Otherwise this cannot be symmetric because 
the different mount points have different requirements on how the output 
should look like.
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-21  6:06                         ` Christoph Lameter
@ 2008-04-21 16:41                           ` Nishanth Aravamudan
  0 siblings, 0 replies; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-21 16:41 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Greg KH, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm

On 20.04.2008 [23:06:48 -0700], Christoph Lameter wrote:
> On Sat, 19 Apr 2008, Greg KH wrote:
> 
> > > That violation is replicated in /proc/meminfo /proc/vmstat etc etc.
> > 
> > Those are /proc files, not sysfs files :)
> 
> Hmmm.. Maybe we need to have /proc/node<x>/meminfo etc that replicates
> the /proc content for each node? Otherwise this cannot be symmetric
> because the different mount points have different requirements on how
> the output should look like.

But the memory info has nothing to do with process specific information,
which is what "new" /proc files should contain (or maybe I'm
mis-remembering).

The current location (/sys/devices/system/node) reflects that memory is
tied to system devices called "nodes"; I'm not entirely convinced we'd
want to change that?  Especially, as Greg noted, it's easier to obtain
the information we want off a sysdev, rather than the raw kobject.

While I understand the desire to maintain sanity for sysfs files,
perhaps the meminfo files (and numastat, etc) are just special, in that
they only make sense as a collective (the snapshot mentioned earlier in
this thread) -- to get a view of the component (memory, NUMA statistics,
etc) as a whole.

In that sense, perhaps the sysfs notion should be extended to "One
logical value per file", where logical is defined as the minimum atomic
information needed by the user [1]? Or perhaps sysfs just isn't the best
place for this information, I don't know. I don't believe I am the
person to make that call.

Thanks,
Nish

[1] That would allow files like available_clocksource not to seem like
violators of the sysfs rule:

$ sudo cat /sys/devices/system/clocksource/clocksource0/available_clocksource
hpet acpi_pm pit jiffies tsc

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-17 23:22                 ` Christoph Lameter
  2008-04-17 23:36                   ` Nishanth Aravamudan
@ 2008-04-22  5:14                   ` Nick Piggin
  2008-04-22 16:56                     ` Nishanth Aravamudan
  1 sibling, 1 reply; 51+ messages in thread
From: Nick Piggin @ 2008-04-22  5:14 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nishanth Aravamudan, Greg KH, wli, agl, luick, Lee.Schermerhorn,
	linux-mm

On Thu, Apr 17, 2008 at 04:22:17PM -0700, Christoph Lameter wrote:
> On Thu, 17 Apr 2008, Nishanth Aravamudan wrote:
> 
> > > Do you see a particular more-sysfs-way here, Greg?
> > 
> > So I've received no comments yet? Perhaps I should leave things the way
> > they are (per-node files in /sys/devices/system/node) and add
> > nr_hugepages to /sys/kernel?
> 
> The strange location of the node directories has always irked me.
> > 
> > Do we want to put it in a subdirectory of /sys/kernel? What should the
> > subdir be called? "hugetlb" (refers to the implementation?) or
> > "hugepages"?
> 
> How about:
> 
> /sys/kernel/node<nr>/<node specific setting/status files> ?

I don't like /sys/kernel/node :P

Under /sys/kernel, we should have parameters to set and query various
kernel functionality. Control of the kernel software implementation. I
think this is pretty well agreed (although there are maybe grey areas I
guess)

So anyway, underneath that directory, we should have more subdirectories
grouping subsystems or sumilar functionality. We aren't tuning node,
but hugepages subsystem.

/sys/kernel/huge{tlb|pages}/

Under that directory could be global settings as well as per node settings
or subdirectories and so on. The layout should be similar to /proc/sys/*
IMO. Actually it should be much neater since we have some hindsight, but
unfortunately it is looking like it is actually messier ;)

Let's really try to put some thought into new sysfs locations. Not just
will it work, but is it logical and will it work tomorrow...

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-22  5:14                   ` Nick Piggin
@ 2008-04-22 16:56                     ` Nishanth Aravamudan
  2008-04-23  1:03                       ` Nick Piggin
  0 siblings, 1 reply; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-22 16:56 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Christoph Lameter, Greg KH, wli, agl, luick, Lee.Schermerhorn,
	linux-mm

On 22.04.2008 [07:14:47 +0200], Nick Piggin wrote:
> On Thu, Apr 17, 2008 at 04:22:17PM -0700, Christoph Lameter wrote:
> > On Thu, 17 Apr 2008, Nishanth Aravamudan wrote:
> > 
> > > > Do you see a particular more-sysfs-way here, Greg?
> > > 
> > > So I've received no comments yet? Perhaps I should leave things the way
> > > they are (per-node files in /sys/devices/system/node) and add
> > > nr_hugepages to /sys/kernel?
> > 
> > The strange location of the node directories has always irked me.
> > > 
> > > Do we want to put it in a subdirectory of /sys/kernel? What should the
> > > subdir be called? "hugetlb" (refers to the implementation?) or
> > > "hugepages"?
> > 
> > How about:
> > 
> > /sys/kernel/node<nr>/<node specific setting/status files> ?
> 
> I don't like /sys/kernel/node :P

Neither do I. My reasoning is that it duplicates information available
elsewhere -- what Christoph was suggesting, I think, was moving all of
the node files there. That seems like it might be outside the scope of
our discussion given the files we have now (but becomes intertwined once
we start talking about the intersection of hugetlb + NUMA in per-node
control).

> Under /sys/kernel, we should have parameters to set and query various
> kernel functionality. Control of the kernel software implementation. I
> think this is pretty well agreed (although there are maybe grey areas
> I guess)

I am fine with this claim.

> So anyway, underneath that directory, we should have more
> subdirectories grouping subsystems or sumilar functionality. We aren't
> tuning node, but hugepages subsystem.
> 
> /sys/kernel/huge{tlb|pages}/
> 
> Under that directory could be global settings as well as per node
> settings or subdirectories and so on. The layout should be similar to
> /proc/sys/* IMO. Actually it should be much neater since we have some
> hindsight, but unfortunately it is looking like it is actually messier
> ;)

Well, that's where I start to get a little stymied. It seems odd to me
to have some per-node information in one place and some in another,
where the two are not even rooted at the same location, beyond both
being in sysfs. Perhaps, as I've mentioned elsewhere, we simply have
symlinks underneath /sys/kernel/hugepages into
/sys/devices/system/node/nodeX ... but the immediate ugliness I see
there is either we duplicate the directories, or we symlink the
directories and there are now to paths into all the NUMA information,
where one (/sys/kernel/hugepages/nodeX) seems like it should only have
hugepage information.

I'd prefer hugepages to hugetlb, I think, but don't necessarily care one
way or the other.

> Let's really try to put some thought into new sysfs locations. Not
> just will it work, but is it logical and will it work tomorrow...

I agree and that's why I keep sending out e-mails about it :) Perhaps I
should prototype /sys/kernel/hugepages so we can see how it would look
as a first step, and then decide given that layout how we want the
per-node information to be presented?

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-22 16:56                     ` Nishanth Aravamudan
@ 2008-04-23  1:03                       ` Nick Piggin
  2008-04-23 18:32                         ` Nishanth Aravamudan
  0 siblings, 1 reply; 51+ messages in thread
From: Nick Piggin @ 2008-04-23  1:03 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Christoph Lameter, Greg KH, wli, agl, luick, Lee.Schermerhorn,
	linux-mm

On Tue, Apr 22, 2008 at 09:56:02AM -0700, Nishanth Aravamudan wrote:
> On 22.04.2008 [07:14:47 +0200], Nick Piggin wrote:
> 
> > So anyway, underneath that directory, we should have more
> > subdirectories grouping subsystems or sumilar functionality. We aren't
> > tuning node, but hugepages subsystem.
> > 
> > /sys/kernel/huge{tlb|pages}/
> > 
> > Under that directory could be global settings as well as per node
> > settings or subdirectories and so on. The layout should be similar to
> > /proc/sys/* IMO. Actually it should be much neater since we have some
> > hindsight, but unfortunately it is looking like it is actually messier
> > ;)
> 
> Well, that's where I start to get a little stymied. It seems odd to me
> to have some per-node information in one place and some in another,
> where the two are not even rooted at the same location, beyond both
> being in sysfs.

Why are nodes special? Why wouldn't you also group per-CPU information in
one place, for example?

Anyway, I'd argue that you wouldn't group either of those things primarily.
You would group by functionality first.

If you wanted to tweak or view your hugepages parameters, where do you
start? /sys/kernel/node is unintuitive; /sys/kernel/hugepages is easy.


> Perhaps, as I've mentioned elsewhere, we simply have
> symlinks underneath /sys/kernel/hugepages into
> /sys/devices/system/node/nodeX ... but the immediate ugliness I see
> there is either we duplicate the directories, or we symlink the

I don't like the idea of putting kernel implementation parameters in
/sys/devices/ (grey area for device drivers, perhaps).


> directories and there are now to paths into all the NUMA information,
> where one (/sys/kernel/hugepages/nodeX) seems like it should only have
> hugepage information.

But the idea of getting "all NUMA information" from one place just seems
wrong to me. Getting all *hardware* NUMA information from one place is
fine. But kernel implementation wise I think you are really interested in
subsystems *first*.

Just to demonstrate how badly "all NUMA information in one place"
generalises: you also then need a completely different place to store
global information for that subsystem, and a different place again to
store per-CPU information.

 
> I'd prefer hugepages to hugetlb, I think, but don't necessarily care one
> way or the other.

I'm fine with that. 


> > Let's really try to put some thought into new sysfs locations. Not
> > just will it work, but is it logical and will it work tomorrow...
> 
> I agree and that's why I keep sending out e-mails about it :) Perhaps I
> should prototype /sys/kernel/hugepages so we can see how it would look
> as a first step, and then decide given that layout how we want the
> per-node information to be presented?

Sure.

Thanks,
Nick

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-23  1:03                       ` Nick Piggin
@ 2008-04-23 18:32                         ` Nishanth Aravamudan
  2008-04-23 19:07                           ` Adam Litke
  2008-04-24  7:13                           ` Nick Piggin
  0 siblings, 2 replies; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-23 18:32 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Christoph Lameter, Greg KH, wli, agl, luick, Lee.Schermerhorn,
	linux-mm

On 23.04.2008 [03:03:00 +0200], Nick Piggin wrote:
> On Tue, Apr 22, 2008 at 09:56:02AM -0700, Nishanth Aravamudan wrote:
> > On 22.04.2008 [07:14:47 +0200], Nick Piggin wrote:
> > 
> > > So anyway, underneath that directory, we should have more
> > > subdirectories grouping subsystems or sumilar functionality. We aren't
> > > tuning node, but hugepages subsystem.
> > > 
> > > /sys/kernel/huge{tlb|pages}/
> > > 
> > > Under that directory could be global settings as well as per node
> > > settings or subdirectories and so on. The layout should be similar to
> > > /proc/sys/* IMO. Actually it should be much neater since we have some
> > > hindsight, but unfortunately it is looking like it is actually messier
> > > ;)
> > 
> > Well, that's where I start to get a little stymied. It seems odd to me
> > to have some per-node information in one place and some in another,
> > where the two are not even rooted at the same location, beyond both
> > being in sysfs.
> 
> Why are nodes special? Why wouldn't you also group per-CPU information
> in one place, for example?
> 
> Anyway, I'd argue that you wouldn't group either of those things
> primarily.  You would group by functionality first.
> 
> If you wanted to tweak or view your hugepages parameters, where do you
> start? /sys/kernel/node is unintuitive; /sys/kernel/hugepages is easy.

Let's be clear, here. I do *not* agree with Christoph's /sys/kernel/node
proposal. I was referring simply to how things were laid out now, and
that we'd have per-node control of hugepages in /sys/kernel/hugepages
and per-node memory information in /sys/devices/system/node.

I have been convinced that /sys/kernel/hugepages to control all hugepage
functionality is reasonable. My primary concern is making sure the code
is clean to move the per-node patches to that location; however, I am
going to focus on moving nr_{,overcommit}_hugepages to sysfs first.

> > Perhaps, as I've mentioned elsewhere, we simply have symlinks
> > underneath /sys/kernel/hugepages into /sys/devices/system/node/nodeX
> > ... but the immediate ugliness I see there is either we duplicate
> > the directories, or we symlink the
> 
> I don't like the idea of putting kernel implementation parameters in
> /sys/devices/ (grey area for device drivers, perhaps).

Ack.

> > directories and there are now to paths into all the NUMA information,
> > where one (/sys/kernel/hugepages/nodeX) seems like it should only have
> > hugepage information.
> 
> But the idea of getting "all NUMA information" from one place just
> seems wrong to me. Getting all *hardware* NUMA information from one
> place is fine. But kernel implementation wise I think you are really
> interested in subsystems *first*.

Ok.

> Just to demonstrate how badly "all NUMA information in one place"
> generalises: you also then need a completely different place to store
> global information for that subsystem, and a different place again to
> store per-CPU information.
> 
> 
> > I'd prefer hugepages to hugetlb, I think, but don't necessarily care
> > one way or the other.
> 
> I'm fine with that. 

Ok, thanks.

> > > Let's really try to put some thought into new sysfs locations. Not
> > > just will it work, but is it logical and will it work tomorrow...
> > 
> > I agree and that's why I keep sending out e-mails about it :) Perhaps I
> > should prototype /sys/kernel/hugepages so we can see how it would look
> > as a first step, and then decide given that layout how we want the
> > per-node information to be presented?
> 
> Sure.

So, I think, we pretty much agree on how things should be:

Direct translation of the current sysctl:

/sys/kernel/hugepages/nr_hugepages
                      nr_overcommit_hugepages

Adding multiple pools:

/sys/kernel/hugepages/nr_hugepages -> nr_hugepages_${default_size}
                      nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size}
                      nr_hugepages_${default_size}
                      nr_overcommit_hugepages_${default_size}
                      nr_hugepages_${other_size1}
                      nr_overcommit_hugepages_${other_size2}

Adding per-node control:

/sys/kernel/hugepages/nr_hugepages -> nr_hugepages_${default_size}
                      nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size}
                      nr_hugepages_${default_size}
                      nr_overcommit_hugepages_${default_size}
                      nr_hugepages_${other_size1}
                      nr_overcommit_hugepages_${other_size2}
                      nodeX/nr_hugepages -> nr_hugepages_${default_size}
                            nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size}
                            nr_hugepages_${default_size}
                            nr_overcommit_hugepages_${default_size}
                            nr_hugepages_${other_size1}
                            nr_overcommit_hugepages_${other_size2}

How does that look? Does anyone have any problems with such an
arrangement?

Thanks,
Nish

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-23 18:32                         ` Nishanth Aravamudan
@ 2008-04-23 19:07                           ` Adam Litke
  2008-04-24  7:13                           ` Nick Piggin
  1 sibling, 0 replies; 51+ messages in thread
From: Adam Litke @ 2008-04-23 19:07 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Nick Piggin, Christoph Lameter, Greg KH, wli, luick,
	Lee.Schermerhorn, linux-mm

On Wed, 2008-04-23 at 11:32 -0700, Nishanth Aravamudan wrote:
> So, I think, we pretty much agree on how things should be:
> 
> Direct translation of the current sysctl:
> 
> /sys/kernel/hugepages/nr_hugepages
>                       nr_overcommit_hugepages
> 
> Adding multiple pools:
> 
> /sys/kernel/hugepages/nr_hugepages -> nr_hugepages_${default_size}
>                       nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size}
>                       nr_hugepages_${default_size}
>                       nr_overcommit_hugepages_${default_size}
>                       nr_hugepages_${other_size1}
>                       nr_overcommit_hugepages_${other_size2}
> 
> Adding per-node control:
> 
> /sys/kernel/hugepages/nr_hugepages -> nr_hugepages_${default_size}
>                       nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size}
>                       nr_hugepages_${default_size}
>                       nr_overcommit_hugepages_${default_size}
>                       nr_hugepages_${other_size1}
>                       nr_overcommit_hugepages_${other_size2}
>                       nodeX/nr_hugepages -> nr_hugepages_${default_size}
>                             nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size}
>                             nr_hugepages_${default_size}
>                             nr_overcommit_hugepages_${default_size}
>                             nr_hugepages_${other_size1}
>                             nr_overcommit_hugepages_${other_size2}
> 
> How does that look? Does anyone have any problems with such an
> arrangement?

This seems sensible to me.  

-- 
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-23 18:32                         ` Nishanth Aravamudan
  2008-04-23 19:07                           ` Adam Litke
@ 2008-04-24  7:13                           ` Nick Piggin
  2008-04-24 15:54                             ` Nishanth Aravamudan
  2008-04-27  3:49                             ` [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] Nishanth Aravamudan
  1 sibling, 2 replies; 51+ messages in thread
From: Nick Piggin @ 2008-04-24  7:13 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Christoph Lameter, Greg KH, wli, agl, luick, Lee.Schermerhorn,
	linux-mm

On Wed, Apr 23, 2008 at 11:32:52AM -0700, Nishanth Aravamudan wrote:
> 
> So, I think, we pretty much agree on how things should be:
> 
> Direct translation of the current sysctl:
> 
> /sys/kernel/hugepages/nr_hugepages
>                       nr_overcommit_hugepages
> 
> Adding multiple pools:
> 
> /sys/kernel/hugepages/nr_hugepages -> nr_hugepages_${default_size}
>                       nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size}
>                       nr_hugepages_${default_size}
>                       nr_overcommit_hugepages_${default_size}
>                       nr_hugepages_${other_size1}
>                       nr_overcommit_hugepages_${other_size2}
> 
> Adding per-node control:
> 
> /sys/kernel/hugepages/nr_hugepages -> nr_hugepages_${default_size}
>                       nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size}
>                       nr_hugepages_${default_size}
>                       nr_overcommit_hugepages_${default_size}
>                       nr_hugepages_${other_size1}
>                       nr_overcommit_hugepages_${other_size2}
>                       nodeX/nr_hugepages -> nr_hugepages_${default_size}
>                             nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size}
>                             nr_hugepages_${default_size}
>                             nr_overcommit_hugepages_${default_size}
>                             nr_hugepages_${other_size1}
>                             nr_overcommit_hugepages_${other_size2}
> 
> How does that look? Does anyone have any problems with such an
> arrangement?

Looks pretty good. I would personally lean toward subdirectories for
hstates. Pros are that it would be a little easier to navigate from
the shell, and maybe more regular to program for.

You could possibly have hugepages_default symlink as well to one of
the directories of your choice. This could be used by apps which do
not specify exactly what size they want...

I don't know, just ideas.

 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI
  2008-04-24  7:13                           ` Nick Piggin
@ 2008-04-24 15:54                             ` Nishanth Aravamudan
  2008-04-27  3:49                             ` [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] Nishanth Aravamudan
  1 sibling, 0 replies; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-24 15:54 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Christoph Lameter, Greg KH, wli, agl, luick, Lee.Schermerhorn,
	linux-mm

On 24.04.2008 [09:13:52 +0200], Nick Piggin wrote:
> On Wed, Apr 23, 2008 at 11:32:52AM -0700, Nishanth Aravamudan wrote:
> > 
> > So, I think, we pretty much agree on how things should be:
> > 
> > Direct translation of the current sysctl:
> > 
> > /sys/kernel/hugepages/nr_hugepages
> >                       nr_overcommit_hugepages
> > 
> > Adding multiple pools:
> > 
> > /sys/kernel/hugepages/nr_hugepages -> nr_hugepages_${default_size}
> >                       nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size}
> >                       nr_hugepages_${default_size}
> >                       nr_overcommit_hugepages_${default_size}
> >                       nr_hugepages_${other_size1}
> >                       nr_overcommit_hugepages_${other_size2}
> > 
> > Adding per-node control:
> > 
> > /sys/kernel/hugepages/nr_hugepages -> nr_hugepages_${default_size}
> >                       nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size}
> >                       nr_hugepages_${default_size}
> >                       nr_overcommit_hugepages_${default_size}
> >                       nr_hugepages_${other_size1}
> >                       nr_overcommit_hugepages_${other_size2}
> >                       nodeX/nr_hugepages -> nr_hugepages_${default_size}
> >                             nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size}
> >                             nr_hugepages_${default_size}
> >                             nr_overcommit_hugepages_${default_size}
> >                             nr_hugepages_${other_size1}
> >                             nr_overcommit_hugepages_${other_size2}
> > 
> > How does that look? Does anyone have any problems with such an
> > arrangement?
> 
> Looks pretty good. I would personally lean toward subdirectories for
> hstates. Pros are that it would be a little easier to navigate from
> the shell, and maybe more regular to program for.

That's probably true -- and extracting the pagesize to which things
correspond should be simpler too. And perhaps that would allow for
easier output of meminfo for the various hugepage sizes?

> You could possibly have hugepages_default symlink as well to one of
> the directories of your choice. This could be used by apps which do
> not specify exactly what size they want...

Yep, that would seem sensible...or perhaps even have nr_hugepages in the
root directory (and other corresponding files directly symlink into the
default_size directory?).

> I don't know, just ideas.

Thanks for the feedback!

-Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI]
  2008-04-24  7:13                           ` Nick Piggin
  2008-04-24 15:54                             ` Nishanth Aravamudan
@ 2008-04-27  3:49                             ` Nishanth Aravamudan
  2008-04-27  5:10                               ` Greg KH
  1 sibling, 1 reply; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-27  3:49 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Christoph Lameter, Greg KH, wli, agl, luick, Lee.Schermerhorn,
	linux-mm

On 24.04.2008 [09:13:52 +0200], Nick Piggin wrote:
> On Wed, Apr 23, 2008 at 11:32:52AM -0700, Nishanth Aravamudan wrote:
> > 
> > So, I think, we pretty much agree on how things should be:
> > 
> > Direct translation of the current sysctl:
> > 
> > /sys/kernel/hugepages/nr_hugepages
> >                       nr_overcommit_hugepages
> > 
> > Adding multiple pools:
> > 
> > /sys/kernel/hugepages/nr_hugepages -> nr_hugepages_${default_size}
> >                       nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size}
> >                       nr_hugepages_${default_size}
> >                       nr_overcommit_hugepages_${default_size}
> >                       nr_hugepages_${other_size1}
> >                       nr_overcommit_hugepages_${other_size2}
> > 
> > Adding per-node control:
> > 
> > /sys/kernel/hugepages/nr_hugepages -> nr_hugepages_${default_size}
> >                       nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size}
> >                       nr_hugepages_${default_size}
> >                       nr_overcommit_hugepages_${default_size}
> >                       nr_hugepages_${other_size1}
> >                       nr_overcommit_hugepages_${other_size2}
> >                       nodeX/nr_hugepages -> nr_hugepages_${default_size}
> >                             nr_overcommit_hugepages -> nr_overcommit_hugepages_${default_size}
> >                             nr_hugepages_${default_size}
> >                             nr_overcommit_hugepages_${default_size}
> >                             nr_hugepages_${other_size1}
> >                             nr_overcommit_hugepages_${other_size2}
> > 
> > How does that look? Does anyone have any problems with such an
> > arrangement?
> 
> Looks pretty good. I would personally lean toward subdirectories for
> hstates. Pros are that it would be a little easier to navigate from
> the shell, and maybe more regular to program for.
> 
> You could possibly have hugepages_default symlink as well to one of
> the directories of your choice. This could be used by apps which do
> not specify exactly what size they want...
> 
> I don't know, just ideas.

So, here's the first cut of the patch. Still very rough, but it builds
and I'm running it now:

[20:41:34]nacc@arkanoid:/sys/kernel/hugepages$ tree
.
`-- hugepages-2MB
    |-- meminfo
    |-- nr_huge_pages
    `-- nr_overcommit_huge_pages

1 directory, 3 files

[20:41:56]nacc@arkanoid:/sys/kernel/hugepages$ cat /sys/kernel/hugepages/hugepages-2MB/meminfo
HugePages_Total:     0
HugePages_Free:      0
HugePages_Rsvd:      0
HugePages_Surp:      0
Hugepagesize:     2048 kB

[20:42:20]nacc@arkanoid:/sys/kernel/hugepages$ sudo echo 10 > /sys/kernel/hugepages/hugepages-2MB/nr_huge_pages 
[20:42:57]nacc@arkanoid:/sys/kernel/hugepages$ cat /sys/kernel/hugepages/hugepages-2MB/nr_huge_pages 
10
[20:43:02]nacc@arkanoid:/sys/kernel/hugepages$ cat /sys/kernel/hugepages/hugepages-2MB/meminfo 
HugePages_Total:    10
HugePages_Free:     10
HugePages_Rsvd:      0
HugePages_Surp:      0
Hugepagesize:     2048 kB
[20:43:05]nacc@arkanoid:/sys/kernel/hugepages$ cat /proc/m
[20:43:10]nacc@arkanoid:/sys/kernel/hugepages$ grep Huge /proc/meminfo 
HugePages_Total:    10
HugePages_Free:     10
HugePages_Rsvd:      0
HugePages_Surp:      0
Hugepagesize:     2048 kB

I haven't tested yet with multiple pools, will hopefully get to that Monday. I
see one obvious issue, in that I left an underscore in huge_pages :) Will fix.
How does the naming seem? I don't like having two memfmt()s but I couldn't
think of a good way, beyond perhaps having two strings, one for the magnitude
and one for the units, but that seemed gross.

A lot of the functions and macros, perhaps all of them, are clones of the ones
used for /sys/kernel/slab. Thanks to those authors for that code!

Greg, do you see any obvious violations of sysfs rules here? Well, beyond
meminfo itself, I guess, but given our previous snapshot discussion, I left it
simple and the same, rather than split it up.

Not-yet-Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

 include/linux/hugetlb.h |    9 +-
 mm/hugetlb.c            |  317 ++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 251 insertions(+), 75 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7aa22e7..cac63bd 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -3,6 +3,9 @@
 
 #include <linux/fs.h>
 #include <linux/shm.h>
+#include <linux/mempolicy.h>
+#include <asm/tlbflush.h>
+#include <asm/hugetlb.h>
 
 #ifdef CONFIG_HUGETLBFS
 struct hugetlbfs_config {
@@ -69,10 +72,6 @@ static inline void set_file_hugepages(struct file *file)
 
 #ifdef CONFIG_HUGETLB_PAGE
 
-#include <linux/mempolicy.h>
-#include <asm/tlbflush.h>
-#include <asm/hugetlb.h>
-
 struct ctl_table;
 
 static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
@@ -131,6 +130,8 @@ struct hstate {
 	unsigned int nr_huge_pages_node[MAX_NUMNODES];
 	unsigned int free_huge_pages_node[MAX_NUMNODES];
 	unsigned int surplus_huge_pages_node[MAX_NUMNODES];
+	const char *name;
+	struct kobject kobj;
 };
 
 void __init huge_add_hstate(unsigned order);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index de03a14..c30e45d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -15,6 +15,7 @@
 #include <linux/cpuset.h>
 #include <linux/mutex.h>
 #include <linux/bootmem.h>
+#include <linux/sysfs.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -604,9 +605,21 @@ static void __init gather_bootmem_prealloc(void)
 	}
 }
 
+static __init char *memfmt_nospaces(char *buf, unsigned long n)
+{
+	if (n >= (1UL << 30))
+		sprintf(buf, "%luGB", n >> 30);
+	else if (n >= (1UL << 20))
+		sprintf(buf, "%luMB", n >> 20);
+	else
+		sprintf(buf, "%luKB", n >> 10);
+	return buf;
+}
+
 static void __init hugetlb_init_hstate(struct hstate *h)
 {
 	unsigned long i;
+	char buf[32];
 
 	/* Don't reinitialize lists if they have been already init'ed */
 	if (!h->hugepage_freelists[0].next) {
@@ -624,6 +637,8 @@ static void __init hugetlb_init_hstate(struct hstate *h)
 			break;
 	}
 	h->max_huge_pages = i;
+	h->name = kasprintf(GFP_KERNEL, "hugepages-%s",
+				memfmt_nospaces(buf, huge_page_size(h)));
 }
 
 static void __init hugetlb_init_hstates(void)
@@ -662,77 +677,6 @@ static void __init report_hugepages(void)
         }
 }
 
-static int __init hugetlb_init(void)
-{
-	BUILD_BUG_ON(HPAGE_SHIFT == 0);
-
-	if (!size_to_hstate(HPAGE_SIZE)) {
-		huge_add_hstate(HUGETLB_PAGE_ORDER);
-		parsed_hstate->max_huge_pages = default_hstate_resv;
-	}
-
-	hugetlb_init_hstates();
-
-	gather_bootmem_prealloc();
-
-	report_hugepages();
-
-	return 0;
-}
-module_init(hugetlb_init);
-
-/* Should be called on processing a hugepagesz=... option */
-void __init huge_add_hstate(unsigned order)
-{
-	struct hstate *h;
-	if (size_to_hstate(PAGE_SIZE << order)) {
-		printk("hugepagesz= specified twice, ignoring\n");
-		return;
-	}
-	BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
-	BUG_ON(order < HPAGE_SHIFT - PAGE_SHIFT);
-	h = &hstates[max_hstate++];
-	h->order = order;
-	h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
-	hugetlb_init_hstate(h);
-	parsed_hstate = h;
-}
-
-static int __init hugetlb_setup(char *s)
-{
-	unsigned long *mhp;
-
-	if (!max_hstate)
-		mhp = &default_hstate_resv;
-	else
-		mhp = &parsed_hstate->max_huge_pages;
-
-	if (sscanf(s, "%lu", mhp) <= 0)
-		*mhp = 0;
-
-	/*
-	 * Global state is always initialized later in hugetlb_init.
-	 * But we need to allocate >= MAX_ORDER hstates here early to still
-	 * use the bootmem allocator.
-	 */
-	if (max_hstate > 0 && parsed_hstate->order >= MAX_ORDER)
-		hugetlb_init_hstate(parsed_hstate);
-
-	return 1;
-}
-__setup("hugepages=", hugetlb_setup);
-
-static unsigned int cpuset_mems_nr(unsigned int *array)
-{
-	int node;
-	unsigned int nr = 0;
-
-	for_each_node_mask(node, cpuset_current_mems_allowed)
-		nr += array[node];
-
-	return nr;
-}
-
 #ifdef CONFIG_SYSCTL
 #ifdef CONFIG_HIGHMEM
 static void try_to_free_low(struct hstate *h, unsigned long count)
@@ -843,6 +787,237 @@ out:
 	return ret;
 }
 
+#ifdef CONFIG_SYSFS
+#define to_hstate_attr(n) container_of(n, struct hstate_attribute, attr)
+#define to_hstate(n) container_of(n, struct hstate, kobj)
+
+struct hstate_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct hstate *h, char *buf);
+	ssize_t (*store)(struct hstate *h, const char *buf, size_t count);
+};
+
+#define HSTATE_ATTR_RO(_name) \
+	static struct hstate_attribute _name##_attr = __ATTR_RO(_name)
+
+#define HSTATE_ATTR(_name) \
+	static struct hstate_attribute _name##_attr = \
+		__ATTR(_name, 0644, _name##_show, _name##_store)
+
+static ssize_t nr_huge_pages_show(struct hstate *h, char *buf)
+{
+	return sprintf(buf, "%lu\n", h->nr_huge_pages);
+}
+static ssize_t nr_huge_pages_store(struct hstate *h, const char *buf, size_t count)
+{
+	int tmp;
+
+	h->max_huge_pages = set_max_huge_pages(h,
+					simple_strtoul(buf, NULL, 10), &tmp);
+	max_huge_pages[h - hstates] = h->max_huge_pages;
+	return count;
+}
+HSTATE_ATTR(nr_huge_pages);
+
+static ssize_t nr_overcommit_huge_pages_show(struct hstate *h, char *buf)
+{
+	return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
+}
+static ssize_t nr_overcommit_huge_pages_store(struct hstate *h, const char *buf, size_t count)
+{
+	spin_lock(&hugetlb_lock);
+	h->nr_overcommit_huge_pages = simple_strtoul(buf, NULL, 10);
+	sysctl_overcommit_huge_pages[h - hstates] = h->nr_overcommit_huge_pages;
+	spin_unlock(&hugetlb_lock);
+	return count;
+}
+HSTATE_ATTR(nr_overcommit_huge_pages);
+
+static ssize_t meminfo_show(struct hstate *h, char *buf)
+{
+	return sprintf(buf,
+			"HugePages_Total: %5lu\n"
+			"HugePages_Free:  %5lu\n"
+			"HugePages_Rsvd:  %5lu\n"
+			"HugePages_Surp:  %5lu\n"
+			"Hugepagesize:    %5lu kB\n",
+			h->nr_huge_pages,
+			h->free_huge_pages,
+			h->resv_huge_pages,
+			h->surplus_huge_pages,
+			huge_page_size(h) / 1024);
+}
+HSTATE_ATTR_RO(meminfo);
+
+static struct kset *hstate_kset;
+
+static struct attribute *hstate_attrs[] = {
+	&meminfo_attr.attr,
+	&nr_huge_pages_attr.attr,
+	&nr_overcommit_huge_pages_attr.attr,
+};
+
+static struct attribute_group hstate_attr_group = {
+	.attrs = hstate_attrs,
+};
+
+static ssize_t hstate_attr_show(struct kobject *kobj,
+					struct attribute *attr,
+					char *buf)
+{
+	struct hstate_attribute *attribute;
+	struct hstate *h;
+	int err;
+
+	attribute = to_hstate_attr(attr);
+	h = to_hstate(kobj);
+
+	if (!attribute->show)
+		return -EIO;
+
+	err = attribute->show(h, buf);
+
+	return err;
+}
+
+static ssize_t hstate_attr_store(struct kobject *kobj,
+					struct attribute *attr,
+					const char *buf, size_t len)
+{
+	struct hstate_attribute *attribute;
+	struct hstate *h;
+	int err;
+
+	attribute = to_hstate_attr(attr);
+	h = to_hstate(kobj);
+
+	if (!attribute->store)
+		return -EIO;
+
+	err = attribute->store(h, buf, len);
+
+	return err;
+}
+
+static struct sysfs_ops hstate_sysfs_ops = {
+	.show = hstate_attr_show,
+	.store = hstate_attr_store,
+};
+
+static struct kobj_type hstate_ktype = {
+	.sysfs_ops = &hstate_sysfs_ops,
+};
+
+static int __init hugetlb_sysfs_add_hstate(struct hstate *h)
+{
+	int err;
+	h->kobj.kset = hstate_kset;
+	err = kobject_init_and_add(&h->kobj, &hstate_ktype, NULL, h->name);
+	if (err) {
+		kobject_put(&h->kobj);
+		return err;
+	}
+	err = sysfs_create_group(&h->kobj, &hstate_attr_group);
+	if (err)
+		return err;
+	return 0;
+}
+
+static void __init hugetlb_sysfs_init(void)
+{
+	struct hstate *h;
+	int err;
+
+	hstate_kset = kset_create_and_add("hugepages", NULL, kernel_kobj);
+	if (!hstate_kset)
+		return;
+
+	for_each_hstate(h) {
+		err = hugetlb_sysfs_add_hstate(h);
+		if (err)
+			printk(KERN_ERR "Hugetlb: Unable to add hstate %s", h->name);
+	}
+}
+#else
+static void __init hugetlb_sysfs_init(void)
+{
+}
+#endif
+
+static int __init hugetlb_init(void)
+{
+	BUILD_BUG_ON(HPAGE_SHIFT == 0);
+
+	if (!size_to_hstate(HPAGE_SIZE)) {
+		huge_add_hstate(HUGETLB_PAGE_ORDER);
+		parsed_hstate->max_huge_pages = default_hstate_resv;
+	}
+
+	hugetlb_init_hstates();
+
+	gather_bootmem_prealloc();
+
+	report_hugepages();
+
+	hugetlb_sysfs_init();
+
+	return 0;
+}
+module_init(hugetlb_init);
+
+/* Should be called on processing a hugepagesz=... option */
+void __init huge_add_hstate(unsigned order)
+{
+	struct hstate *h;
+	if (size_to_hstate(PAGE_SIZE << order)) {
+		printk("hugepagesz= specified twice, ignoring\n");
+		return;
+	}
+	BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
+	BUG_ON(order < HPAGE_SHIFT - PAGE_SHIFT);
+	h = &hstates[max_hstate++];
+	h->order = order;
+	h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
+	hugetlb_init_hstate(h);
+	parsed_hstate = h;
+}
+
+static int __init hugetlb_setup(char *s)
+{
+	unsigned long *mhp;
+
+	if (!max_hstate)
+		mhp = &default_hstate_resv;
+	else
+		mhp = &parsed_hstate->max_huge_pages;
+
+	if (sscanf(s, "%lu", mhp) <= 0)
+		*mhp = 0;
+
+	/*
+	 * Global state is always initialized later in hugetlb_init.
+	 * But we need to allocate >= MAX_ORDER hstates here early to still
+	 * use the bootmem allocator.
+	 */
+	if (max_hstate > 0 && parsed_hstate->order >= MAX_ORDER)
+		hugetlb_init_hstate(parsed_hstate);
+
+	return 1;
+}
+__setup("hugepages=", hugetlb_setup);
+
+static unsigned int cpuset_mems_nr(unsigned int *array)
+{
+	int node;
+	unsigned int nr = 0;
+
+	for_each_node_mask(node, cpuset_current_mems_allowed)
+		nr += array[node];
+
+	return nr;
+}
+
+
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 			   struct file *file, void __user *buffer,
 			   size_t *length, loff_t *ppos)

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI]
  2008-04-27  3:49                             ` [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] Nishanth Aravamudan
@ 2008-04-27  5:10                               ` Greg KH
  2008-04-28 17:22                                 ` Nishanth Aravamudan
  2008-04-28 20:31                                 ` Christoph Lameter
  0 siblings, 2 replies; 51+ messages in thread
From: Greg KH @ 2008-04-27  5:10 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn,
	linux-mm

On Sat, Apr 26, 2008 at 08:49:42PM -0700, Nishanth Aravamudan wrote:
> 
> [20:41:56]nacc@arkanoid:/sys/kernel/hugepages$ cat /sys/kernel/hugepages/hugepages-2MB/meminfo
> HugePages_Total:     0
> HugePages_Free:      0
> HugePages_Rsvd:      0
> HugePages_Surp:      0
> Hugepagesize:     2048 kB
> 
> Greg, do you see any obvious violations of sysfs rules here? Well, beyond
> meminfo itself, I guess, but given our previous snapshot discussion, I left it
> simple and the same, rather than split it up.

Yeah, I don't like that file.  Why not just have 5 files, one for each
value?  There isn't such a need for an immediate snapshot shere you
can't just read all 5 values from 5 files?

Also, why use a "units" here, just always use the lowest unit, and
userspace can convert from kB to GB if needed.

thanks,

greg k-h

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI]
  2008-04-27  5:10                               ` Greg KH
@ 2008-04-28 17:22                                 ` Nishanth Aravamudan
  2008-04-28 17:29                                   ` Greg KH
  2008-04-28 20:31                                 ` Christoph Lameter
  1 sibling, 1 reply; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-28 17:22 UTC (permalink / raw)
  To: Greg KH
  Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn,
	linux-mm

On 26.04.2008 [22:10:29 -0700], Greg KH wrote:
> On Sat, Apr 26, 2008 at 08:49:42PM -0700, Nishanth Aravamudan wrote:
> > 
> > [20:41:56]nacc@arkanoid:/sys/kernel/hugepages$ cat /sys/kernel/hugepages/hugepages-2MB/meminfo
> > HugePages_Total:     0
> > HugePages_Free:      0
> > HugePages_Rsvd:      0
> > HugePages_Surp:      0
> > Hugepagesize:     2048 kB
> > 
> > Greg, do you see any obvious violations of sysfs rules here? Well, beyond
> > meminfo itself, I guess, but given our previous snapshot discussion, I left it
> > simple and the same, rather than split it up.
> 
> Yeah, I don't like that file.  Why not just have 5 files, one for each
> value?  There isn't such a need for an immediate snapshot shere you
> can't just read all 5 values from 5 files?

Actually, we already have Total in nr_hugepages, so I only needed to add
3 files. The size is implicit in the directory name?

> Also, why use a "units" here, just always use the lowest unit, and
> userspace can convert from kB to GB if needed.

Agreed, so I changed the name of the directory from

hugepages-2M

to

hugpeages-2048

for instance. Userspace utilities can pretty-ize it :)

Thanks,
Nish

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI]
  2008-04-28 17:22                                 ` Nishanth Aravamudan
@ 2008-04-28 17:29                                   ` Greg KH
  2008-04-29 17:11                                     ` Nishanth Aravamudan
  0 siblings, 1 reply; 51+ messages in thread
From: Greg KH @ 2008-04-28 17:29 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn,
	linux-mm

On Mon, Apr 28, 2008 at 10:22:39AM -0700, Nishanth Aravamudan wrote:
> On 26.04.2008 [22:10:29 -0700], Greg KH wrote:
> > On Sat, Apr 26, 2008 at 08:49:42PM -0700, Nishanth Aravamudan wrote:
> > > 
> > > [20:41:56]nacc@arkanoid:/sys/kernel/hugepages$ cat /sys/kernel/hugepages/hugepages-2MB/meminfo
> > > HugePages_Total:     0
> > > HugePages_Free:      0
> > > HugePages_Rsvd:      0
> > > HugePages_Surp:      0
> > > Hugepagesize:     2048 kB
> > > 
> > > Greg, do you see any obvious violations of sysfs rules here? Well, beyond
> > > meminfo itself, I guess, but given our previous snapshot discussion, I left it
> > > simple and the same, rather than split it up.
> > 
> > Yeah, I don't like that file.  Why not just have 5 files, one for each
> > value?  There isn't such a need for an immediate snapshot shere you
> > can't just read all 5 values from 5 files?
> 
> Actually, we already have Total in nr_hugepages, so I only needed to add
> 3 files. The size is implicit in the directory name?

Ah, good point.

> > Also, why use a "units" here, just always use the lowest unit, and
> > userspace can convert from kB to GB if needed.
> 
> Agreed, so I changed the name of the directory from
> 
> hugepages-2M
> 
> to
> 
> hugpeages-2048
> 
> for instance. Userspace utilities can pretty-ize it :)

Exactly, that would be much better.

thanks,

greg k-h

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI]
  2008-04-28 17:29                                   ` Greg KH
@ 2008-04-29 17:11                                     ` Nishanth Aravamudan
  2008-04-29 17:22                                       ` Greg KH
  0 siblings, 1 reply; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-29 17:11 UTC (permalink / raw)
  To: Greg KH
  Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn,
	linux-mm

On 28.04.2008 [10:29:51 -0700], Greg KH wrote:
> On Mon, Apr 28, 2008 at 10:22:39AM -0700, Nishanth Aravamudan wrote:
> > On 26.04.2008 [22:10:29 -0700], Greg KH wrote:
> > > On Sat, Apr 26, 2008 at 08:49:42PM -0700, Nishanth Aravamudan wrote:
> > > > 
> > > > [20:41:56]nacc@arkanoid:/sys/kernel/hugepages$ cat /sys/kernel/hugepages/hugepages-2MB/meminfo
> > > > HugePages_Total:     0
> > > > HugePages_Free:      0
> > > > HugePages_Rsvd:      0
> > > > HugePages_Surp:      0
> > > > Hugepagesize:     2048 kB
> > > > 
> > > > Greg, do you see any obvious violations of sysfs rules here? Well, beyond
> > > > meminfo itself, I guess, but given our previous snapshot discussion, I left it
> > > > simple and the same, rather than split it up.
> > > 
> > > Yeah, I don't like that file.  Why not just have 5 files, one for each
> > > value?  There isn't such a need for an immediate snapshot shere you
> > > can't just read all 5 values from 5 files?
> > 
> > Actually, we already have Total in nr_hugepages, so I only needed to add
> > 3 files. The size is implicit in the directory name?
> 
> Ah, good point.
> 
> > > Also, why use a "units" here, just always use the lowest unit, and
> > > userspace can convert from kB to GB if needed.
> > 
> > Agreed, so I changed the name of the directory from
> > 
> > hugepages-2M
> > 
> > to
> > 
> > hugpeages-2048
> > 
> > for instance. Userspace utilities can pretty-ize it :)
> 
> Exactly, that would be much better.

FWIW, here's the updated patch. Still needs more testing.

 include/linux/hugetlb.h |    9 +-
 mm/hugetlb.c            |  322 ++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 256 insertions(+), 75 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7aa22e7..cac63bd 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -3,6 +3,9 @@
 
 #include <linux/fs.h>
 #include <linux/shm.h>
+#include <linux/mempolicy.h>
+#include <asm/tlbflush.h>
+#include <asm/hugetlb.h>
 
 #ifdef CONFIG_HUGETLBFS
 struct hugetlbfs_config {
@@ -69,10 +72,6 @@ static inline void set_file_hugepages(struct file *file)
 
 #ifdef CONFIG_HUGETLB_PAGE
 
-#include <linux/mempolicy.h>
-#include <asm/tlbflush.h>
-#include <asm/hugetlb.h>
-
 struct ctl_table;
 
 static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
@@ -131,6 +130,8 @@ struct hstate {
 	unsigned int nr_huge_pages_node[MAX_NUMNODES];
 	unsigned int free_huge_pages_node[MAX_NUMNODES];
 	unsigned int surplus_huge_pages_node[MAX_NUMNODES];
+	const char *name;
+	struct kobject kobj;
 };
 
 void __init huge_add_hstate(unsigned order);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index de03a14..1d94a85 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -15,6 +15,7 @@
 #include <linux/cpuset.h>
 #include <linux/mutex.h>
 #include <linux/bootmem.h>
+#include <linux/sysfs.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -624,6 +625,8 @@ static void __init hugetlb_init_hstate(struct hstate *h)
 			break;
 	}
 	h->max_huge_pages = i;
+	h->name = kasprintf(GFP_KERNEL, "hugepages-%lu",
+						huge_page_size(h) / 1024);
 }
 
 static void __init hugetlb_init_hstates(void)
@@ -662,77 +665,6 @@ static void __init report_hugepages(void)
         }
 }
 
-static int __init hugetlb_init(void)
-{
-	BUILD_BUG_ON(HPAGE_SHIFT == 0);
-
-	if (!size_to_hstate(HPAGE_SIZE)) {
-		huge_add_hstate(HUGETLB_PAGE_ORDER);
-		parsed_hstate->max_huge_pages = default_hstate_resv;
-	}
-
-	hugetlb_init_hstates();
-
-	gather_bootmem_prealloc();
-
-	report_hugepages();
-
-	return 0;
-}
-module_init(hugetlb_init);
-
-/* Should be called on processing a hugepagesz=... option */
-void __init huge_add_hstate(unsigned order)
-{
-	struct hstate *h;
-	if (size_to_hstate(PAGE_SIZE << order)) {
-		printk("hugepagesz= specified twice, ignoring\n");
-		return;
-	}
-	BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
-	BUG_ON(order < HPAGE_SHIFT - PAGE_SHIFT);
-	h = &hstates[max_hstate++];
-	h->order = order;
-	h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
-	hugetlb_init_hstate(h);
-	parsed_hstate = h;
-}
-
-static int __init hugetlb_setup(char *s)
-{
-	unsigned long *mhp;
-
-	if (!max_hstate)
-		mhp = &default_hstate_resv;
-	else
-		mhp = &parsed_hstate->max_huge_pages;
-
-	if (sscanf(s, "%lu", mhp) <= 0)
-		*mhp = 0;
-
-	/*
-	 * Global state is always initialized later in hugetlb_init.
-	 * But we need to allocate >= MAX_ORDER hstates here early to still
-	 * use the bootmem allocator.
-	 */
-	if (max_hstate > 0 && parsed_hstate->order >= MAX_ORDER)
-		hugetlb_init_hstate(parsed_hstate);
-
-	return 1;
-}
-__setup("hugepages=", hugetlb_setup);
-
-static unsigned int cpuset_mems_nr(unsigned int *array)
-{
-	int node;
-	unsigned int nr = 0;
-
-	for_each_node_mask(node, cpuset_current_mems_allowed)
-		nr += array[node];
-
-	return nr;
-}
-
 #ifdef CONFIG_SYSCTL
 #ifdef CONFIG_HIGHMEM
 static void try_to_free_low(struct hstate *h, unsigned long count)
@@ -843,6 +775,254 @@ out:
 	return ret;
 }
 
+#ifdef CONFIG_SYSFS
+#define to_hstate_attr(n) container_of(n, struct hstate_attribute, attr)
+#define to_hstate(n) container_of(n, struct hstate, kobj)
+
+struct hstate_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct hstate *h, char *buf);
+	ssize_t (*store)(struct hstate *h, const char *buf, size_t count);
+};
+
+#define HSTATE_ATTR_RO(_name) \
+	static struct hstate_attribute _name##_attr = __ATTR_RO(_name)
+
+#define HSTATE_ATTR(_name) \
+	static struct hstate_attribute _name##_attr = \
+		__ATTR(_name, 0644, _name##_show, _name##_store)
+
+static ssize_t nr_hugepages_show(struct hstate *h, char *buf)
+{
+	return sprintf(buf, "%lu\n", h->nr_huge_pages);
+}
+static ssize_t nr_hugepages_store(struct hstate *h, const char *buf,
+								size_t count)
+{
+	int tmp, err;
+	unsigned long input;
+	err = strict_strtoul(buf, 10, &input);
+	if (err)
+		return 0;
+
+	h->max_huge_pages = set_max_huge_pages(h, input, &tmp);
+	max_huge_pages[h - hstates] = h->max_huge_pages;
+	return count;
+}
+HSTATE_ATTR(nr_hugepages);
+
+static ssize_t nr_overcommit_hugepages_show(struct hstate *h, char *buf)
+{
+	return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
+}
+static ssize_t nr_overcommit_hugepages_store(struct hstate *h, const char *buf,
+								size_t count)
+{
+	unsigned long input;
+	int err;
+
+	err = strict_strtoul(buf, 10, &input);
+	if (err)
+		return 0;
+
+	spin_lock(&hugetlb_lock);
+	h->nr_overcommit_huge_pages = input;
+	sysctl_overcommit_huge_pages[h - hstates] = h->nr_overcommit_huge_pages;
+	spin_unlock(&hugetlb_lock);
+	return count;
+}
+HSTATE_ATTR(nr_overcommit_hugepages);
+
+static ssize_t free_hugepages_show(struct hstate *h, char *buf)
+{
+	return sprintf(buf, "%lu\n", h->free_huge_pages);
+}
+HSTATE_ATTR_RO(free_hugepages);
+
+static ssize_t resv_hugepages_show(struct hstate *h, char *buf)
+{
+	return sprintf(buf, "%lu\n", h->resv_huge_pages);
+}
+HSTATE_ATTR_RO(resv_hugepages);
+
+static ssize_t surplus_hugepages_show(struct hstate *h, char *buf)
+{
+	return sprintf(buf, "%lu\n", h->surplus_huge_pages);
+}
+HSTATE_ATTR_RO(surplus_hugepages);
+
+static struct kset *hstate_kset;
+
+static struct attribute *hstate_attrs[] = {
+	&nr_hugepages_attr.attr,
+	&nr_overcommit_hugepages_attr.attr,
+	&free_hugepages_attr.attr,
+	&resv_hugepages_attr.attr,
+	&surplus_hugepages_attr.attr,
+};
+
+static struct attribute_group hstate_attr_group = {
+	.attrs = hstate_attrs,
+};
+
+static ssize_t hstate_attr_show(struct kobject *kobj,
+					struct attribute *attr,
+					char *buf)
+{
+	struct hstate_attribute *attribute;
+	struct hstate *h;
+	int err;
+
+	attribute = to_hstate_attr(attr);
+	h = to_hstate(kobj);
+
+	if (!attribute->show)
+		return -EIO;
+
+	err = attribute->show(h, buf);
+
+	return err;
+}
+
+static ssize_t hstate_attr_store(struct kobject *kobj,
+					struct attribute *attr,
+					const char *buf, size_t len)
+{
+	struct hstate_attribute *attribute;
+	struct hstate *h;
+	int err;
+
+	attribute = to_hstate_attr(attr);
+	h = to_hstate(kobj);
+
+	if (!attribute->store)
+		return -EIO;
+
+	err = attribute->store(h, buf, len);
+
+	return err;
+}
+
+static struct sysfs_ops hstate_sysfs_ops = {
+	.show = hstate_attr_show,
+	.store = hstate_attr_store,
+};
+
+static struct kobj_type hstate_ktype = {
+	.sysfs_ops = &hstate_sysfs_ops,
+};
+
+static int __init hugetlb_sysfs_add_hstate(struct hstate *h)
+{
+	int err;
+	h->kobj.kset = hstate_kset;
+	err = kobject_init_and_add(&h->kobj, &hstate_ktype, NULL, h->name);
+	if (err) {
+		kobject_put(&h->kobj);
+		return err;
+	}
+	err = sysfs_create_group(&h->kobj, &hstate_attr_group);
+	if (err)
+		return err;
+	return 0;
+}
+
+static void __init hugetlb_sysfs_init(void)
+{
+	struct hstate *h;
+	int err;
+
+	hstate_kset = kset_create_and_add("hugepages", NULL, kernel_kobj);
+	if (!hstate_kset)
+		return;
+
+	for_each_hstate(h) {
+		err = hugetlb_sysfs_add_hstate(h);
+		if (err)
+			printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
+								h->name);
+	}
+}
+#else
+static void __init hugetlb_sysfs_init(void)
+{
+}
+#endif
+
+static int __init hugetlb_init(void)
+{
+	BUILD_BUG_ON(HPAGE_SHIFT == 0);
+
+	if (!size_to_hstate(HPAGE_SIZE)) {
+		huge_add_hstate(HUGETLB_PAGE_ORDER);
+		parsed_hstate->max_huge_pages = default_hstate_resv;
+	}
+
+	hugetlb_init_hstates();
+
+	gather_bootmem_prealloc();
+
+	report_hugepages();
+
+	hugetlb_sysfs_init();
+
+	return 0;
+}
+module_init(hugetlb_init);
+
+/* Should be called on processing a hugepagesz=... option */
+void __init huge_add_hstate(unsigned order)
+{
+	struct hstate *h;
+	if (size_to_hstate(PAGE_SIZE << order)) {
+		printk("hugepagesz= specified twice, ignoring\n");
+		return;
+	}
+	BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
+	BUG_ON(order < HPAGE_SHIFT - PAGE_SHIFT);
+	h = &hstates[max_hstate++];
+	h->order = order;
+	h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
+	hugetlb_init_hstate(h);
+	parsed_hstate = h;
+}
+
+static int __init hugetlb_setup(char *s)
+{
+	unsigned long *mhp;
+
+	if (!max_hstate)
+		mhp = &default_hstate_resv;
+	else
+		mhp = &parsed_hstate->max_huge_pages;
+
+	if (sscanf(s, "%lu", mhp) <= 0)
+		*mhp = 0;
+
+	/*
+	 * Global state is always initialized later in hugetlb_init.
+	 * But we need to allocate >= MAX_ORDER hstates here early to still
+	 * use the bootmem allocator.
+	 */
+	if (max_hstate > 0 && parsed_hstate->order >= MAX_ORDER)
+		hugetlb_init_hstate(parsed_hstate);
+
+	return 1;
+}
+__setup("hugepages=", hugetlb_setup);
+
+static unsigned int cpuset_mems_nr(unsigned int *array)
+{
+	int node;
+	unsigned int nr = 0;
+
+	for_each_node_mask(node, cpuset_current_mems_allowed)
+		nr += array[node];
+
+	return nr;
+}
+
+
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 			   struct file *file, void __user *buffer,
 			   size_t *length, loff_t *ppos)

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI]
  2008-04-29 17:11                                     ` Nishanth Aravamudan
@ 2008-04-29 17:22                                       ` Greg KH
  2008-04-29 18:14                                         ` Nishanth Aravamudan
  0 siblings, 1 reply; 51+ messages in thread
From: Greg KH @ 2008-04-29 17:22 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn,
	linux-mm

On Tue, Apr 29, 2008 at 10:11:15AM -0700, Nishanth Aravamudan wrote:
> +struct hstate_attribute {
> +	struct attribute attr;
> +	ssize_t (*show)(struct hstate *h, char *buf);
> +	ssize_t (*store)(struct hstate *h, const char *buf, size_t count);
> +};

Do you need your own attribute type with show and store?  Can't you just
use the "default" kobject attributes?

Also, you have no release function for your kobject to be cleaned up,
that's a major bug.

thanks,

greg k-h

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI]
  2008-04-29 17:22                                       ` Greg KH
@ 2008-04-29 18:14                                         ` Nishanth Aravamudan
  2008-04-29 18:26                                           ` Greg KH
  0 siblings, 1 reply; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-29 18:14 UTC (permalink / raw)
  To: Greg KH
  Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn,
	linux-mm

On 29.04.2008 [10:22:43 -0700], Greg KH wrote:
> On Tue, Apr 29, 2008 at 10:11:15AM -0700, Nishanth Aravamudan wrote:
> > +struct hstate_attribute {
> > +	struct attribute attr;
> > +	ssize_t (*show)(struct hstate *h, char *buf);
> > +	ssize_t (*store)(struct hstate *h, const char *buf, size_t count);
> > +};
> 
> Do you need your own attribute type with show and store?  Can't you just
> use the "default" kobject attributes?

Hrm, I don't know? Probably. Like I said, I was using the
/sys/kernel/slab code as my reference. Can you explain this more? Or
just point me to the source/documentation I should read for info. Are
you referring to kobj_attr_show/kobj_attr_store? Should I just be using
kobj_sysfs_ops, then, most likely?

> Also, you have no release function for your kobject to be cleaned up,
> that's a major bug.

Well, these kobjects never go away? They will be statically initialized
at boot-time and then stick around until the kernel goes away. Looking
at /sys/kernel/slab's code, again, the release() function there does a
kfree() on the containing kmem_cache, but for hugetlb, the hstates are
static... If we do move to dynamic allocations ever (or allow adding
hugepage sizes at run-time somehow), then perhaps we'll need a release
method then?

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI]
  2008-04-29 18:14                                         ` Nishanth Aravamudan
@ 2008-04-29 18:26                                           ` Greg KH
  2008-04-29 23:48                                             ` Nishanth Aravamudan
  2008-04-30 19:19                                             ` Nishanth Aravamudan
  0 siblings, 2 replies; 51+ messages in thread
From: Greg KH @ 2008-04-29 18:26 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn,
	linux-mm

On Tue, Apr 29, 2008 at 11:14:15AM -0700, Nishanth Aravamudan wrote:
> On 29.04.2008 [10:22:43 -0700], Greg KH wrote:
> > On Tue, Apr 29, 2008 at 10:11:15AM -0700, Nishanth Aravamudan wrote:
> > > +struct hstate_attribute {
> > > +	struct attribute attr;
> > > +	ssize_t (*show)(struct hstate *h, char *buf);
> > > +	ssize_t (*store)(struct hstate *h, const char *buf, size_t count);
> > > +};
> > 
> > Do you need your own attribute type with show and store?  Can't you just
> > use the "default" kobject attributes?
> 
> Hrm, I don't know? Probably. Like I said, I was using the
> /sys/kernel/slab code as my reference. Can you explain this more? Or
> just point me to the source/documentation I should read for info.

Documentation/kobject.txt, with sample examples in samples/kobject/ for
you to copy and use.

> Are you referring to kobj_attr_show/kobj_attr_store? Should I just be
> using kobj_sysfs_ops, then, most likely?

See the above examples for more details.

> > Also, you have no release function for your kobject to be cleaned up,
> > that's a major bug.
> 
> Well, these kobjects never go away? They will be statically initialized
> at boot-time and then stick around until the kernel goes away. Looking
> at /sys/kernel/slab's code, again, the release() function there does a
> kfree() on the containing kmem_cache, but for hugetlb, the hstates are
> static... If we do move to dynamic allocations ever (or allow adding
> hugepage sizes at run-time somehow), then perhaps we'll need a release
> method then?

Yes you will.  Please always create one, what happens when you want to
clean them up at shut-down time...

thanks,

greg k-h

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI]
  2008-04-29 18:26                                           ` Greg KH
@ 2008-04-29 23:48                                             ` Nishanth Aravamudan
  2008-05-01  3:07                                               ` Greg KH
  2008-04-30 19:19                                             ` Nishanth Aravamudan
  1 sibling, 1 reply; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-29 23:48 UTC (permalink / raw)
  To: Greg KH
  Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn,
	linux-mm

On 29.04.2008 [11:26:13 -0700], Greg KH wrote:
> On Tue, Apr 29, 2008 at 11:14:15AM -0700, Nishanth Aravamudan wrote:
> > On 29.04.2008 [10:22:43 -0700], Greg KH wrote:
> > > On Tue, Apr 29, 2008 at 10:11:15AM -0700, Nishanth Aravamudan wrote:
> > > > +struct hstate_attribute {
> > > > +	struct attribute attr;
> > > > +	ssize_t (*show)(struct hstate *h, char *buf);
> > > > +	ssize_t (*store)(struct hstate *h, const char *buf, size_t count);
> > > > +};
> > > 
> > > Do you need your own attribute type with show and store?  Can't you just
> > > use the "default" kobject attributes?
> > 
> > Hrm, I don't know? Probably. Like I said, I was using the
> > /sys/kernel/slab code as my reference. Can you explain this more? Or
> > just point me to the source/documentation I should read for info.
> 
> Documentation/kobject.txt, with sample examples in samples/kobject/ for
> you to copy and use.

Great thanks!

> > Are you referring to kobj_attr_show/kobj_attr_store? Should I just be
> > using kobj_sysfs_ops, then, most likely?
> 
> See the above examples for more details.

Will do -- I think we'll need our own store, at least, though, because
of locking issues? And I'm guessing if we provide our own store, we're
going to need to provide our own show?

> > > Also, you have no release function for your kobject to be cleaned up,
> > > that's a major bug.
> > 
> > Well, these kobjects never go away? They will be statically initialized
> > at boot-time and then stick around until the kernel goes away. Looking
> > at /sys/kernel/slab's code, again, the release() function there does a
> > kfree() on the containing kmem_cache, but for hugetlb, the hstates are
> > static... If we do move to dynamic allocations ever (or allow adding
> > hugepage sizes at run-time somehow), then perhaps we'll need a release
> > method then?
> 
> Yes you will.  Please always create one, what happens when you want to
> clean them up at shut-down time...

Again, I'm not sure what you want me to clean-up? The examples in
samples/ are freeing dynamically allocated objects containing the
kobject in question -- but /sys/kernel/hugepages only dynamically
allocates the kobject itself... Although, I guess I should free the name
string since I used kasprintf()...

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI]
  2008-04-29 23:48                                             ` Nishanth Aravamudan
@ 2008-05-01  3:07                                               ` Greg KH
  2008-05-01 18:25                                                 ` Nishanth Aravamudan
  0 siblings, 1 reply; 51+ messages in thread
From: Greg KH @ 2008-05-01  3:07 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn,
	linux-mm

On Tue, Apr 29, 2008 at 04:48:39PM -0700, Nishanth Aravamudan wrote:
> On 29.04.2008 [11:26:13 -0700], Greg KH wrote:
> > On Tue, Apr 29, 2008 at 11:14:15AM -0700, Nishanth Aravamudan wrote:
> > > On 29.04.2008 [10:22:43 -0700], Greg KH wrote:
> > > > On Tue, Apr 29, 2008 at 10:11:15AM -0700, Nishanth Aravamudan wrote:
> > > > > +struct hstate_attribute {
> > > > > +	struct attribute attr;
> > > > > +	ssize_t (*show)(struct hstate *h, char *buf);
> > > > > +	ssize_t (*store)(struct hstate *h, const char *buf, size_t count);
> > > > > +};
> > > > 
> > > > Do you need your own attribute type with show and store?  Can't you just
> > > > use the "default" kobject attributes?
> > > 
> > > Hrm, I don't know? Probably. Like I said, I was using the
> > > /sys/kernel/slab code as my reference. Can you explain this more? Or
> > > just point me to the source/documentation I should read for info.
> > 
> > Documentation/kobject.txt, with sample examples in samples/kobject/ for
> > you to copy and use.
> 
> Great thanks!
> 
> > > Are you referring to kobj_attr_show/kobj_attr_store? Should I just be
> > > using kobj_sysfs_ops, then, most likely?
> > 
> > See the above examples for more details.
> 
> Will do -- I think we'll need our own store, at least, though, because
> of locking issues? And I'm guessing if we provide our own store, we're
> going to need to provide our own show?

Yes, but see below...

> > > > Also, you have no release function for your kobject to be cleaned up,
> > > > that's a major bug.
> > > 
> > > Well, these kobjects never go away? They will be statically initialized
> > > at boot-time and then stick around until the kernel goes away. Looking
> > > at /sys/kernel/slab's code, again, the release() function there does a
> > > kfree() on the containing kmem_cache, but for hugetlb, the hstates are
> > > static... If we do move to dynamic allocations ever (or allow adding
> > > hugepage sizes at run-time somehow), then perhaps we'll need a release
> > > method then?
> > 
> > Yes you will.  Please always create one, what happens when you want to
> > clean them up at shut-down time...
> 
> Again, I'm not sure what you want me to clean-up? The examples in
> samples/ are freeing dynamically allocated objects containing the
> kobject in question -- but /sys/kernel/hugepages only dynamically
> allocates the kobject itself... Although, I guess I should free the name
> string since I used kasprintf()...

Ugh.

Embed a kobject into a structure if you want it to control the lifetime
rules of that structure.  And that includes tearing it down.

If you _only_ want to use a kobject to create some sysfs trees and
files, then just use the dynamic kobject functions, as documented.  Then
you only have a pointer to a kobject, it does not control the lifetime
of your structure, you don't have to write your own show/store wrappers,
and life is oh so much more easier.

So you might want to rethink your current patch :)

thanks,

greg k-h

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI]
  2008-05-01  3:07                                               ` Greg KH
@ 2008-05-01 18:25                                                 ` Nishanth Aravamudan
  0 siblings, 0 replies; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-05-01 18:25 UTC (permalink / raw)
  To: Greg KH
  Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn,
	linux-mm

On 30.04.2008 [20:07:38 -0700], Greg KH wrote:
> On Tue, Apr 29, 2008 at 04:48:39PM -0700, Nishanth Aravamudan wrote:
> > On 29.04.2008 [11:26:13 -0700], Greg KH wrote:
> > > On Tue, Apr 29, 2008 at 11:14:15AM -0700, Nishanth Aravamudan wrote:
> > > > On 29.04.2008 [10:22:43 -0700], Greg KH wrote:
> > > > > On Tue, Apr 29, 2008 at 10:11:15AM -0700, Nishanth Aravamudan wrote:
> > > > > > +struct hstate_attribute {
> > > > > > +	struct attribute attr;
> > > > > > +	ssize_t (*show)(struct hstate *h, char *buf);
> > > > > > +	ssize_t (*store)(struct hstate *h, const char *buf, size_t count);
> > > > > > +};
> > > > > 
> > > > > Do you need your own attribute type with show and store?  Can't you just
> > > > > use the "default" kobject attributes?
> > > > 
> > > > Hrm, I don't know? Probably. Like I said, I was using the
> > > > /sys/kernel/slab code as my reference. Can you explain this more? Or
> > > > just point me to the source/documentation I should read for info.
> > > 
> > > Documentation/kobject.txt, with sample examples in samples/kobject/ for
> > > you to copy and use.
> > 
> > Great thanks!
> > 
> > > > Are you referring to kobj_attr_show/kobj_attr_store? Should I just be
> > > > using kobj_sysfs_ops, then, most likely?
> > > 
> > > See the above examples for more details.
> > 
> > Will do -- I think we'll need our own store, at least, though, because
> > of locking issues? And I'm guessing if we provide our own store, we're
> > going to need to provide our own show?
> 
> Yes, but see below...
> 
> > > > > Also, you have no release function for your kobject to be cleaned up,
> > > > > that's a major bug.
> > > > 
> > > > Well, these kobjects never go away? They will be statically initialized
> > > > at boot-time and then stick around until the kernel goes away. Looking
> > > > at /sys/kernel/slab's code, again, the release() function there does a
> > > > kfree() on the containing kmem_cache, but for hugetlb, the hstates are
> > > > static... If we do move to dynamic allocations ever (or allow adding
> > > > hugepage sizes at run-time somehow), then perhaps we'll need a release
> > > > method then?
> > > 
> > > Yes you will.  Please always create one, what happens when you want to
> > > clean them up at shut-down time...
> > 
> > Again, I'm not sure what you want me to clean-up? The examples in
> > samples/ are freeing dynamically allocated objects containing the
> > kobject in question -- but /sys/kernel/hugepages only dynamically
> > allocates the kobject itself... Although, I guess I should free the name
> > string since I used kasprintf()...
> 
> Ugh.
> 
> Embed a kobject into a structure if you want it to control the
> lifetime rules of that structure.  And that includes tearing it down.
> 
> If you _only_ want to use a kobject to create some sysfs trees and
> files, then just use the dynamic kobject functions, as documented.
> Then you only have a pointer to a kobject, it does not control the
> lifetime of your structure, you don't have to write your own
> show/store wrappers, and life is oh so much more easier.
> 
> So you might want to rethink your current patch :)

Ok, I get this now, and have started moving over to it. However, I see a
few problems, or have a few questions:

1) I do need my own store() wrapper due to locking, right? We can't
change the writable values here without grabbing the hugetlb_lock. And
the examples in samples/kobject/kobject-sample.c, at least, do have
their own show/store methods (or do you mean something else by wrapper)?
Oh, maybe you are referring to hstate_attr_store()/hstate_attr_show()?
Those no longer exist in this patch...

2) I will need a kobject pointer for each hstate, right? So what I have
now is:

static struct kobject *hstate_kobj[HUGE_MAX_HSTATE];

and then I use kobject_create_and_add() for each of them. How do I then
refer back to which hstate I'm dealing with (because I want to
manipulate that hstate's values in the show/store methods) -- would I
need to iterate through hstate_kobj until I find the kobject that was
passed in and then use that index into hstates() to find the
corresponding hstate? I guess unlike in the embedding case, I don't see
the link between the structure I'm trying to represent and the
kobject...

3) Each hstate is going to have the same set of attributes. Let's say I
use sysfs_create_group() on each of the hstate_kobj's array members.
Will I then actually need duplicates of the set of attributes so that
there is a static set of attributes per-hstate? This directly relates to
2), actually -- if I can get to the hstate from the kobject then I can
do that with one set of attributes.

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI]
  2008-04-29 18:26                                           ` Greg KH
  2008-04-29 23:48                                             ` Nishanth Aravamudan
@ 2008-04-30 19:19                                             ` Nishanth Aravamudan
  2008-05-01  3:08                                               ` Greg KH
  1 sibling, 1 reply; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-30 19:19 UTC (permalink / raw)
  To: Greg KH
  Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn,
	linux-mm

On 29.04.2008 [11:26:13 -0700], Greg KH wrote:
> On Tue, Apr 29, 2008 at 11:14:15AM -0700, Nishanth Aravamudan wrote:
> > On 29.04.2008 [10:22:43 -0700], Greg KH wrote:
> > > On Tue, Apr 29, 2008 at 10:11:15AM -0700, Nishanth Aravamudan wrote:
> > > > +struct hstate_attribute {
> > > > +	struct attribute attr;
> > > > +	ssize_t (*show)(struct hstate *h, char *buf);
> > > > +	ssize_t (*store)(struct hstate *h, const char *buf, size_t count);
> > > > +};
> > > 
> > > Do you need your own attribute type with show and store?  Can't you just
> > > use the "default" kobject attributes?
> > 
> > Hrm, I don't know? Probably. Like I said, I was using the
> > /sys/kernel/slab code as my reference. Can you explain this more? Or
> > just point me to the source/documentation I should read for info.
> 
> Documentation/kobject.txt, with sample examples in samples/kobject/ for
> you to copy and use.
> 
> > Are you referring to kobj_attr_show/kobj_attr_store? Should I just be
> > using kobj_sysfs_ops, then, most likely?
> 
> See the above examples for more details.
> 
> > > Also, you have no release function for your kobject to be cleaned up,
> > > that's a major bug.
> > 
> > Well, these kobjects never go away? They will be statically initialized
> > at boot-time and then stick around until the kernel goes away. Looking
> > at /sys/kernel/slab's code, again, the release() function there does a
> > kfree() on the containing kmem_cache, but for hugetlb, the hstates are
> > static... If we do move to dynamic allocations ever (or allow adding
> > hugepage sizes at run-time somehow), then perhaps we'll need a release
> > method then?
> 
> Yes you will.  Please always create one, what happens when you want to
> clean them up at shut-down time...

Does this look better? I really appreciate the review, Greg.

 include/linux/hugetlb.h |    9 +-
 mm/hugetlb.c            |  292 +++++++++++++++++++++++++++++++++++------------
 2 files changed, 226 insertions(+), 75 deletions(-)

Still-not-Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7aa22e7..cac63bd 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -3,6 +3,9 @@
 
 #include <linux/fs.h>
 #include <linux/shm.h>
+#include <linux/mempolicy.h>
+#include <asm/tlbflush.h>
+#include <asm/hugetlb.h>
 
 #ifdef CONFIG_HUGETLBFS
 struct hugetlbfs_config {
@@ -69,10 +72,6 @@ static inline void set_file_hugepages(struct file *file)
 
 #ifdef CONFIG_HUGETLB_PAGE
 
-#include <linux/mempolicy.h>
-#include <asm/tlbflush.h>
-#include <asm/hugetlb.h>
-
 struct ctl_table;
 
 static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
@@ -131,6 +130,8 @@ struct hstate {
 	unsigned int nr_huge_pages_node[MAX_NUMNODES];
 	unsigned int free_huge_pages_node[MAX_NUMNODES];
 	unsigned int surplus_huge_pages_node[MAX_NUMNODES];
+	const char *name;
+	struct kobject kobj;
 };
 
 void __init huge_add_hstate(unsigned order);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index de03a14..8a40afa 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -15,6 +15,7 @@
 #include <linux/cpuset.h>
 #include <linux/mutex.h>
 #include <linux/bootmem.h>
+#include <linux/sysfs.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -624,6 +625,8 @@ static void __init hugetlb_init_hstate(struct hstate *h)
 			break;
 	}
 	h->max_huge_pages = i;
+	h->name = kasprintf(GFP_KERNEL, "hugepages-%lu",
+						huge_page_size(h) / 1024);
 }
 
 static void __init hugetlb_init_hstates(void)
@@ -662,77 +665,6 @@ static void __init report_hugepages(void)
         }
 }
 
-static int __init hugetlb_init(void)
-{
-	BUILD_BUG_ON(HPAGE_SHIFT == 0);
-
-	if (!size_to_hstate(HPAGE_SIZE)) {
-		huge_add_hstate(HUGETLB_PAGE_ORDER);
-		parsed_hstate->max_huge_pages = default_hstate_resv;
-	}
-
-	hugetlb_init_hstates();
-
-	gather_bootmem_prealloc();
-
-	report_hugepages();
-
-	return 0;
-}
-module_init(hugetlb_init);
-
-/* Should be called on processing a hugepagesz=... option */
-void __init huge_add_hstate(unsigned order)
-{
-	struct hstate *h;
-	if (size_to_hstate(PAGE_SIZE << order)) {
-		printk("hugepagesz= specified twice, ignoring\n");
-		return;
-	}
-	BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
-	BUG_ON(order < HPAGE_SHIFT - PAGE_SHIFT);
-	h = &hstates[max_hstate++];
-	h->order = order;
-	h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
-	hugetlb_init_hstate(h);
-	parsed_hstate = h;
-}
-
-static int __init hugetlb_setup(char *s)
-{
-	unsigned long *mhp;
-
-	if (!max_hstate)
-		mhp = &default_hstate_resv;
-	else
-		mhp = &parsed_hstate->max_huge_pages;
-
-	if (sscanf(s, "%lu", mhp) <= 0)
-		*mhp = 0;
-
-	/*
-	 * Global state is always initialized later in hugetlb_init.
-	 * But we need to allocate >= MAX_ORDER hstates here early to still
-	 * use the bootmem allocator.
-	 */
-	if (max_hstate > 0 && parsed_hstate->order >= MAX_ORDER)
-		hugetlb_init_hstate(parsed_hstate);
-
-	return 1;
-}
-__setup("hugepages=", hugetlb_setup);
-
-static unsigned int cpuset_mems_nr(unsigned int *array)
-{
-	int node;
-	unsigned int nr = 0;
-
-	for_each_node_mask(node, cpuset_current_mems_allowed)
-		nr += array[node];
-
-	return nr;
-}
-
 #ifdef CONFIG_SYSCTL
 #ifdef CONFIG_HIGHMEM
 static void try_to_free_low(struct hstate *h, unsigned long count)
@@ -843,6 +775,224 @@ out:
 	return ret;
 }
 
+#ifdef CONFIG_SYSFS
+#define HSTATE_ATTR_RO(_name) \
+	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+#define HSTATE_ATTR(_name) \
+	static struct kobj_attribute _name##_attr = \
+		__ATTR(_name, 0644, _name##_show, _name##_store)
+
+static ssize_t nr_hugepages_show(struct kobject *kobj,
+					struct kobj_attribute *attr, char *buf)
+{
+	struct hstate *h = container_of(kobj, struct hstate, kobj);
+	return sprintf(buf, "%lu\n", h->nr_huge_pages);
+}
+static ssize_t nr_hugepages_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	int tmp, err;
+	unsigned long input;
+	struct hstate *h = container_of(kobj, struct hstate, kobj);
+
+	err = strict_strtoul(buf, 10, &input);
+	if (err)
+		return 0;
+
+	h->max_huge_pages = set_max_huge_pages(h, input, &tmp);
+	max_huge_pages[h - hstates] = h->max_huge_pages;
+
+	return count;
+}
+HSTATE_ATTR(nr_hugepages);
+
+static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
+					struct kobj_attribute *attr, char *buf)
+{
+	struct hstate *h = container_of(kobj, struct hstate, kobj);
+	return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
+}
+static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	int err;
+	unsigned long input;
+	struct hstate *h = container_of(kobj, struct hstate, kobj);
+
+	err = strict_strtoul(buf, 10, &input);
+	if (err)
+		return 0;
+
+	spin_lock(&hugetlb_lock);
+	h->nr_overcommit_huge_pages = input;
+	sysctl_overcommit_huge_pages[h - hstates] = h->nr_overcommit_huge_pages;
+	spin_unlock(&hugetlb_lock);
+
+	return count;
+}
+HSTATE_ATTR(nr_overcommit_hugepages);
+
+static ssize_t free_hugepages_show(struct kobject *kobj,
+					struct kobj_attribute *attr, char *buf)
+{
+	struct hstate *h = container_of(kobj, struct hstate, kobj);
+	return sprintf(buf, "%lu\n", h->free_huge_pages);
+}
+HSTATE_ATTR_RO(free_hugepages);
+
+static ssize_t resv_hugepages_show(struct kobject *kobj,
+					struct kobj_attribute *attr, char *buf)
+{
+	struct hstate *h = container_of(kobj, struct hstate, kobj);
+	return sprintf(buf, "%lu\n", h->resv_huge_pages);
+}
+HSTATE_ATTR_RO(resv_hugepages);
+
+static ssize_t surplus_hugepages_show(struct kobject *kobj,
+					struct kobj_attribute *attr, char *buf)
+{
+	struct hstate *h = container_of(kobj, struct hstate, kobj);
+	return sprintf(buf, "%lu\n", h->surplus_huge_pages);
+}
+HSTATE_ATTR_RO(surplus_hugepages);
+
+static void hstate_release(struct kobject *kobj)
+{
+	struct hstate *h = container_of(kobj, struct hstate, kobj);
+	kfree(h->name);
+}
+
+static struct kset *hstate_kset;
+
+static struct attribute *hstate_attrs[] = {
+	&nr_hugepages_attr.attr,
+	&nr_overcommit_hugepages_attr.attr,
+	&free_hugepages_attr.attr,
+	&resv_hugepages_attr.attr,
+	&surplus_hugepages_attr.attr,
+	NULL,
+};
+
+static struct kobj_type hstate_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_attrs = hstate_attrs,
+	.release = hstate_release,
+};
+
+static int __init hugetlb_sysfs_add_hstate(struct hstate *h)
+{
+	int retval;
+
+	h->kobj.kset = hstate_kset;
+
+	retval = kobject_init_and_add(&h->kobj, &hstate_ktype, NULL, h->name);
+	if (retval) {
+		kfree(h->name);
+		return retval;
+	}
+
+	kobject_uevent(&h->kobj, KOBJ_ADD);
+
+	return 0;
+}
+
+static void __init hugetlb_sysfs_init(void)
+{
+	struct hstate *h;
+	int err;
+
+	hstate_kset = kset_create_and_add("hugepages", NULL, kernel_kobj);
+	if (!hstate_kset)
+		return;
+
+	for_each_hstate(h) {
+		err = hugetlb_sysfs_add_hstate(h);
+		if (err)
+			printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
+								h->name);
+	}
+}
+#else
+static void __init hugetlb_sysfs_init(void)
+{
+}
+#endif
+
+static int __init hugetlb_init(void)
+{
+	BUILD_BUG_ON(HPAGE_SHIFT == 0);
+
+	if (!size_to_hstate(HPAGE_SIZE)) {
+		huge_add_hstate(HUGETLB_PAGE_ORDER);
+		parsed_hstate->max_huge_pages = default_hstate_resv;
+	}
+
+	hugetlb_init_hstates();
+
+	gather_bootmem_prealloc();
+
+	report_hugepages();
+
+	hugetlb_sysfs_init();
+
+	return 0;
+}
+module_init(hugetlb_init);
+
+/* Should be called on processing a hugepagesz=... option */
+void __init huge_add_hstate(unsigned order)
+{
+	struct hstate *h;
+	if (size_to_hstate(PAGE_SIZE << order)) {
+		printk("hugepagesz= specified twice, ignoring\n");
+		return;
+	}
+	BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
+	BUG_ON(order < HPAGE_SHIFT - PAGE_SHIFT);
+	h = &hstates[max_hstate++];
+	h->order = order;
+	h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
+	hugetlb_init_hstate(h);
+	parsed_hstate = h;
+}
+
+static int __init hugetlb_setup(char *s)
+{
+	unsigned long *mhp;
+
+	if (!max_hstate)
+		mhp = &default_hstate_resv;
+	else
+		mhp = &parsed_hstate->max_huge_pages;
+
+	if (sscanf(s, "%lu", mhp) <= 0)
+		*mhp = 0;
+
+	/*
+	 * Global state is always initialized later in hugetlb_init.
+	 * But we need to allocate >= MAX_ORDER hstates here early to still
+	 * use the bootmem allocator.
+	 */
+	if (max_hstate > 0 && parsed_hstate->order >= MAX_ORDER)
+		hugetlb_init_hstate(parsed_hstate);
+
+	return 1;
+}
+__setup("hugepages=", hugetlb_setup);
+
+static unsigned int cpuset_mems_nr(unsigned int *array)
+{
+	int node;
+	unsigned int nr = 0;
+
+	for_each_node_mask(node, cpuset_current_mems_allowed)
+		nr += array[node];
+
+	return nr;
+}
+
+
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 			   struct file *file, void __user *buffer,
 			   size_t *length, loff_t *ppos)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI]
  2008-04-30 19:19                                             ` Nishanth Aravamudan
@ 2008-05-01  3:08                                               ` Greg KH
  2008-05-02 17:58                                                 ` Nishanth Aravamudan
  0 siblings, 1 reply; 51+ messages in thread
From: Greg KH @ 2008-05-01  3:08 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn,
	linux-mm

On Wed, Apr 30, 2008 at 12:19:41PM -0700, Nishanth Aravamudan wrote:
> On 29.04.2008 [11:26:13 -0700], Greg KH wrote:
> > On Tue, Apr 29, 2008 at 11:14:15AM -0700, Nishanth Aravamudan wrote:
> > > On 29.04.2008 [10:22:43 -0700], Greg KH wrote:
> > > > On Tue, Apr 29, 2008 at 10:11:15AM -0700, Nishanth Aravamudan wrote:
> > > > > +struct hstate_attribute {
> > > > > +	struct attribute attr;
> > > > > +	ssize_t (*show)(struct hstate *h, char *buf);
> > > > > +	ssize_t (*store)(struct hstate *h, const char *buf, size_t count);
> > > > > +};
> > > > 
> > > > Do you need your own attribute type with show and store?  Can't you just
> > > > use the "default" kobject attributes?
> > > 
> > > Hrm, I don't know? Probably. Like I said, I was using the
> > > /sys/kernel/slab code as my reference. Can you explain this more? Or
> > > just point me to the source/documentation I should read for info.
> > 
> > Documentation/kobject.txt, with sample examples in samples/kobject/ for
> > you to copy and use.
> > 
> > > Are you referring to kobj_attr_show/kobj_attr_store? Should I just be
> > > using kobj_sysfs_ops, then, most likely?
> > 
> > See the above examples for more details.
> > 
> > > > Also, you have no release function for your kobject to be cleaned up,
> > > > that's a major bug.
> > > 
> > > Well, these kobjects never go away? They will be statically initialized
> > > at boot-time and then stick around until the kernel goes away. Looking
> > > at /sys/kernel/slab's code, again, the release() function there does a
> > > kfree() on the containing kmem_cache, but for hugetlb, the hstates are
> > > static... If we do move to dynamic allocations ever (or allow adding
> > > hugepage sizes at run-time somehow), then perhaps we'll need a release
> > > method then?
> > 
> > Yes you will.  Please always create one, what happens when you want to
> > clean them up at shut-down time...
> 
> Does this look better? I really appreciate the review, Greg.

See my previous email, you should not embed a kobject into this
structure.  Just use a pointer to one, it will shrink this patch a lot.

thanks,

greg k-h

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI]
  2008-05-01  3:08                                               ` Greg KH
@ 2008-05-02 17:58                                                 ` Nishanth Aravamudan
  0 siblings, 0 replies; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-05-02 17:58 UTC (permalink / raw)
  To: Greg KH
  Cc: Nick Piggin, Christoph Lameter, wli, agl, luick, Lee.Schermerhorn,
	linux-mm

On 30.04.2008 [20:08:44 -0700], Greg KH wrote:
> On Wed, Apr 30, 2008 at 12:19:41PM -0700, Nishanth Aravamudan wrote:
> > On 29.04.2008 [11:26:13 -0700], Greg KH wrote:
> > > On Tue, Apr 29, 2008 at 11:14:15AM -0700, Nishanth Aravamudan wrote:
> > > > On 29.04.2008 [10:22:43 -0700], Greg KH wrote:
> > > > > On Tue, Apr 29, 2008 at 10:11:15AM -0700, Nishanth Aravamudan wrote:
> > > > > > +struct hstate_attribute {
> > > > > > +	struct attribute attr;
> > > > > > +	ssize_t (*show)(struct hstate *h, char *buf);
> > > > > > +	ssize_t (*store)(struct hstate *h, const char *buf, size_t count);
> > > > > > +};
> > > > > 
> > > > > Do you need your own attribute type with show and store?  Can't you just
> > > > > use the "default" kobject attributes?
> > > > 
> > > > Hrm, I don't know? Probably. Like I said, I was using the
> > > > /sys/kernel/slab code as my reference. Can you explain this more? Or
> > > > just point me to the source/documentation I should read for info.
> > > 
> > > Documentation/kobject.txt, with sample examples in samples/kobject/ for
> > > you to copy and use.
> > > 
> > > > Are you referring to kobj_attr_show/kobj_attr_store? Should I just be
> > > > using kobj_sysfs_ops, then, most likely?
> > > 
> > > See the above examples for more details.
> > > 
> > > > > Also, you have no release function for your kobject to be cleaned up,
> > > > > that's a major bug.
> > > > 
> > > > Well, these kobjects never go away? They will be statically initialized
> > > > at boot-time and then stick around until the kernel goes away. Looking
> > > > at /sys/kernel/slab's code, again, the release() function there does a
> > > > kfree() on the containing kmem_cache, but for hugetlb, the hstates are
> > > > static... If we do move to dynamic allocations ever (or allow adding
> > > > hugepage sizes at run-time somehow), then perhaps we'll need a release
> > > > method then?
> > > 
> > > Yes you will.  Please always create one, what happens when you want to
> > > clean them up at shut-down time...
> > 
> > Does this look better? I really appreciate the review, Greg.
> 
> See my previous email, you should not embed a kobject into this
> structure.  Just use a pointer to one, it will shrink this patch a lot.

Ok, I did that -- and the patch grew (due to adding a helper
function to figure out which hstate a kobject corresponds to?). I'm
sure I'm doing something stupid.

FWIW, this patch does work with Jon's efforts and shows 64k/16m/16g at
run-time, all correct and such.

commit 164d446024a76b9d785b11141e1b53b330f6ce4d
Author: Nishanth Aravamudan <nacc@us.ibm.com>
Date:   Fri Apr 25 15:34:58 2008 -0700

    hugetlb: present information in sysfs
    
    Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 4fe8d16..4898f32 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -3,6 +3,9 @@
 
 #include <linux/fs.h>
 #include <linux/shm.h>
+#include <linux/mempolicy.h>
+#include <asm/tlbflush.h>
+#include <asm/hugetlb.h>
 
 #ifdef CONFIG_HUGETLBFS
 struct hugetlbfs_config {
@@ -69,10 +72,6 @@ static inline void set_file_hugepages(struct file *file)
 
 #ifdef CONFIG_HUGETLB_PAGE
 
-#include <linux/mempolicy.h>
-#include <asm/tlbflush.h>
-#include <asm/hugetlb.h>
-
 struct ctl_table;
 
 static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
@@ -132,6 +131,7 @@ struct hstate {
 	unsigned int nr_huge_pages_node[MAX_NUMNODES];
 	unsigned int free_huge_pages_node[MAX_NUMNODES];
 	unsigned int surplus_huge_pages_node[MAX_NUMNODES];
+	char name[32];
 };
 
 struct huge_bm_page {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bd07510..c87eeca 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -15,6 +15,7 @@
 #include <linux/cpuset.h>
 #include <linux/mutex.h>
 #include <linux/bootmem.h>
+#include <linux/sysfs.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -659,76 +660,6 @@ static void __init report_hugepages(void)
         }
 }
 
-static int __init hugetlb_init(void)
-{
-	BUILD_BUG_ON(HPAGE_SHIFT == 0);
-
-	if (!size_to_hstate(HPAGE_SIZE)) {
-		huge_add_hstate(HUGETLB_PAGE_ORDER);
-		parsed_hstate->max_huge_pages = default_hstate_resv;
-	}
-
-	hugetlb_init_hstates();
-
-	gather_bootmem_prealloc();
-
-	report_hugepages();
-
-	return 0;
-}
-module_init(hugetlb_init);
-
-/* Should be called on processing a hugepagesz=... option */
-void __init huge_add_hstate(unsigned order)
-{
-	struct hstate *h;
-	if (size_to_hstate(PAGE_SIZE << order)) {
-		printk("hugepagesz= specified twice, ignoring\n");
-		return;
-	}
-	BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
-	h = &hstates[max_hstate++];
-	h->order = order;
-	h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
-	hugetlb_init_hstate(h);
-	parsed_hstate = h;
-}
-
-static int __init hugetlb_setup(char *s)
-{
-	unsigned long *mhp;
-
-	if (!max_hstate)
-		mhp = &default_hstate_resv;
-	else
-		mhp = &parsed_hstate->max_huge_pages;
-
-	if (sscanf(s, "%lu", mhp) <= 0)
-		*mhp = 0;
-
-	/*
-	 * Global state is always initialized later in hugetlb_init.
-	 * But we need to allocate >= MAX_ORDER hstates here early to still
-	 * use the bootmem allocator.
-	 */
-	if (max_hstate > 0 && parsed_hstate->order >= MAX_ORDER)
-		hugetlb_init_hstate(parsed_hstate);
-
-	return 1;
-}
-__setup("hugepages=", hugetlb_setup);
-
-static unsigned int cpuset_mems_nr(unsigned int *array)
-{
-	int node;
-	unsigned int nr = 0;
-
-	for_each_node_mask(node, cpuset_current_mems_allowed)
-		nr += array[node];
-
-	return nr;
-}
-
 #ifdef CONFIG_SYSCTL
 #ifdef CONFIG_HIGHMEM
 static void try_to_free_low(struct hstate *h, unsigned long count)
@@ -839,6 +770,236 @@ out:
 	return ret;
 }
 
+#ifdef CONFIG_SYSFS
+#define HSTATE_ATTR_RO(_name) \
+	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+#define HSTATE_ATTR(_name) \
+	static struct kobj_attribute _name##_attr = \
+		__ATTR(_name, 0644, _name##_show, _name##_store)
+
+static struct kobject *hugepages_kobj;
+static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
+
+static struct hstate *kobj_to_hstate(struct kobject *kobj)
+{
+	int i;
+	for (i = 0; i < HUGE_MAX_HSTATE; i++)
+		if (hstate_kobjs[i] == kobj)
+			return &hstates[i];
+	BUG();
+	return NULL;
+}
+
+static ssize_t nr_hugepages_show(struct kobject *kobj,
+					struct kobj_attribute *attr, char *buf)
+{
+	struct hstate *h = kobj_to_hstate(kobj);
+	return sprintf(buf, "%lu\n", h->nr_huge_pages);
+}
+static ssize_t nr_hugepages_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	int tmp, err;
+	unsigned long input;
+	struct hstate *h = kobj_to_hstate(kobj);
+
+	err = strict_strtoul(buf, 10, &input);
+	if (err)
+		return 0;
+
+	h->max_huge_pages = set_max_huge_pages(h, input, &tmp);
+	max_huge_pages[h - hstates] = h->max_huge_pages;
+
+	return count;
+}
+HSTATE_ATTR(nr_hugepages);
+
+static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
+					struct kobj_attribute *attr, char *buf)
+{
+	struct hstate *h = kobj_to_hstate(kobj);
+	return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
+}
+static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	int err;
+	unsigned long input;
+	struct hstate *h = kobj_to_hstate(kobj);
+
+	err = strict_strtoul(buf, 10, &input);
+	if (err)
+		return 0;
+
+	spin_lock(&hugetlb_lock);
+	h->nr_overcommit_huge_pages = input;
+	sysctl_overcommit_huge_pages[h - hstates] = h->nr_overcommit_huge_pages;
+	spin_unlock(&hugetlb_lock);
+
+	return count;
+}
+HSTATE_ATTR(nr_overcommit_hugepages);
+
+static ssize_t free_hugepages_show(struct kobject *kobj,
+					struct kobj_attribute *attr, char *buf)
+{
+	struct hstate *h = kobj_to_hstate(kobj);
+	return sprintf(buf, "%lu\n", h->free_huge_pages);
+}
+HSTATE_ATTR_RO(free_hugepages);
+
+static ssize_t resv_hugepages_show(struct kobject *kobj,
+					struct kobj_attribute *attr, char *buf)
+{
+	struct hstate *h = kobj_to_hstate(kobj);
+	return sprintf(buf, "%lu\n", h->resv_huge_pages);
+}
+HSTATE_ATTR_RO(resv_hugepages);
+
+static ssize_t surplus_hugepages_show(struct kobject *kobj,
+					struct kobj_attribute *attr, char *buf)
+{
+	struct hstate *h = kobj_to_hstate(kobj);
+	return sprintf(buf, "%lu\n", h->surplus_huge_pages);
+}
+HSTATE_ATTR_RO(surplus_hugepages);
+
+static struct attribute *hstate_attrs[] = {
+	&nr_hugepages_attr.attr,
+	&nr_overcommit_hugepages_attr.attr,
+	&free_hugepages_attr.attr,
+	&resv_hugepages_attr.attr,
+	&surplus_hugepages_attr.attr,
+	NULL,
+};
+
+static struct attribute_group hstate_attr_group = {
+	.attrs = hstate_attrs,
+};
+
+static int __init hugetlb_sysfs_add_hstate(struct hstate *h)
+{
+	int retval;
+
+	hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, hugepages_kobj);
+	if (!hstate_kobjs[h - hstates])
+		return -ENOMEM;
+
+	retval = sysfs_create_group(hstate_kobjs[h - hstates], &hstate_attr_group);
+	if (retval)
+		kobject_put(hstate_kobjs[h - hstates]);
+
+	return retval;
+}
+
+static void __init hugetlb_sysfs_init(void)
+{
+	struct hstate *h;
+	int err;
+
+	hugepages_kobj = kobject_create_and_add("hugepages", kernel_kobj);
+	if (!hugepages_kobj)
+		return;
+
+	for_each_hstate(h) {
+		err = hugetlb_sysfs_add_hstate(h);
+		if (err)
+			printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
+								h->name);
+	}
+}
+#else
+static void __init hugetlb_sysfs_init(void)
+{
+}
+#endif
+
+static int __init hugetlb_init(void)
+{
+	BUILD_BUG_ON(HPAGE_SHIFT == 0);
+
+	if (!size_to_hstate(HPAGE_SIZE)) {
+		huge_add_hstate(HUGETLB_PAGE_ORDER);
+		parsed_hstate->max_huge_pages = default_hstate_resv;
+	}
+
+	hugetlb_init_hstates();
+
+	gather_bootmem_prealloc();
+
+	report_hugepages();
+
+	hugetlb_sysfs_init();
+
+	return 0;
+}
+module_init(hugetlb_init);
+
+static void __exit hugetlb_exit(void)
+{
+	struct hstate *h;
+
+	for_each_hstate(h)
+		kobject_put(hstate_kobjs[h - hstates]);
+
+	kobject_put(hugepages_kobj);
+}
+module_exit(hugetlb_exit);
+
+/* Should be called on processing a hugepagesz=... option */
+void __init huge_add_hstate(unsigned order)
+{
+	struct hstate *h;
+	if (size_to_hstate(PAGE_SIZE << order)) {
+		printk("hugepagesz= specified twice, ignoring\n");
+		return;
+	}
+	BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
+	h = &hstates[max_hstate++];
+	h->order = order;
+	h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
+	snprintf(h->name, 32, "hugepages-%lu", huge_page_size(h)/1024);
+	hugetlb_init_hstate(h);
+	parsed_hstate = h;
+}
+
+static int __init hugetlb_setup(char *s)
+{
+	unsigned long *mhp;
+
+	if (!max_hstate)
+		mhp = &default_hstate_resv;
+	else
+		mhp = &parsed_hstate->max_huge_pages;
+
+	if (sscanf(s, "%lu", mhp) <= 0)
+		*mhp = 0;
+
+	/*
+	 * Global state is always initialized later in hugetlb_init.
+	 * But we need to allocate >= MAX_ORDER hstates here early to still
+	 * use the bootmem allocator.
+	 */
+	if (max_hstate > 0 && parsed_hstate->order >= MAX_ORDER)
+		hugetlb_init_hstate(parsed_hstate);
+
+	return 1;
+}
+__setup("hugepages=", hugetlb_setup);
+
+static unsigned int cpuset_mems_nr(unsigned int *array)
+{
+	int node;
+	unsigned int nr = 0;
+
+	for_each_node_mask(node, cpuset_current_mems_allowed)
+		nr += array[node];
+
+	return nr;
+}
+
+
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 			   struct file *file, void __user *buffer,
 			   size_t *length, loff_t *ppos)

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI]
  2008-04-27  5:10                               ` Greg KH
  2008-04-28 17:22                                 ` Nishanth Aravamudan
@ 2008-04-28 20:31                                 ` Christoph Lameter
  2008-04-28 20:52                                   ` Nishanth Aravamudan
  1 sibling, 1 reply; 51+ messages in thread
From: Christoph Lameter @ 2008-04-28 20:31 UTC (permalink / raw)
  To: Greg KH
  Cc: Nishanth Aravamudan, Nick Piggin, wli, agl, luick,
	Lee.Schermerhorn, linux-mm

On Sat, 26 Apr 2008, Greg KH wrote:

> Also, why use a "units" here, just always use the lowest unit, and
> userspace can convert from kB to GB if needed.

Additional complications will come about because IA64 supports 
varying hugetlb sizes from 4kb to 1GB.

Also we would at some point like to add support for 1TB hugepages (that 
may depend on the presence of a special device that handles these).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI]
  2008-04-28 20:31                                 ` Christoph Lameter
@ 2008-04-28 20:52                                   ` Nishanth Aravamudan
  2008-04-28 21:29                                     ` Christoph Lameter
  0 siblings, 1 reply; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-28 20:52 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Greg KH, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm

On 28.04.2008 [13:31:00 -0700], Christoph Lameter wrote:
> On Sat, 26 Apr 2008, Greg KH wrote:
> 
> > Also, why use a "units" here, just always use the lowest unit, and
> > userspace can convert from kB to GB if needed.
> 
> Additional complications will come about because IA64 supports 
> varying hugetlb sizes from 4kb to 1GB.

What "complications" do you mean? It's a small function indeed to
convert from the directory name to the corresponding "human-named" size,
e.g. hugepages-1048576 to "1 GB". And such a function will probably
exist in libhugetlbfs at some point, for applications to use, if they
like.

A potential problem I do see is for a 32-bit binary running on a 64-bit
kernel and is one we've run against for 32-bit binaries with 16G pages
available. The 32-bit binary can't actually store the size of the
hugepage in an unsigned long, so we have to remember how big of a value
we can represent (i.e., max_hugepage_size_in_kb) and check what's
obtained from /proc/meminfo against that. Not ideal, for sure.

> Also we would at some point like to add support for 1TB hugepages
> (that may depend on the presence of a special device that handles
> these).

I also don't see a limitation here? For 32-bit programs, we'll see
1073741824 and know we can't convert that into a valid value in bytes.

More importnatly, I think the fact that IA64 supports multiple hugepage
sizes is a reason *for* moving to sysfs for this information? However, I
think we may need to massage the IA64-specific bits of the kernel to
actually support multiple hugepage size pools being available at
run-time? That is, with the current kernel, we can only support one
hugepagesize at run-time, due to VHPT restrictions?

Thanks,
Nish

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI]
  2008-04-28 20:52                                   ` Nishanth Aravamudan
@ 2008-04-28 21:29                                     ` Christoph Lameter
  2008-04-29 16:43                                       ` Nishanth Aravamudan
  0 siblings, 1 reply; 51+ messages in thread
From: Christoph Lameter @ 2008-04-28 21:29 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Greg KH, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm

On Mon, 28 Apr 2008, Nishanth Aravamudan wrote:

> More importnatly, I think the fact that IA64 supports multiple hugepage
> sizes is a reason *for* moving to sysfs for this information? However, I
> think we may need to massage the IA64-specific bits of the kernel to
> actually support multiple hugepage size pools being available at
> run-time? That is, with the current kernel, we can only support one
> hugepagesize at run-time, due to VHPT restrictions?

We'd love to have multiple huge page pools available but the current rigid 
region setup limits us to one size. Switching off the VHPT or doing some 
tricks with the tlb fault handler, or freeing up an unused region (region 
0?) could get us there.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI]
  2008-04-28 21:29                                     ` Christoph Lameter
@ 2008-04-29 16:43                                       ` Nishanth Aravamudan
  2008-04-29 17:01                                         ` Christoph Lameter
  0 siblings, 1 reply; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-29 16:43 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Greg KH, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm

On 28.04.2008 [14:29:02 -0700], Christoph Lameter wrote:
> On Mon, 28 Apr 2008, Nishanth Aravamudan wrote:
> 
> > More importnatly, I think the fact that IA64 supports multiple hugepage
> > sizes is a reason *for* moving to sysfs for this information? However, I
> > think we may need to massage the IA64-specific bits of the kernel to
> > actually support multiple hugepage size pools being available at
> > run-time? That is, with the current kernel, we can only support one
> > hugepagesize at run-time, due to VHPT restrictions?
> 
> We'd love to have multiple huge page pools available but the current
> rigid region setup limits us to one size. Switching off the VHPT or
> doing some tricks with the tlb fault handler, or freeing up an unused
> region (region 0?) could get us there.

Ok, that was my impression. So on IA64, without further kernel
modifications, we will always only have one hugepage size visible in
/proc/meminfo and /sys/kernel/hugepages?

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI]
  2008-04-29 16:43                                       ` Nishanth Aravamudan
@ 2008-04-29 17:01                                         ` Christoph Lameter
  0 siblings, 0 replies; 51+ messages in thread
From: Christoph Lameter @ 2008-04-29 17:01 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Greg KH, Nick Piggin, wli, agl, luick, Lee.Schermerhorn, linux-mm

On Tue, 29 Apr 2008, Nishanth Aravamudan wrote:

> Ok, that was my impression. So on IA64, without further kernel
> modifications, we will always only have one hugepage size visible in
> /proc/meminfo and /sys/kernel/hugepages?

I am not aware of any work in progress. So yes.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 2/5] hugetlb: numafy several functions
  2008-04-11 23:47 ` [RFC][PATCH 2/5] " Nishanth Aravamudan
  2008-04-11 23:47   ` [PATCH 3/5] hugetlb: interleave dequeueing of huge pages Nishanth Aravamudan
@ 2008-04-14 14:52   ` Adam Litke
  2008-04-14 21:10     ` Nishanth Aravamudan
  1 sibling, 1 reply; 51+ messages in thread
From: Adam Litke @ 2008-04-14 14:52 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: wli, clameter, luick, Lee.Schermerhorn, linux-mm, npiggin

On Fri, 2008-04-11 at 16:47 -0700, Nishanth Aravamudan wrote:
> +#define persistent_huge_pages_node(nid)	\
> +		(nr_huge_pages_node[nid] - surplus_huge_pages_node[nid])
> +static ssize_t hugetlb_write_nr_hugepages_node(struct sys_device *dev,
> +					const char *buf, size_t count)
> +{
> +	int nid = dev->id;
> +	unsigned long target;
> +	unsigned long free_on_other_nodes;
> +	unsigned long nr_huge_pages_req = simple_strtoul(buf, NULL, 10);
> +
> +	/*
> +	 * Increase the pool size on the node
> +	 * First take pages out of surplus state.  Then make up the
> +	 * remaining difference by allocating fresh huge pages.
> +	 *
> +	 * We might race with alloc_buddy_huge_page() here and be unable
> +	 * to convert a surplus huge page to a normal huge page. That is
> +	 * not critical, though, it just means the overall size of the
> +	 * pool might be one hugepage larger than it needs to be, but
> +	 * within all the constraints specified by the sysctls.
> +	 */
> +	spin_lock(&hugetlb_lock);
> +	while (surplus_huge_pages_node[nid] &&
> +		nr_huge_pages_req > persistent_huge_pages_node(nid)) {
> +		if (!adjust_pool_surplus_node(-1, nid))
> +			break;
> +	}
> +
> +	while (nr_huge_pages_req > persistent_huge_pages_node(nid)) {
> +		struct page *ret;
> +		/*
> +		 * If this allocation races such that we no longer need the
> +		 * page, free_huge_page will handle it by freeing the page
> +		 * and reducing the surplus.
> +		 */
> +		spin_unlock(&hugetlb_lock);
> +		ret = alloc_fresh_huge_page_node(nid);
> +		spin_lock(&hugetlb_lock);
> +		if (!ret)
> +			goto out;
> +
> +	}
> +
> +	if (nr_huge_pages_req >= nr_huge_pages_node[nid])
> +		goto out;
> +
> +	/*
> +	 * Decrease the pool size
> +	 * First return free pages to the buddy allocator (being careful
> +	 * to keep enough around to satisfy reservations).  Then place
> +	 * pages into surplus state as needed so the pool will shrink
> +	 * to the desired size as pages become free.
> +	 *
> +	 * By placing pages into the surplus state independent of the
> +	 * overcommit value, we are allowing the surplus pool size to
> +	 * exceed overcommit. There are few sane options here. Since
> +	 * alloc_buddy_huge_page() is checking the global counter,
> +	 * though, we'll note that we're not allowed to exceed surplus
> +	 * and won't grow the pool anywhere else. Not until one of the
> +	 * sysctls are changed, or the surplus pages go out of use.
> +	 */
> +	free_on_other_nodes = free_huge_pages - free_huge_pages_node[nid];
> +	if (free_on_other_nodes >= resv_huge_pages) {
> +		/* other nodes can satisfy reserve */
> +		target = nr_huge_pages_req;
> +	} else {
> +		/* this node needs some free to satisfy reserve */
> +		target = max((resv_huge_pages - free_on_other_nodes),
> +						nr_huge_pages_req);
> +	}
> +	try_to_free_low_node(nid, target);
> +	while (target < persistent_huge_pages_node(nid)) {
> +		struct page *page = dequeue_huge_page_node(NULL, nid);
> +		if (!page)
> +			break;
> +		update_and_free_page(nid, page);
> +	}
> +
> +	while (target < persistent_huge_pages_node(nid)) {
> +		if (!adjust_pool_surplus_node(1, nid))
> +			break;
> +	}
> +out:
> +	spin_unlock(&hugetlb_lock);
> +	return count;
> +}

Hmm, this function looks very familiar ;)  Is there any way we can
consolidate it with set_max_huge_pages()?  Perhaps the new node helpers
from the beginning of this series will help?

-- 
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [RFC][PATCH 2/5] hugetlb: numafy several functions
  2008-04-14 14:52   ` [RFC][PATCH 2/5] hugetlb: numafy several functions Adam Litke
@ 2008-04-14 21:10     ` Nishanth Aravamudan
  0 siblings, 0 replies; 51+ messages in thread
From: Nishanth Aravamudan @ 2008-04-14 21:10 UTC (permalink / raw)
  To: Adam Litke; +Cc: wli, clameter, luick, Lee.Schermerhorn, linux-mm, npiggin

On 14.04.2008 [09:52:50 -0500], Adam Litke wrote:
> 
> On Fri, 2008-04-11 at 16:47 -0700, Nishanth Aravamudan wrote:
> > +#define persistent_huge_pages_node(nid)	\
> > +		(nr_huge_pages_node[nid] - surplus_huge_pages_node[nid])
> > +static ssize_t hugetlb_write_nr_hugepages_node(struct sys_device *dev,
> > +					const char *buf, size_t count)
> > +{
> > +	int nid = dev->id;
> > +	unsigned long target;
> > +	unsigned long free_on_other_nodes;
> > +	unsigned long nr_huge_pages_req = simple_strtoul(buf, NULL, 10);
> > +
> > +	/*
> > +	 * Increase the pool size on the node
> > +	 * First take pages out of surplus state.  Then make up the
> > +	 * remaining difference by allocating fresh huge pages.
> > +	 *
> > +	 * We might race with alloc_buddy_huge_page() here and be unable
> > +	 * to convert a surplus huge page to a normal huge page. That is
> > +	 * not critical, though, it just means the overall size of the
> > +	 * pool might be one hugepage larger than it needs to be, but
> > +	 * within all the constraints specified by the sysctls.
> > +	 */
> > +	spin_lock(&hugetlb_lock);
> > +	while (surplus_huge_pages_node[nid] &&
> > +		nr_huge_pages_req > persistent_huge_pages_node(nid)) {
> > +		if (!adjust_pool_surplus_node(-1, nid))
> > +			break;
> > +	}
> > +
> > +	while (nr_huge_pages_req > persistent_huge_pages_node(nid)) {
> > +		struct page *ret;
> > +		/*
> > +		 * If this allocation races such that we no longer need the
> > +		 * page, free_huge_page will handle it by freeing the page
> > +		 * and reducing the surplus.
> > +		 */
> > +		spin_unlock(&hugetlb_lock);
> > +		ret = alloc_fresh_huge_page_node(nid);
> > +		spin_lock(&hugetlb_lock);
> > +		if (!ret)
> > +			goto out;
> > +
> > +	}
> > +
> > +	if (nr_huge_pages_req >= nr_huge_pages_node[nid])
> > +		goto out;
> > +
> > +	/*
> > +	 * Decrease the pool size
> > +	 * First return free pages to the buddy allocator (being careful
> > +	 * to keep enough around to satisfy reservations).  Then place
> > +	 * pages into surplus state as needed so the pool will shrink
> > +	 * to the desired size as pages become free.
> > +	 *
> > +	 * By placing pages into the surplus state independent of the
> > +	 * overcommit value, we are allowing the surplus pool size to
> > +	 * exceed overcommit. There are few sane options here. Since
> > +	 * alloc_buddy_huge_page() is checking the global counter,
> > +	 * though, we'll note that we're not allowed to exceed surplus
> > +	 * and won't grow the pool anywhere else. Not until one of the
> > +	 * sysctls are changed, or the surplus pages go out of use.
> > +	 */
> > +	free_on_other_nodes = free_huge_pages - free_huge_pages_node[nid];
> > +	if (free_on_other_nodes >= resv_huge_pages) {
> > +		/* other nodes can satisfy reserve */
> > +		target = nr_huge_pages_req;
> > +	} else {
> > +		/* this node needs some free to satisfy reserve */
> > +		target = max((resv_huge_pages - free_on_other_nodes),
> > +						nr_huge_pages_req);
> > +	}
> > +	try_to_free_low_node(nid, target);
> > +	while (target < persistent_huge_pages_node(nid)) {
> > +		struct page *page = dequeue_huge_page_node(NULL, nid);
> > +		if (!page)
> > +			break;
> > +		update_and_free_page(nid, page);
> > +	}
> > +
> > +	while (target < persistent_huge_pages_node(nid)) {
> > +		if (!adjust_pool_surplus_node(1, nid))
> > +			break;
> > +	}
> > +out:
> > +	spin_unlock(&hugetlb_lock);
> > +	return count;
> > +}
> 
> Hmm, this function looks very familiar ;)  Is there any way we can
> consolidate it with set_max_huge_pages()?  Perhaps the new node helpers
> from the beginning of this series will help?

A good idea. I think I was more worried about getting something wrong if
I did that in my first cut after the dynamic pool was merged and just
hadn't recombined. I will work on it for the next version, once we've
come up with a consensus on the interface's location.

Thanks for the review,
Nish

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 51+ messages in thread

end of thread, other threads:[~2008-05-02 17:58 UTC | newest]

Thread overview: 51+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-04-11 23:44 [PATCH 1/5] hugetlb: numafy several functions Nishanth Aravamudan
2008-04-11 23:47 ` [RFC][PATCH 2/5] " Nishanth Aravamudan
2008-04-11 23:47   ` [PATCH 3/5] hugetlb: interleave dequeueing of huge pages Nishanth Aravamudan
2008-04-11 23:49     ` [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI Nishanth Aravamudan
2008-04-11 23:50       ` [RFC][PATCH 5/5] Documentation: update ABI and hugetlbpage.txt for per-node files Nishanth Aravamudan
2008-04-11 23:56       ` [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI Greg KH
2008-04-12  0:27         ` Nishanth Aravamudan
2008-04-12  9:41         ` Nick Piggin
2008-04-12 10:26           ` Christoph Lameter
2008-04-14 21:09             ` Nishanth Aravamudan
2008-04-13  3:41           ` Greg KH
2008-04-14 21:05             ` Nishanth Aravamudan
2008-04-17 23:16               ` Nishanth Aravamudan
2008-04-17 23:22                 ` Christoph Lameter
2008-04-17 23:36                   ` Nishanth Aravamudan
2008-04-17 23:39                     ` Christoph Lameter
2008-04-18  6:04                       ` Nishanth Aravamudan
2008-04-18 17:27                         ` Nishanth Aravamudan
2008-04-20  2:24                           ` Greg KH
2008-04-21 16:43                             ` Nishanth Aravamudan
2008-04-20  2:21                       ` Greg KH
2008-04-21  6:06                         ` Christoph Lameter
2008-04-21 16:41                           ` Nishanth Aravamudan
2008-04-22  5:14                   ` Nick Piggin
2008-04-22 16:56                     ` Nishanth Aravamudan
2008-04-23  1:03                       ` Nick Piggin
2008-04-23 18:32                         ` Nishanth Aravamudan
2008-04-23 19:07                           ` Adam Litke
2008-04-24  7:13                           ` Nick Piggin
2008-04-24 15:54                             ` Nishanth Aravamudan
2008-04-27  3:49                             ` [RFC][PATCH] hugetlb: add information and interface in sysfs [Was Re: [RFC][PATCH 4/5] Documentation: add node files to sysfs ABI] Nishanth Aravamudan
2008-04-27  5:10                               ` Greg KH
2008-04-28 17:22                                 ` Nishanth Aravamudan
2008-04-28 17:29                                   ` Greg KH
2008-04-29 17:11                                     ` Nishanth Aravamudan
2008-04-29 17:22                                       ` Greg KH
2008-04-29 18:14                                         ` Nishanth Aravamudan
2008-04-29 18:26                                           ` Greg KH
2008-04-29 23:48                                             ` Nishanth Aravamudan
2008-05-01  3:07                                               ` Greg KH
2008-05-01 18:25                                                 ` Nishanth Aravamudan
2008-04-30 19:19                                             ` Nishanth Aravamudan
2008-05-01  3:08                                               ` Greg KH
2008-05-02 17:58                                                 ` Nishanth Aravamudan
2008-04-28 20:31                                 ` Christoph Lameter
2008-04-28 20:52                                   ` Nishanth Aravamudan
2008-04-28 21:29                                     ` Christoph Lameter
2008-04-29 16:43                                       ` Nishanth Aravamudan
2008-04-29 17:01                                         ` Christoph Lameter
2008-04-14 14:52   ` [RFC][PATCH 2/5] hugetlb: numafy several functions Adam Litke
2008-04-14 21:10     ` Nishanth Aravamudan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).