From: Matthew Dobson <colpatch@us.ibm.com>
To: linux-kernel@vger.kernel.org,
Linus Torvalds <torvalds@transmeta.com>,
Michael Hohnbaum <hohnbaum@us.ibm.com>,
Martin Bligh <mjbligh@us.ibm.com>,
Andrew Morton <akpm@zip.com.au>,
lse-tech <lse-tech@lists.sourceforge.net>,
Andrea Arcangeli <andrea@suse.de>
Subject: [patch] Memory Binding API v0.2
Date: Thu, 25 Jul 2002 15:40:34 -0700 [thread overview]
Message-ID: <3D407E62.8080707@us.ibm.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 538 bytes --]
Here is the latest version of the Mem Binding API. It's a follow-up to the
patch posted a week or so ago. It incorporates some changes, and should be a
bit more efficient, readable, and functional. Bigger, better, faster, eh? It
needs to be patched on top of the Simple binding API that I posted a minute ago..
This API as well as the Simple Binding were both originally part of a NUMA
Binding API discussed at length on LSE-Tech, and met with approval there. I
hope that these can be included in the mainline.
Cheers!
-Matt
[-- Attachment #2: mem_api-v0.2-2.5.28.patch --]
[-- Type: text/plain, Size: 11165 bytes --]
diff -Nur linux-2.5.27-vanilla/arch/i386/config.in linux-2.5.27-api/arch/i386/config.in
--- linux-2.5.27-vanilla/arch/i386/config.in Wed Jul 24 17:33:41 2002
+++ linux-2.5.27-api/arch/i386/config.in Wed Jul 24 17:38:34 2002
@@ -168,6 +168,8 @@
bool 'Multiquad NUMA system' CONFIG_X86_NUMAQ
if [ "$CONFIG_X86_NUMAQ" = y ]; then
define_bool CONFIG_MULTIQUAD y
+ bool 'Memory Binding API Support' CONFIG_MEMBIND
+ bool 'NUMA Memory Allocation Support' CONFIG_NUMA
fi
fi
diff -Nur linux-2.5.27-vanilla/arch/i386/kernel/entry.S linux-2.5.27-api/arch/i386/kernel/entry.S
--- linux-2.5.27-vanilla/arch/i386/kernel/entry.S Wed Jul 24 17:33:41 2002
+++ linux-2.5.27-api/arch/i386/kernel/entry.S Wed Jul 24 17:38:34 2002
@@ -754,6 +754,8 @@
.long sys_sched_setaffinity
.long sys_sched_getaffinity
.long sys_check_topology
+ .long sys_mem_setbinding
+ .long sys_mem_getbinding
.rept NR_syscalls-(.-sys_call_table)/4
.long sys_ni_syscall
diff -Nur linux-2.5.27-vanilla/include/asm-i386/unistd.h linux-2.5.27-api/include/asm-i386/unistd.h
--- linux-2.5.27-vanilla/include/asm-i386/unistd.h Wed Jul 24 17:33:41 2002
+++ linux-2.5.27-api/include/asm-i386/unistd.h Wed Jul 24 17:38:34 2002
@@ -248,6 +248,8 @@
#define __NR_sched_setaffinity 241
#define __NR_sched_getaffinity 242
#define __NR_check_topology 243
+#define __NR_mem_setbinding 244
+#define __NR_mem_getbinding 245
/* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
diff -Nur linux-2.5.27-vanilla/include/linux/init_task.h linux-2.5.27-api/include/linux/init_task.h
--- linux-2.5.27-vanilla/include/linux/init_task.h Sat Jul 20 12:11:07 2002
+++ linux-2.5.27-api/include/linux/init_task.h Wed Jul 24 17:38:34 2002
@@ -59,6 +59,11 @@
children: LIST_HEAD_INIT(tsk.children), \
sibling: LIST_HEAD_INIT(tsk.sibling), \
thread_group: LIST_HEAD_INIT(tsk.thread_group), \
+ memblk_binding: { \
+ bitmask: MEMBLK_NO_BINDING, \
+ behavior: MPOL_STRICT, \
+ lock: SPIN_LOCK_UNLOCKED \
+ }, \
wait_chldexit: __WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\
real_timer: { \
function: it_real_fn \
diff -Nur linux-2.5.27-vanilla/include/linux/membind.h linux-2.5.27-api/include/linux/membind.h
--- linux-2.5.27-vanilla/include/linux/membind.h Wed Dec 31 16:00:00 1969
+++ linux-2.5.27-api/include/linux/membind.h Wed Jul 24 17:38:34 2002
@@ -0,0 +1,51 @@
+/*
+ * linux/include/linux/membind.h
+ *
+ * Written by: Matthew Dobson, IBM Corporation
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <colpatch@us.ibm.com>
+ */
+#ifndef _LINUX_MEMBIND_H
+#define _LINUX_MEMBIND_H
+
+#include <linux/types.h>
+
+#define MEMBLK_NO_BINDING (~0UL)
+
+typedef struct memblk_list {
+ unsigned long bitmask;
+ int behavior;
+ spinlock_t lock;
+} memblk_list_t;
+
+
+#define is_valid_memblk_behavior(x) (1) /* for now */
+#define is_memblk_subset(x, y) (!(~(x) & (y))) /* test whether x is a subset of y */
+
+#define MPOL_STRICT 0 /* Memory MUST be allocated according to binding */
+#define MPOL_LOOSE 1 /* Memory will be allocated according to binding, but
+ can fall back to other memory blocks if necessary. */
+#define MPOL_FIRST 2 /* UNUSED FOR NOW */
+#define MPOL_STRIPE 4 /* UNUSED FOR NOW */
+#define MPOL_RR 8 /* UNUSED FOR NOW */
+
+#endif /* _LINUX_MEMBIND_H */
diff -Nur linux-2.5.27-vanilla/include/linux/mmzone.h linux-2.5.27-api/include/linux/mmzone.h
--- linux-2.5.27-vanilla/include/linux/mmzone.h Wed Jul 24 17:33:41 2002
+++ linux-2.5.27-api/include/linux/mmzone.h Wed Jul 24 17:38:34 2002
@@ -138,6 +138,7 @@
unsigned long node_start_mapnr;
unsigned long node_size;
int node_id;
+ int memblk_id; /* A unique ID for each memory block */
struct pglist_data *node_next;
} pg_data_t;
diff -Nur linux-2.5.27-vanilla/include/linux/sched.h linux-2.5.27-api/include/linux/sched.h
--- linux-2.5.27-vanilla/include/linux/sched.h Sat Jul 20 12:11:07 2002
+++ linux-2.5.27-api/include/linux/sched.h Wed Jul 24 17:38:34 2002
@@ -27,6 +27,7 @@
#include <linux/securebits.h>
#include <linux/fs_struct.h>
#include <linux/compiler.h>
+#include <linux/membind.h>
struct exec_domain;
@@ -302,6 +303,9 @@
struct task_struct *pidhash_next;
struct task_struct **pidhash_pprev;
+ /* additional Memory Binding stuff */
+ memblk_list_t memblk_binding;
+
wait_queue_head_t wait_chldexit; /* for wait4() */
struct completion *vfork_done; /* for vfork() */
diff -Nur linux-2.5.27-vanilla/kernel/sys.c linux-2.5.27-api/kernel/sys.c
--- linux-2.5.27-vanilla/kernel/sys.c Wed Jul 24 17:33:41 2002
+++ linux-2.5.27-api/kernel/sys.c Wed Jul 24 17:38:34 2002
@@ -1263,6 +1263,57 @@
return (long)ret;
}
+/*
+ * sys_mem_setbinding(): Sets up a new MemBlk Binding
+ */
+asmlinkage long sys_mem_setbinding(unsigned long memblks, int behavior)
+{
+ long ret;
+ unsigned long flags;
+ struct task_struct *curr = current;
+
+ ret = -ENODEV;
+ /* Make sure that at least one of the memblks in the new binding set is online. */
+ if (!(memblks & memblk_online_map))
+ goto out;
+
+ ret = -EINVAL;
+ /* Test to make sure the behavior argument is valid. */
+ if (!is_valid_memblk_behavior(behavior))
+ goto out;
+
+ ret = -EPERM;
+ spin_lock_irqsave(&curr->memblk_binding.lock, flags);
+ /* If the new binding expands upon the old binding, the caller
+ must have CAP_SYS_NICE. */
+ if (is_memblk_subset(memblks, curr->memblk_binding.bitmask) ||
+ capable(CAP_SYS_NICE)){
+ curr->memblk_binding.bitmask = memblks;
+ curr->memblk_binding.behavior = behavior;
+ ret = 0;
+ }
+ spin_unlock_irqrestore(&curr->memblk_binding.lock, flags);
+
+ out:
+ return ret;
+}
+
+/*
+ * sys_mem_getbinding(): Returns the current MemBlk Binding
+ */
+asmlinkage long sys_mem_getbinding(void)
+{
+ long flags;
+ unsigned long memblk_binding;
+ struct task_struct *curr = current;
+
+ spin_lock_irqsave(&curr->memblk_binding.lock, flags);
+ memblk_binding = curr->memblk_binding.bitmask;
+ spin_unlock_irqrestore(&curr->memblk_binding.lock, flags);
+
+ return memblk_binding;
+}
+
asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
unsigned long arg4, unsigned long arg5)
{
diff -Nur linux-2.5.27-vanilla/mm/numa.c linux-2.5.27-api/mm/numa.c
--- linux-2.5.27-vanilla/mm/numa.c Sat Jul 20 12:11:12 2002
+++ linux-2.5.27-api/mm/numa.c Wed Jul 24 17:38:34 2002
@@ -8,6 +8,7 @@
#include <linux/bootmem.h>
#include <linux/mmzone.h>
#include <linux/spinlock.h>
+#include <linux/membind.h>
int numnodes = 1; /* Initialized for UMA platforms */
@@ -27,6 +28,9 @@
{
free_area_init_core(0, &contig_page_data, &mem_map, zones_size,
zone_start_paddr, zholes_size, pmap);
+ contig_page_data.node_id = 0;
+ contig_page_data.memblk_id = 0;
+ memblk_online_map = 1UL;
}
#endif /* !CONFIG_DISCONTIGMEM */
@@ -71,6 +75,11 @@
free_area_init_core(nid, pgdat, &discard, zones_size, zone_start_paddr,
zholes_size, pmap);
pgdat->node_id = nid;
+ pgdat->memblk_id = num_online_memblks();
+ if (test_and_set_bit(num_online_memblks() + 1, &memblk_online_map)){
+ printk("memblk alread counted?!?!\n");
+ BUG();
+ }
/*
* Get space for the valid bitmap.
@@ -88,6 +97,8 @@
return __alloc_pages(gfp_mask, order, pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK));
}
+#ifdef CONFIG_NUMA
+
/*
* This can be refined. Currently, tries to do round robin, instead
* should do concentratic circle search, starting from current node.
@@ -96,23 +107,67 @@
{
struct page *ret = 0;
pg_data_t *start, *temp;
-#ifndef CONFIG_NUMA
+ int search_twice = 0;
+ unsigned long memblk_mask;
+ struct task_struct *curr = current;
unsigned long flags;
- static pg_data_t *next = 0;
-#endif
if (order >= MAX_ORDER)
return NULL;
-#ifdef CONFIG_NUMA
+
+ spin_lock_irqsave(&curr->memblk_binding.lock, flags);
+ memblk_mask = curr->memblk_binding.bitmask;
+ /* if it is a loose binding, remember to search other memblks */
+ if ((curr->memblk_binding.behavior == MPOL_LOOSE) &&
+ (curr->memblk_binding.bitmask != MEMBLK_NO_BINDING))
+ search_twice = 1;
+ spin_unlock_irqrestore(&curr->memblk_binding.lock, flags);
+
+search_through_memblks:
temp = NODE_DATA(numa_node_id());
-#else
- spin_lock_irqsave(&node_lock, flags);
+ start = temp;
+ while (temp) {
+ if (memblk_mask & (1 << temp->memblk_id))
+ if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
+ return(ret);
+ temp = temp->node_next;
+ }
+ temp = pgdat_list;
+ while (temp != start) {
+ if (!(memblk_mask & (1 << temp->memblk_id)))
+ if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
+ return(ret);
+ temp = temp->node_next;
+ }
+
+ if (search_twice) {
+ /*
+ * If we failed to find a "preferred" memblk, try again
+ * looking for anything we haven't checked yet.
+ */
+ search_twice = 0; /* no infinite loops, please */
+ memblk_mask = ~memblk_mask;
+ goto search_through_memblks;
+ }
+ return(0);
+}
+
+#else /* !CONFIG_NUMA */
+
+struct page * _alloc_pages(unsigned int gfp_mask, unsigned int order)
+{
+ struct page *ret = 0;
+ pg_data_t *start, *temp;
+ static pg_data_t *next = 0;
+ unsigned long flags;
+
+ if (order >= MAX_ORDER)
+ return NULL;
+
if (!next) next = pgdat_list;
- temp = next;
+ temp = start = next;
next = next->node_next;
- spin_unlock_irqrestore(&node_lock, flags);
-#endif
- start = temp;
+
while (temp) {
if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
return(ret);
@@ -127,4 +182,6 @@
return(0);
}
+#endif /* CONFIG_NUMA */
+
#endif /* CONFIG_DISCONTIGMEM */
diff -Nur linux-2.5.27-vanilla/mm/page_alloc.c linux-2.5.27-api/mm/page_alloc.c
--- linux-2.5.27-vanilla/mm/page_alloc.c Sat Jul 20 12:11:07 2002
+++ linux-2.5.27-api/mm/page_alloc.c Wed Jul 24 17:38:34 2002
@@ -42,6 +42,8 @@
static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
+extern unsigned long memblk_online_map;
+
/*
* Temporary debugging check for pages not lying within a given zone.
*/
@@ -927,6 +929,9 @@
void __init free_area_init(unsigned long *zones_size)
{
free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
+ contig_page_data.node_id = 0;
+ contig_page_data.memblk_id = 0;
+ memblk_online_map = 1UL;
}
static int __init setup_mem_frac(char *str)
next reply other threads:[~2002-07-25 22:40 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2002-07-25 22:40 Matthew Dobson [this message]
2002-07-29 10:37 ` [Lse-tech] [patch] Memory Binding API v0.2 Erich Focht
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=3D407E62.8080707@us.ibm.com \
--to=colpatch@us.ibm.com \
--cc=akpm@zip.com.au \
--cc=andrea@suse.de \
--cc=hohnbaum@us.ibm.com \
--cc=linux-kernel@vger.kernel.org \
--cc=lse-tech@lists.sourceforge.net \
--cc=mjbligh@us.ibm.com \
--cc=torvalds@transmeta.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.