public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Matthew Dobson <colpatch@us.ibm.com>
To: Andrew Morton <akpm@digeo.com>
Cc: linux-kernel@vger.kernel.org, mbligh@aracnet.com,
	lse-tech@lists.sourceforge.net
Subject: Re: [patch][rfc] Memory Binding (1/1)
Date: Tue, 01 Apr 2003 18:42:00 -0800	[thread overview]
Message-ID: <3E8A4DF8.8010504@us.ibm.com> (raw)
In-Reply-To: 20030401153945.17d26219.akpm@digeo.com

[-- Attachment #1: Type: text/plain, Size: 1432 bytes --]

Andrew Morton wrote:
> Matthew Dobson <colpatch@us.ibm.com> wrote:
> 
>>Okee dokee...  Here's the real core of the patch.
> 
> 
> Looks saneish to me.  I'd like to see thorough benchmark results when it is
> complete.  And it would be nice to make address_space.binding go away if
> !CONFIG_NUMA.
Yeah... I figured the one pointer wouldn't be a big deal, and it would 
save a few more ifdefs, but I guess consistency is better.  If I ifdef 
most of it, may as well ifdef it all!

> The explicit knowledge of ZONE_DMA/ZONE_NORMAL/ZONE_HIGHMEM in get_zonetype()
> should not be necessary - you don't want it to explode if ZONE_DMA32 is
> added.  It should be indexing into node_zonelists in some manner.
> 
> Will this code work if all memory is in ZONE_DMA, as some architectures do?

Well, what I do for now is use the gfp_mask in the address_space that is 
already there for the shm segments.  This means I have to look at that 
to decide if they want DMA, NORMAL, or HIGHMEM.  So it already would 
need to be changed if we added a new zone.  If the thing we're creating 
a binding for only can use DMA, for example, I need to make sure I use 
only the DMA zones.  I can't easily see a way to make sure I add only 
the appropriate zones to the zonelist, and also don't look at the 
gfp_flag to determine the zones they want.  I'll ponder it more deeply 
tonight, though.

New patch, sans binding pointer attatched.

Cheers!

-Matt

[-- Attachment #2: 01-membind.patch --]
[-- Type: text/plain, Size: 12520 bytes --]

diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/arch/i386/kernel/entry.S linux-2.5.66-membind/arch/i386/kernel/entry.S
--- linux-2.5.66-pre_membind/arch/i386/kernel/entry.S	Mon Mar 24 14:00:11 2003
+++ linux-2.5.66-membind/arch/i386/kernel/entry.S	Mon Mar 31 17:45:20 2003
@@ -852,6 +852,7 @@
  	.long sys_clock_gettime		/* 265 */
  	.long sys_clock_getres
  	.long sys_clock_nanosleep
+ 	.long sys_membind
  
  
 nr_syscalls=(.-sys_call_table)/4
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/fs/inode.c linux-2.5.66-membind/fs/inode.c
--- linux-2.5.66-pre_membind/fs/inode.c	Mon Mar 24 14:01:48 2003
+++ linux-2.5.66-membind/fs/inode.c	Mon Mar 31 17:45:20 2003
@@ -141,6 +141,7 @@
 		mapping->a_ops = &empty_aops;
  		mapping->host = inode;
 		mapping->gfp_mask = GFP_HIGHUSER;
+		mapping->binding = NULL;
 		mapping->dirtied_when = 0;
 		mapping->assoc_mapping = NULL;
 		mapping->backing_dev_info = &default_backing_dev_info;
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/include/asm-i386/unistd.h linux-2.5.66-membind/include/asm-i386/unistd.h
--- linux-2.5.66-pre_membind/include/asm-i386/unistd.h	Mon Mar 24 14:00:54 2003
+++ linux-2.5.66-membind/include/asm-i386/unistd.h	Mon Mar 31 17:45:20 2003
@@ -273,8 +273,9 @@
 #define __NR_clock_gettime	(__NR_timer_create+6)
 #define __NR_clock_getres	(__NR_timer_create+7)
 #define __NR_clock_nanosleep	(__NR_timer_create+8)
+#define __NR_membind		268
 
-#define NR_syscalls 268
+#define NR_syscalls 269
 
 /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
 
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/include/linux/binding.h linux-2.5.66-membind/include/linux/binding.h
--- linux-2.5.66-pre_membind/include/linux/binding.h	Wed Dec 31 16:00:00 1969
+++ linux-2.5.66-membind/include/linux/binding.h	Tue Apr  1 17:14:16 2003
@@ -0,0 +1,40 @@
+/*
+ * include/linux/binding.h
+ *
+ * Written by: Matthew Dobson, IBM Corporation
+ *
+ * Copyright (C) 2003, IBM Corp.
+ *
+ * All rights reserved.          
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <colpatch@us.ibm.com>
+ */
+#ifndef _LINUX_BINDING_H
+#define _LINUX_BINDING_H
+
+#ifdef CONFIG_NUMA
+
+#include <linux/mmzone.h>
+
+/* Structure to keep track of shared memory segment bindings */
+struct binding {
+	struct zonelist	zonelist;
+};
+
+#endif /* CONFIG_NUMA */
+#endif /* _LINUX_BINDING_H */
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/include/linux/fs.h linux-2.5.66-membind/include/linux/fs.h
--- linux-2.5.66-pre_membind/include/linux/fs.h	Mon Mar 24 14:00:10 2003
+++ linux-2.5.66-membind/include/linux/fs.h	Tue Apr  1 17:14:37 2003
@@ -19,6 +19,7 @@
 #include <linux/cache.h>
 #include <linux/radix-tree.h>
 #include <linux/kobject.h>
+#include <linux/binding.h>
 #include <asm/atomic.h>
 
 struct iovec;
@@ -329,6 +330,9 @@
 	spinlock_t		private_lock;	/* for use by the address_space */
 	struct list_head	private_list;	/* ditto */
 	struct address_space	*assoc_mapping;	/* ditto */
+#ifdef CONFIG_NUMA
+	struct binding		*binding;	/* for memory bindings */
+#endif
 };
 
 struct block_device {
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/include/linux/pagemap.h linux-2.5.66-membind/include/linux/pagemap.h
--- linux-2.5.66-pre_membind/include/linux/pagemap.h	Mon Mar 24 13:59:54 2003
+++ linux-2.5.66-membind/include/linux/pagemap.h	Tue Apr  1 17:13:53 2003
@@ -8,6 +8,7 @@
 #include <linux/fs.h>
 #include <linux/list.h>
 #include <linux/highmem.h>
+#include <linux/binding.h>
 #include <asm/uaccess.h>
 
 /*
@@ -27,6 +28,8 @@
 #define page_cache_release(page)	put_page(page)
 void release_pages(struct page **pages, int nr, int cold);
 
+#ifndef CONFIG_NUMA
+
 static inline struct page *page_cache_alloc(struct address_space *x)
 {
 	return alloc_pages(x->gfp_mask, 0);
@@ -37,6 +40,29 @@
 	return alloc_pages(x->gfp_mask|__GFP_COLD, 0);
 }
 
+#else /* CONFIG_NUMA */
+
+static inline struct page *__page_cache_alloc(struct address_space *x, int cold)
+{
+	int gfp_mask;
+	struct zonelist *zonelist;
+
+	gfp_mask = x->gfp_mask;
+	if (cold)
+		gfp_mask |= __GFP_COLD;
+	if (!x->binding)
+		zonelist = get_zonelist(gfp_mask);
+	else
+		zonelist = &x->binding->zonelist;
+
+	return __alloc_pages(gfp_mask, 0, zonelist);
+}
+
+#define page_cache_alloc(x)	 __page_cache_alloc((x), 0)
+#define page_cache_alloc_cold(x) __page_cache_alloc((x), 1)
+
+#endif /* !CONFIG_NUMA */
+
 typedef int filler_t(void *, struct page *);
 
 extern struct page * find_get_page(struct address_space *mapping,
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/mm/Makefile linux-2.5.66-membind/mm/Makefile
--- linux-2.5.66-pre_membind/mm/Makefile	Mon Mar 24 14:00:51 2003
+++ linux-2.5.66-membind/mm/Makefile	Mon Mar 31 17:45:20 2003
@@ -7,7 +7,7 @@
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
 			   shmem.o vmalloc.o
 
-obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
+obj-y			:= binding.o bootmem.o fadvise.o filemap.o mempool.o oom_kill.o \
 			   page_alloc.o page-writeback.o pdflush.o readahead.o \
 			   slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y)
 
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/mm/binding.c linux-2.5.66-membind/mm/binding.c
--- linux-2.5.66-pre_membind/mm/binding.c	Wed Dec 31 16:00:00 1969
+++ linux-2.5.66-membind/mm/binding.c	Tue Apr  1 18:32:45 2003
@@ -0,0 +1,199 @@
+/*
+ * mm/binding.c
+ *
+ * Written by: Matthew Dobson, IBM Corporation
+ *
+ * Copyright (C) 2003, IBM Corp.
+ *
+ * All rights reserved.          
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <colpatch@us.ibm.com>
+ */
+#include <linux/errno.h>
+#include <linux/mm.h>
+
+#ifndef CONFIG_NUMA
+
+asmlinkage unsigned long sys_membind(unsigned long start, unsigned long len, 
+		unsigned long *mask_ptr, unsigned int mask_len, unsigned long policy)
+{
+	return -ENOSYS;
+}
+
+#else /* CONFIG_NUMA */
+
+#include <linux/binding.h>
+#include <asm/string.h>
+#include <asm/topology.h>
+#include <asm/uaccess.h>
+
+/* Translate a cpumask to a nodemask */
+static inline void cpumask_to_nodemask(DECLARE_BITMAP(cpumask, NR_CPUS), 
+		DECLARE_BITMAP(nodemask, MAX_NUMNODES))
+{
+	int i;
+
+	for (i = 0; i < NR_CPUS; i++)
+		if (test_bit(i, (cpumask)))
+			set_bit(cpu_to_node(i), (nodemask));
+}
+
+/*
+ * Takes a BITMAP of nodes as an argument, and ensures that at least one of 
+ * the nodes in the bitmap are actually online.
+ * Returns 0 if at least one specified node is online, -EINVAL otherwise.
+ */
+static inline int check_binding_nodemask(DECLARE_BITMAP(nodemask, MAX_NUMNODES))
+{
+	int i;
+
+	/* Make sure at least one specified node is online */
+	for (i = 0; i < MAX_NUMNODES; i++)
+		if (test_bit(i, nodemask) && node_online(i))
+			return 0;
+	return -EINVAL;
+}
+
+/*
+ * Adds the zones belonging to @pgdat to @zonelist.  Returns the next 
+ * index in @zonelist.
+ */
+static inline int add_zones(pg_data_t *pgdat, struct zonelist *zonelist, 
+			int zone_num, int zone_type)
+{
+	switch (zone_type) {
+		struct zone *zone;
+	default:
+		BUG();
+	case ZONE_HIGHMEM:
+		zone = pgdat->node_zones + ZONE_HIGHMEM;
+		if (zone->present_pages)
+			zonelist->zones[zone_num++] = zone;
+	case ZONE_NORMAL:
+		zone = pgdat->node_zones + ZONE_NORMAL;
+		if (zone->present_pages)
+			zonelist->zones[zone_num++] = zone;
+	case ZONE_DMA:
+		zone = pgdat->node_zones + ZONE_DMA;
+		if (zone->present_pages)
+			zonelist->zones[zone_num++] = zone;
+	}
+	return zone_num;
+}
+
+/* Determine the appropriate ZONE_* flag based on the given GFP_* flags */
+static inline int get_zonetype(int gfp_flag)
+{
+	int zone_type;
+
+	gfp_flag &= GFP_ZONEMASK;
+	if (gfp_flag & __GFP_HIGHMEM)
+		zone_type = ZONE_HIGHMEM;
+	else if (gfp_flag & __GFP_DMA)
+		zone_type = ZONE_DMA;
+	else
+		zone_type = ZONE_NORMAL;
+
+	return zone_type;
+}
+
+/* Top-level function for allocating a binding for a region of memory */
+static struct binding *alloc_binding(DECLARE_BITMAP(nodemask, MAX_NUMNODES), 
+		int gfp_flag, unsigned long policy)
+{
+	struct binding *binding;
+	int node, zone_num, zone_type;
+
+	if (check_binding_nodemask(nodemask))
+		return NULL;
+
+	binding = (struct binding *)kmalloc(sizeof(struct binding), GFP_KERNEL);
+	if (!binding)
+		return NULL;
+	memset(binding, 0, sizeof(struct binding));
+
+	/* Build binding zonelist */
+	zone_type = get_zonetype(gfp_flag);
+	zone_num = 0;
+	for (node = 0; node < MAX_NUMNODES; node++) {
+		if (test_bit(node, nodemask) && node_online(node))
+			zone_num = add_zones(NODE_DATA(node), &binding->zonelist, zone_num, zone_type);
+	}
+	binding->zonelist.zones[zone_num] = NULL;
+
+	if (!zone_num) {
+		/* No zones were added to the zonelist.  Let the caller know. */
+		kfree(binding);
+		binding = NULL;
+	}
+	return binding;
+} 
+
+
+/*
+ * membind -  Bind a range of a process' VM space to a set of memory blocks according to
+ *            a predefined policy.
+ * @start:    beginning address of memory region to bind
+ * @len:      length of memory region to bind
+ * @mask_ptr: pointer to bitmask of cpus
+ * @mask_len: length of the bitmask
+ * @policy:   flag specifying the policy to use for the segment
+ */
+asmlinkage unsigned long sys_membind(unsigned long start, unsigned long len, 
+		unsigned long *mask_ptr, unsigned int mask_len, unsigned long policy)
+{
+	DECLARE_BITMAP(cpu_mask, NR_CPUS);
+	DECLARE_BITMAP(node_mask, MAX_NUMNODES);
+	struct vm_area_struct *vma = NULL;
+	struct address_space *mapping;
+	int error = 0;
+
+	/* Deal with getting cpu_mask from userspace & translating to node_mask */
+	if (mask_len > NR_CPUS) {
+		error = -EINVAL;
+		goto out;
+	}
+	CLEAR_BITMAP(cpu_mask, NR_CPUS);
+	CLEAR_BITMAP(node_mask, MAX_NUMNODES);
+	if (copy_from_user(cpu_mask, mask_ptr, (mask_len+7)/8)) {
+		error = -EFAULT;
+		goto out;
+	}
+	cpumask_to_nodemask(cpu_mask, node_mask);
+
+	vma = find_vma(current->mm, start);
+	if (!(vma && vma->vm_file && vma->vm_ops && 
+		vma->vm_ops->nopage == shmem_nopage)) {
+		/* This isn't a shm segment.  For now, we bail. */
+		printk("%s: Can only bind shm(em) segments for now!\n", __FUNCTION__);
+		error = -EINVAL;
+		goto out;
+	}
+
+	mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
+	mapping->binding = alloc_binding(node_mask, mapping->gfp_mask, policy);
+	if (!mapping->binding) {
+		printk("%s: Error while building memory binding!\n", __FUNCTION__);
+		error = -EFAULT;
+	}
+
+out:
+	return error;
+}
+
+#endif /* !CONFIG_NUMA */
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/mm/swap_state.c linux-2.5.66-membind/mm/swap_state.c
--- linux-2.5.66-pre_membind/mm/swap_state.c	Mon Mar 24 14:00:21 2003
+++ linux-2.5.66-membind/mm/swap_state.c	Tue Apr  1 17:12:00 2003
@@ -47,6 +47,9 @@
 	.i_shared_sem	= __MUTEX_INITIALIZER(swapper_space.i_shared_sem),
 	.private_lock	= SPIN_LOCK_UNLOCKED,
 	.private_list	= LIST_HEAD_INIT(swapper_space.private_list),
+#ifdef CONFIG_NUMA
+	.binding	= NULL,
+#endif
 };
 
 #define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)

  reply	other threads:[~2003-04-02  2:40 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2003-04-01 22:31 [patch][rfc] Memory Binding (0/1) Matthew Dobson
2003-04-01 22:39 ` [patch][rfc] Memory Binding (1/1) Matthew Dobson
2003-04-01 23:39   ` Andrew Morton
2003-04-02  2:42     ` Matthew Dobson [this message]
2003-04-02 20:43       ` Bryan Rittmeyer
2003-04-02  7:26   ` Christoph Hellwig
2003-04-02  8:17   ` [Lse-tech] " Andi Kleen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=3E8A4DF8.8010504@us.ibm.com \
    --to=colpatch@us.ibm.com \
    --cc=akpm@digeo.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=lse-tech@lists.sourceforge.net \
    --cc=mbligh@aracnet.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox