public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Matthew Dobson <colpatch@us.ibm.com>
To: linux-kernel@vger.kernel.org
Cc: "Martin J. Bligh" <mbligh@aracnet.com>,
	Andrew Morton <akpm@digeo.com>,
	Christoph Hellwig <hch@infradead.org>,
	Paolo Zeppegno <zeppegno.paolo@seat.it>, Andi Kleen <ak@muc.de>,
	lse-tech <lse-tech@lists.sourceforge.net>
Subject: [rfc][patch] Memory Binding Take 2 (1/1)
Date: Wed, 02 Apr 2003 21:56:49 -0800	[thread overview]
Message-ID: <3E8BCD21.2050307@us.ibm.com> (raw)
In-Reply-To: 3E8BCB96.6090908@us.ibm.com

[-- Attachment #1: Type: text/plain, Size: 865 bytes --]

Now for the good stuff! ;)

This one has had more changes...  I've changed the syscall from 
sys_membind to sys_mbind.  I liked Paolo's suggestion of aligning the 
naming.  I've fixed up the way the bitmaps are passed.  I pulled out all 
the ZONE_* code, and now just have it use all the zones on the node.  I 
made sure that the binding pointers are not compiled in for non-NUMA 
kernels.  All that is added for non-NUMA kernels is the cond_syscall and 
a small change in the page_cache_alloc callpath.  Now page_cache_alloc 
calls __page_cache_alloc, which is just the old page_cache_alloc for 
non-NUMA.  For NUMA, it's obviously a different function.  I also 
cleaned up the bitmask size issue, by just making sure userspace doesn't 
pass in a bitmask that's way too large.

I guess that's it...  As always, I'm looking forward to any comments!

Cheers!

-Matt

[-- Attachment #2: 01-membind.patch --]
[-- Type: text/plain, Size: 11322 bytes --]

diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/arch/i386/kernel/entry.S linux-2.5.66-membind/arch/i386/kernel/entry.S
--- linux-2.5.66-pre_membind/arch/i386/kernel/entry.S	Mon Mar 24 14:00:11 2003
+++ linux-2.5.66-membind/arch/i386/kernel/entry.S	Wed Apr  2 10:46:20 2003
@@ -807,7 +807,7 @@
 	.long sys_getdents64	/* 220 */
 	.long sys_fcntl64
 	.long sys_ni_syscall	/* reserved for TUX */
-	.long sys_ni_syscall
+ 	.long sys_mbind
 	.long sys_gettid
 	.long sys_readahead	/* 225 */
 	.long sys_setxattr
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/fs/inode.c linux-2.5.66-membind/fs/inode.c
--- linux-2.5.66-pre_membind/fs/inode.c	Mon Mar 24 14:01:48 2003
+++ linux-2.5.66-membind/fs/inode.c	Wed Apr  2 10:49:36 2003
@@ -144,6 +144,9 @@
 		mapping->dirtied_when = 0;
 		mapping->assoc_mapping = NULL;
 		mapping->backing_dev_info = &default_backing_dev_info;
+#ifdef CONFIG_NUMA
+		mapping->binding = NULL;
+#endif
 		if (sb->s_bdev)
 			mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
 		memset(&inode->u, 0, sizeof(inode->u));
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/include/asm-i386/unistd.h linux-2.5.66-membind/include/asm-i386/unistd.h
--- linux-2.5.66-pre_membind/include/asm-i386/unistd.h	Mon Mar 24 14:00:54 2003
+++ linux-2.5.66-membind/include/asm-i386/unistd.h	Wed Apr  2 10:52:18 2003
@@ -228,7 +228,7 @@
 #define __NR_madvise1		219	/* delete when C lib stub is removed */
 #define __NR_getdents64		220
 #define __NR_fcntl64		221
-/* 223 is unused */
+#define __NR_mbind		223
 #define __NR_gettid		224
 #define __NR_readahead		225
 #define __NR_setxattr		226
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/include/linux/fs.h linux-2.5.66-membind/include/linux/fs.h
--- linux-2.5.66-pre_membind/include/linux/fs.h	Mon Mar 24 14:00:10 2003
+++ linux-2.5.66-membind/include/linux/fs.h	Wed Apr  2 10:54:17 2003
@@ -19,6 +19,7 @@
 #include <linux/cache.h>
 #include <linux/radix-tree.h>
 #include <linux/kobject.h>
+#include <linux/mbind.h>
 #include <asm/atomic.h>
 
 struct iovec;
@@ -329,6 +330,9 @@
 	spinlock_t		private_lock;	/* for use by the address_space */
 	struct list_head	private_list;	/* ditto */
 	struct address_space	*assoc_mapping;	/* ditto */
+#ifdef CONFIG_NUMA
+	struct binding		*binding;	/* for memory bindings */
+#endif
 };
 
 struct block_device {
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/include/linux/mbind.h linux-2.5.66-membind/include/linux/mbind.h
--- linux-2.5.66-pre_membind/include/linux/mbind.h	Wed Dec 31 16:00:00 1969
+++ linux-2.5.66-membind/include/linux/mbind.h	Wed Apr  2 18:52:41 2003
@@ -0,0 +1,40 @@
+/*
+ * include/linux/mbind.h
+ *
+ * Written by: Matthew Dobson, IBM Corporation
+ *
+ * Copyright (C) 2003, IBM Corp.
+ *
+ * All rights reserved.          
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <colpatch@us.ibm.com>
+ */
+#ifndef _LINUX_MBIND_H
+#define _LINUX_MBIND_H
+
+#ifdef CONFIG_NUMA
+
+#include <linux/mmzone.h>
+
+/* Structure to keep track of memory segment (VMA) bindings */
+struct binding {
+	struct zonelist	zonelist;
+};
+
+#endif /* CONFIG_NUMA */
+#endif /* _LINUX_MBIND_H */
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/include/linux/pagemap.h linux-2.5.66-membind/include/linux/pagemap.h
--- linux-2.5.66-pre_membind/include/linux/pagemap.h	Mon Mar 24 13:59:54 2003
+++ linux-2.5.66-membind/include/linux/pagemap.h	Wed Apr  2 19:49:42 2003
@@ -8,6 +8,7 @@
 #include <linux/fs.h>
 #include <linux/list.h>
 #include <linux/highmem.h>
+#include <linux/mbind.h>
 #include <asm/uaccess.h>
 
 /*
@@ -27,14 +28,37 @@
 #define page_cache_release(page)	put_page(page)
 void release_pages(struct page **pages, int nr, int cold);
 
+#ifndef CONFIG_NUMA
+
+static inline struct page *__page_cache_alloc(struct address_space *x, int gfp_mask)
+{
+	return alloc_pages(gfp_mask, 0);
+}
+
+#else /* CONFIG_NUMA */
+
+static inline struct page *__page_cache_alloc(struct address_space *x, int gfp_mask)
+{
+	struct zonelist *zonelist;
+
+	if (!x->binding)
+		zonelist = get_zonelist(gfp_mask);
+	else
+		zonelist = &x->binding->zonelist;
+
+	return __alloc_pages(gfp_mask, 0, zonelist);
+}
+
+#endif /* !CONFIG_NUMA */
+
 static inline struct page *page_cache_alloc(struct address_space *x)
 {
-	return alloc_pages(x->gfp_mask, 0);
+	return __page_cache_alloc(x, x->gfp_mask);
 }
 
 static inline struct page *page_cache_alloc_cold(struct address_space *x)
 {
-	return alloc_pages(x->gfp_mask|__GFP_COLD, 0);
+	return __page_cache_alloc(x, x->gfp_mask|__GFP_COLD);
 }
 
 typedef int filler_t(void *, struct page *);
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/kernel/sys.c linux-2.5.66-membind/kernel/sys.c
--- linux-2.5.66-pre_membind/kernel/sys.c	Mon Mar 24 14:00:00 2003
+++ linux-2.5.66-membind/kernel/sys.c	Wed Apr  2 11:00:44 2003
@@ -226,6 +226,7 @@
 cond_syscall(sys_sendmsg)
 cond_syscall(sys_recvmsg)
 cond_syscall(sys_socketcall)
+cond_syscall(sys_mbind)
 
 static int set_one_prio(struct task_struct *p, int niceval, int error)
 {
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/mm/Makefile linux-2.5.66-membind/mm/Makefile
--- linux-2.5.66-pre_membind/mm/Makefile	Mon Mar 24 14:00:51 2003
+++ linux-2.5.66-membind/mm/Makefile	Wed Apr  2 10:50:59 2003
@@ -7,8 +7,10 @@
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
 			   shmem.o vmalloc.o
 
-obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
+obj-y			:= bootmem.o fadvise.o filemap.o mempool.o oom_kill.o \
 			   page_alloc.o page-writeback.o pdflush.o readahead.o \
 			   slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o
+
+obj-$(CONFIG_NUMA)	+= mbind.o
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/mm/mbind.c linux-2.5.66-membind/mm/mbind.c
--- linux-2.5.66-pre_membind/mm/mbind.c	Wed Dec 31 16:00:00 1969
+++ linux-2.5.66-membind/mm/mbind.c	Wed Apr  2 21:45:39 2003
@@ -0,0 +1,131 @@
+/*
+ * mm/mbind.c
+ *
+ * Written by: Matthew Dobson, IBM Corporation
+ *
+ * Copyright (C) 2003, IBM Corp.
+ *
+ * All rights reserved.          
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <colpatch@us.ibm.com>
+ */
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/mbind.h>
+#include <asm/string.h>
+#include <asm/topology.h>
+#include <asm/uaccess.h>
+
+/* Translate a cpumask to a nodemask */
+static inline void cpumask_to_nodemask(bitmap_t cpumask, bitmap_t nodemask)
+{
+	int i;
+
+	for (i = 0; i < NR_CPUS; i++)
+		if (test_bit(i, cpumask))
+			set_bit(cpu_to_node(i), nodemask);
+}
+
+/*
+ * Adds the zones belonging to @pgdat to @zonelist.  Returns the next 
+ * index in @zonelist.
+ */
+static inline int add_node(pg_data_t *pgdat, struct zonelist *zonelist, int zone_num)
+{
+	int i;
+	struct zone *zone;
+
+	for (i = MAX_NR_ZONES-1; i >=0 ; i--) {
+		zone = pgdat->node_zones + i;
+		if (zone->present_pages)
+			zonelist->zones[zone_num++] = zone;
+	}
+	return zone_num;
+}
+
+/* Top-level function for allocating a binding for a region of memory */
+static inline struct binding *alloc_binding(bitmap_t nodemask)
+{
+	struct binding *binding;
+	int node, zone_num;
+
+	binding = (struct binding *)kmalloc(sizeof(struct binding), GFP_KERNEL);
+	if (!binding)
+		return NULL;
+	memset(binding, 0, sizeof(struct binding));
+
+	/* Build binding zonelist */
+	for (node = 0, zone_num = 0; node < MAX_NUMNODES; node++)
+		if (test_bit(node, nodemask) && node_online(node))
+			zone_num = add_node(NODE_DATA(node), 
+				&binding->zonelist, zone_num);
+	binding->zonelist.zones[zone_num] = NULL;
+
+	if (zone_num == 0) {
+		/* No zones were added to the zonelist.  Let the caller know. */
+		kfree(binding);
+		binding = NULL;
+	}
+	return binding;
+} 
+
+
+/*
+ * membind -  Bind a range of a process' VM space to a set of memory blocks according to
+ *            a predefined policy.
+ * @start:    beginning address of memory region to bind
+ * @len:      length of memory region to bind
+ * @mask_ptr: pointer to bitmask of cpus
+ * @mask_len: length of the bitmask
+ * @policy:   flag specifying the policy to use for the segment
+ */
+asmlinkage unsigned long sys_mbind(unsigned long start, unsigned long len, 
+		unsigned long *mask_ptr, unsigned int mask_len, unsigned long policy)
+{
+	DECLARE_BITMAP(cpu_mask, NR_CPUS);
+	DECLARE_BITMAP(node_mask, MAX_NUMNODES);
+	struct vm_area_struct *vma = NULL;
+	struct address_space *mapping;
+	int copy_len, error = 0;
+
+	/* Deal with getting cpu_mask from userspace & translating to node_mask */
+	copy_len = min(mask_len, (unsigned int)NR_CPUS);
+	CLEAR_BITMAP(cpu_mask, NR_CPUS);
+	CLEAR_BITMAP(node_mask, MAX_NUMNODES);
+	if (copy_from_user(cpu_mask, mask_ptr, (copy_len+7)/8)) {
+		error = -EFAULT;
+		goto out;
+	}
+	cpumask_to_nodemask(cpu_mask, node_mask);
+
+	vma = find_vma(current->mm, start);
+	if (!(vma && vma->vm_file && vma->vm_ops && 
+		vma->vm_ops->nopage == shmem_nopage)) {
+		/* This isn't a shm segment.  For now, we bail. */
+		error = -EINVAL;
+		goto out;
+	}
+
+	mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
+	mapping->binding = alloc_binding(node_mask);
+	if (!mapping->binding)
+		error = -EFAULT;
+
+out:
+	return error;
+}
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/mm/swap_state.c linux-2.5.66-membind/mm/swap_state.c
--- linux-2.5.66-pre_membind/mm/swap_state.c	Mon Mar 24 14:00:21 2003
+++ linux-2.5.66-membind/mm/swap_state.c	Tue Apr  1 17:12:00 2003
@@ -47,6 +47,9 @@
 	.i_shared_sem	= __MUTEX_INITIALIZER(swapper_space.i_shared_sem),
 	.private_lock	= SPIN_LOCK_UNLOCKED,
 	.private_list	= LIST_HEAD_INIT(swapper_space.private_list),
+#ifdef CONFIG_NUMA
+	.binding	= NULL,
+#endif
 };
 
 #define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)

  reply	other threads:[~2003-04-03  5:55 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2003-04-03  5:50 [rfc][patch] Memory Binding Take 2 (0/1) Matthew Dobson
2003-04-03  5:56 ` Matthew Dobson [this message]
2003-04-03  6:37   ` [rfc][patch] Memory Binding Take 2 (1/1) Andrew Morton
2003-04-03 23:30     ` Matthew Dobson
2003-04-03 12:20   ` Hugh Dickins
2003-04-03 13:25     ` Paolo Zeppegno
2003-04-03 23:57     ` Matthew Dobson
2003-04-04 13:40   ` Christoph Hellwig
2003-04-04 13:34 ` [rfc][patch] Memory Binding Take 2 (0/1) Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=3E8BCD21.2050307@us.ibm.com \
    --to=colpatch@us.ibm.com \
    --cc=ak@muc.de \
    --cc=akpm@digeo.com \
    --cc=hch@infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=lse-tech@lists.sourceforge.net \
    --cc=mbligh@aracnet.com \
    --cc=zeppegno.paolo@seat.it \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox