From: Matthew Dobson <colpatch@us.ibm.com>
To: linux-kernel@vger.kernel.org
Cc: "Martin J. Bligh" <mbligh@aracnet.com>,
Andrew Morton <akpm@digeo.com>,
Christoph Hellwig <hch@infradead.org>,
Paolo Zeppegno <zeppegno.paolo@seat.it>, Andi Kleen <ak@muc.de>,
lse-tech <lse-tech@lists.sourceforge.net>
Subject: [rfc][patch] Memory Binding Take 2 (1/1)
Date: Wed, 02 Apr 2003 21:56:49 -0800 [thread overview]
Message-ID: <3E8BCD21.2050307@us.ibm.com> (raw)
In-Reply-To: 3E8BCB96.6090908@us.ibm.com
[-- Attachment #1: Type: text/plain, Size: 865 bytes --]
Now for the good stuff! ;)
This one has had more changes... I've changed the syscall from
sys_membind to sys_mbind. I liked Paolo's suggestion of aligning the
naming. I've fixed up the way the bitmaps are passed. I pulled out all
the ZONE_* code, and now just have it use all the zones on the node. I
made sure that the binding pointers are not compiled in for non-NUMA
kernels. All that is added for non-NUMA kernels is the cond_syscall and
a small change in the page_cache_alloc callpath. Now page_cache_alloc
calls __page_cache_alloc, which is just the old page_cache_alloc for
non-NUMA. For NUMA, it's obviously a different function. I also
cleaned up the bitmask size issue, by just making sure userspace doesn't
pass in a bitmask that's way too large.
I guess that's it... As always, I'm looking forward to any comments!
Cheers!
-Matt
[-- Attachment #2: 01-membind.patch --]
[-- Type: text/plain, Size: 11322 bytes --]
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/arch/i386/kernel/entry.S linux-2.5.66-membind/arch/i386/kernel/entry.S
--- linux-2.5.66-pre_membind/arch/i386/kernel/entry.S Mon Mar 24 14:00:11 2003
+++ linux-2.5.66-membind/arch/i386/kernel/entry.S Wed Apr 2 10:46:20 2003
@@ -807,7 +807,7 @@
.long sys_getdents64 /* 220 */
.long sys_fcntl64
.long sys_ni_syscall /* reserved for TUX */
- .long sys_ni_syscall
+ .long sys_mbind
.long sys_gettid
.long sys_readahead /* 225 */
.long sys_setxattr
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/fs/inode.c linux-2.5.66-membind/fs/inode.c
--- linux-2.5.66-pre_membind/fs/inode.c Mon Mar 24 14:01:48 2003
+++ linux-2.5.66-membind/fs/inode.c Wed Apr 2 10:49:36 2003
@@ -144,6 +144,9 @@
mapping->dirtied_when = 0;
mapping->assoc_mapping = NULL;
mapping->backing_dev_info = &default_backing_dev_info;
+#ifdef CONFIG_NUMA
+ mapping->binding = NULL;
+#endif
if (sb->s_bdev)
mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
memset(&inode->u, 0, sizeof(inode->u));
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/include/asm-i386/unistd.h linux-2.5.66-membind/include/asm-i386/unistd.h
--- linux-2.5.66-pre_membind/include/asm-i386/unistd.h Mon Mar 24 14:00:54 2003
+++ linux-2.5.66-membind/include/asm-i386/unistd.h Wed Apr 2 10:52:18 2003
@@ -228,7 +228,7 @@
#define __NR_madvise1 219 /* delete when C lib stub is removed */
#define __NR_getdents64 220
#define __NR_fcntl64 221
-/* 223 is unused */
+#define __NR_mbind 223
#define __NR_gettid 224
#define __NR_readahead 225
#define __NR_setxattr 226
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/include/linux/fs.h linux-2.5.66-membind/include/linux/fs.h
--- linux-2.5.66-pre_membind/include/linux/fs.h Mon Mar 24 14:00:10 2003
+++ linux-2.5.66-membind/include/linux/fs.h Wed Apr 2 10:54:17 2003
@@ -19,6 +19,7 @@
#include <linux/cache.h>
#include <linux/radix-tree.h>
#include <linux/kobject.h>
+#include <linux/mbind.h>
#include <asm/atomic.h>
struct iovec;
@@ -329,6 +330,9 @@
spinlock_t private_lock; /* for use by the address_space */
struct list_head private_list; /* ditto */
struct address_space *assoc_mapping; /* ditto */
+#ifdef CONFIG_NUMA
+ struct binding *binding; /* for memory bindings */
+#endif
};
struct block_device {
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/include/linux/mbind.h linux-2.5.66-membind/include/linux/mbind.h
--- linux-2.5.66-pre_membind/include/linux/mbind.h Wed Dec 31 16:00:00 1969
+++ linux-2.5.66-membind/include/linux/mbind.h Wed Apr 2 18:52:41 2003
@@ -0,0 +1,40 @@
+/*
+ * include/linux/mbind.h
+ *
+ * Written by: Matthew Dobson, IBM Corporation
+ *
+ * Copyright (C) 2003, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <colpatch@us.ibm.com>
+ */
+#ifndef _LINUX_MBIND_H
+#define _LINUX_MBIND_H
+
+#ifdef CONFIG_NUMA
+
+#include <linux/mmzone.h>
+
+/* Structure to keep track of memory segment (VMA) bindings */
+struct binding {
+ struct zonelist zonelist;
+};
+
+#endif /* CONFIG_NUMA */
+#endif /* _LINUX_MBIND_H */
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/include/linux/pagemap.h linux-2.5.66-membind/include/linux/pagemap.h
--- linux-2.5.66-pre_membind/include/linux/pagemap.h Mon Mar 24 13:59:54 2003
+++ linux-2.5.66-membind/include/linux/pagemap.h Wed Apr 2 19:49:42 2003
@@ -8,6 +8,7 @@
#include <linux/fs.h>
#include <linux/list.h>
#include <linux/highmem.h>
+#include <linux/mbind.h>
#include <asm/uaccess.h>
/*
@@ -27,14 +28,37 @@
#define page_cache_release(page) put_page(page)
void release_pages(struct page **pages, int nr, int cold);
+#ifndef CONFIG_NUMA
+
+static inline struct page *__page_cache_alloc(struct address_space *x, int gfp_mask)
+{
+ return alloc_pages(gfp_mask, 0);
+}
+
+#else /* CONFIG_NUMA */
+
+static inline struct page *__page_cache_alloc(struct address_space *x, int gfp_mask)
+{
+ struct zonelist *zonelist;
+
+ if (!x->binding)
+ zonelist = get_zonelist(gfp_mask);
+ else
+ zonelist = &x->binding->zonelist;
+
+ return __alloc_pages(gfp_mask, 0, zonelist);
+}
+
+#endif /* !CONFIG_NUMA */
+
static inline struct page *page_cache_alloc(struct address_space *x)
{
- return alloc_pages(x->gfp_mask, 0);
+ return __page_cache_alloc(x, x->gfp_mask);
}
static inline struct page *page_cache_alloc_cold(struct address_space *x)
{
- return alloc_pages(x->gfp_mask|__GFP_COLD, 0);
+ return __page_cache_alloc(x, x->gfp_mask|__GFP_COLD);
}
typedef int filler_t(void *, struct page *);
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/kernel/sys.c linux-2.5.66-membind/kernel/sys.c
--- linux-2.5.66-pre_membind/kernel/sys.c Mon Mar 24 14:00:00 2003
+++ linux-2.5.66-membind/kernel/sys.c Wed Apr 2 11:00:44 2003
@@ -226,6 +226,7 @@
cond_syscall(sys_sendmsg)
cond_syscall(sys_recvmsg)
cond_syscall(sys_socketcall)
+cond_syscall(sys_mbind)
static int set_one_prio(struct task_struct *p, int niceval, int error)
{
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/mm/Makefile linux-2.5.66-membind/mm/Makefile
--- linux-2.5.66-pre_membind/mm/Makefile Mon Mar 24 14:00:51 2003
+++ linux-2.5.66-membind/mm/Makefile Wed Apr 2 10:50:59 2003
@@ -7,8 +7,10 @@
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
shmem.o vmalloc.o
-obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
+obj-y := bootmem.o fadvise.o filemap.o mempool.o oom_kill.o \
page_alloc.o page-writeback.o pdflush.o readahead.o \
slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y)
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
+
+obj-$(CONFIG_NUMA) += mbind.o
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/mm/mbind.c linux-2.5.66-membind/mm/mbind.c
--- linux-2.5.66-pre_membind/mm/mbind.c Wed Dec 31 16:00:00 1969
+++ linux-2.5.66-membind/mm/mbind.c Wed Apr 2 21:45:39 2003
@@ -0,0 +1,131 @@
+/*
+ * mm/mbind.c
+ *
+ * Written by: Matthew Dobson, IBM Corporation
+ *
+ * Copyright (C) 2003, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <colpatch@us.ibm.com>
+ */
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/mbind.h>
+#include <asm/string.h>
+#include <asm/topology.h>
+#include <asm/uaccess.h>
+
+/* Translate a cpumask to a nodemask */
+static inline void cpumask_to_nodemask(bitmap_t cpumask, bitmap_t nodemask)
+{
+ int i;
+
+ for (i = 0; i < NR_CPUS; i++)
+ if (test_bit(i, cpumask))
+ set_bit(cpu_to_node(i), nodemask);
+}
+
+/*
+ * Adds the zones belonging to @pgdat to @zonelist. Returns the next
+ * index in @zonelist.
+ */
+static inline int add_node(pg_data_t *pgdat, struct zonelist *zonelist, int zone_num)
+{
+ int i;
+ struct zone *zone;
+
+ for (i = MAX_NR_ZONES-1; i >=0 ; i--) {
+ zone = pgdat->node_zones + i;
+ if (zone->present_pages)
+ zonelist->zones[zone_num++] = zone;
+ }
+ return zone_num;
+}
+
+/* Top-level function for allocating a binding for a region of memory */
+static inline struct binding *alloc_binding(bitmap_t nodemask)
+{
+ struct binding *binding;
+ int node, zone_num;
+
+ binding = (struct binding *)kmalloc(sizeof(struct binding), GFP_KERNEL);
+ if (!binding)
+ return NULL;
+ memset(binding, 0, sizeof(struct binding));
+
+ /* Build binding zonelist */
+ for (node = 0, zone_num = 0; node < MAX_NUMNODES; node++)
+ if (test_bit(node, nodemask) && node_online(node))
+ zone_num = add_node(NODE_DATA(node),
+ &binding->zonelist, zone_num);
+ binding->zonelist.zones[zone_num] = NULL;
+
+ if (zone_num == 0) {
+ /* No zones were added to the zonelist. Let the caller know. */
+ kfree(binding);
+ binding = NULL;
+ }
+ return binding;
+}
+
+
+/*
+ * membind - Bind a range of a process' VM space to a set of memory blocks according to
+ * a predefined policy.
+ * @start: beginning address of memory region to bind
+ * @len: length of memory region to bind
+ * @mask_ptr: pointer to bitmask of cpus
+ * @mask_len: length of the bitmask
+ * @policy: flag specifying the policy to use for the segment
+ */
+asmlinkage unsigned long sys_mbind(unsigned long start, unsigned long len,
+ unsigned long *mask_ptr, unsigned int mask_len, unsigned long policy)
+{
+ DECLARE_BITMAP(cpu_mask, NR_CPUS);
+ DECLARE_BITMAP(node_mask, MAX_NUMNODES);
+ struct vm_area_struct *vma = NULL;
+ struct address_space *mapping;
+ int copy_len, error = 0;
+
+ /* Deal with getting cpu_mask from userspace & translating to node_mask */
+ copy_len = min(mask_len, (unsigned int)NR_CPUS);
+ CLEAR_BITMAP(cpu_mask, NR_CPUS);
+ CLEAR_BITMAP(node_mask, MAX_NUMNODES);
+ if (copy_from_user(cpu_mask, mask_ptr, (copy_len+7)/8)) {
+ error = -EFAULT;
+ goto out;
+ }
+ cpumask_to_nodemask(cpu_mask, node_mask);
+
+ vma = find_vma(current->mm, start);
+ if (!(vma && vma->vm_file && vma->vm_ops &&
+ vma->vm_ops->nopage == shmem_nopage)) {
+ /* This isn't a shm segment. For now, we bail. */
+ error = -EINVAL;
+ goto out;
+ }
+
+ mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
+ mapping->binding = alloc_binding(node_mask);
+ if (!mapping->binding)
+ error = -EFAULT;
+
+out:
+ return error;
+}
diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.66-pre_membind/mm/swap_state.c linux-2.5.66-membind/mm/swap_state.c
--- linux-2.5.66-pre_membind/mm/swap_state.c Mon Mar 24 14:00:21 2003
+++ linux-2.5.66-membind/mm/swap_state.c Tue Apr 1 17:12:00 2003
@@ -47,6 +47,9 @@
.i_shared_sem = __MUTEX_INITIALIZER(swapper_space.i_shared_sem),
.private_lock = SPIN_LOCK_UNLOCKED,
.private_list = LIST_HEAD_INIT(swapper_space.private_list),
+#ifdef CONFIG_NUMA
+ .binding = NULL,
+#endif
};
#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
next prev parent reply other threads:[~2003-04-03 5:55 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2003-04-03 5:50 [rfc][patch] Memory Binding Take 2 (0/1) Matthew Dobson
2003-04-03 5:56 ` Matthew Dobson [this message]
2003-04-03 6:37 ` [rfc][patch] Memory Binding Take 2 (1/1) Andrew Morton
2003-04-03 23:30 ` Matthew Dobson
2003-04-03 12:20 ` Hugh Dickins
2003-04-03 13:25 ` Paolo Zeppegno
2003-04-03 23:57 ` Matthew Dobson
2003-04-04 13:40 ` Christoph Hellwig
2003-04-04 13:34 ` [rfc][patch] Memory Binding Take 2 (0/1) Christoph Hellwig
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=3E8BCD21.2050307@us.ibm.com \
--to=colpatch@us.ibm.com \
--cc=ak@muc.de \
--cc=akpm@digeo.com \
--cc=hch@infradead.org \
--cc=linux-kernel@vger.kernel.org \
--cc=lse-tech@lists.sourceforge.net \
--cc=mbligh@aracnet.com \
--cc=zeppegno.paolo@seat.it \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.