public inbox for linux-ia64@vger.kernel.org
 help / color / mirror / Atom feed
* CPUSET Proposal
@ 2003-09-24 15:59 Simon Derr
  2003-09-24 22:06 ` David Mosberger
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Simon Derr @ 2003-09-24 15:59 UTC (permalink / raw)
  To: linux-ia64

[-- Attachment #1: Type: TEXT/PLAIN, Size: 2193 bytes --]


Hi,

We have developped a new feature in the Linux kernel, controlling CPU
placements, which are useful on large SMP machines, especially NUMA ones.
We call it CPUSETS, and we would highly appreciate to know about anyone
who would be interested in such a feature. This has been somewhat inspired
by the pset or cpumemset patches existing for Linux 2.4.

CPUSETs are lightweight objects in the linux kernel that enable users to
partition their multiprocessor machine by creating execution areas. A
virtualization layer has been added so it becomes possible to split a
machine in terms of CPUs.

Furthermore, HPC applications often need to bind their processes to a
specific CPU, and can achieve this by calling sched_setaffinity() in the
recent Linux kernels. But running several HPC applications on a large
system will result in several processes running on the same processor.
This problem is addressed by the CPUSET mechanism.


CPUSETS allow to:
----------------
1/ create sets of CPUs on the system, and bind applications to them

2/ translate the masks of CPUs given to sched_setaffinity() so they stay
   inside the set of CPUs. With this mechanism, processors are virtualized,
   for the use of sched_setaffinity() and /proc information. Thus, any former
   application using this syscall to bind processes to processors will
   work with virtual CPUs without any change.

3/ provide a way to create sets of cpus *inside* a set of cpus : hence a
   system administrator can partition a system among users, and users can
   partition their partition among their applications.

4/ Change on the fly the execution area of a whole set of processes (to
   give more resources to a critical application, for example).

...
5/ In the future, probably associate a memory allocation policy (such as
local node, or round robin) to a set of cpus.


These features have been implemented as a kernel patch for Linux 2.6 and a
suite of userland tools.

You can find the associated manpages and a slightly more detailed
explanation here: http://www.bullopensource.org/cpuset/

Any feedback, comment or opinion is welcome:
	Simon.Derr@Bull.net,
	Sylvain.Jaugey@bull.net

Thanks,

	Simon and Sylvain.


[-- Attachment #2: Type: TEXT/PLAIN, Size: 33435 bytes --]

--- linux-2.6.0-test4/Makefile	2003-08-23 01:57:25.000000000 +0200
+++ linux-2.6.0-test4+cpusets/Makefile	2003-09-09 10:48:52.000000000 +0200
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 0
-EXTRAVERSION = -test4
+EXTRAVERSION = -test4+cpusets
 
 # *DOCUMENTATION*
 # To see a list of typical targets execute "make help"
--- linux-2.6.0-test4/arch/ia64/Kconfig	2003-09-09 10:37:48.000000000 +0200
+++ linux-2.6.0-test4+cpusets/arch/ia64/Kconfig	2003-09-09 10:39:53.000000000 +0200
@@ -388,6 +388,37 @@
 
 	  If you don't know what to do here, say N.
 
+config CPUSETS
+	bool "cpusets"
+	depends on SMP
+	help
+	  This options will let you create and manage sets of cpu where you
+	  can run the processes.
+          
+          Say N if unsure.
+
+config CPUSETS_PROC
+	bool "/proc/cpusets support"
+	depends on CPUSETS
+	help
+	  Get some info about the existing cpusets in your system.
+	  To use this option, you have to ensure that the "/proc file system
+	  support" (CONFIG_PROC_FS) is enabled, too.
+
+config CPUSETS_PROC_CPUINFO
+	bool "/proc/cpuinfo uses current cpuset"
+	depends on CPUSETS_PROC
+	help
+	  With this option enabled, a process reading /proc/cpuinfo will
+	  only see the CPUs that are in its current cpuset.
+
+config CPUSETS_PROC_STAT
+	bool "/proc/stat uses current cpuset"
+	depends on CPUSETS_PROC
+	help
+	  With this option enabled, a process reading /proc/stat will
+	  only see the CPUs that are in its current cpuset.
+
 config PREEMPT
 	bool "Preemptible Kernel"
         help
--- linux-2.6.0-test4/arch/ia64/kernel/entry.S	2003-09-09 10:37:48.000000000 +0200
+++ linux-2.6.0-test4+cpusets/arch/ia64/kernel/entry.S	2003-09-09 10:39:53.000000000 +0200
@@ -1481,11 +1481,19 @@
 	data8 ia64_ni_syscall
 	data8 ia64_ni_syscall			// 1265
 	data8 ia64_ni_syscall
+#ifdef CONFIG_CPUSETS	
+	data8 sys_cpuset_create
+	data8 sys_cpuset_destroy
+	data8 sys_cpuset_alloc
+	data8 sys_cpuset_attach			// 1270
+	data8 sys_cpuset_getfreecpus
+#else	
 	data8 ia64_ni_syscall
 	data8 ia64_ni_syscall
 	data8 ia64_ni_syscall
 	data8 ia64_ni_syscall			// 1270
 	data8 ia64_ni_syscall
+#endif	
 	data8 ia64_ni_syscall
 	data8 ia64_ni_syscall
 	data8 ia64_ni_syscall
--- linux-2.6.0-test4/arch/ia64/kernel/setup.c	2003-09-09 10:37:48.000000000 +0200
+++ linux-2.6.0-test4+cpusets/arch/ia64/kernel/setup.c	2003-09-09 10:39:53.000000000 +0200
@@ -46,6 +46,10 @@
 #include <asm/system.h>
 #include <asm/unistd.h>
 
+#ifdef CONFIG_CPUSETS_PROC_CPUINFO
+# include <linux/cpuset_types.h>
+#endif
+
 #if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE)
 # error "struct cpuinfo_ia64 too big!"
 #endif
@@ -360,6 +364,16 @@
 	unsigned long mask;
 	int i;
 
+#ifdef CONFIG_CPUSETS_PROC_CPUINFO
+#define GET_BIT(mask, index) ((mask) & (1 << (index)))
+	/* show only CPUs in current cpuset */
+	if (!current->cpuset)
+		BUG();
+		
+	if (!GET_BIT(current->cpuset->cpus_allowed, cpunum)) 
+		return 0;	
+#endif /* CONFIG_CPUSETS_PROC_CPUINFO */		
+
 	mask = c->features;
 
 	switch (c->family) {
--- linux-2.6.0-test4/fs/proc/base.c	2003-09-09 10:37:48.000000000 +0200
+++ linux-2.6.0-test4+cpusets/fs/proc/base.c	2003-09-09 10:48:52.000000000 +0200
@@ -59,6 +59,7 @@
 	PROC_PID_CPU,
 	PROC_PID_MOUNTS,
 	PROC_PID_WCHAN,
+	PROC_PID_CPUSET,
 #ifdef CONFIG_SECURITY
 	PROC_PID_ATTR,
 	PROC_PID_ATTR_CURRENT,
@@ -99,6 +100,9 @@
 #ifdef CONFIG_KALLSYMS
   E(PROC_PID_WCHAN,	"wchan",	S_IFREG|S_IRUGO),
 #endif
+#ifdef CONFIG_CPUSETS_PROC	
+  E(PROC_PID_CPUSET,	"cpuset",	S_IFREG|S_IRUGO),
+#endif	
   {0,0,NULL,0}
 };
 #ifdef CONFIG_SECURITY
@@ -292,6 +296,11 @@
 }
 #endif /* CONFIG_KALLSYMS */
 
+
+#ifdef CONFIG_CPUSETS_PROC
+int proc_pid_cpuset(struct task_struct *task, char *buffer);
+#endif /* CONFIG_CPUSETS_PROC */
+
 /************************************************************************/
 /*                       Here the fs part begins                        */
 /************************************************************************/
@@ -1217,6 +1226,12 @@
 			ei->op.proc_read = proc_pid_wchan;
 			break;
 #endif
+#ifdef CONFIG_CPUSETS_PROC
+		case PROC_PID_CPUSET:
+			inode->i_fop = &proc_info_file_operations;
+			ei->op.proc_read = proc_pid_cpuset;
+			break;
+#endif
 		default:
 			printk("procfs: impossible type (%d)",p->type);
 			iput(inode);
--- linux-2.6.0-test4/fs/proc/base.c.orig	2003-08-23 01:57:13.000000000 +0200
+++ linux-2.6.0-test4+cpusets/fs/proc/base.c.orig	2003-09-09 10:13:08.000000000 +0200
@@ -56,6 +56,7 @@
 	PROC_PID_STAT,
 	PROC_PID_STATM,
 	PROC_PID_MAPS,
+	PROC_PID_CPU,
 	PROC_PID_MOUNTS,
 	PROC_PID_WCHAN,
 #ifdef CONFIG_SECURITY
@@ -83,6 +84,9 @@
   E(PROC_PID_CMDLINE,	"cmdline",	S_IFREG|S_IRUGO),
   E(PROC_PID_STAT,	"stat",		S_IFREG|S_IRUGO),
   E(PROC_PID_STATM,	"statm",	S_IFREG|S_IRUGO),
+#ifdef CONFIG_SMP
+  E(PROC_PID_CPU,	"cpu",		S_IFREG|S_IRUGO),
+#endif
   E(PROC_PID_MAPS,	"maps",		S_IFREG|S_IRUGO),
   E(PROC_PID_MEM,	"mem",		S_IFREG|S_IRUSR|S_IWUSR),
   E(PROC_PID_CWD,	"cwd",		S_IFLNK|S_IRWXUGO),
@@ -1181,6 +1185,12 @@
 		case PROC_PID_MAPS:
 			inode->i_fop = &proc_maps_operations;
 			break;
+#ifdef CONFIG_SMP
+		case PROC_PID_CPU:
+			inode->i_fop = &proc_info_file_operations;
+			ei->op.proc_read = proc_pid_cpu;
+			break;
+#endif
 		case PROC_PID_MEM:
 			inode->i_op = &proc_mem_inode_operations;
 			inode->i_fop = &proc_mem_operations;
--- linux-2.6.0-test4/fs/proc/proc_misc.c	2003-08-23 01:52:23.000000000 +0200
+++ linux-2.6.0-test4+cpusets/fs/proc/proc_misc.c	2003-09-09 10:48:52.000000000 +0200
@@ -51,6 +51,10 @@
 #include <asm/tlb.h>
 #include <asm/div64.h>
 
+#ifdef CONFIG_CPUSETS_PROC_STAT
+# include <linux/cpuset_types.h>
+#endif
+
 #define LOAD_INT(x) ((x) >> FSHIFT)
 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
 /*
@@ -379,10 +383,21 @@
 	jif = ((u64)now.tv_sec * HZ) + (now.tv_usec/(1000000/HZ)) - jif;
 	do_div(jif, HZ);
 
+#ifdef CONFIG_CPUSETS_PROC_STAT
+#define GET_BIT(mask, index) ((mask) & (1 << (index)))
+#endif
+
 	for (i = 0 ; i < NR_CPUS; i++) {
 		int j;
 
 		if(!cpu_online(i)) continue;
+		
+#ifdef CONFIG_CPUSETS_PROC_STAT
+		/* show only CPUs in current cpuset */
+		if (!GET_BIT(current->cpuset->cpus_allowed, i)) 
+			continue;
+#endif 
+
 		user += kstat_cpu(i).cpustat.user;
 		nice += kstat_cpu(i).cpustat.nice;
 		system += kstat_cpu(i).cpustat.system;
@@ -404,6 +419,11 @@
 		jiffies_to_clock_t(softirq));
 	for (i = 0 ; i < NR_CPUS; i++){
 		if (!cpu_online(i)) continue;
+#ifdef CONFIG_CPUSETS_PROC_STAT
+		/* show only CPUs in current cpuset */
+		if (!GET_BIT(current->cpuset->cpus_allowed, i)) 
+			continue;
+#endif 
 		len += sprintf(page + len, "cpu%d %u %u %u %u %u %u %u\n",
 			i,
 			jiffies_to_clock_t(kstat_cpu(i).cpustat.user),
--- linux-2.6.0-test4/include/asm-ia64/unistd.h	2003-09-09 10:37:48.000000000 +0200
+++ linux-2.6.0-test4+cpusets/include/asm-ia64/unistd.h	2003-09-09 10:39:53.000000000 +0200
@@ -254,6 +254,18 @@
 
 #define NR_syscalls			256 /* length of syscall table */
 
+#define __NR_sys_cpuset_create		1267
+#define __NR_sys_cpuset_destroy		1268
+#define __NR_sys_cpuset_alloc		1269
+#define __NR_sys_cpuset_attach		1270
+#define __NR_sys_cpuset_getfreecpus	1271
+
+#define __NR_sys_cpuset_create		1267
+#define __NR_sys_cpuset_destroy		1268
+#define __NR_sys_cpuset_alloc		1269
+#define __NR_sys_cpuset_attach		1270
+#define __NR_sys_cpuset_getfreecpus	1271
+
 #if !defined(__ASSEMBLY__) && !defined(ASSEMBLER)
 
 extern long __ia64_syscall (long a0, long a1, long a2, long a3, long a4, long nr);
--- linux-2.6.0-test4/include/linux/init_task.h	2003-08-23 01:52:10.000000000 +0200
+++ linux-2.6.0-test4+cpusets/include/linux/init_task.h	2003-09-10 14:02:16.000000000 +0200
@@ -56,6 +56,12 @@
 	.siglock	= SPIN_LOCK_UNLOCKED, 		\
 }
 
+#ifdef CONFIG_CPUSETS	
+#define CPUSET_TSKINIT(a,b)	.a = b,
+#else
+#define CPUSET_TSKINIT(a,b)
+#endif	
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -108,6 +114,9 @@
 	.proc_lock	= SPIN_LOCK_UNLOCKED,				\
 	.switch_lock	= SPIN_LOCK_UNLOCKED,				\
 	.journal_info	= NULL,						\
+	CPUSET_TSKINIT(cpus_wanted, CPU_MASK_ALL)				\
+	CPUSET_TSKINIT(cpuset, &top_cpuset)				\
+	CPUSET_TSKINIT(cpuset_attach_lock, SPIN_LOCK_UNLOCKED)		\
 }
 
 
--- linux-2.6.0-test4/include/linux/sched.h	2003-09-09 10:37:48.000000000 +0200
+++ linux-2.6.0-test4+cpusets/include/linux/sched.h	2003-09-09 10:39:53.000000000 +0200
@@ -29,6 +29,7 @@
 #include <linux/completion.h>
 #include <linux/pid.h>
 #include <linux/percpu.h>
+#include <linux/cpuset.h>
 
 struct exec_domain;
 
@@ -461,6 +462,13 @@
 
 	unsigned long ptrace_message;
 	siginfo_t *last_siginfo; /* For ptrace use.  */
+
+/* cpuset info */
+#ifdef CONFIG_CPUSETS	
+	struct cpuset * cpuset;
+	unsigned long cpus_wanted;
+	spinlock_t cpuset_attach_lock;
+#endif 	
 };
 
 extern void __put_task_struct(struct task_struct *tsk);
--- linux-2.6.0-test4/init/main.c	2003-08-23 01:52:56.000000000 +0200
+++ linux-2.6.0-test4+cpusets/init/main.c	2003-09-09 10:39:53.000000000 +0200
@@ -84,6 +84,7 @@
 extern void free_initmem(void);
 extern void populate_rootfs(void);
 extern void driver_init(void);
+extern void cpusets_init(void);
 
 #ifdef CONFIG_TC
 extern void tc_init(void);
@@ -452,6 +453,10 @@
 #if defined(CONFIG_SYSVIPC)
 	ipc_init();
 #endif
+#ifdef CONFIG_CPUSETS
+	cpusets_init();
+#endif	
+
 	check_bugs();
 	printk("POSIX conformance testing by UNIFIX\n");
 
--- linux-2.6.0-test4/kernel/Makefile	2003-08-23 01:53:04.000000000 +0200
+++ linux-2.6.0-test4+cpusets/kernel/Makefile	2003-09-09 10:48:52.000000000 +0200
@@ -19,6 +19,8 @@
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_IKCONFIG) += configs.o
+obj-$(CONFIG_CPUSETS) += cpuset.o
+
 
 # files to be removed upon make clean
 clean-files := ikconfig.h
--- linux-2.6.0-test4/kernel/exit.c	2003-08-23 01:59:26.000000000 +0200
+++ linux-2.6.0-test4+cpusets/kernel/exit.c	2003-09-09 10:39:53.000000000 +0200
@@ -54,6 +54,19 @@
  
 	BUG_ON(p->state < TASK_ZOMBIE);
  
+	
+#ifdef CONFIG_CPUSETS	
+	spin_lock(&p->cpuset_attach_lock);
+	release_cpuset(p->cpuset);
+
+	/* mark that this process's cpuset has already been released 
+	 * another process might still try to cpuset_attach this process
+	 */
+	p->cpuset = NULL; 
+	spin_unlock(&p->cpuset_attach_lock);
+#endif /* CONFIG_CPUSETS */	
+	
+
 	atomic_dec(&p->user->processes);
 	spin_lock(&p->proc_lock);
 	proc_dentry = proc_pid_unhash(p);
@@ -85,6 +98,7 @@
 	spin_unlock(&p->proc_lock);
 	proc_pid_flush(proc_dentry);
 	release_thread(p);
+
 	put_task_struct(p);
 }
 
--- linux-2.6.0-test4/kernel/fork.c	2003-09-09 10:37:48.000000000 +0200
+++ linux-2.6.0-test4+cpusets/kernel/fork.c	2003-09-09 10:39:53.000000000 +0200
@@ -31,6 +31,10 @@
 #include <linux/ptrace.h>
 #include <linux/mount.h>
 
+#ifdef CONFIG_CPUSETS
+#include <linux/cpuset.h>
+#endif
+
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -1008,6 +1012,11 @@
 	if (p->ptrace & PT_PTRACED)
 		__ptrace_link(p, current->parent);
 
+#ifdef CONFIG_CPUSETS	
+	use_cpuset(p->cpuset);
+#endif	
+
+
 	attach_pid(p, PIDTYPE_PID, p->pid);
 	if (thread_group_leader(p)) {
 		attach_pid(p, PIDTYPE_TGID, p->tgid);
--- linux-2.6.0-test4/kernel/sched.c	2003-08-23 01:58:43.000000000 +0200
+++ linux-2.6.0-test4+cpusets/kernel/sched.c	2003-09-09 10:48:16.000000000 +0200
@@ -35,6 +35,10 @@
 #include <linux/cpu.h>
 #include <linux/percpu.h>
 
+#ifdef CONFIG_CPUSETS	
+#include <linux/cpuset.h>
+#endif
+
 #ifdef CONFIG_NUMA
 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
 #else
@@ -1954,7 +1958,11 @@
 			!capable(CAP_SYS_NICE))
 		goto out_unlock;
 
+#ifdef CONFIG_CPUSETS 
+	retval = cpuset_setaffinity(p, new_mask);
+#else
 	retval = set_cpus_allowed(p, new_mask);
+#endif
 
 out_unlock:
 	put_task_struct(p);
@@ -1987,7 +1995,11 @@
 		goto out_unlock;
 
 	retval = 0;
+#ifdef CONFIG_CPUSETS
+	mask = p->cpus_wanted;
+#else
 	cpus_and(mask, p->cpus_allowed, cpu_online_map);
+#endif
 
 out_unlock:
 	read_unlock(&tasklist_lock);
--- linux-2.6.0-test4/include/linux/cpuset_types.h	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.0-test4+cpusets/include/linux/cpuset_types.h	2003-09-10 13:30:34.000000000 +0200
@@ -0,0 +1,39 @@
+#ifndef _LINUX_CPUSET_TYPES_H
+#define _LINUX_CPUSET_TYPES_H
+
+
+struct cpuset {
+        cpuset_t id;
+        int flags;
+	int has_been_attached;
+
+        /* bitmask of the cpus present in this cpuset */
+        cpumask_t cpus_allowed;
+
+        /* bitmask of the cpus reserved in this cpuset */
+        cpumask_t cpus_reserved;
+
+        /* bitmask of the cpus reserved with CPUSET_STRICT */
+        cpumask_t cpus_strictly_reserved;
+
+        struct cpuset * parent;
+        struct list_head list; /* for the whole list */
+
+        struct list_head children; 
+        struct list_head brothers;
+
+	/* overall users (processes + children) */
+	/* will be replaced by atomic_t in the future */
+        atomic_t count; 
+
+	spinlock_t attach_lock;
+
+	/* owner */
+	uid_t uid;
+	uid_t suid;
+
+
+};
+
+
+#endif
--- linux-2.6.0-test4/include/linux/cpuset.h	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.0-test4+cpusets/include/linux/cpuset.h	2003-09-09 10:50:05.000000000 +0200
@@ -0,0 +1,25 @@
+/*
+ * BULL cpuset interface
+ */
+
+#ifndef _LINUX_CPUSET_H
+#define _LINUX_CPUSET_H
+
+typedef unsigned int cpuset_t;
+
+#define CPUSET_STRICT           0x00000001
+#define CPUSET_AUTOCLEAN        0x00000002
+
+#ifdef __KERNEL__
+
+extern struct cpuset top_cpuset;
+
+void use_cpuset(struct cpuset *);
+void release_cpuset(struct cpuset *);
+
+struct task_struct;
+int cpuset_setaffinity(struct task_struct * task, unsigned long mask);
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_CPUSET_H */
--- linux-2.6.0-test4/kernel/cpuset.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.0-test4+cpusets/kernel/cpuset.c	2003-09-10 16:20:46.000000000 +0200
@@ -0,0 +1,748 @@
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+#include <linux/slab.h> /* for kmalloc */
+#include <linux/list.h>
+#include <linux/sched.h> /* for find_task_by_pid and task_struct */
+#include <asm/uaccess.h> 
+#include <linux/errno.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/cpuset_types.h>
+
+#define info(args...) do {} while(0) 
+//#define info(args...) printk(KERN_INFO args)
+
+rwlock_t cpuset_lock = RW_LOCK_UNLOCKED;
+
+#define CPUSET_TOP_ID 1
+
+struct cpuset top_cpuset = {
+	.id = CPUSET_TOP_ID,
+	.flags = CPUSET_STRICT,
+	.cpus_reserved = CPU_MASK_NONE,
+	.cpus_strictly_reserved = CPU_MASK_NONE,
+	.parent = 0,
+	.children = LIST_HEAD_INIT(top_cpuset.children),
+	.brothers = LIST_HEAD_INIT(top_cpuset.brothers),
+	.list = LIST_HEAD_INIT(top_cpuset.list),
+	.count = ATOMIC_INIT(1), /* this cpuset can't be deleted */
+	.has_been_attached = 0,
+	.uid = 0,
+	.attach_lock = SPIN_LOCK_UNLOCKED,
+	.suid = 0
+};
+
+	
+static int proc_cpusets_init(void);
+
+int __init cpusets_init(void)
+{
+	info("cpusets ("__FILE__ " compiled " __DATE__ " " __TIME__ "initializing..\n");
+	top_cpuset.cpus_allowed = phys_cpu_present_map;	
+
+#ifdef CONFIG_CPUSETS_PROC
+	proc_cpusets_init();
+#endif /* CONFIG_CPUSETS_PROC */	
+	return 0;
+}
+
+
+static const int N = (8*sizeof(cpumask_t));
+/* mask must NOT be ZERO ! */
+/* this is a cyclic version of next_cpu */
+static inline void _next_cpu(const cpumask_t mask, int * index)
+{
+	for(;;) {
+		if (++*index >= N) *index = 0;
+		if (cpu_isset(*index, mask)) return;
+	}
+}
+			
+static unsigned long cpuset_combine_mask(const cpumask_t wanted, const cpumask_t allowed)
+{
+	int i;
+	cpumask_t mask;
+
+	/* start with current cpu out of the mask
+	 * so the first call to next_cpu will take the first cpu
+	 * even if it is cpu zero
+	 */
+	int cpu = N;
+
+	if (cpus_empty(wanted)) return 0;
+	if (cpus_empty(allowed)) return 0;
+
+	cpus_clear(mask);
+
+	for(i=0; i < N; i++) {
+		_next_cpu(allowed, &cpu);
+		if (cpu_isset(i, wanted)) 
+			cpu_set(cpu, mask);
+	}
+	info("cpuset_combine_mask: %016lx + %016lx --> %016lx\n", 
+				wanted, allowed, mask); 
+	return mask;
+}
+
+	
+
+static struct cpuset * find_cpuset_by_id(cpuset_t id)
+{
+	struct cpuset * cs;
+	if (id == CPUSET_TOP_ID) return &top_cpuset; 
+
+	list_for_each_entry(cs, &top_cpuset.list, list) {
+		if (cs->id == id) return cs;
+	}
+	/* Not found */
+	return 0;
+}
+
+/* increment a cpuset use count */
+void use_cpuset(struct cpuset * cs)
+{
+	atomic_inc(&cs->count);
+}
+
+static void check_cpuset_autoclean(struct cpuset *);
+
+/* decrement a cpuset use count, and maybe autodestroy it */
+/* cpuset_lock MUST NOT BE HELD */
+void release_cpuset(struct cpuset * cs)
+{
+	if (atomic_dec_and_test(&cs->count))	
+		check_cpuset_autoclean(cs);
+}
+
+/* find a free cpuset ID */
+static cpuset_t cpuset_mkid(void)
+{
+	static cpuset_t curid = CPUSET_TOP_ID;
+
+	while (find_cpuset_by_id(++curid));
+
+	return curid;
+}
+
+asmlinkage long sys_cpuset_create(cpuset_t * cpusetp, int flags)
+{
+	struct cpuset * cs;		
+	
+	info("sys_cpuset_create(%016lx, %d) called\n", 
+		(unsigned long) cpusetp, flags);
+
+	/* can only create a strict cs in another strict cs */
+	if ((flags & CPUSET_STRICT) && (!(current->cpuset->flags & CPUSET_STRICT)))	
+		return -EINVAL;
+
+	/* check if given pointer is valid */
+	if (verify_area(VERIFY_WRITE, cpusetp, sizeof(cpuset_t))) 
+		return -EFAULT;
+	
+	cs = (struct cpuset *) kmalloc(sizeof(struct cpuset), GFP_KERNEL);
+	if (!cs)
+		return -ENOMEM;
+
+	cs->flags = flags;
+	atomic_set(&cs->count, 0);
+	INIT_LIST_HEAD(&cs->children);
+	cs->cpus_allowed = 0; 
+	cs->cpus_reserved = 0;
+	cs->cpus_strictly_reserved = 0;
+	cs->has_been_attached = 0;
+	cs->uid = current->uid;
+	cs->suid = current->suid;
+	cs->attach_lock = SPIN_LOCK_UNLOCKED;
+	
+	cs->parent = current->cpuset;
+
+	use_cpuset(cs->parent);
+	
+	write_lock(&cpuset_lock); 
+	
+	cs->id = cpuset_mkid();
+	list_add(&cs->brothers, &cs->parent->children);
+	list_add(&cs->list, &top_cpuset.list);
+	
+	write_unlock(&cpuset_lock);
+
+	if (put_user(cs->id, cpusetp))
+		info("put_user failed !\n");
+
+	return 0;
+}
+
+
+static inline int bad_permission(struct cpuset * cs) 
+{
+	return ((current->euid) && (current->euid != cs->uid) && (current->euid != cs->suid));
+}
+
+static void __cpuset_destroy(struct cpuset * cs);
+
+asmlinkage long sys_cpuset_destroy(cpuset_t cpuset) 
+{
+	struct cpuset * cs;
+
+	info("sys_cpuset_destroy(%d) called\n", cpuset);
+
+	if (cpuset == CPUSET_TOP_ID)
+		return -EINVAL;
+
+	read_lock(&cpuset_lock); 
+	cs = find_cpuset_by_id(cpuset);
+	
+	if (!cs) {
+		read_unlock(&cpuset_lock); 
+		return -EINVAL;
+	}
+
+	use_cpuset(cs);
+	read_unlock(&cpuset_lock); 
+	
+	if (bad_permission(cs)) {
+		release_cpuset(cs);
+		return -EPERM;
+	}
+
+	write_lock(&cpuset_lock);
+	/* there's at least 1 user (us), if there's more, we can't destroy cs */	
+	if (atomic_read(&cs->count) > 1) {
+		write_unlock(&cpuset_lock);
+		release_cpuset(cs);	
+		return -EBUSY;
+	}
+
+	/* everything OK, destroy it */
+	__cpuset_destroy(cs);
+	/* write_unlock(&cpuset_lock) will be done inside __cpuset_destroy */
+
+	return 0;
+}
+
+static void rebuild_reserved_masks(struct cpuset * csp) {
+	cpumask_t r;
+	cpumask_t sr;
+	struct cpuset * cs;
+	info("Updating cpuset %d masks\n", csp->id);
+
+	cpus_clear(r);
+	cpus_clear(sr);
+
+	list_for_each_entry(cs, &csp->children, brothers) {
+		info("	child %d\n", cs->id);
+		cpus_or(r, r, cs->cpus_allowed);
+		if (cs->flags & CPUSET_STRICT)
+			cpus_or(sr, sr, cs->cpus_allowed);
+	}
+	csp->cpus_reserved = r;
+	csp->cpus_strictly_reserved = sr;
+}
+
+/* REALLY destroy a cpuset 
+ * NOTE: 
+ * -> write cpuset_lock must be held 
+ * -> ----------------- WILL BE RELEASED
+ * this ugly hack is necessary to call release_cpuset(parent)
+ */
+static void __cpuset_destroy(struct cpuset * cs)
+{
+        list_del(&cs->list);
+	list_del(&cs->brothers);
+	
+	/* cs will never be top_cpuset, so ->parent exists */
+	rebuild_reserved_masks(cs->parent); 
+
+	write_unlock(&cpuset_lock);
+	release_cpuset(cs->parent);
+
+	kfree(cs);
+}
+	
+/* remove an unused cpuset if it has the CPUSET_AUTOCLEAN flag */
+static void check_cpuset_autoclean(struct cpuset * cs)
+{
+	if (!(cs->flags & CPUSET_AUTOCLEAN)) return; /* not autoclean */
+	if (!cs->has_been_attached) return;	
+
+	write_lock(&cpuset_lock);
+
+	if (atomic_read(&cs->count) > 0) { /* still in use */
+		write_unlock(&cpuset_lock);
+		return; 
+	}
+
+	info("autocleaning cpuset %d\n", cs->id);
+
+	__cpuset_destroy(cs);
+	/* write_unlock(&cpuset_lock) will be done inside __cpuset_destroy */
+}
+
+asmlinkage long sys_cpuset_attach(cpuset_t cpuset, pid_t pid)
+{
+	struct cpuset * cs;
+	struct task_struct * task;
+	
+	info("sys_cpuset_attach(%d, %d) called\n", cpuset, pid);
+
+	read_lock(&cpuset_lock); 
+	cs = find_cpuset_by_id(cpuset);
+
+	if (!cs) {
+		read_unlock(&cpuset_lock); 
+		return -EINVAL;
+	}
+
+
+	use_cpuset(cs);
+
+	read_unlock(&cpuset_lock); 
+	
+	if (bad_permission(cs)) {
+		release_cpuset(cs);
+		return -EPERM;
+	}
+
+	if (!cs->cpus_allowed) { /* cannot attach a cpuset with no CPU */
+		release_cpuset(cs);
+		return -EINVAL;
+	}
+
+	if (pid) {	
+		read_lock(&tasklist_lock);
+
+		task = find_task_by_pid(pid);
+		if (!task) {
+			read_unlock(&tasklist_lock);
+			release_cpuset(cs);
+			return -ESRCH;
+		}
+
+		get_task_struct(task);
+		read_unlock(&tasklist_lock);
+
+		if ((current->euid) && (current->euid != task->uid) && (current->euid != task->suid)) {
+			put_task_struct(task);
+			release_cpuset(cs);
+			return -EPERM;
+		}
+	}
+	else {
+		task = current;
+		get_task_struct(task);
+	}
+
+	set_cpus_allowed(task, cpuset_combine_mask(task->cpus_wanted, cs->cpus_allowed));
+	cs->has_been_attached = 1;
+
+	/* release the current cpu set of the task */
+	/* lock to prevent a race where two cpuset_attach would be called on the same 
+	 * task at the same time, and task->cpuset would be released twice
+         */
+	spin_lock(&task->cpuset_attach_lock);
+	if (!task->cpuset) { /* task with no cpuset ? means it is exiting */ 
+		spin_unlock(&task->cpuset_attach_lock);
+		put_task_struct(task);
+		release_cpuset(cs);
+		return -ESRCH;
+	}	
+	release_cpuset(task->cpuset);
+	/* now lock the cpuset, to protect any running migrate_cpuset...()
+	 * from being disturbed by us
+	 */
+	spin_lock(&cs->attach_lock);
+	task->cpuset = cs;
+	spin_unlock(&cs->attach_lock);
+
+	spin_unlock(&task->cpuset_attach_lock);
+	
+
+	put_task_struct(task);
+
+	/* don't call release_cpuset here, 
+	 * the task being attached to the cpuset 
+	 * is really a new user !
+	 */
+
+	return 0;
+}
+
+		
+static int __cpuset_setaffinity(struct task_struct * task)
+{
+	cpumask_t allowed;
+	cpumask_t last = CPU_MASK_NONE; /* remember : 0 is not a valid mask */
+
+	/* We cannot hold any lock while calling set_cpus_allowed
+	 * since it might sleep
+	 * Thus we try until we are sure we did it with the right mask
+	 */
+	for(;;) {	
+		spin_lock(&task->cpuset_attach_lock);
+		if (!task->cpuset) {
+			/* task exiting */
+			spin_unlock(&task->cpuset_attach_lock);
+			return 0;
+		}
+		allowed = task->cpuset->cpus_allowed;
+		spin_unlock(&task->cpuset_attach_lock);
+
+		if (last == allowed) 
+			return 0;
+
+		int ret;
+		ret = set_cpus_allowed(task, cpuset_combine_mask(task->cpus_wanted, allowed));
+		if (ret < 0) 
+			return ret;
+
+		last = allowed;
+	}
+}
+
+/* Our replacement function for set_cpus_allowed */
+int cpuset_setaffinity(struct task_struct * task, cpumask_t mask)
+{
+	task->cpus_wanted = mask;
+	return __cpuset_setaffinity(task);
+}
+
+/* When a cpuset with attached processes is being realloc'ed CPUs
+ * update the processes' masks and migrate them
+ */
+static void migrate_cpuset_processes(struct cpuset * cs)
+{		
+	struct task_struct *g, *p;
+	/* This should be a RARE use of the cpusets.
+	 * therefore we'll prefer an inefficient operation here
+	 * (searching the whole process list)
+	 * than adding another list_head in task_t
+	 * and locks and list_add for each fork()
+	 */
+
+	/* we need to lock tasklist_lock for reading the processes list
+	 * BUT we cannot call set_cpus_allowed with any spinlock held
+	 * => we need to store the list of task struct in an array
+	 */
+	struct task_struct ** array;
+	int nb = 0;
+	int sz;
+
+	spin_lock(&cs->attach_lock);
+	/* at most cs->count - 1 processes to migrate */
+	sz = atomic_read(&cs->count) - 1;
+	array = (struct task_struct **) kmalloc(sz * sizeof(struct task_struct *), GFP_ATOMIC);
+	if (!array) {
+		spin_unlock(&cs->attach_lock);
+		printk("Error allocating array in migrate_cpuset_processes !\n");
+		return;
+	}
+	/* see linux/sched.h for this nested for/do-while loop */
+	read_lock(&tasklist_lock);
+	do_each_thread(g, p) {
+		if (p->cpuset == cs) {
+			if (nb == sz) {
+				printk("migrate_cpuset_processes: array full !\n");
+				goto end_loop; /* break won't work in this double loop */
+			}
+			get_task_struct(p);
+			array[nb++] = p;
+		}
+	} while_each_thread(g, p); 
+end_loop:	
+	read_unlock(&tasklist_lock);
+	spin_unlock(&cs->attach_lock);
+	
+	while(nb) {
+		struct task_struct * p = array[--nb];	
+		__cpuset_setaffinity(p); 
+		put_task_struct(p);
+	}
+	kfree(array);
+}
+
+
+
+/* see if mask b is included in mask a */
+/* old version : #define MASK_INCLUDED(a, b) (((a)|(b)) == (a)) */
+static inline int MASK_INCLUDED(cpumask_t a, cpumask_t b)
+{
+	cpumask_t r;
+	cpus_or(r, a, b);
+	return cpus_equal(r, a);
+}
+
+static inline cpumask_t CPUS_NOT(cpumask_t a)
+{
+	cpus_complement(a);
+	return a;
+}
+
+static inline cpumask_t CPUS_OR(cpumask_t a, cpumask_t b)
+{
+	cpumask_t r;
+	cpus_or(r, a, b);
+	return r;
+}
+
+static inline cpumask_t CPUS_AND(cpumask_t a, cpumask_t b)
+{
+	cpumask_t r;
+	cpus_and(r, a, b);
+	return r;
+}
+
+
+asmlinkage long sys_cpuset_alloc(cpuset_t cpuset, int len, unsigned long * user_mask_ptr)
+{
+	cpumask_t new_mask;
+	cpumask_t old_mask;
+	struct cpuset * cs ;
+	int retval;
+
+	info("sys_cpuset_alloc(%d, ...) called\n", cpuset);
+
+	if (cpuset == CPUSET_TOP_ID)
+		return -EINVAL;
+
+	if (len < sizeof(new_mask))
+		return -EINVAL;
+
+	if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
+		return -EFAULT;
+
+	/* do some sanity checks on the mask */
+	/* must have at least ONE cpu */
+	if (cpus_empty(new_mask))
+		return -EINVAL;
+
+	/* must only have existing CPUs */
+	if (!MASK_INCLUDED(phys_cpu_present_map, new_mask))
+		return -EINVAL;
+
+	
+	info(" with mask %016lx\n", new_mask);
+
+	read_lock(&cpuset_lock); 
+	cs = find_cpuset_by_id(cpuset);
+	
+	
+	if (!cs) {
+		read_unlock(&cpuset_lock); 
+		return -EINVAL;
+	}
+	
+	use_cpuset(cs);
+	read_unlock(&cpuset_lock); 
+
+	if (bad_permission(cs)) {
+		release_cpuset(cs);
+		return -EPERM;
+	}
+		
+	/* lock early - we do not want the parent's masks to change under us */
+	write_lock(&cpuset_lock);
+	/* must only have CPUs in the parent cpuset (if any) */
+	retval = -EACCES;
+	if (!MASK_INCLUDED(cs->parent->cpus_allowed, new_mask)) 
+		goto mask_error;
+	
+	old_mask = cs->cpus_allowed;
+
+	retval = -EBUSY;
+	/* must only have free cpus */
+	if (cs->flags & CPUSET_STRICT) {
+		/* CPUs already in this cs ARE free for us ! -> old_mask */
+		/* The next few lines mean :
+		 * if (!MASK_INCLUDED(~cs->parent->cpus_reserved, new_mask & (~old_mask))) 
+		 * (just obfuscated my the cpus_ macros)
+		 */
+		if (!MASK_INCLUDED(CPUS_NOT(cs->parent->cpus_reserved), 
+				   CPUS_AND(new_mask, CPUS_NOT(old_mask))))
+			goto mask_error;
+	}
+	else {
+		if (!MASK_INCLUDED(CPUS_NOT(cs->parent->cpus_strictly_reserved), new_mask))
+			goto mask_error;
+	}
+
+
+	/* are we trying to FREE reserved CPUs 
+	 * (i.e. reserved by children cpusets)
+	 * from a non-unused cpuset ? */
+	/* if (cs->cpus_reserved & ~new_mask) */
+	if (!cpus_empty(CPUS_AND(cs->cpus_reserved, CPUS_NOT(new_mask))))
+		goto mask_error;
+
+	/* everything is OK */
+	cs->cpus_allowed = new_mask;
+	rebuild_reserved_masks(cs->parent);
+	write_unlock(&cpuset_lock); 
+
+	/* did we change a non-unused cpuset ? */
+	if ((atomic_read(&cs->count) > 1) && !cpus_equal(new_mask, old_mask)) {
+		migrate_cpuset_processes(cs);
+	}
+
+	release_cpuset(cs);
+	return 0;
+
+mask_error:
+	write_unlock(&cpuset_lock); 
+	release_cpuset(cs);
+	return retval;
+}
+
+asmlinkage long sys_cpuset_getfreecpus(int flags, int len, unsigned long * user_mask_ptr)
+{
+	cpumask_t reserved;
+	cpumask_t free;
+
+	int real_len = sizeof(unsigned long);
+	if (len < real_len)
+		return -EINVAL;
+	
+	if (flags & CPUSET_STRICT)
+		reserved = current->cpuset->cpus_reserved;
+	else	
+		reserved = current->cpuset->cpus_strictly_reserved;
+
+	free = CPUS_AND(current->cpuset->cpus_allowed, CPUS_NOT(reserved));
+
+	if (copy_to_user(user_mask_ptr, &free, real_len))
+		return -EFAULT;
+
+	return real_len;
+}
+
+/*************************************************************
+ ***************** /proc/cpusets stuff ***********************
+ *************************************************************
+ */
+#ifdef CONFIG_CPUSETS_PROC
+
+static void *proc_cpusets_start(struct seq_file *m, loff_t *pos)
+{
+        loff_t n = *pos;
+        struct list_head *p;
+
+	read_lock(&cpuset_lock); 
+        if (!n) seq_puts(m, "cpusets info \n");
+        
+	p = &top_cpuset.list;
+        while (n--) {
+                p = p->next;
+                if (p == &top_cpuset.list)
+                        return NULL;
+        }
+        return list_entry(p, struct cpuset, list);
+}
+
+static void *proc_cpusets_next(struct seq_file *m, void *p, loff_t *pos)
+{
+        struct cpuset * cs = p;
+        ++*pos;
+        return cs->list.next == &top_cpuset.list ? NULL
+                : list_entry(cs->list.next, struct cpuset, list);
+}
+
+/* How many chars needed to print a long (as a mask) ? */
+#define CHARS_FOR_LONG 	(BITS_PER_LONG / 4)
+#define CFL 		CHARS_FOR_LONG
+static void sprint_mask(char * buf, cpumask_t mask)
+{
+#ifdef CPU_ARRAY_SIZE	
+	int l;
+	for (l = CPU_ARRAY_SIZE - 1; l>=0; l--) {
+		/* XXX only 64 bits long supported here ! */
+		sprintf(buf, "%016lx", mask.mask[l]);
+		buf += CFL;
+	}
+#else
+	/* XXX only 64 bits long supported here ! */
+	sprintf(buf, "%016lx", mask);
+#endif
+}
+		
+
+static int proc_cpusets_show(struct seq_file *m, void *p)
+{
+        struct cpuset * cs = p;
+#ifdef CPU_ARRAY_SIZE
+	char maskbuf[CPU_ARRAY_SIZE * CFL + 1];
+#else
+	char maskbuf[CFL + 1];
+#endif
+
+	seq_printf(m, "cpuset %d {\n"
+		"\tparent = %d\n"
+		"\tflags = %d\n"
+		"\tcount = %d\n"
+		"\thba = %d\n"
+		"\tuid & suid = %d & %d\n",
+		cs->id, cs->parent ? cs->parent->id : -1, 
+		cs->flags, atomic_read(&cs->count), cs->has_been_attached,
+		cs->uid, cs->suid);
+
+	sprint_mask(maskbuf, cs->cpus_allowed);
+	seq_printf(m,"\tcpus_allowed = %s\n", maskbuf);
+	sprint_mask(maskbuf, cs->cpus_reserved);
+	seq_printf(m,"\tcpus_reserved = %s\n", maskbuf);
+	sprint_mask(maskbuf, cs->cpus_strictly_reserved);
+	seq_printf(m,"\tcpus_strictly_reserved = %s\n", maskbuf);
+
+	seq_printf(m, "}\n\n");
+
+	return 0;
+}
+
+static void proc_cpusets_stop(struct seq_file *m, void *p)
+{
+	read_unlock(&cpuset_lock);
+}
+
+static struct seq_operations cpusets_op = {
+	.start =	proc_cpusets_start,
+	.next =		proc_cpusets_next,
+	.stop =		proc_cpusets_stop,
+	.show =		proc_cpusets_show
+};
+
+
+static int proc_cpusets_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &cpusets_op);
+}
+
+static struct file_operations proc_cpusets_operations = {
+	.open		= proc_cpusets_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+
+static int __init proc_cpusets_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = create_proc_entry("cpusets", 0, NULL);
+	if (entry)
+		entry->proc_fops = &proc_cpusets_operations;
+	return 0;
+}
+
+/*************************************************************
+ *********** /proc/xxx/cpuset ********************************
+ *************************************************************
+ */
+int proc_pid_cpuset(struct task_struct *task, char *buffer)
+{
+	return sprintf(buffer, "%d\n", task->cpuset->id);
+}
+
+#endif /* CONFIG_CPUSETS_PROC */	
+

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2003-09-25 14:36 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2003-09-24 15:59 CPUSET Proposal Simon Derr
2003-09-24 22:06 ` David Mosberger
2003-09-24 22:27 ` Erich Focht
2003-09-25 14:36 ` Simon Derr

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox