[RFC][PATCH 1/1] MAZE: Mazed processes monitor

All of lore.kernel.org
 help / color / mirror / Atom feed

* [RFC][PATCH 1/1] MAZE: Mazed processes monitor
@ 2008-05-13 11:47 Hirofumi Nakagawa
  2008-05-13 16:04 ` Andrew Morton
  0 siblings, 1 reply; 2+ messages in thread
From: Hirofumi Nakagawa @ 2008-05-13 11:47 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm

This patch for linux-2.6.25-mm1 that is tested on x86_64 hardware.

Signed-off-by: Hirofumi Nakagawa <hnakagawa@miraclelinux.com>
---
 include/linux/maze.h  |   92 +++++++++
 include/linux/sched.h |    3
 init/Kconfig          |   10 +
 init/main.c           |    2
 kernel/Makefile       |    1
 kernel/exit.c         |    2
 kernel/fork.c         |    2
 kernel/maze.c         |  466 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched.c        |    3
 9 files changed, 581 insertions(+)

--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25-mm1-maze/include/linux/maze.h	2008-05-13 17:52:23.000000000 +0900
@@ -0,0 +1,92 @@
+/*
+ *  Copyright (C) 2007-2008 MIRACLE LINUX Corp.
+ *
+ *  Written by Hirofumi Nakagawa <hnakagawa@miraclelinux.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License as
+ *	published by the Free Software Foundation, version 2.
+ */
+#ifndef _LINUX_MAZE_H
+#define _LINUX_MAZE_H
+
+#ifdef __KERNEL__
+
+#include <linux/sched.h>
+#include <linux/list.h>
+
+struct maze_context {
+	struct task_struct *task;
+	int pid;
+
+	/* The stime of task when the maze count was reseted */
+	cputime_t reset_stime;
+	/* The utime of task when the maze count was reseted */
+	cputime_t reset_utime;
+
+	/* The soft limit of maze count */
+	unsigned long soft_limit;
+	/* THe hard limit of maze count */
+	unsigned long hard_limit;
+
+	/* The send signal when the maze count reach soft limit */
+	int soft_signal;
+	/* The send signal when the maze count reach hard limit */
+	int hard_signal;
+
+#define MAZE_SENT_SOFT_SIG  1
+#define MAZE_SENT_HARD_SIG  2
+	/* The flags of sent signal */
+	unsigned long flags;
+
+	/* This value is 1, if maze_context is enqueueing */
+	int enqueue;
+	/* This value is cpu number of enqueueing */
+	int enqueue_cpu;
+
+	/* This value is 1, if preempt_notifier is registered */
+	atomic_t registered_notifier;
+
+	struct preempt_notifier notifier;
+
+	/* This list_head is linked from the maze_queue */
+	struct list_head queue;
+	/* This list_head is linked from the maze_list */
+	struct list_head list;
+};
+
+#ifdef CONFIG_MAZE
+
+/* This function is called start_kernel() */
+extern void maze_init_early(void);
+
+/* This function is called do_exit() */
+extern void maze_exit(struct task_struct *task);
+
+/* This function is called copy_process() */
+extern void maze_fork(struct task_struct *task);
+
+/* This function is called sys_sched_yield() */
+extern void maze_sched_yield(struct task_struct *task);
+
+#else  /* !CONFIG_MAZE */
+
+static inline void maze_init_early(void)
+{
+}
+
+static inline maze_exit(struct task_struct *task)
+{
+}
+
+static inline void maze_fork(struct task_struct *task)
+{
+}
+
+static inline void maze_sched_yield(struct task_struct *task)
+{
+}
+#endif /* CONFIG_MAZE */
+
+#endif /* __KERNEL__    */
+#endif /* _LINUX_MAZE_H */
--- linux-2.6.25-mm1-maze.orig/include/linux/sched.h	2008-05-13 17:40:43.000000000 +0900
+++ linux-2.6.25-mm1-maze/include/linux/sched.h	2008-05-13 17:52:23.000000000 +0900
@@ -1254,6 +1254,9 @@ struct task_struct {
 	/* cg_list protected by css_set_lock and tsk->alloc_lock */
 	struct list_head cg_list;
 #endif
+#ifdef CONFIG_MAZE
+	struct maze_context *maze_context;
+#endif
 #ifdef CONFIG_FUTEX
 	struct robust_list_head __user *robust_list;
 #ifdef CONFIG_COMPAT
--- linux-2.6.25-mm1-maze.orig/init/Kconfig	2008-05-13 17:40:43.000000000 +0900
+++ linux-2.6.25-mm1-maze/init/Kconfig	2008-05-13 17:52:23.000000000 +0900
@@ -371,6 +371,16 @@ config RESOURCE_COUNTERS
           infrastructure that works with cgroups
 	depends on CGROUPS

+config MAZE
+	bool "MAZE monitor support"
+	select PREEMPT_NOTIFIERS
+	help
+	  MAZE is a function to monitor mazed processes which use excessive CPU cycles.
+	  This is a CGL (Carrier Grade Linux) requirement (AVL.14.0). MAZE detects such
+	  processes and sends specified signals to the processes for their termination.
+
+	  Say N if unsure.
+
 config MM_OWNER
 	bool

--- linux-2.6.25-mm1-maze.orig/init/main.c	2008-05-13 17:40:43.000000000 +0900
+++ linux-2.6.25-mm1-maze/init/main.c	2008-05-13 17:52:23.000000000 +0900
@@ -61,6 +61,7 @@
 #include <linux/sched.h>
 #include <linux/signal.h>
 #include <linux/idr.h>
+#include <linux/maze.h>

 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -543,6 +544,7 @@ asmlinkage void __init start_kernel(void
 	boot_init_stack_canary();
 	debug_objects_early_init();
 	cgroup_init_early();
+	maze_init_early();

 	local_irq_disable();
 	early_boot_irqs_off();
--- linux-2.6.25-mm1-maze.orig/kernel/Makefile	2008-05-13 17:40:43.000000000 +0900
+++ linux-2.6.25-mm1-maze/kernel/Makefile	2008-05-13 17:52:23.000000000 +0900
@@ -80,6 +80,7 @@ obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
 obj-$(CONFIG_FTRACE) += trace/
 obj-$(CONFIG_TRACING) += trace/
+obj-$(CONFIG_MAZE) += maze.o

 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
--- linux-2.6.25-mm1-maze.orig/kernel/exit.c	2008-05-13 17:40:43.000000000 +0900
+++ linux-2.6.25-mm1-maze/kernel/exit.c	2008-05-13 17:52:23.000000000 +0900
@@ -44,6 +44,7 @@
 #include <linux/resource.h>
 #include <linux/blkdev.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/maze.h>

 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -1058,6 +1059,7 @@ NORET_TYPE void do_exit(long code)
 	__exit_fs(tsk);
 	check_stack_usage();
 	exit_thread();
+	maze_exit(tsk);
 	cgroup_exit(tsk, 1);
 	exit_keys(tsk);

--- linux-2.6.25-mm1-maze.orig/kernel/fork.c	2008-05-13 17:40:43.000000000 +0900
+++ linux-2.6.25-mm1-maze/kernel/fork.c	2008-05-13 17:52:23.000000000 +0900
@@ -53,6 +53,7 @@
 #include <linux/tty.h>
 #include <linux/proc_fs.h>
 #include <linux/blkdev.h>
+#include <linux/maze.h>

 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -1363,6 +1364,7 @@ static struct task_struct *copy_process(
 	write_unlock_irq(&tasklist_lock);
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
+	maze_fork(p);
 	return p;

 bad_fork_free_pid:
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25-mm1-maze/kernel/maze.c	2008-05-13 20:07:55.000000000 +0900
@@ -0,0 +1,466 @@
+/*
+ *  Copyright (C) 2007-2008 MIRACLE LINUX Corp.
+ *
+ *  Written by Hirofumi Nakagawa <hnakagawa@miraclelinux.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License as
+ *	published by the Free Software Foundation, version 2.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/timer.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+#include <linux/capability.h>
+#include <linux/maze.h>
+
+#define ERRPRINT(str, ...) printk("ERROR [%s] " str, __FUNCTION__, ##__VA_ARGS__);
+
+/* The root directory of procfs*/
+static struct proc_dir_entry *maze_proc_dir;
+
+static int maze_initialized;
+
+static struct list_head maze_list;
+static spinlock_t maze_list_lock;
+
+static int maze_timer_interval = 10;
+
+DEFINE_PER_CPU(struct list_head, maze_queue);
+DEFINE_PER_CPU(spinlock_t, maze_queue_lock);
+
+DEFINE_PER_CPU(struct timer_list, maze_timer);
+
+static inline void reset_maze_count(struct maze_context *context)
+{
+	context->reset_utime = context->task->utime;
+	context->reset_stime = context->task->stime;
+	context->flags = 0;
+}
+
+static inline cputime_t get_maze_count(struct maze_context *context)
+{
+	return (context->task->utime - context->reset_utime) +
+		(context->task->stime - context->reset_stime);
+}
+
+/*
+ * This function is called start_kernel()
+ */
+void maze_init_early(void)
+{
+	init_task.maze_context = NULL;
+}
+
+/*
+ * This function is called do_exit()
+ */
+void maze_exit(struct task_struct *task)
+{
+	unsigned long flags;
+
+	task_lock(task);
+	if (task->maze_context) {
+		spin_lock_irqsave(&per_cpu(maze_queue_lock,
+								   task->maze_context->enqueue_cpu), flags);
+		if (task->maze_context->enqueue)
+			list_del(&task->maze_context->queue);
+		spin_unlock_irqrestore(&per_cpu(maze_queue_lock,
+										task->maze_context->enqueue_cpu), flags);
+
+		spin_lock(&maze_list_lock);
+		list_del(&task->maze_context->list);
+		spin_unlock(&maze_list_lock);
+
+		kfree(task->maze_context);
+		task->maze_context = NULL;
+	}
+	task_unlock(task);
+}
+
+static inline void copy_limit_and_signal(struct maze_context *to,
+										 struct maze_context *from)
+{
+	to->soft_limit = from->soft_limit;
+	to->hard_limit = from->hard_limit;
+	to->soft_signal = from->soft_signal;
+	to->hard_signal = from->hard_signal;
+}
+
+/*
+ * Must be called under task_lock().
+ */
+static inline void enqueue(struct maze_context *context)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&__get_cpu_var(maze_queue_lock), flags);
+	if (!context->enqueue) {
+		list_add(&context->queue, &__get_cpu_var(maze_queue));
+		context->enqueue_cpu = smp_processor_id();
+		context->enqueue = 1;
+	}
+	spin_unlock_irqrestore(&__get_cpu_var(maze_queue_lock), flags);
+}
+
+static void sched_in_event(struct preempt_notifier *notifier, int cpu)
+{
+	task_lock(current);
+	if (current->maze_context) {
+		if (current->state != TASK_RUNNING)
+			reset_maze_count(current->maze_context);
+		enqueue(current->maze_context);
+	}
+	task_unlock(current);
+}
+
+static void sched_out_event(struct preempt_notifier *notifier,
+							struct task_struct *next)
+{
+	task_lock(next);
+	if (next->maze_context)
+		enqueue(next->maze_context);
+	task_unlock(next);
+}
+
+static struct preempt_ops preempt_ops = {
+	.sched_in = sched_in_event,
+	.sched_out = sched_out_event,
+};
+
+/*
+ * This function is called copy_process()
+ */
+void maze_fork(struct task_struct *task)
+{
+	task->maze_context = NULL;
+
+	if (!current->maze_context)
+		return;
+
+	task_lock(task);
+
+	task->maze_context = kzalloc(sizeof(struct maze_context), GFP_ATOMIC);
+	if (unlikely(!task->maze_context)) {
+		ERRPRINT("fail to alloc maze_context.\n");
+		goto task_unlock;
+	}
+	task->maze_context->task = task;
+	task->maze_context->pid = task->pid;
+
+	copy_limit_and_signal(task->maze_context, current->maze_context);
+	preempt_notifier_init(&task->maze_context->notifier, &preempt_ops);
+
+	spin_lock(&maze_list_lock);
+	list_add(&task->maze_context->list, &maze_list);
+	spin_unlock(&maze_list_lock);
+
+task_unlock:
+	task_unlock(task);
+}
+
+/*
+ * This function is called sys_sched_yield()
+ */
+void maze_sched_yield(struct task_struct *task)
+{
+	if (task->maze_context)
+		reset_maze_count(task->maze_context);
+}
+
+static void soft_limit_event(struct task_struct *task)
+{
+	send_sig(task->maze_context->soft_signal, task, 1);
+	task->maze_context->flags |= MAZE_SENT_SOFT_SIG;
+}
+
+static void hard_limit_event(struct task_struct *task)
+{
+	send_sig(task->maze_context->hard_signal, task, 1);
+	task->maze_context->flags |= MAZE_SENT_HARD_SIG;
+}
+
+static inline void set_preempt_notifir(struct maze_context *context)
+{
+	if (!atomic_xchg(&context->registered_notifier, 1))
+		preempt_notifier_register(&context->notifier);
+}
+
+static int add_maze_context(struct maze_context *context)
+{
+	struct task_struct *task;
+	int ret = 0;
+
+	rcu_read_lock();
+
+	task = find_task_by_pid(context->pid);
+	if (unlikely(!task))
+		goto read_unlock;
+
+	task_lock(task);
+
+	if (!task->maze_context) {
+		task->maze_context = context;
+		context->task = task;
+
+		spin_lock(&maze_list_lock);
+		list_add(&context->list, &maze_list);
+		spin_unlock(&maze_list_lock);
+
+		reset_maze_count(context);
+
+		preempt_notifier_init(&context->notifier, &preempt_ops);
+		if (current == task)
+		   set_preempt_notifir(context);
+	} else {
+		copy_limit_and_signal(task->maze_context, context);
+		ret = 1;
+	}
+
+	task_unlock(task);
+read_unlock:
+	rcu_read_unlock();
+	return ret;
+}
+
+static int build_maze_context(const char *read_line, struct maze_context *context)
+{
+	unsigned long soft_limit, hard_limit;
+	int soft_signal, hard_signal;
+	int pid, res;
+
+	res = sscanf(read_line, "%d %ld %ld %d %d", &pid, &soft_limit,
+				 &hard_limit, &soft_signal, &hard_signal);
+
+	if (res != 5 || pid < 0 ||
+		soft_limit < 0 || hard_limit < 0 ||
+		soft_signal < 0 || hard_signal < 0)
+		return -EINVAL;
+
+	context->pid = pid;
+	context->soft_limit = soft_limit;
+	context->hard_limit = hard_limit;
+	context->soft_signal = soft_signal;
+	context->hard_signal = hard_signal;
+
+	return 0;
+}
+
+static void timer_handler(unsigned long data);
+
+/*
+ * Setup softirq timer
+ */
+static void continue_timer(int cpu)
+{
+	struct timer_list *t;
+
+	t = &per_cpu(maze_timer, cpu);
+	t->function = timer_handler;
+	t->expires = jiffies + maze_timer_interval;
+	init_timer(t);
+	add_timer_on(t, cpu);
+}
+
+static inline void check_limit(struct maze_context *context)
+{
+	cputime_t t = get_maze_count(context);
+
+	if (!(context->flags & MAZE_SENT_SOFT_SIG)) {
+		if (t >= context->soft_limit)
+			soft_limit_event(context->task);
+	} else if (!(context->flags & MAZE_SENT_HARD_SIG)) {
+		if (t >= context->hard_limit)
+			hard_limit_event(context->task);
+	}
+}
+
+static void timer_handler(unsigned long data)
+{
+	struct maze_context *context, *next;
+
+	spin_lock(&__get_cpu_var(maze_queue_lock));
+
+	if (current->maze_context) {
+		set_preempt_notifir(current->maze_context);
+		if (!current->maze_context->enqueue)
+			check_limit(current->maze_context);
+	}
+
+	if (!list_empty(&__get_cpu_var(maze_queue))) {
+		list_for_each_entry_safe (context, next,
+								  &__get_cpu_var(maze_queue), queue) {
+			check_limit(context);
+			list_del(&context->queue);
+			context->enqueue = 0;
+		}
+	}
+
+	spin_unlock(&__get_cpu_var(maze_queue_lock));
+
+	continue_timer(smp_processor_id());
+}
+
+
+static int maze_entries_file_show(struct seq_file *seq, void *nouse)
+{
+	struct maze_context *context;
+
+	spin_lock(&maze_list_lock);
+
+	list_for_each_entry(context, &maze_list, list) {
+		seq_printf(seq, "pid:%5d   ", context->pid);
+		seq_printf(seq, "count:%6ld  ", get_maze_count(context));
+		seq_printf(seq, "soft limit:%6ld  ", context->soft_limit);
+		seq_printf(seq, "hard limit:%6ld  ", context->hard_limit);
+		seq_printf(seq, "soft signal:%2d  ", context->soft_signal);
+		seq_printf(seq, "hard signal:%2d  ", context->hard_signal);
+		seq_printf(seq, "\n");
+	}
+
+	spin_unlock(&maze_list_lock);
+
+	return 0;
+}
+
+/*
+ * Open operation of /proc/maze/entries
+ */
+static int maze_entries_file_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, maze_entries_file_show, NULL);
+}
+
+/*
+ * Release operation of /proc/maze/entries
+ */
+static int maze_entries_file_release(struct inode *inode, struct file *file)
+{
+	return single_release(inode, file);
+}
+
+/*
+ * Write operation of /proc/maze/entries
+ */
+static ssize_t maze_entries_file_write(struct file *file,
+					const char __user *buffer,
+					size_t count, loff_t *ppos)
+{
+	struct maze_context *context;
+	char read_line[32];
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (count > sizeof(read_line) - 1)
+		return -EINVAL;
+
+	if (copy_from_user(read_line, buffer, count))
+		return -EFAULT;
+
+	read_line[count] = '\0';
+
+	context = kzalloc(sizeof(struct maze_context), GFP_ATOMIC);
+	if (unlikely(!context)) {
+		ERRPRINT("fail to alloc maze_context.\n");
+		return -ENOMEM;
+	}
+
+	ret = build_maze_context(read_line, context);
+	if (ret) {
+		kfree(context);
+		return ret;
+	}
+
+	if (add_maze_context(context))
+		kfree(context);
+
+	return count;
+}
+
+static struct file_operations maze_entries_file_ops = {
+	.owner	 = THIS_MODULE,
+	.open	 = maze_entries_file_open,
+	.read	 = seq_read,
+	.write   = maze_entries_file_write,
+	.llseek	 = seq_lseek,
+	.release = maze_entries_file_release,
+};
+
+/*
+ * Creating /proc/maze/
+ */
+static inline int init_dir(void)
+{
+	maze_proc_dir =
+		create_proc_entry("maze",
+						  S_IFDIR | S_IRUGO | S_IXUGO,
+						  NULL);
+
+	if (!maze_proc_dir)
+		panic("fail to create /proc/maze\n");
+
+	return 0;
+}
+
+
+/*
+ * Creating /proc/maze/entries
+ */
+static inline int init_entries(void)
+{
+	struct proc_dir_entry *p;
+
+	p = create_proc_entry("entries", S_IRUGO, maze_proc_dir);
+	if (p == NULL)
+		panic("fail to create /proc/maze/entries\n");
+
+	p->proc_fops = &maze_entries_file_ops;
+
+	return 0;
+}
+
+/*
+ * Initializing maze procfs
+ */
+static void __init init_proc(void)
+{
+	init_dir();
+	init_entries();
+}
+
+static void __init init_cpu(int cpu)
+{
+	spin_lock_init(&per_cpu(maze_queue_lock, cpu));
+	INIT_LIST_HEAD(&per_cpu(maze_queue, cpu));
+
+	/* Setup softirq timer */
+	continue_timer(cpu);
+}
+
+static int __init maze_init(void)
+{
+	int cpu;
+
+	printk(KERN_INFO "Maze: Initializing\n");
+	maze_initialized = 1;
+	init_proc();
+
+	spin_lock_init(&maze_list_lock);
+	INIT_LIST_HEAD(&maze_list);
+
+	for_each_online_cpu(cpu)
+		init_cpu(cpu);
+
+	return 0;
+}
+late_initcall(maze_init);
--- linux-2.6.25-mm1-maze.orig/kernel/sched.c	2008-05-13 17:40:43.000000000 +0900
+++ linux-2.6.25-mm1-maze/kernel/sched.c	2008-05-13 17:52:23.000000000 +0900
@@ -68,6 +68,7 @@
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
 #include <linux/ftrace.h>
+#include <linux/maze.h>

 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -5157,6 +5158,8 @@ asmlinkage long sys_sched_yield(void)
 	_raw_spin_unlock(&rq->lock);
 	preempt_enable_no_resched();

+	maze_sched_yield(current);
+
 	schedule();

 	return 0;




^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [RFC][PATCH 1/1] MAZE: Mazed processes monitor
  2008-05-13 11:47 [RFC][PATCH 1/1] MAZE: Mazed processes monitor Hirofumi Nakagawa
@ 2008-05-13 16:04 ` Andrew Morton
  0 siblings, 0 replies; 2+ messages in thread
From: Andrew Morton @ 2008-05-13 16:04 UTC (permalink / raw)
  To: Hirofumi Nakagawa; +Cc: linux-kernel

On Tue, 13 May 2008 20:47:32 +0900 Hirofumi Nakagawa <hnakagawa@miraclelinux.com> wrote:

> This patch for linux-2.6.25-mm1 that is tested on x86_64 hardware.
> 
> ...
>
> +static struct proc_dir_entry *maze_proc_dir;
> +
> +static int maze_initialized;
> +
> +static struct list_head maze_list;
> +static spinlock_t maze_list_lock;
> +
> +static int maze_timer_interval = 10;
> +
> +DEFINE_PER_CPU(struct list_head, maze_queue);
> +DEFINE_PER_CPU(spinlock_t, maze_queue_lock);
> +
> +DEFINE_PER_CPU(struct timer_list, maze_timer);

These three can be static?

Please document the data structures here at their definition sites with
code comments: what they are used for, what their locking rules are. 
Also, a per-cpu spinlock is somewhat unusual and a few words describing
why it is needed would be useful.

> +static inline void reset_maze_count(struct maze_context *context)
> +{
> +	context->reset_utime = context->task->utime;
> +	context->reset_stime = context->task->stime;
> +	context->flags = 0;
> +}
> +
> +static inline cputime_t get_maze_count(struct maze_context *context)
> +{
> +	return (context->task->utime - context->reset_utime) +
> +		(context->task->stime - context->reset_stime);
> +}
> +
> +/*
> + * This function is called start_kernel()
> + */
> +void maze_init_early(void)
> +{
> +	init_task.maze_context = NULL;
> +}
> +
> +/*
> + * This function is called do_exit()
> + */
> +void maze_exit(struct task_struct *task)
> +{
> +	unsigned long flags;
> +
> +	task_lock(task);
> +	if (task->maze_context) {
> +		spin_lock_irqsave(&per_cpu(maze_queue_lock,
> +								   task->maze_context->enqueue_cpu), flags);

There's a moderate amount of whitespace brokenness in here. 
scripts/checkpatch.pl detects some of it.

> +		if (task->maze_context->enqueue)
> +			list_del(&task->maze_context->queue);
> +		spin_unlock_irqrestore(&per_cpu(maze_queue_lock,
> +										task->maze_context->enqueue_cpu), flags);
> +
> +		spin_lock(&maze_list_lock);
> +		list_del(&task->maze_context->list);
> +		spin_unlock(&maze_list_lock);
> +
> +		kfree(task->maze_context);
> +		task->maze_context = NULL;
> +	}
> +	task_unlock(task);
> +}
>
> ...
>
> +/*
> + * Must be called under task_lock().
> + */
> +static inline void enqueue(struct maze_context *context)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&__get_cpu_var(maze_queue_lock), flags);
> +	if (!context->enqueue) {
> +		list_add(&context->queue, &__get_cpu_var(maze_queue));
> +		context->enqueue_cpu = smp_processor_id();
> +		context->enqueue = 1;
> +	}
> +	spin_unlock_irqrestore(&__get_cpu_var(maze_queue_lock), flags);
> +}

The code uses `inline' too much, and is probably slower as a result. 
Most of all of them should be removed, please.

> +static void sched_in_event(struct preempt_notifier *notifier, int cpu)
> +{
> +	task_lock(current);
> +	if (current->maze_context) {
> +		if (current->state != TASK_RUNNING)
> +			reset_maze_count(current->maze_context);
> +		enqueue(current->maze_context);
> +	}
> +	task_unlock(current);
> +}

Please update the description of task_lock() in sched.h when adding new
rules to it.  I assume that it protects task_struct.maze_context.

I'm a little surprised to see a global lock being taken under
task_lock(): generally I think of task_lock() as a low-level thing
which protects only task internals.  I assume this code was runtime
tested with lockdep enabled?

But that doesn't mean that the chosen lock ranking is desirable.  From
a quick look the locking does seem logical.

> +static void sched_out_event(struct preempt_notifier *notifier,
> +							struct task_struct *next)
> +{
> +	task_lock(next);
> +	if (next->maze_context)
> +		enqueue(next->maze_context);
> +	task_unlock(next);
> +}
> +
> +static struct preempt_ops preempt_ops = {
> +	.sched_in = sched_in_event,
> +	.sched_out = sched_out_event,
> +};
> +
> +/*
> + * This function is called copy_process()
> + */

Some additional description here would be useful.  What does the function do?

> +void maze_fork(struct task_struct *task)
> +{
> +	task->maze_context = NULL;
> +
> +	if (!current->maze_context)
> +		return;
> +
> +	task_lock(task);
> +
> +	task->maze_context = kzalloc(sizeof(struct maze_context), GFP_ATOMIC);

GFP_ATOMIC is weak, and can fail.  With some reorganisation this could
be moved outside the lock and switched to GFP_KERNEL.

> +	if (unlikely(!task->maze_context)) {
> +		ERRPRINT("fail to alloc maze_context.\n");
> +		goto task_unlock;
> +	}
> +	task->maze_context->task = task;
> +	task->maze_context->pid = task->pid;
> +
> +	copy_limit_and_signal(task->maze_context, current->maze_context);
> +	preempt_notifier_init(&task->maze_context->notifier, &preempt_ops);
> +
> +	spin_lock(&maze_list_lock);
> +	list_add(&task->maze_context->list, &maze_list);
> +	spin_unlock(&maze_list_lock);
> +
> +task_unlock:
> +	task_unlock(task);
> +}
>
> ...
>
> +static inline void set_preempt_notifir(struct maze_context *context)

"notifier"

> +{
> +	if (!atomic_xchg(&context->registered_notifier, 1))
> +		preempt_notifier_register(&context->notifier);
> +}
> +
> +static int add_maze_context(struct maze_context *context)
> +{
> +	struct task_struct *task;
> +	int ret = 0;
> +
> +	rcu_read_lock();
> +
> +	task = find_task_by_pid(context->pid);
> +	if (unlikely(!task))
> +		goto read_unlock;
> +
> +	task_lock(task);
> +
> +	if (!task->maze_context) {
> +		task->maze_context = context;
> +		context->task = task;
> +
> +		spin_lock(&maze_list_lock);
> +		list_add(&context->list, &maze_list);
> +		spin_unlock(&maze_list_lock);
> +
> +		reset_maze_count(context);
> +
> +		preempt_notifier_init(&context->notifier, &preempt_ops);
> +		if (current == task)
> +		   set_preempt_notifir(context);

Please use only hard tabs for code indenting.

> +	} else {
> +		copy_limit_and_signal(task->maze_context, context);
> +		ret = 1;
> +	}
> +
> +	task_unlock(task);
> +read_unlock:
> +	rcu_read_unlock();
> +	return ret;
> +}
> +
> +static int build_maze_context(const char *read_line, struct maze_context *context)
> +{
> +	unsigned long soft_limit, hard_limit;
> +	int soft_signal, hard_signal;
> +	int pid, res;
> +
> +	res = sscanf(read_line, "%d %ld %ld %d %d", &pid, &soft_limit,
> +				 &hard_limit, &soft_signal, &hard_signal);
> +
> +	if (res != 5 || pid < 0 ||
> +		soft_limit < 0 || hard_limit < 0 ||
> +		soft_signal < 0 || hard_signal < 0)
> +		return -EINVAL;
> +
> +	context->pid = pid;
> +	context->soft_limit = soft_limit;
> +	context->hard_limit = hard_limit;
> +	context->soft_signal = soft_signal;
> +	context->hard_signal = hard_signal;
> +
> +	return 0;
> +}

pids are not unique in a containerised system.  What are the
implications of this?

> +static void timer_handler(unsigned long data);
> +
> +/*
> + * Setup softirq timer
> + */
> +static void continue_timer(int cpu)
> +{
> +	struct timer_list *t;
> +
> +	t = &per_cpu(maze_timer, cpu);
> +	t->function = timer_handler;
> +	t->expires = jiffies + maze_timer_interval;
> +	init_timer(t);

setup_timer()

> +	add_timer_on(t, cpu);
> +}
>
> ...
>
> +static void timer_handler(unsigned long data)
> +{
> +	struct maze_context *context, *next;
> +
> +	spin_lock(&__get_cpu_var(maze_queue_lock));
> +
> +	if (current->maze_context) {
> +		set_preempt_notifir(current->maze_context);
> +		if (!current->maze_context->enqueue)
> +			check_limit(current->maze_context);
> +	}
> +
> +	if (!list_empty(&__get_cpu_var(maze_queue))) {
> +		list_for_each_entry_safe (context, next,
> +								  &__get_cpu_var(maze_queue), queue) {
> +			check_limit(context);
> +			list_del(&context->queue);
> +			context->enqueue = 0;
> +		}
> +	}
> +
> +	spin_unlock(&__get_cpu_var(maze_queue_lock));
> +
> +	continue_timer(smp_processor_id());
> +}

It is odd that the timer runs every ten jiffies.  Because jiffies can
alter by a factor of ten, depending upon Kconfig settings and
architecture, etc.

Having numerous timers expire once per ten jiffies will be a power
consumption problem for some people.  Can this be fixed?

There's not really enough information in either code comments or the
changelog to permit me to understand what this timer is here for.

> +
> +static int maze_entries_file_show(struct seq_file *seq, void *nouse)
> +{
> +	struct maze_context *context;
> +
> +	spin_lock(&maze_list_lock);
> +
> +	list_for_each_entry(context, &maze_list, list) {
> +		seq_printf(seq, "pid:%5d   ", context->pid);
> +		seq_printf(seq, "count:%6ld  ", get_maze_count(context));
> +		seq_printf(seq, "soft limit:%6ld  ", context->soft_limit);
> +		seq_printf(seq, "hard limit:%6ld  ", context->hard_limit);
> +		seq_printf(seq, "soft signal:%2d  ", context->soft_signal);
> +		seq_printf(seq, "hard signal:%2d  ", context->hard_signal);
> +		seq_printf(seq, "\n");
> +	}
> +
> +	spin_unlock(&maze_list_lock);
> +
> +	return 0;
> +}
>
> ...
>
> +/*
> + * Write operation of /proc/maze/entries
> + */
> +static ssize_t maze_entries_file_write(struct file *file,
> +					const char __user *buffer,
> +					size_t count, loff_t *ppos)
> +{
> +	struct maze_context *context;
> +	char read_line[32];
> +	int ret;
> +
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EPERM;
> +
> +	if (count > sizeof(read_line) - 1)
> +		return -EINVAL;
> +
> +	if (copy_from_user(read_line, buffer, count))
> +		return -EFAULT;
> +
> +	read_line[count] = '\0';
> +
> +	context = kzalloc(sizeof(struct maze_context), GFP_ATOMIC);

Use GFP_KERNEL.

> +	if (unlikely(!context)) {
> +		ERRPRINT("fail to alloc maze_context.\n");
> +		return -ENOMEM;
> +	}
> +
> +	ret = build_maze_context(read_line, context);
> +	if (ret) {
> +		kfree(context);
> +		return ret;
> +	}
> +
> +	if (add_maze_context(context))
> +		kfree(context);
> +
> +	return count;
> +}

It looks like this can leak the memory at *context?

> +static struct file_operations maze_entries_file_ops = {
> +	.owner	 = THIS_MODULE,
> +	.open	 = maze_entries_file_open,
> +	.read	 = seq_read,
> +	.write   = maze_entries_file_write,
> +	.llseek	 = seq_lseek,
> +	.release = maze_entries_file_release,
> +};

Please document newly-added /proc files in Documentation/filesystems/proc.txt

> +/*
> + * Creating /proc/maze/
> + */
> +static inline int init_dir(void)
> +{
> +	maze_proc_dir =
> +		create_proc_entry("maze",
> +						  S_IFDIR | S_IRUGO | S_IXUGO,
> +						  NULL);
> +
> +	if (!maze_proc_dir)
> +		panic("fail to create /proc/maze\n");
> +
> +	return 0;
> +}

uninline, fix whitespace...

> +
> +/*
> + * Creating /proc/maze/entries
> + */
> +static inline int init_entries(void)
> +{
> +	struct proc_dir_entry *p;
> +
> +	p = create_proc_entry("entries", S_IRUGO, maze_proc_dir);
> +	if (p == NULL)
> +		panic("fail to create /proc/maze/entries\n");
> +
> +	p->proc_fops = &maze_entries_file_ops;
> +
> +	return 0;
> +}
>
> ...
>
> +static int __init maze_init(void)
> +{
> +	int cpu;
> +
> +	printk(KERN_INFO "Maze: Initializing\n");
> +	maze_initialized = 1;
> +	init_proc();
> +
> +	spin_lock_init(&maze_list_lock);

Remove this, use DEFINE_SPINLOCK().

> +	INIT_LIST_HEAD(&maze_list);

Remove this, use LIST_HEAD().

> +	for_each_online_cpu(cpu)
> +		init_cpu(cpu);
> +
> +	return 0;
> +}
> +late_initcall(maze_init);

I am unable to tell from the implementation why late_initcall() was
chosen instead of plain old __initcall() (or module_innit()).  Please
add a comment.

> --- linux-2.6.25-mm1-maze.orig/kernel/sched.c	2008-05-13 17:40:43.000000000 +0900
> +++ linux-2.6.25-mm1-maze/kernel/sched.c	2008-05-13 17:52:23.000000000 +0900
> @@ -68,6 +68,7 @@
>  #include <linux/hrtimer.h>
>  #include <linux/tick.h>
>  #include <linux/ftrace.h>
> +#include <linux/maze.h>
> 
>  #include <asm/tlb.h>
>  #include <asm/irq_regs.h>
> @@ -5157,6 +5158,8 @@ asmlinkage long sys_sched_yield(void)
>  	_raw_spin_unlock(&rq->lock);
>  	preempt_enable_no_resched();
> 
> +	maze_sched_yield(current);
> +
>  	schedule();
> 
>  	return 0;
> 
> 

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2008-05-13 16:05 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-05-13 11:47 [RFC][PATCH 1/1] MAZE: Mazed processes monitor Hirofumi Nakagawa
2008-05-13 16:04 ` Andrew Morton

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.