* VM fixes [1/4]
@ 2004-12-24 17:35 Andrea Arcangeli
2004-12-24 18:00 ` David S. Miller
0 siblings, 1 reply; 7+ messages in thread
From: Andrea Arcangeli @ 2004-12-24 17:35 UTC (permalink / raw)
To: linux-kernel; +Cc: Thomas Gleixner, Andrew Morton
Likely I'll bring a -aa tree up in a few weeks to avoid this stuff to
get lost, but I'm attempting to merge it in mainline first.
This is protect-pids, a patch to allow the admin to tune the oom killer.
The tweak is inherited between parent and child so it's easy to write a
wrapper for complex apps.
I made used_math a char at the light of later patches. per-cpu atomicity
with byte granularity is provided by all archs AFIK.
From: garloff@suse.de
Subject: protect-pids
Signed-off-by: Andrea Arcangeli <andrea@suse.de>
From: garloff@suse.de
Subject: protect-pids
Signed-off-by: Andrea Arcangeli <andrea@suse.de>
--- x/fs/proc/base.c.orig 2004-12-04 08:56:31.000000000 +0100
+++ x/fs/proc/base.c 2004-12-24 17:50:29.043208992 +0100
@@ -71,6 +71,8 @@ enum pid_directory_inos {
PROC_TGID_ATTR_FSCREATE,
#endif
PROC_TGID_FD_DIR,
+ PROC_TGID_OOM_SCORE,
+ PROC_TGID_OOM_ADJUST,
PROC_TID_INO,
PROC_TID_STATUS,
PROC_TID_MEM,
@@ -97,6 +99,8 @@ enum pid_directory_inos {
PROC_TID_ATTR_FSCREATE,
#endif
PROC_TID_FD_DIR = 0x8000, /* 0x8000-0xffff */
+ PROC_TID_OOM_SCORE,
+ PROC_TID_OOM_ADJUST,
};
struct pid_entry {
@@ -132,6 +136,8 @@ static struct pid_entry tgid_base_stuff[
#ifdef CONFIG_SCHEDSTATS
E(PROC_TGID_SCHEDSTAT, "schedstat", S_IFREG|S_IRUGO),
#endif
+ E(PROC_TGID_OOM_SCORE, "oom_score",S_IFREG|S_IRUGO),
+ E(PROC_TGID_OOM_ADJUST,"oom_adj", S_IFREG|S_IRUGO|S_IWUSR),
{0,0,NULL,0}
};
static struct pid_entry tid_base_stuff[] = {
@@ -157,6 +163,8 @@ static struct pid_entry tid_base_stuff[]
#ifdef CONFIG_SCHEDSTATS
E(PROC_TID_SCHEDSTAT, "schedstat",S_IFREG|S_IRUGO),
#endif
+ E(PROC_TID_OOM_SCORE, "oom_score",S_IFREG|S_IRUGO),
+ E(PROC_TID_OOM_ADJUST, "oom_adj", S_IFREG|S_IRUGO|S_IWUSR),
{0,0,NULL,0}
};
@@ -425,6 +433,18 @@ static int proc_pid_schedstat(struct tas
}
#endif
+/* The badness from the OOM killer */
+unsigned long badness(struct task_struct *p, unsigned long uptime);
+static int proc_oom_score(struct task_struct *task, char *buffer)
+{
+ unsigned long points;
+ struct timespec uptime;
+
+ do_posix_clock_monotonic_gettime(&uptime);
+ points = badness(task, uptime.tv_sec);
+ return sprintf(buffer, "%lu\n", points);
+}
+
/************************************************************************/
/* Here the fs part begins */
/************************************************************************/
@@ -698,6 +718,55 @@ static struct file_operations proc_mem_o
.open = mem_open,
};
+static ssize_t oom_adjust_read(struct file * file, char * buf,
+ size_t count, loff_t *ppos)
+{
+ struct task_struct *task = proc_task(file->f_dentry->d_inode);
+ char buffer[8];
+ size_t len;
+ int oom_adjust = task->oomkilladj;
+
+ len = sprintf(buffer, "%i\n", oom_adjust) + 1;
+ if (*ppos >= len)
+ return 0;
+ if (count > len-*ppos)
+ count = len-*ppos;
+ if (copy_to_user(buf, buffer + *ppos, count))
+ return -EFAULT;
+ *ppos += count;
+ return count;
+}
+
+static ssize_t oom_adjust_write(struct file * file, const char * buf,
+ size_t count, loff_t *ppos)
+{
+ struct task_struct *task = proc_task(file->f_dentry->d_inode);
+ char buffer[8], *end;
+ int oom_adjust;
+
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+ memset(buffer, 0, 8);
+ if (count > 6)
+ count = 6;
+ if (copy_from_user(buffer, buf, count))
+ return -EFAULT;
+ oom_adjust = simple_strtol(buffer, &end, 0);
+ if (oom_adjust < -16 || oom_adjust > 15)
+ return -EINVAL;
+ if (*end == '\n')
+ end++;
+ task->oomkilladj = oom_adjust;
+ if (end - buffer == 0)
+ return -EIO;
+ return end - buffer;
+}
+
+static struct file_operations proc_oom_adjust_operations = {
+ read: oom_adjust_read,
+ write: oom_adjust_write,
+};
+
static struct inode_operations proc_mem_inode_operations = {
.permission = proc_permission,
};
@@ -1377,6 +1446,15 @@ static struct dentry *proc_pident_lookup
ei->op.proc_read = proc_pid_schedstat;
break;
#endif
+ case PROC_TID_OOM_SCORE:
+ case PROC_TGID_OOM_SCORE:
+ inode->i_fop = &proc_info_file_operations;
+ ei->op.proc_read = proc_oom_score;
+ break;
+ case PROC_TID_OOM_ADJUST:
+ case PROC_TGID_OOM_ADJUST:
+ inode->i_fop = &proc_oom_adjust_operations;
+ break;
default:
printk("procfs: impossible type (%d)",p->type);
iput(inode);
--- x/include/linux/sched.h.orig 2004-12-04 08:56:32.000000000 +0100
+++ x/include/linux/sched.h 2004-12-24 17:48:36.743281176 +0100
@@ -600,7 +600,19 @@ struct task_struct {
struct key *process_keyring; /* keyring private to this process (CLONE_THREAD) */
struct key *thread_keyring; /* keyring private to this thread */
#endif
- unsigned short used_math;
+/*
+ * Must be changed atomically so it shouldn't be
+ * be a shareable bitflag.
+ */
+ unsigned char used_math;
+/*
+ * OOM kill score adjustment (bit shift).
+ * Cannot live together with used_math since
+ * used_math and oomkilladj can be changed at the
+ * same time, so they would race if they're in the
+ * same atomic block.
+ */
+ short oomkilladj;
char comm[16];
/* file system info */
int link_count, total_link_count;
--- x/mm/oom_kill.c.orig 2004-12-04 08:55:05.000000000 +0100
+++ x/mm/oom_kill.c 2004-12-24 17:49:07.098666456 +0100
@@ -42,7 +42,7 @@
* of least surprise ... (be careful when you change it)
*/
-static unsigned long badness(struct task_struct *p, unsigned long uptime)
+unsigned long badness(struct task_struct *p, unsigned long uptime)
{
unsigned long points, cpu_time, run_time, s;
@@ -98,6 +98,17 @@ static unsigned long badness(struct task
*/
if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
points /= 4;
+
+ /*
+ * Adjust the score by oomkilladj.
+ */
+ if (p->oomkilladj) {
+ if (p->oomkilladj > 0)
+ points <<= p->oomkilladj;
+ else
+ points >>= -(p->oomkilladj);
+ }
+
#ifdef DEBUG
printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",
p->pid, p->comm, points);
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: VM fixes [1/4]
2004-12-24 17:35 VM fixes [1/4] Andrea Arcangeli
@ 2004-12-24 18:00 ` David S. Miller
2004-12-24 18:20 ` Andrea Arcangeli
0 siblings, 1 reply; 7+ messages in thread
From: David S. Miller @ 2004-12-24 18:00 UTC (permalink / raw)
To: Andrea Arcangeli; +Cc: linux-kernel, tglx, akpm
On Fri, 24 Dec 2004 18:35:19 +0100
Andrea Arcangeli <andrea@suse.de> wrote:
> I made used_math a char at the light of later patches. per-cpu atomicity
> with byte granularity is provided by all archs AFIK.
Older Alpha's need to read-modify-write a word to implement
byte ops.
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: VM fixes [1/4]
2004-12-24 18:00 ` David S. Miller
@ 2004-12-24 18:20 ` Andrea Arcangeli
2004-12-24 18:41 ` Lee Revell
2004-12-25 4:40 ` Jeff Garzik
0 siblings, 2 replies; 7+ messages in thread
From: Andrea Arcangeli @ 2004-12-24 18:20 UTC (permalink / raw)
To: David S. Miller; +Cc: linux-kernel, tglx, akpm
On Fri, Dec 24, 2004 at 10:00:16AM -0800, David S. Miller wrote:
> On Fri, 24 Dec 2004 18:35:19 +0100
> Andrea Arcangeli <andrea@suse.de> wrote:
>
> > I made used_math a char at the light of later patches. per-cpu atomicity
> > with byte granularity is provided by all archs AFIK.
>
> Older Alpha's need to read-modify-write a word to implement
> byte ops.
Yep, I remeber this was the case in some old alpha. But did they support
smp too? I can't see how that old hardware could support smp. If they're
UP they're fine.
The race is extremely tiny anyway, you'd need to write to the
/proc/<pid>/ file at the same time that used_math is toggled.
Or alternatively you'd need to kill the task due oom at the same time
used_math is toggled.
The race in PF_MEMDIE is more serious.
And false sharing with memdie and oomkilladj is zero, since they're
pratically readonly.
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: VM fixes [1/4]
2004-12-24 18:20 ` Andrea Arcangeli
@ 2004-12-24 18:41 ` Lee Revell
2004-12-24 18:48 ` Andrea Arcangeli
2004-12-25 4:40 ` Jeff Garzik
1 sibling, 1 reply; 7+ messages in thread
From: Lee Revell @ 2004-12-24 18:41 UTC (permalink / raw)
To: Andrea Arcangeli; +Cc: David S. Miller, linux-kernel, tglx, akpm
On Fri, 2004-12-24 at 19:20 +0100, Andrea Arcangeli wrote:
> Yep, I remeber this was the case in some old alpha. But did they
> support smp too? I can't see how that old hardware could support smp.
> If they're UP they're fine.
Isn't there still a race with PREEMPT? Or am I missing something?
Lee
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: VM fixes [1/4]
2004-12-24 18:41 ` Lee Revell
@ 2004-12-24 18:48 ` Andrea Arcangeli
2004-12-24 18:52 ` Lee Revell
0 siblings, 1 reply; 7+ messages in thread
From: Andrea Arcangeli @ 2004-12-24 18:48 UTC (permalink / raw)
To: Lee Revell; +Cc: David S. Miller, linux-kernel, tglx, akpm
On Fri, Dec 24, 2004 at 01:41:03PM -0500, Lee Revell wrote:
> On Fri, 2004-12-24 at 19:20 +0100, Andrea Arcangeli wrote:
> > Yep, I remeber this was the case in some old alpha. But did they
> > support smp too? I can't see how that old hardware could support smp.
> > If they're UP they're fine.
>
> Isn't there still a race with PREEMPT? Or am I missing something?
I was thinking the same right now, OTOH used_math is most important
during exceptions and sure preempt can't affect exceptions.
There are a few corner cases that you'd need to check to know if
used_math can be changed during normal kernel context and not only
during exceptions. Probably it's racy though. But what's the point of
enabling preempt in such an ancient hardare?
I'm not against fixing it up though, but at least we'd need a
#define _HAVE_BYTE_ATOMIC_GRANULARITY
#define _HAVE_SHORT_ATOMIC_GRANULARITY
(assuming short is the minimal smp atomic granularity we support in
linux)
to be able to write optimal code for anything that 99.999% of the
userbase would be using.
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: VM fixes [1/4]
2004-12-24 18:48 ` Andrea Arcangeli
@ 2004-12-24 18:52 ` Lee Revell
0 siblings, 0 replies; 7+ messages in thread
From: Lee Revell @ 2004-12-24 18:52 UTC (permalink / raw)
To: Andrea Arcangeli; +Cc: David S. Miller, linux-kernel, tglx, akpm
On Fri, 2004-12-24 at 19:48 +0100, Andrea Arcangeli wrote:
> But what's the point of
> enabling preempt in such an ancient hardare?
>
None really, but it seems like many people enable preempt who don't need
it.
Lee
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: VM fixes [1/4]
2004-12-24 18:20 ` Andrea Arcangeli
2004-12-24 18:41 ` Lee Revell
@ 2004-12-25 4:40 ` Jeff Garzik
1 sibling, 0 replies; 7+ messages in thread
From: Jeff Garzik @ 2004-12-25 4:40 UTC (permalink / raw)
To: Andrea Arcangeli; +Cc: David S. Miller, linux-kernel, tglx, akpm
On Fri, Dec 24, 2004 at 07:20:31PM +0100, Andrea Arcangeli wrote:
> On Fri, Dec 24, 2004 at 10:00:16AM -0800, David S. Miller wrote:
> > On Fri, 24 Dec 2004 18:35:19 +0100
> > Andrea Arcangeli <andrea@suse.de> wrote:
> >
> > > I made used_math a char at the light of later patches. per-cpu atomicity
> > > with byte granularity is provided by all archs AFIK.
> >
> > Older Alpha's need to read-modify-write a word to implement
> > byte ops.
>
> Yep, I remeber this was the case in some old alpha. But did they support
> smp too? I can't see how that old hardware could support smp. If they're
> UP they're fine.
Sure... there were older Alpha SMP boxes without the BWX and CIX
extensions.
Jeff
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2004-12-25 4:40 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-12-24 17:35 VM fixes [1/4] Andrea Arcangeli
2004-12-24 18:00 ` David S. Miller
2004-12-24 18:20 ` Andrea Arcangeli
2004-12-24 18:41 ` Lee Revell
2004-12-24 18:48 ` Andrea Arcangeli
2004-12-24 18:52 ` Lee Revell
2004-12-25 4:40 ` Jeff Garzik
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox