From: Marcelo Tosatti <marcelo@kvack.org>
To: linux-mm@kvack.org
Cc: drepper@redhat.com, riel@redhat.com, akpm@linux-foundation.org,
mbligh@mbligh.org, balbir@linux.vnet.ibm.com
Subject: [RFC] oom notifications via /dev/oom_notify
Date: Tue, 30 Oct 2007 15:18:27 -0400 [thread overview]
Message-ID: <20071030191827.GB31038@dmt> (raw)
Hi,
Following patch creates a /dev/oom_notify device which applications can
select()/poll() to get informed of memory pressure.
The basic idea here is that applications can be part of the memory
reclaim process. The notification is loosely defined as "please free
some small percentage of your memory".
There is no easy way of finding whether the system is approaching a
state where swapping is required in the reclaim paths, so a defensive
approach is taken by using a timer with 1Hz frequency which verifies
whether swapping has occurred.
For scenarios which require a "severe pressure notification" (please
read Nokia's implementation at http://www.linuxjournal.com/article/8502 for
more details), I believe the best solution is to create a separate
/dev/oom_notify_critical device to avoid complication of the main device
code paths. Take into account that such notification needs careful
synchronization with the OOM killer.
Comments please...
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/drivers/char/mem.c linux-2.6/drivers/char/mem.c
--- linux-2.6.orig/drivers/char/mem.c 2007-10-24 15:52:54.000000000 -0300
+++ linux-2.6/drivers/char/mem.c 2007-10-29 00:22:31.000000000 -0300
@@ -34,6 +34,8 @@
# include <linux/efi.h>
#endif
+extern struct file_operations oom_notify_fops;
+
/*
* Architectures vary in how they handle caching for addresses
* outside of main memory.
@@ -854,6 +856,9 @@
filp->f_op = &oldmem_fops;
break;
#endif
+ case 13:
+ filp->f_op = &oom_notify_fops;
+ break;
default:
return -ENXIO;
}
@@ -886,6 +891,7 @@
#ifdef CONFIG_CRASH_DUMP
{12,"oldmem", S_IRUSR | S_IWUSR | S_IRGRP, &oldmem_fops},
#endif
+ {13,"oom_notify", S_IRUGO, &oom_notify_fops},
};
static struct class *mem_class;
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/include/linux/vmstat.h linux-2.6/include/linux/vmstat.h
--- linux-2.6.orig/include/linux/vmstat.h 2007-10-24 15:55:30.000000000 -0300
+++ linux-2.6/include/linux/vmstat.h 2007-10-27 23:28:48.000000000 -0300
@@ -80,6 +80,7 @@
}
extern void all_vm_events(unsigned long *);
+extern unsigned int sum_vm_event(int);
#ifdef CONFIG_HOTPLUG
extern void vm_events_fold_cpu(int cpu);
#else
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/Kconfig linux-2.6/mm/Kconfig
--- linux-2.6.orig/mm/Kconfig 2007-10-24 15:53:02.000000000 -0300
+++ linux-2.6/mm/Kconfig 2007-10-25 13:58:38.000000000 -0300
@@ -170,6 +170,13 @@
example on NUMA systems to put pages nearer to the processors accessing
the page.
+config OOM_NOTIFY
+ bool "Memory notification"
+ def_bool n
+ help
+ This option allows the kernel to notify applications of memory
+ shortage.
+
config RESOURCES_64BIT
bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
default 64BIT
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/Makefile linux-2.6/mm/Makefile
--- linux-2.6.orig/mm/Makefile 2007-10-24 15:53:02.000000000 -0300
+++ linux-2.6/mm/Makefile 2007-10-25 13:54:34.000000000 -0300
@@ -30,4 +30,5 @@
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_OOM_NOTIFY) += oom_notify.o
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/oom_notify.c linux-2.6/mm/oom_notify.c
--- linux-2.6.orig/mm/oom_notify.c 1969-12-31 21:00:00.000000000 -0300
+++ linux-2.6/mm/oom_notify.c 2007-10-30 12:35:24.000000000 -0300
@@ -0,0 +1,96 @@
+/*
+ * Notify applications of memory pressure via /dev/oom_notify
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/timer.h>
+#include <linux/spinlock.h>
+#include <linux/vmstat.h>
+
+static int oom_notify_users = 0;
+static bool oom_notify_status = 0;
+static unsigned int prev_swapped_pages = 0;
+
+static void oom_check_fn(unsigned long);
+
+DECLARE_WAIT_QUEUE_HEAD(oom_wait);
+DEFINE_SPINLOCK(oom_notify_lock);
+static struct timer_list oom_check_timer =
+ TIMER_INITIALIZER(oom_check_fn, 0, 0);
+
+void oom_check_fn(unsigned long unused)
+{
+ bool wake = 0;
+ unsigned int swapped_pages;
+
+ swapped_pages = sum_vm_event(PSWPOUT);
+ if (swapped_pages > prev_swapped_pages)
+ wake = 1;
+ prev_swapped_pages = swapped_pages;
+
+ oom_notify_status = wake;
+
+ if (wake)
+ wake_up_all(&oom_wait);
+
+ return;
+}
+
+static int oom_notify_open(struct inode *inode, struct file *file)
+{
+ spin_lock(&oom_notify_lock);
+ if (!oom_notify_users) {
+ oom_notify_status = 0;
+ oom_check_timer.expires = jiffies + msecs_to_jiffies(1000);
+ mod_timer(&oom_check_timer, oom_check_timer.expires);
+ }
+ oom_notify_users++;
+ spin_unlock(&oom_notify_lock);
+
+ return 0;
+}
+
+static int oom_notify_release(struct inode *inode, struct file *file)
+{
+ spin_lock(&oom_notify_lock);
+ oom_notify_users--;
+ if (!oom_notify_users) {
+ del_timer(&oom_check_timer);
+ oom_notify_status = 0;
+ }
+ spin_unlock(&oom_notify_lock);
+ return 0;
+}
+
+static unsigned int oom_notify_poll(struct file *file, poll_table *wait)
+{
+ unsigned int val = 0;
+ struct zone *zone;
+ int cz_idx = zone_idx(NODE_DATA(nid)->node_zonelists->zones[0]);
+
+ poll_wait(file, &oom_wait, wait);
+
+ if (oom_notify_status)
+ val = POLLIN;
+
+ for_each_zone(zone) {
+ if (!populated_zone(zone))
+ continue;
+ if (!zone_watermark_ok(zone, 0, zone->pages_low, cz_idx, 0)) {
+ val = POLLIN;
+ break;
+ }
+ }
+
+ return val;
+}
+
+struct file_operations oom_notify_fops = {
+ .open = oom_notify_open,
+ .release = oom_notify_release,
+ .poll = oom_notify_poll,
+};
+EXPORT_SYMBOL(oom_notify_fops);
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/vmstat.c linux-2.6/mm/vmstat.c
--- linux-2.6.orig/mm/vmstat.c 2007-10-24 15:53:02.000000000 -0300
+++ linux-2.6/mm/vmstat.c 2007-10-27 22:45:35.000000000 -0300
@@ -52,6 +52,28 @@
}
EXPORT_SYMBOL_GPL(all_vm_events);
+unsigned int sum_vm_event(int vm_event)
+{
+ int cpu = 0;
+ int i;
+ unsigned int ret = 0;
+ cpumask_t *cpumask = &cpu_online_map;
+
+ cpu = first_cpu(*cpumask);
+ while (cpu < NR_CPUS) {
+ struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
+
+ cpu = next_cpu(cpu, *cpumask);
+
+ if (cpu < NR_CPUS)
+ prefetch(&per_cpu(vm_event_states, cpu));
+
+ ret += this->event[vm_event];
+ }
+ return ret;
+}
+EXPORT_SYMBOL(sum_vm_event);
+
#ifdef CONFIG_HOTPLUG
/*
* Fold the foreign cpu events into our own.
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/include/linux/vmstat.h linux-2.6/include/linux/vmstat.h
--- linux-2.6.orig/include/linux/vmstat.h 2007-10-24 15:55:30.000000000 -0300
+++ linux-2.6/include/linux/vmstat.h 2007-10-27 23:28:48.000000000 -0300
@@ -80,6 +80,7 @@
}
extern void all_vm_events(unsigned long *);
+extern unsigned int sum_vm_event(int);
#ifdef CONFIG_HOTPLUG
extern void vm_events_fold_cpu(int cpu);
#else
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next reply other threads:[~2007-10-30 19:18 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-10-30 19:18 Marcelo Tosatti [this message]
2007-10-30 20:57 ` [RFC] oom notifications via /dev/oom_notify Balbir Singh
2007-10-30 22:16 ` Marcelo Tosatti
2007-10-30 21:00 ` Rik van Riel
2007-10-30 21:07 ` Marcelo Tosatti
2007-10-30 21:19 ` Rik van Riel
2007-10-30 22:26 ` Marcelo Tosatti
2007-10-31 17:20 ` Dave Jones
2007-11-01 23:58 ` Marcelo Tosatti
2007-10-30 21:59 ` Badari Pulavarty
2007-10-30 21:12 ` Rik van Riel
2007-10-31 4:17 ` Badari
2007-10-31 4:31 ` Rik van Riel
2007-10-31 17:01 ` Badari Pulavarty
2007-10-31 16:15 ` Rik van Riel
2007-10-31 5:38 ` Balbir Singh
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20071030191827.GB31038@dmt \
--to=marcelo@kvack.org \
--cc=akpm@linux-foundation.org \
--cc=balbir@linux.vnet.ibm.com \
--cc=drepper@redhat.com \
--cc=linux-mm@kvack.org \
--cc=mbligh@mbligh.org \
--cc=riel@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).