linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Marcelo Tosatti <marcelo@kvack.org>
To: linux-mm@kvack.org
Cc: drepper@redhat.com, riel@redhat.com, akpm@linux-foundation.org,
	mbligh@mbligh.org, balbir@linux.vnet.ibm.com
Subject: Re: [RFC] oom notifications via /dev/oom_notify
Date: Tue, 30 Oct 2007 17:07:43 -0400	[thread overview]
Message-ID: <20071030210743.GA304@dmt> (raw)
In-Reply-To: <20071030191827.GB31038@dmt>

On Tue, Oct 30, 2007 at 03:18:27PM -0400, Marcelo Tosatti wrote:
> Hi,
> 
> Following patch creates a /dev/oom_notify device which applications can
> select()/poll() to get informed of memory pressure.
> 
> The basic idea here is that applications can be part of the memory
> reclaim process. The notification is loosely defined as "please free
> some small percentage of your memory".
> 
> There is no easy way of finding whether the system is approaching a
> state where swapping is required in the reclaim paths, so a defensive
> approach is taken by using a timer with 1Hz frequency which verifies
> whether swapping has occurred.
> 
> For scenarios which require a "severe pressure notification" (please
> read Nokia's implementation at http://www.linuxjournal.com/article/8502 for
> more details), I believe the best solution is to create a separate
> /dev/oom_notify_critical device to avoid complication of the main device
> code paths. Take into account that such notification needs careful
> synchronization with the OOM killer.
> 
> Comments please...

changes:
- rearm timer (!)
- wake up one thread instead of all in swapout detection
- msecs_to_jiffies(1000) -> HZ

--- linux-2.6.orig/drivers/char/mem.c	2007-10-24 15:52:54.000000000 -0300
+++ linux-2.6/drivers/char/mem.c	2007-10-29 00:22:31.000000000 -0300
@@ -34,6 +34,8 @@
 # include <linux/efi.h>
 #endif
 
+extern struct file_operations oom_notify_fops;
+
 /*
  * Architectures vary in how they handle caching for addresses
  * outside of main memory.
@@ -854,6 +856,9 @@
 			filp->f_op = &oldmem_fops;
 			break;
 #endif
+		case 13:
+			filp->f_op = &oom_notify_fops;
+			break;
 		default:
 			return -ENXIO;
 	}
@@ -886,6 +891,7 @@
 #ifdef CONFIG_CRASH_DUMP
 	{12,"oldmem",    S_IRUSR | S_IWUSR | S_IRGRP, &oldmem_fops},
 #endif
+	{13,"oom_notify", S_IRUGO, &oom_notify_fops},
 };
 
 static struct class *mem_class;
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/include/linux/vmstat.h linux-2.6/include/linux/vmstat.h
--- linux-2.6.orig/include/linux/vmstat.h	2007-10-24 15:55:30.000000000 -0300
+++ linux-2.6/include/linux/vmstat.h	2007-10-27 23:28:48.000000000 -0300
@@ -80,6 +80,7 @@
 }
 
 extern void all_vm_events(unsigned long *);
+extern unsigned int sum_vm_event(int);
 #ifdef CONFIG_HOTPLUG
 extern void vm_events_fold_cpu(int cpu);
 #else
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/Kconfig linux-2.6/mm/Kconfig
--- linux-2.6.orig/mm/Kconfig	2007-10-24 15:53:02.000000000 -0300
+++ linux-2.6/mm/Kconfig	2007-10-25 13:58:38.000000000 -0300
@@ -170,6 +170,13 @@
 	  example on NUMA systems to put pages nearer to the processors accessing
 	  the page.
 
+config OOM_NOTIFY
+	bool "Memory notification"
+	def_bool n
+	help
+	  This option allows the kernel to notify applications of memory 
+	  shortage.
+
 config RESOURCES_64BIT
 	bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
 	default 64BIT
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/Makefile linux-2.6/mm/Makefile
--- linux-2.6.orig/mm/Makefile	2007-10-24 15:53:02.000000000 -0300
+++ linux-2.6/mm/Makefile	2007-10-25 13:54:34.000000000 -0300
@@ -30,4 +30,5 @@
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_OOM_NOTIFY) += oom_notify.o
 
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/oom_notify.c linux-2.6/mm/oom_notify.c
--- linux-2.6.orig/mm/oom_notify.c	1969-12-31 21:00:00.000000000 -0300
+++ linux-2.6/mm/oom_notify.c	2007-10-30 16:02:29.000000000 -0300
@@ -0,0 +1,97 @@
+/*
+ * Notify applications of memory pressure via /dev/oom_notify
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/timer.h>
+#include <linux/spinlock.h>
+#include <linux/vmstat.h>
+
+static int oom_notify_users = 0;
+static bool oom_notify_status = 0;
+static unsigned int prev_swapped_pages = 0;
+
+static void oom_check_fn(unsigned long);
+
+DECLARE_WAIT_QUEUE_HEAD(oom_wait);
+DEFINE_SPINLOCK(oom_notify_lock);
+static struct timer_list oom_check_timer =
+		TIMER_INITIALIZER(oom_check_fn, 0, 0);
+
+void oom_check_fn(unsigned long unused)
+{
+	bool wake = 0;
+	unsigned int swapped_pages;
+
+	swapped_pages = sum_vm_event(PSWPOUT);
+	if (swapped_pages > prev_swapped_pages)
+		wake = 1;
+	prev_swapped_pages = swapped_pages;
+
+	oom_notify_status = wake;
+
+	if (wake)
+		wake_up(&oom_wait);
+
+	mod_timer(&oom_check_timer, jiffies+HZ);
+	return;
+}
+
+static int oom_notify_open(struct inode *inode, struct file *file)
+{
+	spin_lock(&oom_notify_lock);
+	if (!oom_notify_users) {
+		oom_notify_status = 0;
+		mod_timer(&oom_check_timer, jiffies+HZ);
+	}
+	oom_notify_users++;
+	spin_unlock(&oom_notify_lock);
+
+	return 0;
+}
+
+static int oom_notify_release(struct inode *inode, struct file *file)
+{
+	spin_lock(&oom_notify_lock);
+	oom_notify_users--;
+	if (!oom_notify_users) {
+		del_timer(&oom_check_timer);
+		oom_notify_status = 0;
+	}
+	spin_unlock(&oom_notify_lock);
+
+	return 0;
+}
+
+static unsigned int oom_notify_poll(struct file *file, poll_table *wait)
+{
+	unsigned int val = 0;
+	struct zone *zone;
+	int cz_idx = zone_idx(NODE_DATA(nid)->node_zonelists->zones[0]);
+
+	poll_wait(file, &oom_wait, wait);
+
+	if (oom_notify_status)
+		val = POLLIN;
+
+	for_each_zone(zone) {
+		if (!populated_zone(zone))
+			continue;	
+		if (!zone_watermark_ok(zone, 0, zone->pages_low, cz_idx, 0)) {
+			val = POLLIN;
+			break;
+		}
+	}
+
+	return val;
+}
+
+struct file_operations oom_notify_fops = {
+	.open = oom_notify_open,
+	.release = oom_notify_release,
+	.poll = oom_notify_poll,
+};
+EXPORT_SYMBOL(oom_notify_fops);
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/vmstat.c linux-2.6/mm/vmstat.c
--- linux-2.6.orig/mm/vmstat.c	2007-10-24 15:53:02.000000000 -0300
+++ linux-2.6/mm/vmstat.c	2007-10-27 22:45:35.000000000 -0300
@@ -52,6 +52,28 @@
 }
 EXPORT_SYMBOL_GPL(all_vm_events);
 
+unsigned int sum_vm_event(int vm_event)
+{
+	int cpu = 0;
+	int i;
+	unsigned int ret = 0;
+	cpumask_t *cpumask = &cpu_online_map;
+
+	cpu = first_cpu(*cpumask);
+	while (cpu < NR_CPUS) {
+		struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
+
+		cpu = next_cpu(cpu, *cpumask);
+
+		if (cpu < NR_CPUS)
+			prefetch(&per_cpu(vm_event_states, cpu));
+
+		ret += this->event[vm_event];
+	}
+	return ret;
+}
+EXPORT_SYMBOL(sum_vm_event);
+
 #ifdef CONFIG_HOTPLUG
 /*
  * Fold the foreign cpu events into our own.
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/include/linux/vmstat.h linux-2.6/include/linux/vmstat.h
--- linux-2.6.orig/include/linux/vmstat.h	2007-10-24 15:55:30.000000000 -0300
+++ linux-2.6/include/linux/vmstat.h	2007-10-27 23:28:48.000000000 -0300
@@ -80,6 +80,7 @@
 }
 
 extern void all_vm_events(unsigned long *);
+extern unsigned int sum_vm_event(int);
 #ifdef CONFIG_HOTPLUG
 extern void vm_events_fold_cpu(int cpu);
 #else

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2007-10-30 21:07 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-10-30 19:18 [RFC] oom notifications via /dev/oom_notify Marcelo Tosatti
2007-10-30 20:57 ` Balbir Singh
2007-10-30 22:16   ` Marcelo Tosatti
2007-10-30 21:00 ` Rik van Riel
2007-10-30 21:07 ` Marcelo Tosatti [this message]
2007-10-30 21:19   ` Rik van Riel
2007-10-30 22:26     ` Marcelo Tosatti
2007-10-31 17:20   ` Dave Jones
2007-11-01 23:58     ` Marcelo Tosatti
2007-10-30 21:59 ` Badari Pulavarty
2007-10-30 21:12   ` Rik van Riel
2007-10-31  4:17     ` Badari
2007-10-31  4:31       ` Rik van Riel
2007-10-31 17:01         ` Badari Pulavarty
2007-10-31 16:15           ` Rik van Riel
2007-10-31  5:38       ` Balbir Singh

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20071030210743.GA304@dmt \
    --to=marcelo@kvack.org \
    --cc=akpm@linux-foundation.org \
    --cc=balbir@linux.vnet.ibm.com \
    --cc=drepper@redhat.com \
    --cc=linux-mm@kvack.org \
    --cc=mbligh@mbligh.org \
    --cc=riel@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).