linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC] oom notifications via /dev/oom_notify
@ 2007-10-30 19:18 Marcelo Tosatti
  2007-10-30 20:57 ` Balbir Singh
                   ` (3 more replies)
  0 siblings, 4 replies; 16+ messages in thread
From: Marcelo Tosatti @ 2007-10-30 19:18 UTC (permalink / raw)
  To: linux-mm; +Cc: drepper, riel, akpm, mbligh, balbir

Hi,

Following patch creates a /dev/oom_notify device which applications can
select()/poll() to get informed of memory pressure.

The basic idea here is that applications can be part of the memory
reclaim process. The notification is loosely defined as "please free
some small percentage of your memory".

There is no easy way of finding whether the system is approaching a
state where swapping is required in the reclaim paths, so a defensive
approach is taken by using a timer with 1Hz frequency which verifies
whether swapping has occurred.

For scenarios which require a "severe pressure notification" (please
read Nokia's implementation at http://www.linuxjournal.com/article/8502 for
more details), I believe the best solution is to create a separate
/dev/oom_notify_critical device to avoid complication of the main device
code paths. Take into account that such notification needs careful
synchronization with the OOM killer.

Comments please...

diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/drivers/char/mem.c linux-2.6/drivers/char/mem.c
--- linux-2.6.orig/drivers/char/mem.c	2007-10-24 15:52:54.000000000 -0300
+++ linux-2.6/drivers/char/mem.c	2007-10-29 00:22:31.000000000 -0300
@@ -34,6 +34,8 @@
 # include <linux/efi.h>
 #endif
 
+extern struct file_operations oom_notify_fops;
+
 /*
  * Architectures vary in how they handle caching for addresses
  * outside of main memory.
@@ -854,6 +856,9 @@
 			filp->f_op = &oldmem_fops;
 			break;
 #endif
+		case 13:
+			filp->f_op = &oom_notify_fops;
+			break;
 		default:
 			return -ENXIO;
 	}
@@ -886,6 +891,7 @@
 #ifdef CONFIG_CRASH_DUMP
 	{12,"oldmem",    S_IRUSR | S_IWUSR | S_IRGRP, &oldmem_fops},
 #endif
+	{13,"oom_notify", S_IRUGO, &oom_notify_fops},
 };
 
 static struct class *mem_class;
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/include/linux/vmstat.h linux-2.6/include/linux/vmstat.h
--- linux-2.6.orig/include/linux/vmstat.h	2007-10-24 15:55:30.000000000 -0300
+++ linux-2.6/include/linux/vmstat.h	2007-10-27 23:28:48.000000000 -0300
@@ -80,6 +80,7 @@
 }
 
 extern void all_vm_events(unsigned long *);
+extern unsigned int sum_vm_event(int);
 #ifdef CONFIG_HOTPLUG
 extern void vm_events_fold_cpu(int cpu);
 #else
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/Kconfig linux-2.6/mm/Kconfig
--- linux-2.6.orig/mm/Kconfig	2007-10-24 15:53:02.000000000 -0300
+++ linux-2.6/mm/Kconfig	2007-10-25 13:58:38.000000000 -0300
@@ -170,6 +170,13 @@
 	  example on NUMA systems to put pages nearer to the processors accessing
 	  the page.
 
+config OOM_NOTIFY
+	bool "Memory notification"
+	def_bool n
+	help
+	  This option allows the kernel to notify applications of memory 
+	  shortage.
+
 config RESOURCES_64BIT
 	bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
 	default 64BIT
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/Makefile linux-2.6/mm/Makefile
--- linux-2.6.orig/mm/Makefile	2007-10-24 15:53:02.000000000 -0300
+++ linux-2.6/mm/Makefile	2007-10-25 13:54:34.000000000 -0300
@@ -30,4 +30,5 @@
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_OOM_NOTIFY) += oom_notify.o
 
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/oom_notify.c linux-2.6/mm/oom_notify.c
--- linux-2.6.orig/mm/oom_notify.c	1969-12-31 21:00:00.000000000 -0300
+++ linux-2.6/mm/oom_notify.c	2007-10-30 12:35:24.000000000 -0300
@@ -0,0 +1,96 @@
+/*
+ * Notify applications of memory pressure via /dev/oom_notify
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/timer.h>
+#include <linux/spinlock.h>
+#include <linux/vmstat.h>
+
+static int oom_notify_users = 0;
+static bool oom_notify_status = 0;
+static unsigned int prev_swapped_pages = 0;
+
+static void oom_check_fn(unsigned long);
+
+DECLARE_WAIT_QUEUE_HEAD(oom_wait);
+DEFINE_SPINLOCK(oom_notify_lock);
+static struct timer_list oom_check_timer =
+		TIMER_INITIALIZER(oom_check_fn, 0, 0);
+
+void oom_check_fn(unsigned long unused)
+{
+	bool wake = 0;
+	unsigned int swapped_pages;
+
+	swapped_pages = sum_vm_event(PSWPOUT);
+	if (swapped_pages > prev_swapped_pages)
+		wake = 1;
+	prev_swapped_pages = swapped_pages;
+
+	oom_notify_status = wake;
+
+	if (wake)
+		wake_up_all(&oom_wait);
+
+	return;
+}
+
+static int oom_notify_open(struct inode *inode, struct file *file)
+{
+	spin_lock(&oom_notify_lock);
+	if (!oom_notify_users) {
+		oom_notify_status = 0;
+		oom_check_timer.expires = jiffies + msecs_to_jiffies(1000);
+		mod_timer(&oom_check_timer, oom_check_timer.expires);
+	}
+	oom_notify_users++;
+	spin_unlock(&oom_notify_lock);
+
+	return 0;
+}
+
+static int oom_notify_release(struct inode *inode, struct file *file)
+{
+	spin_lock(&oom_notify_lock);
+	oom_notify_users--;
+	if (!oom_notify_users) {
+		del_timer(&oom_check_timer);
+		oom_notify_status = 0;
+	}
+	spin_unlock(&oom_notify_lock);
+	return 0;
+}
+
+static unsigned int oom_notify_poll(struct file *file, poll_table *wait)
+{
+	unsigned int val = 0;
+	struct zone *zone;
+	int cz_idx = zone_idx(NODE_DATA(nid)->node_zonelists->zones[0]);
+
+	poll_wait(file, &oom_wait, wait);
+
+	if (oom_notify_status)
+		val = POLLIN;
+
+	for_each_zone(zone) {
+		if (!populated_zone(zone))
+			continue;	
+		if (!zone_watermark_ok(zone, 0, zone->pages_low, cz_idx, 0)) {
+			val = POLLIN;
+			break;
+		}
+	}
+
+	return val;
+}
+
+struct file_operations oom_notify_fops = {
+	.open = oom_notify_open,
+	.release = oom_notify_release,
+	.poll = oom_notify_poll,
+};
+EXPORT_SYMBOL(oom_notify_fops);
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/vmstat.c linux-2.6/mm/vmstat.c
--- linux-2.6.orig/mm/vmstat.c	2007-10-24 15:53:02.000000000 -0300
+++ linux-2.6/mm/vmstat.c	2007-10-27 22:45:35.000000000 -0300
@@ -52,6 +52,28 @@
 }
 EXPORT_SYMBOL_GPL(all_vm_events);
 
+unsigned int sum_vm_event(int vm_event)
+{
+	int cpu = 0;
+	int i;
+	unsigned int ret = 0;
+	cpumask_t *cpumask = &cpu_online_map;
+
+	cpu = first_cpu(*cpumask);
+	while (cpu < NR_CPUS) {
+		struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
+
+		cpu = next_cpu(cpu, *cpumask);
+
+		if (cpu < NR_CPUS)
+			prefetch(&per_cpu(vm_event_states, cpu));
+
+		ret += this->event[vm_event];
+	}
+	return ret;
+}
+EXPORT_SYMBOL(sum_vm_event);
+
 #ifdef CONFIG_HOTPLUG
 /*
  * Fold the foreign cpu events into our own.
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/include/linux/vmstat.h linux-2.6/include/linux/vmstat.h
--- linux-2.6.orig/include/linux/vmstat.h	2007-10-24 15:55:30.000000000 -0300
+++ linux-2.6/include/linux/vmstat.h	2007-10-27 23:28:48.000000000 -0300
@@ -80,6 +80,7 @@
 }
 
 extern void all_vm_events(unsigned long *);
+extern unsigned int sum_vm_event(int);
 #ifdef CONFIG_HOTPLUG
 extern void vm_events_fold_cpu(int cpu);
 #else

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC] oom notifications via /dev/oom_notify
  2007-10-30 19:18 [RFC] oom notifications via /dev/oom_notify Marcelo Tosatti
@ 2007-10-30 20:57 ` Balbir Singh
  2007-10-30 22:16   ` Marcelo Tosatti
  2007-10-30 21:00 ` Rik van Riel
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 16+ messages in thread
From: Balbir Singh @ 2007-10-30 20:57 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: linux-mm, drepper, riel, akpm, mbligh, Gautham shenoy

Marcelo Tosatti wrote:
> Hi,
> 
> Following patch creates a /dev/oom_notify device which applications can
> select()/poll() to get informed of memory pressure.
> 
> The basic idea here is that applications can be part of the memory
> reclaim process. The notification is loosely defined as "please free
> some small percentage of your memory".
> 
> There is no easy way of finding whether the system is approaching a
> state where swapping is required in the reclaim paths, so a defensive
> approach is taken by using a timer with 1Hz frequency which verifies
> whether swapping has occurred.
> 
> For scenarios which require a "severe pressure notification" (please
> read Nokia's implementation at http://www.linuxjournal.com/article/8502 for
> more details), I believe the best solution is to create a separate
> /dev/oom_notify_critical device to avoid complication of the main device
> code paths. Take into account that such notification needs careful
> synchronization with the OOM killer.
> 
> Comments please...
> 
> diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/drivers/char/mem.c linux-2.6/drivers/char/mem.c
> --- linux-2.6.orig/drivers/char/mem.c	2007-10-24 15:52:54.000000000 -0300
> +++ linux-2.6/drivers/char/mem.c	2007-10-29 00:22:31.000000000 -0300
> @@ -34,6 +34,8 @@
>  # include <linux/efi.h>
>  #endif
> 
> +extern struct file_operations oom_notify_fops;
> +
>  /*
>   * Architectures vary in how they handle caching for addresses
>   * outside of main memory.
> @@ -854,6 +856,9 @@
>  			filp->f_op = &oldmem_fops;
>  			break;
>  #endif
> +		case 13:
> +			filp->f_op = &oom_notify_fops;
> +			break;
>  		default:
>  			return -ENXIO;
>  	}
> @@ -886,6 +891,7 @@
>  #ifdef CONFIG_CRASH_DUMP
>  	{12,"oldmem",    S_IRUSR | S_IWUSR | S_IRGRP, &oldmem_fops},
>  #endif
> +	{13,"oom_notify", S_IRUGO, &oom_notify_fops},
>  };
> 
>  static struct class *mem_class;
> diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/include/linux/vmstat.h linux-2.6/include/linux/vmstat.h
> --- linux-2.6.orig/include/linux/vmstat.h	2007-10-24 15:55:30.000000000 -0300
> +++ linux-2.6/include/linux/vmstat.h	2007-10-27 23:28:48.000000000 -0300
> @@ -80,6 +80,7 @@
>  }
> 
>  extern void all_vm_events(unsigned long *);
> +extern unsigned int sum_vm_event(int);
>  #ifdef CONFIG_HOTPLUG
>  extern void vm_events_fold_cpu(int cpu);
>  #else
> diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/Kconfig linux-2.6/mm/Kconfig
> --- linux-2.6.orig/mm/Kconfig	2007-10-24 15:53:02.000000000 -0300
> +++ linux-2.6/mm/Kconfig	2007-10-25 13:58:38.000000000 -0300
> @@ -170,6 +170,13 @@
>  	  example on NUMA systems to put pages nearer to the processors accessing
>  	  the page.
> 
> +config OOM_NOTIFY
> +	bool "Memory notification"
> +	def_bool n
> +	help
> +	  This option allows the kernel to notify applications of memory 
> +	  shortage.
> +
>  config RESOURCES_64BIT
>  	bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
>  	default 64BIT
> diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/Makefile linux-2.6/mm/Makefile
> --- linux-2.6.orig/mm/Makefile	2007-10-24 15:53:02.000000000 -0300
> +++ linux-2.6/mm/Makefile	2007-10-25 13:54:34.000000000 -0300
> @@ -30,4 +30,5 @@
>  obj-$(CONFIG_MIGRATION) += migrate.o
>  obj-$(CONFIG_SMP) += allocpercpu.o
>  obj-$(CONFIG_QUICKLIST) += quicklist.o
> +obj-$(CONFIG_OOM_NOTIFY) += oom_notify.o
> 
> diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/oom_notify.c linux-2.6/mm/oom_notify.c
> --- linux-2.6.orig/mm/oom_notify.c	1969-12-31 21:00:00.000000000 -0300
> +++ linux-2.6/mm/oom_notify.c	2007-10-30 12:35:24.000000000 -0300
> @@ -0,0 +1,96 @@
> +/*
> + * Notify applications of memory pressure via /dev/oom_notify
> + */
> +
> +#include <linux/module.h>
> +#include <linux/fs.h>
> +#include <linux/wait.h>
> +#include <linux/poll.h>
> +#include <linux/timer.h>
> +#include <linux/spinlock.h>
> +#include <linux/vmstat.h>
> +
> +static int oom_notify_users = 0;
> +static bool oom_notify_status = 0;
> +static unsigned int prev_swapped_pages = 0;
> +
> +static void oom_check_fn(unsigned long);
> +
> +DECLARE_WAIT_QUEUE_HEAD(oom_wait);
> +DEFINE_SPINLOCK(oom_notify_lock);
> +static struct timer_list oom_check_timer =
> +		TIMER_INITIALIZER(oom_check_fn, 0, 0);
> +
> +void oom_check_fn(unsigned long unused)
> +{
> +	bool wake = 0;
> +	unsigned int swapped_pages;
> +
> +	swapped_pages = sum_vm_event(PSWPOUT);
> +	if (swapped_pages > prev_swapped_pages)
> +		wake = 1;
> +	prev_swapped_pages = swapped_pages;
> +

Two comments

1. So this is a rate growth function and continues to wake
   up tasks as long as the rate of swapout keeps growing?"
2. How will this function work in the absence of swap? Does
  this feature work in the absence of swap?


> +	oom_notify_status = wake;
> +
> +	if (wake)
> +		wake_up_all(&oom_wait);
> +
> +	return;
> +}
> +
> +static int oom_notify_open(struct inode *inode, struct file *file)
> +{

Should we check current->oomkilladj before allowing open to proceed?

> +	spin_lock(&oom_notify_lock);
> +	if (!oom_notify_users) {
> +		oom_notify_status = 0;
> +		oom_check_timer.expires = jiffies + msecs_to_jiffies(1000);

A more meaningful name for 1000, here please?

> +		mod_timer(&oom_check_timer, oom_check_timer.expires);
> +	}
> +	oom_notify_users++;
> +	spin_unlock(&oom_notify_lock);
> +
> +	return 0;
> +}
> +
> +static int oom_notify_release(struct inode *inode, struct file *file)
> +{
> +	spin_lock(&oom_notify_lock);
> +	oom_notify_users--;
> +	if (!oom_notify_users) {
> +		del_timer(&oom_check_timer);
> +		oom_notify_status = 0;
> +	}
> +	spin_unlock(&oom_notify_lock);
> +	return 0;
> +}
> +
> +static unsigned int oom_notify_poll(struct file *file, poll_table *wait)
> +{
> +	unsigned int val = 0;
> +	struct zone *zone;
> +	int cz_idx = zone_idx(NODE_DATA(nid)->node_zonelists->zones[0]);
> +
> +	poll_wait(file, &oom_wait, wait);
> +
> +	if (oom_notify_status)
> +		val = POLLIN;
> +
> +	for_each_zone(zone) {
> +		if (!populated_zone(zone))
> +			continue;	
> +		if (!zone_watermark_ok(zone, 0, zone->pages_low, cz_idx, 0)) {
> +			val = POLLIN;
> +			break;
> +		}
> +	}
> +
> +	return val;
> +}
> +
> +struct file_operations oom_notify_fops = {
> +	.open = oom_notify_open,
> +	.release = oom_notify_release,
> +	.poll = oom_notify_poll,
> +};

Can we also implement a oom_notify_read() function, so that a read on
/dev/oom_notify will give the reason for returning on select on
/dev/oom_notify.

> +EXPORT_SYMBOL(oom_notify_fops);
> diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/vmstat.c linux-2.6/mm/vmstat.c
> --- linux-2.6.orig/mm/vmstat.c	2007-10-24 15:53:02.000000000 -0300
> +++ linux-2.6/mm/vmstat.c	2007-10-27 22:45:35.000000000 -0300
> @@ -52,6 +52,28 @@
>  }
>  EXPORT_SYMBOL_GPL(all_vm_events);
> 
> +unsigned int sum_vm_event(int vm_event)
> +{
> +	int cpu = 0;
> +	int i;
> +	unsigned int ret = 0;
> +	cpumask_t *cpumask = &cpu_online_map;
> +
> +	cpu = first_cpu(*cpumask);
> +	while (cpu < NR_CPUS) {
> +		struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
> +
> +		cpu = next_cpu(cpu, *cpumask);
> +
> +		if (cpu < NR_CPUS)
> +			prefetch(&per_cpu(vm_event_states, cpu));
> +
> +		ret += this->event[vm_event];
> +	}
> +	return ret;
> +}
> +EXPORT_SYMBOL(sum_vm_event);

Is this routine CPU Hotplug safe?

> +
>  #ifdef CONFIG_HOTPLUG
>  /*
>   * Fold the foreign cpu events into our own.
> diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/include/linux/vmstat.h linux-2.6/include/linux/vmstat.h
> --- linux-2.6.orig/include/linux/vmstat.h	2007-10-24 15:55:30.000000000 -0300
> +++ linux-2.6/include/linux/vmstat.h	2007-10-27 23:28:48.000000000 -0300
> @@ -80,6 +80,7 @@
>  }
> 
>  extern void all_vm_events(unsigned long *);
> +extern unsigned int sum_vm_event(int);
>  #ifdef CONFIG_HOTPLUG
>  extern void vm_events_fold_cpu(int cpu);
>  #else
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>


-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC] oom notifications via /dev/oom_notify
  2007-10-30 19:18 [RFC] oom notifications via /dev/oom_notify Marcelo Tosatti
  2007-10-30 20:57 ` Balbir Singh
@ 2007-10-30 21:00 ` Rik van Riel
  2007-10-30 21:07 ` Marcelo Tosatti
  2007-10-30 21:59 ` Badari Pulavarty
  3 siblings, 0 replies; 16+ messages in thread
From: Rik van Riel @ 2007-10-30 21:00 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: linux-mm, drepper, akpm, mbligh, balbir

On Tue, 30 Oct 2007 15:18:27 -0400
Marcelo Tosatti <marcelo@kvack.org> wrote:

> The basic idea here is that applications can be part of the memory
> reclaim process. The notification is loosely defined as "please free
> some small percentage of your memory".

This will be especially useful for things like databases, JVMs
and applications that cache data that can be easily recreated
or read in from disk.  File IO tends to be much faster than
swap IO.

It could also be useful for glibc.  When a userland process
calls free(), a lot of the time the memory is not given back
to the kernel.  After all, chances are the process will need
it again.

If the kernel needs the memory, however, it will be faster for
applications to simply give it back than for the apps to wait
on disk IO.
 
> There is no easy way of finding whether the system is approaching a
> state where swapping is required in the reclaim paths, so a defensive
> approach is taken by using a timer with 1Hz frequency which verifies
> whether swapping has occurred.

Good enough for initial testing.  I will make sure that we will
have a more clearly defined threshold in the split VM code that
I am working on, so we can send the signal before we actually
start swapping.

> +void oom_check_fn(unsigned long unused)
> +{
> +	bool wake = 0;
> +	unsigned int swapped_pages;
> +
> +	swapped_pages = sum_vm_event(PSWPOUT);
> +	if (swapped_pages > prev_swapped_pages)
> +		wake = 1;
> +	prev_swapped_pages = swapped_pages;
> +
> +	oom_notify_status = wake;
> +
> +	if (wake)
> +		wake_up_all(&oom_wait);
> +
> +	return;
> +}

Maybe it would be better if we could do the wakeup earlier, so
we could wake up fewer processes at a time?  Maybe only one?

Thundering herd problems could be bad...

On the other hand, if memory is low on one NUMA node it would
not help at all if we woke up processes from other NUMA nodes...

-- 
All Rights Reversed

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC] oom notifications via /dev/oom_notify
  2007-10-30 19:18 [RFC] oom notifications via /dev/oom_notify Marcelo Tosatti
  2007-10-30 20:57 ` Balbir Singh
  2007-10-30 21:00 ` Rik van Riel
@ 2007-10-30 21:07 ` Marcelo Tosatti
  2007-10-30 21:19   ` Rik van Riel
  2007-10-31 17:20   ` Dave Jones
  2007-10-30 21:59 ` Badari Pulavarty
  3 siblings, 2 replies; 16+ messages in thread
From: Marcelo Tosatti @ 2007-10-30 21:07 UTC (permalink / raw)
  To: linux-mm; +Cc: drepper, riel, akpm, mbligh, balbir

On Tue, Oct 30, 2007 at 03:18:27PM -0400, Marcelo Tosatti wrote:
> Hi,
> 
> Following patch creates a /dev/oom_notify device which applications can
> select()/poll() to get informed of memory pressure.
> 
> The basic idea here is that applications can be part of the memory
> reclaim process. The notification is loosely defined as "please free
> some small percentage of your memory".
> 
> There is no easy way of finding whether the system is approaching a
> state where swapping is required in the reclaim paths, so a defensive
> approach is taken by using a timer with 1Hz frequency which verifies
> whether swapping has occurred.
> 
> For scenarios which require a "severe pressure notification" (please
> read Nokia's implementation at http://www.linuxjournal.com/article/8502 for
> more details), I believe the best solution is to create a separate
> /dev/oom_notify_critical device to avoid complication of the main device
> code paths. Take into account that such notification needs careful
> synchronization with the OOM killer.
> 
> Comments please...

changes:
- rearm timer (!)
- wake up one thread instead of all in swapout detection
- msecs_to_jiffies(1000) -> HZ

--- linux-2.6.orig/drivers/char/mem.c	2007-10-24 15:52:54.000000000 -0300
+++ linux-2.6/drivers/char/mem.c	2007-10-29 00:22:31.000000000 -0300
@@ -34,6 +34,8 @@
 # include <linux/efi.h>
 #endif
 
+extern struct file_operations oom_notify_fops;
+
 /*
  * Architectures vary in how they handle caching for addresses
  * outside of main memory.
@@ -854,6 +856,9 @@
 			filp->f_op = &oldmem_fops;
 			break;
 #endif
+		case 13:
+			filp->f_op = &oom_notify_fops;
+			break;
 		default:
 			return -ENXIO;
 	}
@@ -886,6 +891,7 @@
 #ifdef CONFIG_CRASH_DUMP
 	{12,"oldmem",    S_IRUSR | S_IWUSR | S_IRGRP, &oldmem_fops},
 #endif
+	{13,"oom_notify", S_IRUGO, &oom_notify_fops},
 };
 
 static struct class *mem_class;
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/include/linux/vmstat.h linux-2.6/include/linux/vmstat.h
--- linux-2.6.orig/include/linux/vmstat.h	2007-10-24 15:55:30.000000000 -0300
+++ linux-2.6/include/linux/vmstat.h	2007-10-27 23:28:48.000000000 -0300
@@ -80,6 +80,7 @@
 }
 
 extern void all_vm_events(unsigned long *);
+extern unsigned int sum_vm_event(int);
 #ifdef CONFIG_HOTPLUG
 extern void vm_events_fold_cpu(int cpu);
 #else
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/Kconfig linux-2.6/mm/Kconfig
--- linux-2.6.orig/mm/Kconfig	2007-10-24 15:53:02.000000000 -0300
+++ linux-2.6/mm/Kconfig	2007-10-25 13:58:38.000000000 -0300
@@ -170,6 +170,13 @@
 	  example on NUMA systems to put pages nearer to the processors accessing
 	  the page.
 
+config OOM_NOTIFY
+	bool "Memory notification"
+	def_bool n
+	help
+	  This option allows the kernel to notify applications of memory 
+	  shortage.
+
 config RESOURCES_64BIT
 	bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
 	default 64BIT
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/Makefile linux-2.6/mm/Makefile
--- linux-2.6.orig/mm/Makefile	2007-10-24 15:53:02.000000000 -0300
+++ linux-2.6/mm/Makefile	2007-10-25 13:54:34.000000000 -0300
@@ -30,4 +30,5 @@
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_OOM_NOTIFY) += oom_notify.o
 
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/oom_notify.c linux-2.6/mm/oom_notify.c
--- linux-2.6.orig/mm/oom_notify.c	1969-12-31 21:00:00.000000000 -0300
+++ linux-2.6/mm/oom_notify.c	2007-10-30 16:02:29.000000000 -0300
@@ -0,0 +1,97 @@
+/*
+ * Notify applications of memory pressure via /dev/oom_notify
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/timer.h>
+#include <linux/spinlock.h>
+#include <linux/vmstat.h>
+
+static int oom_notify_users = 0;
+static bool oom_notify_status = 0;
+static unsigned int prev_swapped_pages = 0;
+
+static void oom_check_fn(unsigned long);
+
+DECLARE_WAIT_QUEUE_HEAD(oom_wait);
+DEFINE_SPINLOCK(oom_notify_lock);
+static struct timer_list oom_check_timer =
+		TIMER_INITIALIZER(oom_check_fn, 0, 0);
+
+void oom_check_fn(unsigned long unused)
+{
+	bool wake = 0;
+	unsigned int swapped_pages;
+
+	swapped_pages = sum_vm_event(PSWPOUT);
+	if (swapped_pages > prev_swapped_pages)
+		wake = 1;
+	prev_swapped_pages = swapped_pages;
+
+	oom_notify_status = wake;
+
+	if (wake)
+		wake_up(&oom_wait);
+
+	mod_timer(&oom_check_timer, jiffies+HZ);
+	return;
+}
+
+static int oom_notify_open(struct inode *inode, struct file *file)
+{
+	spin_lock(&oom_notify_lock);
+	if (!oom_notify_users) {
+		oom_notify_status = 0;
+		mod_timer(&oom_check_timer, jiffies+HZ);
+	}
+	oom_notify_users++;
+	spin_unlock(&oom_notify_lock);
+
+	return 0;
+}
+
+static int oom_notify_release(struct inode *inode, struct file *file)
+{
+	spin_lock(&oom_notify_lock);
+	oom_notify_users--;
+	if (!oom_notify_users) {
+		del_timer(&oom_check_timer);
+		oom_notify_status = 0;
+	}
+	spin_unlock(&oom_notify_lock);
+
+	return 0;
+}
+
+static unsigned int oom_notify_poll(struct file *file, poll_table *wait)
+{
+	unsigned int val = 0;
+	struct zone *zone;
+	int cz_idx = zone_idx(NODE_DATA(nid)->node_zonelists->zones[0]);
+
+	poll_wait(file, &oom_wait, wait);
+
+	if (oom_notify_status)
+		val = POLLIN;
+
+	for_each_zone(zone) {
+		if (!populated_zone(zone))
+			continue;	
+		if (!zone_watermark_ok(zone, 0, zone->pages_low, cz_idx, 0)) {
+			val = POLLIN;
+			break;
+		}
+	}
+
+	return val;
+}
+
+struct file_operations oom_notify_fops = {
+	.open = oom_notify_open,
+	.release = oom_notify_release,
+	.poll = oom_notify_poll,
+};
+EXPORT_SYMBOL(oom_notify_fops);
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/vmstat.c linux-2.6/mm/vmstat.c
--- linux-2.6.orig/mm/vmstat.c	2007-10-24 15:53:02.000000000 -0300
+++ linux-2.6/mm/vmstat.c	2007-10-27 22:45:35.000000000 -0300
@@ -52,6 +52,28 @@
 }
 EXPORT_SYMBOL_GPL(all_vm_events);
 
+unsigned int sum_vm_event(int vm_event)
+{
+	int cpu = 0;
+	int i;
+	unsigned int ret = 0;
+	cpumask_t *cpumask = &cpu_online_map;
+
+	cpu = first_cpu(*cpumask);
+	while (cpu < NR_CPUS) {
+		struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
+
+		cpu = next_cpu(cpu, *cpumask);
+
+		if (cpu < NR_CPUS)
+			prefetch(&per_cpu(vm_event_states, cpu));
+
+		ret += this->event[vm_event];
+	}
+	return ret;
+}
+EXPORT_SYMBOL(sum_vm_event);
+
 #ifdef CONFIG_HOTPLUG
 /*
  * Fold the foreign cpu events into our own.
diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/include/linux/vmstat.h linux-2.6/include/linux/vmstat.h
--- linux-2.6.orig/include/linux/vmstat.h	2007-10-24 15:55:30.000000000 -0300
+++ linux-2.6/include/linux/vmstat.h	2007-10-27 23:28:48.000000000 -0300
@@ -80,6 +80,7 @@
 }
 
 extern void all_vm_events(unsigned long *);
+extern unsigned int sum_vm_event(int);
 #ifdef CONFIG_HOTPLUG
 extern void vm_events_fold_cpu(int cpu);
 #else

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC] oom notifications via /dev/oom_notify
  2007-10-30 21:59 ` Badari Pulavarty
@ 2007-10-30 21:12   ` Rik van Riel
  2007-10-31  4:17     ` Badari
  0 siblings, 1 reply; 16+ messages in thread
From: Rik van Riel @ 2007-10-30 21:12 UTC (permalink / raw)
  To: Badari Pulavarty
  Cc: Marcelo Tosatti, linux-mm, drepper, Andrew Morton, mbligh, balbir

On Tue, 30 Oct 2007 13:59:28 -0800
Badari Pulavarty <pbadari@us.ibm.com> wrote:

> Interesting.. Our database folks wanted some kind of notification when
> there is memory pressure and we are about to kill the biggest consumer
> (in most cases, the most useful application :(). What actually they
> want is a way to get notified, so that they can shrink their memory
> footprint in response. Just notifying before OOM may not help, since
> they don't have time to react. How does this notification help ? Are
> they supposed to monitor swapping activity and decide ?

Marcelo's code monitors swapping activity and will let userspace
programs (that poll/select the device node) know when they should
shrink their memory footprint.

This is not "OOM" in the sense of "no more memory or swap", but
in the sense of "we're low on memory - if you don't free something
we'll slow you down by swapping stuff".

-- 
All Rights Reversed

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC] oom notifications via /dev/oom_notify
  2007-10-30 21:07 ` Marcelo Tosatti
@ 2007-10-30 21:19   ` Rik van Riel
  2007-10-30 22:26     ` Marcelo Tosatti
  2007-10-31 17:20   ` Dave Jones
  1 sibling, 1 reply; 16+ messages in thread
From: Rik van Riel @ 2007-10-30 21:19 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: linux-mm, drepper, akpm, mbligh, balbir

On Tue, 30 Oct 2007 17:07:43 -0400
Marcelo Tosatti <marcelo@kvack.org> wrote:

> > Comments please...
> 
> changes:
> - rearm timer (!)
> - wake up one thread instead of all in swapout detection
> - msecs_to_jiffies(1000) -> HZ

Would it be an idea to use round_jiffies() ?

-- 
All Rights Reversed

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC] oom notifications via /dev/oom_notify
  2007-10-30 19:18 [RFC] oom notifications via /dev/oom_notify Marcelo Tosatti
                   ` (2 preceding siblings ...)
  2007-10-30 21:07 ` Marcelo Tosatti
@ 2007-10-30 21:59 ` Badari Pulavarty
  2007-10-30 21:12   ` Rik van Riel
  3 siblings, 1 reply; 16+ messages in thread
From: Badari Pulavarty @ 2007-10-30 21:59 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: linux-mm, drepper, riel, Andrew Morton, mbligh, balbir

On Tue, 2007-10-30 at 15:18 -0400, Marcelo Tosatti wrote:
> Hi,
> 
> Following patch creates a /dev/oom_notify device which applications can
> select()/poll() to get informed of memory pressure.
> 
> The basic idea here is that applications can be part of the memory
> reclaim process. The notification is loosely defined as "please free
> some small percentage of your memory".
> 
> There is no easy way of finding whether the system is approaching a
> state where swapping is required in the reclaim paths, so a defensive
> approach is taken by using a timer with 1Hz frequency which verifies
> whether swapping has occurred.
> 
> For scenarios which require a "severe pressure notification" (please
> read Nokia's implementation at http://www.linuxjournal.com/article/8502 for
> more details), I believe the best solution is to create a separate
> /dev/oom_notify_critical device to avoid complication of the main device
> code paths. Take into account that such notification needs careful
> synchronization with the OOM killer.
> 
> Comments please...

Interesting.. Our database folks wanted some kind of notification when
there is memory pressure and we are about to kill the biggest consumer
(in most cases, the most useful application :(). What actually they
want is a way to get notified, so that they can shrink their memory
footprint in response. Just notifying before OOM may not help, since
they don't have time to react. How does this notification help ? Are
they supposed to monitor swapping activity and decide ?

Thanks,
Badari



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC] oom notifications via /dev/oom_notify
  2007-10-30 20:57 ` Balbir Singh
@ 2007-10-30 22:16   ` Marcelo Tosatti
  0 siblings, 0 replies; 16+ messages in thread
From: Marcelo Tosatti @ 2007-10-30 22:16 UTC (permalink / raw)
  To: Balbir Singh
  Cc: Marcelo Tosatti, linux-mm, drepper, riel, akpm, mbligh,
	Gautham shenoy, roland

Hi Balbir, 

Last message was lacking details and clarity, sorry.

And yes, the OOM acronym is confusing since it usually refers to OOM
killer.. mem_notify sounds way better.

On Wed, Oct 31, 2007 at 02:27:01AM +0530, Balbir Singh wrote:

> > +void oom_check_fn(unsigned long unused)
> > +{
> > +	bool wake = 0;
> > +	unsigned int swapped_pages;
> > +
> > +	swapped_pages = sum_vm_event(PSWPOUT);
> > +	if (swapped_pages > prev_swapped_pages)
> > +		wake = 1;
> > +	prev_swapped_pages = swapped_pages;
> > +
> 
> Two comments
> 
> 1. So this is a rate growth function and continues to wake
>    up tasks as long as the rate of swapout keeps growing?"

Correct.

> 2. How will this function work in the absence of swap? Does
>   this feature work in the absence of swap?

In the absence of swap PSWPOUT does not increase, therefore the function
won't wake-up tasks.

> > +	oom_notify_status = wake;
> > +
> > +	if (wake)
> > +		wake_up_all(&oom_wait);
> > +
> > +	return;
> > +}
> > +
> > +static int oom_notify_open(struct inode *inode, struct file *file)
> > +{
> 
> Should we check current->oomkilladj before allowing open to proceed?
> 
> > +	spin_lock(&oom_notify_lock);
> > +	if (!oom_notify_users) {
> > +		oom_notify_status = 0;
> > +		oom_check_timer.expires = jiffies + msecs_to_jiffies(1000);
> 
> A more meaningful name for 1000, here please?

Fixed.

> > +		mod_timer(&oom_check_timer, oom_check_timer.expires);
> > +	}
> > +	oom_notify_users++;
> > +	spin_unlock(&oom_notify_lock);
> > +
> > +	return 0;
> > +}
> > +
> > +static int oom_notify_release(struct inode *inode, struct file *file)
> > +{
> > +	spin_lock(&oom_notify_lock);
> > +	oom_notify_users--;
> > +	if (!oom_notify_users) {
> > +		del_timer(&oom_check_timer);
> > +		oom_notify_status = 0;
> > +	}
> > +	spin_unlock(&oom_notify_lock);
> > +	return 0;
> > +}
> > +
> > +static unsigned int oom_notify_poll(struct file *file, poll_table *wait)
> > +{
> > +	unsigned int val = 0;
> > +	struct zone *zone;
> > +	int cz_idx = zone_idx(NODE_DATA(nid)->node_zonelists->zones[0]);
> > +
> > +	poll_wait(file, &oom_wait, wait);
> > +
> > +	if (oom_notify_status)
> > +		val = POLLIN;
> > +
> > +	for_each_zone(zone) {
> > +		if (!populated_zone(zone))
> > +			continue;	
> > +		if (!zone_watermark_ok(zone, 0, zone->pages_low, cz_idx, 0)) {
> > +			val = POLLIN;
> > +			break;
> > +		}
> > +	}
> > +
> > +	return val;
> > +}
> > +
> > +struct file_operations oom_notify_fops = {
> > +	.open = oom_notify_open,
> > +	.release = oom_notify_release,
> > +	.poll = oom_notify_poll,
> > +};
> 
> Can we also implement a oom_notify_read() function, so that a read on
> /dev/oom_notify will give the reason for returning on select on
> /dev/oom_notify.

There are two different notifications:

1) normal memory shortage, allowing userspace to intelligently free
data.

2) critical memory shortage, allowing userspace to take an action before
the OOM killer kicks in.

1 is a fast path AND the large majority of applications only care
about it anyway... which means that I see little value on reporting
both events via the same descriptor.

However, that might be bullshit?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC] oom notifications via /dev/oom_notify
  2007-10-30 21:19   ` Rik van Riel
@ 2007-10-30 22:26     ` Marcelo Tosatti
  0 siblings, 0 replies; 16+ messages in thread
From: Marcelo Tosatti @ 2007-10-30 22:26 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Marcelo Tosatti, linux-mm, drepper, akpm, mbligh, balbir

On Tue, Oct 30, 2007 at 05:19:09PM -0400, Rik van Riel wrote:
> On Tue, 30 Oct 2007 17:07:43 -0400
> Marcelo Tosatti <marcelo@kvack.org> wrote:
> 
> > > Comments please...
> > 
> > changes:
> > - rearm timer (!)
> > - wake up one thread instead of all in swapout detection
> > - msecs_to_jiffies(1000) -> HZ
> 
> Would it be an idea to use round_jiffies() ?

Certainly, fixed.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC] oom notifications via /dev/oom_notify
  2007-10-30 21:12   ` Rik van Riel
@ 2007-10-31  4:17     ` Badari
  2007-10-31  4:31       ` Rik van Riel
  2007-10-31  5:38       ` Balbir Singh
  0 siblings, 2 replies; 16+ messages in thread
From: Badari @ 2007-10-31  4:17 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Marcelo Tosatti, linux-mm, drepper, Andrew Morton, mbligh, balbir

Rik van Riel wrote:
> On Tue, 30 Oct 2007 13:59:28 -0800
> Badari Pulavarty <pbadari@us.ibm.com> wrote:
>
>   
>> Interesting.. Our database folks wanted some kind of notification when
>> there is memory pressure and we are about to kill the biggest consumer
>> (in most cases, the most useful application :(). What actually they
>> want is a way to get notified, so that they can shrink their memory
>> footprint in response. Just notifying before OOM may not help, since
>> they don't have time to react. How does this notification help ? Are
>> they supposed to monitor swapping activity and decide ?
>>     
>
> Marcelo's code monitors swapping activity and will let userspace
> programs (that poll/select the device node) know when they should
> shrink their memory footprint.
>
> This is not "OOM" in the sense of "no more memory or swap", but
> in the sense of "we're low on memory - if you don't free something
> we'll slow you down by swapping stuff".
>
>   
I think having this kind of OOM notification is a decent start. But any 
applications that
wants to know notifications, would be more interested if kernel is 
swapping out any of
its data, than overall system swapping events. I guess, making it 
per-process or per-cgroup
may be logical extension. I am not sure if its really practical , though...

Thanks,
Badari

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC] oom notifications via /dev/oom_notify
  2007-10-31  4:17     ` Badari
@ 2007-10-31  4:31       ` Rik van Riel
  2007-10-31 17:01         ` Badari Pulavarty
  2007-10-31  5:38       ` Balbir Singh
  1 sibling, 1 reply; 16+ messages in thread
From: Rik van Riel @ 2007-10-31  4:31 UTC (permalink / raw)
  To: Badari; +Cc: Marcelo Tosatti, linux-mm, drepper, Andrew Morton, mbligh, balbir

On Tue, 30 Oct 2007 21:17:32 -0700
Badari <pbadari@us.ibm.com> wrote:

> Rik van Riel wrote:
> > On Tue, 30 Oct 2007 13:59:28 -0800
> > Badari Pulavarty <pbadari@us.ibm.com> wrote:
> >
> >   
> >> Interesting.. Our database folks wanted some kind of notification
> >> when there is memory pressure and we are about to kill the biggest
> >> consumer (in most cases, the most useful application :(). What
> >> actually they want is a way to get notified, so that they can
> >> shrink their memory footprint in response. Just notifying before
> >> OOM may not help, since they don't have time to react. How does
> >> this notification help ? Are they supposed to monitor swapping
> >> activity and decide ? 
> >
> > Marcelo's code monitors swapping activity and will let userspace
> > programs (that poll/select the device node) know when they should
> > shrink their memory footprint.
> >
> > This is not "OOM" in the sense of "no more memory or swap", but
> > in the sense of "we're low on memory - if you don't free something
> > we'll slow you down by swapping stuff".
> >
> >   
> I think having this kind of OOM notification is a decent start. But
> any applications that
> wants to know notifications, would be more interested if kernel is 
> swapping out any of
> its data, 

Well, if the scheme is implemented "right", then what you
describe will never happen because programs will have freed
their excess memory already before any swapping happens.

Tweaking the wakeup selection by NUMA node probably makes
sense as a future enhancement, though.

-- 
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC] oom notifications via /dev/oom_notify
  2007-10-31  4:17     ` Badari
  2007-10-31  4:31       ` Rik van Riel
@ 2007-10-31  5:38       ` Balbir Singh
  1 sibling, 0 replies; 16+ messages in thread
From: Balbir Singh @ 2007-10-31  5:38 UTC (permalink / raw)
  To: Badari
  Cc: Rik van Riel, Marcelo Tosatti, linux-mm, drepper, Andrew Morton,
	mbligh

Badari wrote:
> Rik van Riel wrote:
>> On Tue, 30 Oct 2007 13:59:28 -0800
>> Badari Pulavarty <pbadari@us.ibm.com> wrote:
>>
>>  
>>> Interesting.. Our database folks wanted some kind of notification when
>>> there is memory pressure and we are about to kill the biggest consumer
>>> (in most cases, the most useful application :(). What actually they
>>> want is a way to get notified, so that they can shrink their memory
>>> footprint in response. Just notifying before OOM may not help, since
>>> they don't have time to react. How does this notification help ? Are
>>> they supposed to monitor swapping activity and decide ?
>>>     
>>
>> Marcelo's code monitors swapping activity and will let userspace
>> programs (that poll/select the device node) know when they should
>> shrink their memory footprint.
>>
>> This is not "OOM" in the sense of "no more memory or swap", but
>> in the sense of "we're low on memory - if you don't free something
>> we'll slow you down by swapping stuff".
>>
>>   
> I think having this kind of OOM notification is a decent start. But any
> applications that
> wants to know notifications, would be more interested if kernel is
> swapping out any of
> its data, than overall system swapping events. I guess, making it
> per-process or per-cgroup
> may be logical extension. I am not sure if its really practical , though...
> 

Badari,

We have information of swapping per process in taskstats. We have
swap io count and delay. This can be easily extended to per cgroup.
It does not have the finesse of this patch, but it might meet basic
needs.

> Thanks,
> Badari


-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC] oom notifications via /dev/oom_notify
  2007-10-31 17:01         ` Badari Pulavarty
@ 2007-10-31 16:15           ` Rik van Riel
  0 siblings, 0 replies; 16+ messages in thread
From: Rik van Riel @ 2007-10-31 16:15 UTC (permalink / raw)
  To: Badari Pulavarty
  Cc: Marcelo Tosatti, linux-mm, drepper, Andrew Morton, mbligh, balbir

On Wed, 31 Oct 2007 09:01:13 -0800
Badari Pulavarty <pbadari@us.ibm.com> wrote:

> > Well, if the scheme is implemented "right", then what you
> > describe will never happen because programs will have freed
> > their excess memory already before any swapping happens.
> 
> Hmm.. Most cases, application doesn't care about swapping
> activity of the kernel - unless its something to do with
> one of its own processes/threads. So having notifications
> per-process/app/cgroup is what they are looking for.

That seems awfully short sighted to me.  Additional IO has
the potential to slow any application down, simply by keeping
the disk busy.

Also, if you only send out a notification by the time it is
too late, it will be too late.  You cannot avoid IO if you
do not send out the notification until you've done IO.

If you send out the notification before IO has been done, you
don't know for sure who would have been swapped out.

> But again, how they would react to the notification is 
> an interesting thing. If they really act nice and free
> up stuff they don't need or read more crap and cause
> more swapping :(

The easiest thing an app can do when getting the notification
is to madvise(MADV_DONTNEED) any pages that are on internal
free lists, eg. free(3)d memory, memory on a garbage collector
free list, etc...

Other things that could be done is to reduce the size of a
cache, or to kick off a garbage collector run.

-- 
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC] oom notifications via /dev/oom_notify
  2007-10-31  4:31       ` Rik van Riel
@ 2007-10-31 17:01         ` Badari Pulavarty
  2007-10-31 16:15           ` Rik van Riel
  0 siblings, 1 reply; 16+ messages in thread
From: Badari Pulavarty @ 2007-10-31 17:01 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Marcelo Tosatti, linux-mm, drepper, Andrew Morton, mbligh, balbir

On Wed, 2007-10-31 at 00:31 -0400, Rik van Riel wrote:
> On Tue, 30 Oct 2007 21:17:32 -0700
> Badari <pbadari@us.ibm.com> wrote:
> 
> > Rik van Riel wrote:
> > > On Tue, 30 Oct 2007 13:59:28 -0800
> > > Badari Pulavarty <pbadari@us.ibm.com> wrote:
> > >
> > >   
> > >> Interesting.. Our database folks wanted some kind of notification
> > >> when there is memory pressure and we are about to kill the biggest
> > >> consumer (in most cases, the most useful application :(). What
> > >> actually they want is a way to get notified, so that they can
> > >> shrink their memory footprint in response. Just notifying before
> > >> OOM may not help, since they don't have time to react. How does
> > >> this notification help ? Are they supposed to monitor swapping
> > >> activity and decide ? 
> > >
> > > Marcelo's code monitors swapping activity and will let userspace
> > > programs (that poll/select the device node) know when they should
> > > shrink their memory footprint.
> > >
> > > This is not "OOM" in the sense of "no more memory or swap", but
> > > in the sense of "we're low on memory - if you don't free something
> > > we'll slow you down by swapping stuff".
> > >
> > >   
> > I think having this kind of OOM notification is a decent start. But
> > any applications that
> > wants to know notifications, would be more interested if kernel is 
> > swapping out any of
> > its data, 
> 
> Well, if the scheme is implemented "right", then what you
> describe will never happen because programs will have freed
> their excess memory already before any swapping happens.

Hmm.. Most cases, application doesn't care about swapping
activity of the kernel - unless its something to do with
one of its own processes/threads. So having notifications
per-process/app/cgroup is what they are looking for.

But again, how they would react to the notification is 
an interesting thing. If they really act nice and free
up stuff they don't need or read more crap and cause
more swapping :(

Thanks,
Badari

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC] oom notifications via /dev/oom_notify
  2007-10-30 21:07 ` Marcelo Tosatti
  2007-10-30 21:19   ` Rik van Riel
@ 2007-10-31 17:20   ` Dave Jones
  2007-11-01 23:58     ` Marcelo Tosatti
  1 sibling, 1 reply; 16+ messages in thread
From: Dave Jones @ 2007-10-31 17:20 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: linux-mm, drepper, riel, akpm, mbligh, balbir

On Tue, Oct 30, 2007 at 05:07:43PM -0400, Marcelo Tosatti wrote:
 > +		case 13:
 > +			filp->f_op = &oom_notify_fops;
 > +			break;

Don't forget to add this to Documentation/devices.txt

 > +	while (cpu < NR_CPUS) {
 > +		struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
 > +
 > +		cpu = next_cpu(cpu, *cpumask);
 > +
 > +		if (cpu < NR_CPUS)
 > +			prefetch(&per_cpu(vm_event_states, cpu));
 > +
 > +		ret += this->event[vm_event];
 > +	}
 > +	return ret;
 > +}

Is the prefetching worth it?

	Dave 

-- 
http://www.codemonkey.org.uk

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC] oom notifications via /dev/oom_notify
  2007-10-31 17:20   ` Dave Jones
@ 2007-11-01 23:58     ` Marcelo Tosatti
  0 siblings, 0 replies; 16+ messages in thread
From: Marcelo Tosatti @ 2007-11-01 23:58 UTC (permalink / raw)
  To: Dave Jones; +Cc: Marcelo Tosatti, linux-mm, drepper, riel, akpm, mbligh, balbir

On Wed, Oct 31, 2007 at 01:20:10PM -0400, Dave Jones wrote:
> On Tue, Oct 30, 2007 at 05:07:43PM -0400, Marcelo Tosatti wrote:
>  > +		case 13:
>  > +			filp->f_op = &oom_notify_fops;
>  > +			break;
> 
> Don't forget to add this to Documentation/devices.txt

Done.

>  > +	while (cpu < NR_CPUS) {
>  > +		struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
>  > +
>  > +		cpu = next_cpu(cpu, *cpumask);
>  > +
>  > +		if (cpu < NR_CPUS)
>  > +			prefetch(&per_cpu(vm_event_states, cpu));
>  > +
>  > +		ret += this->event[vm_event];
>  > +	}
>  > +	return ret;
>  > +}
> 
> Is the prefetching worth it?

Taking into account that this is a generic function and as such any
member of this->event[] might be accessed, no... removed. 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2007-11-01 23:58 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-10-30 19:18 [RFC] oom notifications via /dev/oom_notify Marcelo Tosatti
2007-10-30 20:57 ` Balbir Singh
2007-10-30 22:16   ` Marcelo Tosatti
2007-10-30 21:00 ` Rik van Riel
2007-10-30 21:07 ` Marcelo Tosatti
2007-10-30 21:19   ` Rik van Riel
2007-10-30 22:26     ` Marcelo Tosatti
2007-10-31 17:20   ` Dave Jones
2007-11-01 23:58     ` Marcelo Tosatti
2007-10-30 21:59 ` Badari Pulavarty
2007-10-30 21:12   ` Rik van Riel
2007-10-31  4:17     ` Badari
2007-10-31  4:31       ` Rik van Riel
2007-10-31 17:01         ` Badari Pulavarty
2007-10-31 16:15           ` Rik van Riel
2007-10-31  5:38       ` Balbir Singh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).