linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Balbir Singh <balbir@linux.vnet.ibm.com>
To: Marcelo Tosatti <marcelo@kvack.org>
Cc: linux-mm@kvack.org, drepper@redhat.com, riel@redhat.com,
	akpm@linux-foundation.org, mbligh@mbligh.org,
	Gautham shenoy <ego@in.ibm.com>
Subject: Re: [RFC] oom notifications via /dev/oom_notify
Date: Wed, 31 Oct 2007 02:27:01 +0530	[thread overview]
Message-ID: <47279A9D.70504@linux.vnet.ibm.com> (raw)
In-Reply-To: <20071030191827.GB31038@dmt>

Marcelo Tosatti wrote:
> Hi,
> 
> Following patch creates a /dev/oom_notify device which applications can
> select()/poll() to get informed of memory pressure.
> 
> The basic idea here is that applications can be part of the memory
> reclaim process. The notification is loosely defined as "please free
> some small percentage of your memory".
> 
> There is no easy way of finding whether the system is approaching a
> state where swapping is required in the reclaim paths, so a defensive
> approach is taken by using a timer with 1Hz frequency which verifies
> whether swapping has occurred.
> 
> For scenarios which require a "severe pressure notification" (please
> read Nokia's implementation at http://www.linuxjournal.com/article/8502 for
> more details), I believe the best solution is to create a separate
> /dev/oom_notify_critical device to avoid complication of the main device
> code paths. Take into account that such notification needs careful
> synchronization with the OOM killer.
> 
> Comments please...
> 
> diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/drivers/char/mem.c linux-2.6/drivers/char/mem.c
> --- linux-2.6.orig/drivers/char/mem.c	2007-10-24 15:52:54.000000000 -0300
> +++ linux-2.6/drivers/char/mem.c	2007-10-29 00:22:31.000000000 -0300
> @@ -34,6 +34,8 @@
>  # include <linux/efi.h>
>  #endif
> 
> +extern struct file_operations oom_notify_fops;
> +
>  /*
>   * Architectures vary in how they handle caching for addresses
>   * outside of main memory.
> @@ -854,6 +856,9 @@
>  			filp->f_op = &oldmem_fops;
>  			break;
>  #endif
> +		case 13:
> +			filp->f_op = &oom_notify_fops;
> +			break;
>  		default:
>  			return -ENXIO;
>  	}
> @@ -886,6 +891,7 @@
>  #ifdef CONFIG_CRASH_DUMP
>  	{12,"oldmem",    S_IRUSR | S_IWUSR | S_IRGRP, &oldmem_fops},
>  #endif
> +	{13,"oom_notify", S_IRUGO, &oom_notify_fops},
>  };
> 
>  static struct class *mem_class;
> diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/include/linux/vmstat.h linux-2.6/include/linux/vmstat.h
> --- linux-2.6.orig/include/linux/vmstat.h	2007-10-24 15:55:30.000000000 -0300
> +++ linux-2.6/include/linux/vmstat.h	2007-10-27 23:28:48.000000000 -0300
> @@ -80,6 +80,7 @@
>  }
> 
>  extern void all_vm_events(unsigned long *);
> +extern unsigned int sum_vm_event(int);
>  #ifdef CONFIG_HOTPLUG
>  extern void vm_events_fold_cpu(int cpu);
>  #else
> diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/Kconfig linux-2.6/mm/Kconfig
> --- linux-2.6.orig/mm/Kconfig	2007-10-24 15:53:02.000000000 -0300
> +++ linux-2.6/mm/Kconfig	2007-10-25 13:58:38.000000000 -0300
> @@ -170,6 +170,13 @@
>  	  example on NUMA systems to put pages nearer to the processors accessing
>  	  the page.
> 
> +config OOM_NOTIFY
> +	bool "Memory notification"
> +	def_bool n
> +	help
> +	  This option allows the kernel to notify applications of memory 
> +	  shortage.
> +
>  config RESOURCES_64BIT
>  	bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
>  	default 64BIT
> diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/Makefile linux-2.6/mm/Makefile
> --- linux-2.6.orig/mm/Makefile	2007-10-24 15:53:02.000000000 -0300
> +++ linux-2.6/mm/Makefile	2007-10-25 13:54:34.000000000 -0300
> @@ -30,4 +30,5 @@
>  obj-$(CONFIG_MIGRATION) += migrate.o
>  obj-$(CONFIG_SMP) += allocpercpu.o
>  obj-$(CONFIG_QUICKLIST) += quicklist.o
> +obj-$(CONFIG_OOM_NOTIFY) += oom_notify.o
> 
> diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/oom_notify.c linux-2.6/mm/oom_notify.c
> --- linux-2.6.orig/mm/oom_notify.c	1969-12-31 21:00:00.000000000 -0300
> +++ linux-2.6/mm/oom_notify.c	2007-10-30 12:35:24.000000000 -0300
> @@ -0,0 +1,96 @@
> +/*
> + * Notify applications of memory pressure via /dev/oom_notify
> + */
> +
> +#include <linux/module.h>
> +#include <linux/fs.h>
> +#include <linux/wait.h>
> +#include <linux/poll.h>
> +#include <linux/timer.h>
> +#include <linux/spinlock.h>
> +#include <linux/vmstat.h>
> +
> +static int oom_notify_users = 0;
> +static bool oom_notify_status = 0;
> +static unsigned int prev_swapped_pages = 0;
> +
> +static void oom_check_fn(unsigned long);
> +
> +DECLARE_WAIT_QUEUE_HEAD(oom_wait);
> +DEFINE_SPINLOCK(oom_notify_lock);
> +static struct timer_list oom_check_timer =
> +		TIMER_INITIALIZER(oom_check_fn, 0, 0);
> +
> +void oom_check_fn(unsigned long unused)
> +{
> +	bool wake = 0;
> +	unsigned int swapped_pages;
> +
> +	swapped_pages = sum_vm_event(PSWPOUT);
> +	if (swapped_pages > prev_swapped_pages)
> +		wake = 1;
> +	prev_swapped_pages = swapped_pages;
> +

Two comments

1. So this is a rate growth function and continues to wake
   up tasks as long as the rate of swapout keeps growing?"
2. How will this function work in the absence of swap? Does
  this feature work in the absence of swap?


> +	oom_notify_status = wake;
> +
> +	if (wake)
> +		wake_up_all(&oom_wait);
> +
> +	return;
> +}
> +
> +static int oom_notify_open(struct inode *inode, struct file *file)
> +{

Should we check current->oomkilladj before allowing open to proceed?

> +	spin_lock(&oom_notify_lock);
> +	if (!oom_notify_users) {
> +		oom_notify_status = 0;
> +		oom_check_timer.expires = jiffies + msecs_to_jiffies(1000);

A more meaningful name for 1000, here please?

> +		mod_timer(&oom_check_timer, oom_check_timer.expires);
> +	}
> +	oom_notify_users++;
> +	spin_unlock(&oom_notify_lock);
> +
> +	return 0;
> +}
> +
> +static int oom_notify_release(struct inode *inode, struct file *file)
> +{
> +	spin_lock(&oom_notify_lock);
> +	oom_notify_users--;
> +	if (!oom_notify_users) {
> +		del_timer(&oom_check_timer);
> +		oom_notify_status = 0;
> +	}
> +	spin_unlock(&oom_notify_lock);
> +	return 0;
> +}
> +
> +static unsigned int oom_notify_poll(struct file *file, poll_table *wait)
> +{
> +	unsigned int val = 0;
> +	struct zone *zone;
> +	int cz_idx = zone_idx(NODE_DATA(nid)->node_zonelists->zones[0]);
> +
> +	poll_wait(file, &oom_wait, wait);
> +
> +	if (oom_notify_status)
> +		val = POLLIN;
> +
> +	for_each_zone(zone) {
> +		if (!populated_zone(zone))
> +			continue;	
> +		if (!zone_watermark_ok(zone, 0, zone->pages_low, cz_idx, 0)) {
> +			val = POLLIN;
> +			break;
> +		}
> +	}
> +
> +	return val;
> +}
> +
> +struct file_operations oom_notify_fops = {
> +	.open = oom_notify_open,
> +	.release = oom_notify_release,
> +	.poll = oom_notify_poll,
> +};

Can we also implement a oom_notify_read() function, so that a read on
/dev/oom_notify will give the reason for returning on select on
/dev/oom_notify.

> +EXPORT_SYMBOL(oom_notify_fops);
> diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/mm/vmstat.c linux-2.6/mm/vmstat.c
> --- linux-2.6.orig/mm/vmstat.c	2007-10-24 15:53:02.000000000 -0300
> +++ linux-2.6/mm/vmstat.c	2007-10-27 22:45:35.000000000 -0300
> @@ -52,6 +52,28 @@
>  }
>  EXPORT_SYMBOL_GPL(all_vm_events);
> 
> +unsigned int sum_vm_event(int vm_event)
> +{
> +	int cpu = 0;
> +	int i;
> +	unsigned int ret = 0;
> +	cpumask_t *cpumask = &cpu_online_map;
> +
> +	cpu = first_cpu(*cpumask);
> +	while (cpu < NR_CPUS) {
> +		struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
> +
> +		cpu = next_cpu(cpu, *cpumask);
> +
> +		if (cpu < NR_CPUS)
> +			prefetch(&per_cpu(vm_event_states, cpu));
> +
> +		ret += this->event[vm_event];
> +	}
> +	return ret;
> +}
> +EXPORT_SYMBOL(sum_vm_event);

Is this routine CPU Hotplug safe?

> +
>  #ifdef CONFIG_HOTPLUG
>  /*
>   * Fold the foreign cpu events into our own.
> diff -Nur --exclude-from=linux-2.6/Documentation/dontdiff linux-2.6.orig/include/linux/vmstat.h linux-2.6/include/linux/vmstat.h
> --- linux-2.6.orig/include/linux/vmstat.h	2007-10-24 15:55:30.000000000 -0300
> +++ linux-2.6/include/linux/vmstat.h	2007-10-27 23:28:48.000000000 -0300
> @@ -80,6 +80,7 @@
>  }
> 
>  extern void all_vm_events(unsigned long *);
> +extern unsigned int sum_vm_event(int);
>  #ifdef CONFIG_HOTPLUG
>  extern void vm_events_fold_cpu(int cpu);
>  #else
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>


-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  reply	other threads:[~2007-10-30 20:57 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-10-30 19:18 [RFC] oom notifications via /dev/oom_notify Marcelo Tosatti
2007-10-30 20:57 ` Balbir Singh [this message]
2007-10-30 22:16   ` Marcelo Tosatti
2007-10-30 21:00 ` Rik van Riel
2007-10-30 21:07 ` Marcelo Tosatti
2007-10-30 21:19   ` Rik van Riel
2007-10-30 22:26     ` Marcelo Tosatti
2007-10-31 17:20   ` Dave Jones
2007-11-01 23:58     ` Marcelo Tosatti
2007-10-30 21:59 ` Badari Pulavarty
2007-10-30 21:12   ` Rik van Riel
2007-10-31  4:17     ` Badari
2007-10-31  4:31       ` Rik van Riel
2007-10-31 17:01         ` Badari Pulavarty
2007-10-31 16:15           ` Rik van Riel
2007-10-31  5:38       ` Balbir Singh

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=47279A9D.70504@linux.vnet.ibm.com \
    --to=balbir@linux.vnet.ibm.com \
    --cc=akpm@linux-foundation.org \
    --cc=drepper@redhat.com \
    --cc=ego@in.ibm.com \
    --cc=linux-mm@kvack.org \
    --cc=marcelo@kvack.org \
    --cc=mbligh@mbligh.org \
    --cc=riel@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).