* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-17 18:51 ` Pekka Enberg
0 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-17 18:51 UTC (permalink / raw)
To: Rik van Riel
Cc: Minchan Kim, linux-mm, LKML, leonid.moiseichuk, kamezawa.hiroyu,
mel, rientjes, KOSAKI Motohiro, Johannes Weiner, Marcelo Tosatti,
Andrew Morton, Ronen Hod, KOSAKI Motohiro
Hello,
Ok, so here's a proof of concept patch that implements sample-base
per-process free threshold VM event watching using perf-like syscall ABI.
I'd really like to see something like this that's much more extensible and
clean than the /dev based ABIs that people have proposed so far.
Pekka
------------------->
>From a07f93fdca360b20daef4a5d66f2a5746f31f6a6 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@kernel.org>
Date: Tue, 17 Jan 2012 17:51:48 +0200
Subject: [PATCH] vmnotify: VM event notification system
This patch implements a new sys_vmnotify_fd() system call that returns a
pollable file descriptor that can be used to watch VM events.
For example, to watch for VM event when free memory is below 99% of available
memory using 1 second sample period, you'd do something like this:
struct vmnotify_config config;
struct vmnotify_event event;
struct pollfd pollfd;
int fd;
config = (struct vmnotify_config) {
.type = VMNOTIFY_TYPE_SAMPLE|VMNOTIFY_TYPE_FREE_THRESHOLD,
.sample_period_ns = 1000000000L,
.free_threshold = 99,
};
fd = sys_vmnotify_fd(&config);
pollfd.fd = fd;
pollfd.events = POLLIN;
if (poll(&pollfd, 1, -1) < 0) {
perror("poll failed");
exit(1);
}
memset(&event, 0, sizeof(event));
if (read(fd, &event, sizeof(event)) < 0) {
perror("read failed");
exit(1);
}
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
arch/x86/include/asm/unistd_64.h | 2 +
include/linux/vmnotify.h | 44 ++++++
mm/Kconfig | 6 +
mm/Makefile | 1 +
mm/vmnotify.c | 235 ++++++++++++++++++++++++++++++++
tools/testing/vmnotify/vmnotify-test.c | 68 +++++++++
6 files changed, 356 insertions(+), 0 deletions(-)
create mode 100644 include/linux/vmnotify.h
create mode 100644 mm/vmnotify.c
create mode 100644 tools/testing/vmnotify/vmnotify-test.c
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 0431f19..b0928cd 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -686,6 +686,8 @@ __SYSCALL(__NR_getcpu, sys_getcpu)
__SYSCALL(__NR_process_vm_readv, sys_process_vm_readv)
#define __NR_process_vm_writev 311
__SYSCALL(__NR_process_vm_writev, sys_process_vm_writev)
+#define __NR_vmnotify_fd 312
+__SYSCALL(__NR_vmnotify_fd, sys_vmnotify_fd)
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
diff --git a/include/linux/vmnotify.h b/include/linux/vmnotify.h
new file mode 100644
index 0000000..8f8642b
--- /dev/null
+++ b/include/linux/vmnotify.h
@@ -0,0 +1,44 @@
+#ifndef _LINUX_VMNOTIFY_H
+#define _LINUX_VMNOTIFY_H
+
+#include <linux/types.h>
+
+enum {
+ VMNOTIFY_TYPE_FREE_THRESHOLD = 1ULL << 0,
+ VMNOTIFY_TYPE_SAMPLE = 1ULL << 1,
+};
+
+struct vmnotify_config {
+ /*
+ * Size of the struct for ABI extensibility.
+ */
+ __u32 size;
+
+ /*
+ * Notification type bitmask
+ */
+ __u64 type;
+
+ /*
+ * Free memory threshold in percentages [1..99]
+ */
+ __u32 free_threshold;
+
+ /*
+ * Sample period in nanoseconds
+ */
+ __u64 sample_period_ns;
+};
+
+struct vmnotify_event {
+ /* Size of the struct for ABI extensibility. */
+ __u32 size;
+
+ __u64 nr_avail_pages;
+
+ __u64 nr_swap_pages;
+
+ __u64 nr_free_pages;
+};
+
+#endif /* _LINUX_VMNOTIFY_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 011b110..6631167 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -373,3 +373,9 @@ config CLEANCACHE
in a negligible performance hit.
If unsure, say Y to enable cleancache
+
+config VMNOTIFY
+ bool "Enable VM event notification system"
+ default n
+ help
+ If unsure, say N to disable vmnotify
diff --git a/mm/Makefile b/mm/Makefile
index 50ec00e..e1b5db3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -51,3 +51,4 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
obj-$(CONFIG_CLEANCACHE) += cleancache.o
+obj-$(CONFIG_VMNOTIFY) += vmnotify.o
diff --git a/mm/vmnotify.c b/mm/vmnotify.c
new file mode 100644
index 0000000..6800450
--- /dev/null
+++ b/mm/vmnotify.c
@@ -0,0 +1,235 @@
+#include <linux/anon_inodes.h>
+#include <linux/vmnotify.h>
+#include <linux/syscalls.h>
+#include <linux/file.h>
+#include <linux/list.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+
+#define VMNOTIFY_MAX_FREE_THRESHOD 100
+
+struct vmnotify_watch {
+ struct vmnotify_config config;
+
+ struct mutex mutex;
+ bool pending;
+ struct vmnotify_event event;
+
+ /* sampling */
+ struct hrtimer timer;
+
+ /* poll */
+ wait_queue_head_t waitq;
+};
+
+static bool vmnotify_match(struct vmnotify_watch *watch, struct vmnotify_event *event)
+{
+ if (watch->config.type & VMNOTIFY_TYPE_FREE_THRESHOLD) {
+ u64 threshold;
+
+ if (!event->nr_avail_pages)
+ return false;
+
+ threshold = event->nr_free_pages * 100 / event->nr_avail_pages;
+ if (threshold > watch->config.free_threshold)
+ return false;
+ }
+
+ return true;
+}
+
+static void vmnotify_sample(struct vmnotify_watch *watch)
+{
+ struct vmnotify_event event;
+ struct sysinfo si;
+
+ memset(&event, 0, sizeof(event));
+
+ event.size = sizeof(event);
+ event.nr_free_pages = global_page_state(NR_FREE_PAGES);
+
+ si_meminfo(&si);
+ event.nr_avail_pages = si.totalram;
+
+#ifdef CONFIG_SWAP
+ si_swapinfo(&si);
+ event.nr_swap_pages = si.totalswap;
+#endif
+
+ if (!vmnotify_match(watch, &event))
+ return;
+
+ mutex_lock(&watch->mutex);
+
+ watch->pending = true;
+
+ memcpy(&watch->event, &event, sizeof(event));
+
+ mutex_unlock(&watch->mutex);
+}
+
+static enum hrtimer_restart vmnotify_timer_fn(struct hrtimer *hrtimer)
+{
+ struct vmnotify_watch *watch = container_of(hrtimer, struct vmnotify_watch, timer);
+ u64 sample_period = watch->config.sample_period_ns;
+
+ vmnotify_sample(watch);
+
+ hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
+
+ wake_up(&watch->waitq);
+
+ return HRTIMER_RESTART;
+}
+
+static void vmnotify_start_timer(struct vmnotify_watch *watch)
+{
+ u64 sample_period = watch->config.sample_period_ns;
+
+ hrtimer_init(&watch->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ watch->timer.function = vmnotify_timer_fn;
+
+ hrtimer_start(&watch->timer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED);
+}
+
+static unsigned int vmnotify_poll(struct file *file, poll_table *wait)
+{
+ struct vmnotify_watch *watch = file->private_data;
+ unsigned int events = 0;
+
+ poll_wait(file, &watch->waitq, wait);
+
+ mutex_lock(&watch->mutex);
+
+ if (watch->pending)
+ events |= POLLIN;
+
+ mutex_unlock(&watch->mutex);
+
+ return events;
+}
+
+static ssize_t vmnotify_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+ struct vmnotify_watch *watch = file->private_data;
+ int ret = 0;
+
+ mutex_lock(&watch->mutex);
+
+ if (!watch->pending)
+ goto out_unlock;
+
+ if (copy_to_user(buf, &watch->event, sizeof(struct vmnotify_event))) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ ret = watch->event.size;
+
+ watch->pending = false;
+
+out_unlock:
+ mutex_unlock(&watch->mutex);
+
+ return ret;
+}
+
+static int vmnotify_release(struct inode *inode, struct file *file)
+{
+ struct vmnotify_watch *watch = file->private_data;
+
+ hrtimer_cancel(&watch->timer);
+
+ kfree(watch);
+
+ return 0;
+}
+
+static const struct file_operations vmnotify_fops = {
+ .poll = vmnotify_poll,
+ .read = vmnotify_read,
+ .release = vmnotify_release,
+};
+
+static struct vmnotify_watch *vmnotify_watch_alloc(void)
+{
+ struct vmnotify_watch *watch;
+
+ watch = kzalloc(sizeof *watch, GFP_KERNEL);
+ if (!watch)
+ return NULL;
+
+ mutex_init(&watch->mutex);
+
+ init_waitqueue_head(&watch->waitq);
+
+ return watch;
+}
+
+static int vmnotify_copy_config(struct vmnotify_config __user *uconfig,
+ struct vmnotify_config *config)
+{
+ int ret;
+
+ ret = copy_from_user(config, uconfig, sizeof(struct vmnotify_config));
+ if (ret)
+ return -EFAULT;
+
+ if (!config->type)
+ return -EINVAL;
+
+ if (config->type & VMNOTIFY_TYPE_SAMPLE) {
+ if (config->sample_period_ns < NSEC_PER_MSEC)
+ return -EINVAL;
+ }
+
+ if (config->type & VMNOTIFY_TYPE_FREE_THRESHOLD) {
+ if (config->free_threshold > VMNOTIFY_MAX_FREE_THRESHOD)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+SYSCALL_DEFINE1(vmnotify_fd,
+ struct vmnotify_config __user *, uconfig)
+{
+ struct vmnotify_watch *watch;
+ struct file *file;
+ int err;
+ int fd;
+
+ watch = vmnotify_watch_alloc();
+ if (!watch)
+ return -ENOMEM;
+
+ err = vmnotify_copy_config(uconfig, &watch->config);
+ if (err)
+ goto err_free;
+
+ fd = get_unused_fd_flags(O_RDONLY);
+ if (fd < 0) {
+ err = fd;
+ goto err_free;
+ }
+
+ file = anon_inode_getfile("[vmnotify]", &vmnotify_fops, watch, O_RDONLY);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ goto err_fd;
+ }
+
+ fd_install(fd, file);
+
+ if (watch->config.type & VMNOTIFY_TYPE_SAMPLE)
+ vmnotify_start_timer(watch);
+
+ return fd;
+
+err_fd:
+ put_unused_fd(fd);
+err_free:
+ kfree(watch);
+ return err;
+}
diff --git a/tools/testing/vmnotify/vmnotify-test.c b/tools/testing/vmnotify/vmnotify-test.c
new file mode 100644
index 0000000..3c6b26d
--- /dev/null
+++ b/tools/testing/vmnotify/vmnotify-test.c
@@ -0,0 +1,68 @@
+#include "../../../include/linux/vmnotify.h"
+
+#if defined(__x86_64__)
+#include "../../../arch/x86/include/asm/unistd.h"
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+#include <poll.h>
+
+static int sys_vmnotify_fd(struct vmnotify_config *config)
+{
+ config->size = sizeof(*config);
+
+ return syscall(__NR_vmnotify_fd, config);
+}
+
+int main(int argc, char *argv[])
+{
+ struct vmnotify_config config;
+ struct vmnotify_event event;
+ struct pollfd pollfd;
+ int i;
+ int fd;
+
+ config = (struct vmnotify_config) {
+ .type = VMNOTIFY_TYPE_SAMPLE|VMNOTIFY_TYPE_FREE_THRESHOLD,
+ .sample_period_ns = 1000000000L,
+ .free_threshold = 99,
+ };
+
+ fd = sys_vmnotify_fd(&config);
+ if (fd < 0) {
+ perror("vmnotify_fd failed");
+ exit(1);
+ }
+
+ for (i = 0; i < 10; i++) {
+ pollfd.fd = fd;
+ pollfd.events = POLLIN;
+
+ if (poll(&pollfd, 1, -1) < 0) {
+ perror("poll failed");
+ exit(1);
+ }
+
+ memset(&event, 0, sizeof(event));
+
+ if (read(fd, &event, sizeof(event)) < 0) {
+ perror("read failed");
+ exit(1);
+ }
+
+ printf("VM event:\n");
+ printf("\tsize=%lu\n", event.size);
+ printf("\tnr_avail_pages=%Lu\n", event.nr_avail_pages);
+ printf("\tnr_swap_pages=%Lu\n", event.nr_swap_pages);
+ printf("\tnr_free_pages=%Lu\n", event.nr_free_pages);
+ }
+ if (close(fd) < 0) {
+ perror("close failed");
+ exit(1);
+ }
+
+ return 0;
+}
--
1.7.6.4
^ permalink raw reply related [flat|nested] 124+ messages in thread* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-17 18:51 ` Pekka Enberg
@ 2012-01-17 19:30 ` Rik van Riel
-1 siblings, 0 replies; 124+ messages in thread
From: Rik van Riel @ 2012-01-17 19:30 UTC (permalink / raw)
To: Pekka Enberg
Cc: Minchan Kim, linux-mm, LKML, leonid.moiseichuk, kamezawa.hiroyu,
mel, rientjes, KOSAKI Motohiro, Johannes Weiner, Marcelo Tosatti,
Andrew Morton, Ronen Hod, KOSAKI Motohiro
On 01/17/2012 01:51 PM, Pekka Enberg wrote:
> Hello,
>
> Ok, so here's a proof of concept patch that implements sample-base
> per-process free threshold VM event watching using perf-like syscall
> ABI. I'd really like to see something like this that's much more
> extensible and clean than the /dev based ABIs that people have proposed
> so far.
Looks like a nice extensible interface to me.
The only thing is, I expect we will not want to wake
up processes most of the time, when there is no memory
pressure, because that would just waste battery power
and/or cpu time that could be used for something else.
The desire to avoid such wakeups makes it harder to
wake up processes at arbitrary points set by the API.
Another issue is that we might be running two programs
on the system, each with a different threshold for
"lets free some of my cache". Say one program sets
the threshold at 20% free/cache memory, the other
program at 10%.
We could end up with the first process continually
throwing away its caches, while the second process
never gives its unused memory back to the kernel.
I am not sure what the right thing to do would be...
--
All rights reversed
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-17 19:30 ` Rik van Riel
0 siblings, 0 replies; 124+ messages in thread
From: Rik van Riel @ 2012-01-17 19:30 UTC (permalink / raw)
To: Pekka Enberg
Cc: Minchan Kim, linux-mm, LKML, leonid.moiseichuk, kamezawa.hiroyu,
mel, rientjes, KOSAKI Motohiro, Johannes Weiner, Marcelo Tosatti,
Andrew Morton, Ronen Hod, KOSAKI Motohiro
On 01/17/2012 01:51 PM, Pekka Enberg wrote:
> Hello,
>
> Ok, so here's a proof of concept patch that implements sample-base
> per-process free threshold VM event watching using perf-like syscall
> ABI. I'd really like to see something like this that's much more
> extensible and clean than the /dev based ABIs that people have proposed
> so far.
Looks like a nice extensible interface to me.
The only thing is, I expect we will not want to wake
up processes most of the time, when there is no memory
pressure, because that would just waste battery power
and/or cpu time that could be used for something else.
The desire to avoid such wakeups makes it harder to
wake up processes at arbitrary points set by the API.
Another issue is that we might be running two programs
on the system, each with a different threshold for
"lets free some of my cache". Say one program sets
the threshold at 20% free/cache memory, the other
program at 10%.
We could end up with the first process continually
throwing away its caches, while the second process
never gives its unused memory back to the kernel.
I am not sure what the right thing to do would be...
--
All rights reversed
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-17 19:30 ` Rik van Riel
@ 2012-01-17 19:49 ` Pekka Enberg
-1 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-17 19:49 UTC (permalink / raw)
To: Rik van Riel
Cc: Minchan Kim, linux-mm, LKML, leonid.moiseichuk, kamezawa.hiroyu,
mel, rientjes, KOSAKI Motohiro, Johannes Weiner, Marcelo Tosatti,
Andrew Morton, Ronen Hod, KOSAKI Motohiro
On Tue, Jan 17, 2012 at 9:30 PM, Rik van Riel <riel@redhat.com> wrote:
> Looks like a nice extensible interface to me.
>
> The only thing is, I expect we will not want to wake
> up processes most of the time, when there is no memory
> pressure, because that would just waste battery power
> and/or cpu time that could be used for something else.
>
> The desire to avoid such wakeups makes it harder to
> wake up processes at arbitrary points set by the API.
Sure. You could either bump up the threshold or use Minchan's hooks - or both.
On Tue, Jan 17, 2012 at 9:30 PM, Rik van Riel <riel@redhat.com> wrote:
> Another issue is that we might be running two programs
> on the system, each with a different threshold for
> "lets free some of my cache". Say one program sets
> the threshold at 20% free/cache memory, the other
> program at 10%.
>
> We could end up with the first process continually
> throwing away its caches, while the second process
> never gives its unused memory back to the kernel.
>
> I am not sure what the right thing to do would be...
One option is to use per-process thresholds on RSS, for example, and
also support system-wide thresholds.
That said, I'd really like to see the N9 and Android policies
supported with this ABI. It's much easier to make it generic once we
support real-world use cases.
Pekka
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-17 19:49 ` Pekka Enberg
0 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-17 19:49 UTC (permalink / raw)
To: Rik van Riel
Cc: Minchan Kim, linux-mm, LKML, leonid.moiseichuk, kamezawa.hiroyu,
mel, rientjes, KOSAKI Motohiro, Johannes Weiner, Marcelo Tosatti,
Andrew Morton, Ronen Hod, KOSAKI Motohiro
On Tue, Jan 17, 2012 at 9:30 PM, Rik van Riel <riel@redhat.com> wrote:
> Looks like a nice extensible interface to me.
>
> The only thing is, I expect we will not want to wake
> up processes most of the time, when there is no memory
> pressure, because that would just waste battery power
> and/or cpu time that could be used for something else.
>
> The desire to avoid such wakeups makes it harder to
> wake up processes at arbitrary points set by the API.
Sure. You could either bump up the threshold or use Minchan's hooks - or both.
On Tue, Jan 17, 2012 at 9:30 PM, Rik van Riel <riel@redhat.com> wrote:
> Another issue is that we might be running two programs
> on the system, each with a different threshold for
> "lets free some of my cache". Say one program sets
> the threshold at 20% free/cache memory, the other
> program at 10%.
>
> We could end up with the first process continually
> throwing away its caches, while the second process
> never gives its unused memory back to the kernel.
>
> I am not sure what the right thing to do would be...
One option is to use per-process thresholds on RSS, for example, and
also support system-wide thresholds.
That said, I'd really like to see the N9 and Android policies
supported with this ABI. It's much easier to make it generic once we
support real-world use cases.
Pekka
^ permalink raw reply [flat|nested] 124+ messages in thread* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-17 19:49 ` Pekka Enberg
@ 2012-01-17 19:54 ` Pekka Enberg
-1 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-17 19:54 UTC (permalink / raw)
To: Rik van Riel
Cc: Minchan Kim, linux-mm, LKML, leonid.moiseichuk, kamezawa.hiroyu,
mel, rientjes, KOSAKI Motohiro, Johannes Weiner, Marcelo Tosatti,
Andrew Morton, Ronen Hod, KOSAKI Motohiro
On Tue, Jan 17, 2012 at 9:49 PM, Pekka Enberg <penberg@kernel.org> wrote:
> That said, I'd really like to see the N9 and Android policies
> supported with this ABI. It's much easier to make it generic once we
> support real-world use cases.
If people are interested in hacking on the thing, I pushed the commit
in 'vmnotify/core' branch of
git://github.com/penberg/linux.git
Pekka
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-17 19:54 ` Pekka Enberg
0 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-17 19:54 UTC (permalink / raw)
To: Rik van Riel
Cc: Minchan Kim, linux-mm, LKML, leonid.moiseichuk, kamezawa.hiroyu,
mel, rientjes, KOSAKI Motohiro, Johannes Weiner, Marcelo Tosatti,
Andrew Morton, Ronen Hod, KOSAKI Motohiro
On Tue, Jan 17, 2012 at 9:49 PM, Pekka Enberg <penberg@kernel.org> wrote:
> That said, I'd really like to see the N9 and Android policies
> supported with this ABI. It's much easier to make it generic once we
> support real-world use cases.
If people are interested in hacking on the thing, I pushed the commit
in 'vmnotify/core' branch of
git://github.com/penberg/linux.git
Pekka
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-17 19:49 ` Pekka Enberg
@ 2012-01-17 19:57 ` Pekka Enberg
-1 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-17 19:57 UTC (permalink / raw)
To: Rik van Riel
Cc: Minchan Kim, linux-mm, LKML, leonid.moiseichuk, kamezawa.hiroyu,
mel, rientjes, KOSAKI Motohiro, Johannes Weiner, Marcelo Tosatti,
Andrew Morton, Ronen Hod, KOSAKI Motohiro
On Tue, Jan 17, 2012 at 9:49 PM, Pekka Enberg <penberg@kernel.org> wrote:
>> The desire to avoid such wakeups makes it harder to
>> wake up processes at arbitrary points set by the API.
>
> Sure. You could either bump up the threshold or use Minchan's hooks - or both.
s/threshold/sample period/g
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-17 19:57 ` Pekka Enberg
0 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-17 19:57 UTC (permalink / raw)
To: Rik van Riel
Cc: Minchan Kim, linux-mm, LKML, leonid.moiseichuk, kamezawa.hiroyu,
mel, rientjes, KOSAKI Motohiro, Johannes Weiner, Marcelo Tosatti,
Andrew Morton, Ronen Hod, KOSAKI Motohiro
On Tue, Jan 17, 2012 at 9:49 PM, Pekka Enberg <penberg@kernel.org> wrote:
>> The desire to avoid such wakeups makes it harder to
>> wake up processes at arbitrary points set by the API.
>
> Sure. You could either bump up the threshold or use Minchan's hooks - or both.
s/threshold/sample period/g
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-17 18:51 ` Pekka Enberg
@ 2012-01-17 23:20 ` Minchan Kim
-1 siblings, 0 replies; 124+ messages in thread
From: Minchan Kim @ 2012-01-17 23:20 UTC (permalink / raw)
To: Pekka Enberg
Cc: Rik van Riel, linux-mm, LKML, leonid.moiseichuk, kamezawa.hiroyu,
mel, rientjes, KOSAKI Motohiro, Johannes Weiner, Marcelo Tosatti,
Andrew Morton, Ronen Hod, KOSAKI Motohiro
On Tue, Jan 17, 2012 at 08:51:13PM +0200, Pekka Enberg wrote:
> Hello,
>
> Ok, so here's a proof of concept patch that implements sample-base
> per-process free threshold VM event watching using perf-like syscall
> ABI. I'd really like to see something like this that's much more
> extensible and clean than the /dev based ABIs that people have
> proposed so far.
>
> Pekka
>
> ------------------->
>
> From a07f93fdca360b20daef4a5d66f2a5746f31f6a6 Mon Sep 17 00:00:00 2001
> From: Pekka Enberg <penberg@kernel.org>
> Date: Tue, 17 Jan 2012 17:51:48 +0200
> Subject: [PATCH] vmnotify: VM event notification system
>
> This patch implements a new sys_vmnotify_fd() system call that returns a
> pollable file descriptor that can be used to watch VM events.
>
> For example, to watch for VM event when free memory is below 99% of available
> memory using 1 second sample period, you'd do something like this:
>
> struct vmnotify_config config;
> struct vmnotify_event event;
> struct pollfd pollfd;
> int fd;
>
> config = (struct vmnotify_config) {
> .type = VMNOTIFY_TYPE_SAMPLE|VMNOTIFY_TYPE_FREE_THRESHOLD,
> .sample_period_ns = 1000000000L,
> .free_threshold = 99,
> };
>
> fd = sys_vmnotify_fd(&config);
>
> pollfd.fd = fd;
> pollfd.events = POLLIN;
>
> if (poll(&pollfd, 1, -1) < 0) {
> perror("poll failed");
> exit(1);
> }
>
> memset(&event, 0, sizeof(event));
>
> if (read(fd, &event, sizeof(event)) < 0) {
> perror("read failed");
> exit(1);
> }
Hi Pekka,
I didn't look into your code(will do) but as I read description,
still I don't convince we need really some process specific threshold like 99%
I think application can know it by polling /proc/meminfo without this mechanism
if they really want.
I would like to notify when system has a trobule with memory pressure without
some process specific threshold. Of course, applicatoin can't expect it.(ie,
application can know system memory pressure by /proc/meminfo but it can't know
when swapout really happens). Kernel low mem notify have to give such notification
to user space, I think.
>
> Signed-off-by: Pekka Enberg <penberg@kernel.org>
> ---
> arch/x86/include/asm/unistd_64.h | 2 +
> include/linux/vmnotify.h | 44 ++++++
> mm/Kconfig | 6 +
> mm/Makefile | 1 +
> mm/vmnotify.c | 235 ++++++++++++++++++++++++++++++++
> tools/testing/vmnotify/vmnotify-test.c | 68 +++++++++
> 6 files changed, 356 insertions(+), 0 deletions(-)
> create mode 100644 include/linux/vmnotify.h
> create mode 100644 mm/vmnotify.c
> create mode 100644 tools/testing/vmnotify/vmnotify-test.c
>
> diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
> index 0431f19..b0928cd 100644
> --- a/arch/x86/include/asm/unistd_64.h
> +++ b/arch/x86/include/asm/unistd_64.h
> @@ -686,6 +686,8 @@ __SYSCALL(__NR_getcpu, sys_getcpu)
> __SYSCALL(__NR_process_vm_readv, sys_process_vm_readv)
> #define __NR_process_vm_writev 311
> __SYSCALL(__NR_process_vm_writev, sys_process_vm_writev)
> +#define __NR_vmnotify_fd 312
> +__SYSCALL(__NR_vmnotify_fd, sys_vmnotify_fd)
>
> #ifndef __NO_STUBS
> #define __ARCH_WANT_OLD_READDIR
> diff --git a/include/linux/vmnotify.h b/include/linux/vmnotify.h
> new file mode 100644
> index 0000000..8f8642b
> --- /dev/null
> +++ b/include/linux/vmnotify.h
> @@ -0,0 +1,44 @@
> +#ifndef _LINUX_VMNOTIFY_H
> +#define _LINUX_VMNOTIFY_H
> +
> +#include <linux/types.h>
> +
> +enum {
> + VMNOTIFY_TYPE_FREE_THRESHOLD = 1ULL << 0,
> + VMNOTIFY_TYPE_SAMPLE = 1ULL << 1,
> +};
> +
> +struct vmnotify_config {
> + /*
> + * Size of the struct for ABI extensibility.
> + */
> + __u32 size;
> +
> + /*
> + * Notification type bitmask
> + */
> + __u64 type;
> +
> + /*
> + * Free memory threshold in percentages [1..99]
> + */
> + __u32 free_threshold;
> +
> + /*
> + * Sample period in nanoseconds
> + */
> + __u64 sample_period_ns;
> +};
> +
> +struct vmnotify_event {
> + /* Size of the struct for ABI extensibility. */
> + __u32 size;
> +
> + __u64 nr_avail_pages;
> +
> + __u64 nr_swap_pages;
> +
> + __u64 nr_free_pages;
> +};
> +
> +#endif /* _LINUX_VMNOTIFY_H */
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 011b110..6631167 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -373,3 +373,9 @@ config CLEANCACHE
> in a negligible performance hit.
>
> If unsure, say Y to enable cleancache
> +
> +config VMNOTIFY
> + bool "Enable VM event notification system"
> + default n
> + help
> + If unsure, say N to disable vmnotify
> diff --git a/mm/Makefile b/mm/Makefile
> index 50ec00e..e1b5db3 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -51,3 +51,4 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
> obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
> obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
> obj-$(CONFIG_CLEANCACHE) += cleancache.o
> +obj-$(CONFIG_VMNOTIFY) += vmnotify.o
> diff --git a/mm/vmnotify.c b/mm/vmnotify.c
> new file mode 100644
> index 0000000..6800450
> --- /dev/null
> +++ b/mm/vmnotify.c
> @@ -0,0 +1,235 @@
> +#include <linux/anon_inodes.h>
> +#include <linux/vmnotify.h>
> +#include <linux/syscalls.h>
> +#include <linux/file.h>
> +#include <linux/list.h>
> +#include <linux/poll.h>
> +#include <linux/slab.h>
> +#include <linux/swap.h>
> +
> +#define VMNOTIFY_MAX_FREE_THRESHOD 100
> +
> +struct vmnotify_watch {
> + struct vmnotify_config config;
> +
> + struct mutex mutex;
> + bool pending;
> + struct vmnotify_event event;
> +
> + /* sampling */
> + struct hrtimer timer;
> +
> + /* poll */
> + wait_queue_head_t waitq;
> +};
> +
> +static bool vmnotify_match(struct vmnotify_watch *watch, struct vmnotify_event *event)
> +{
> + if (watch->config.type & VMNOTIFY_TYPE_FREE_THRESHOLD) {
> + u64 threshold;
> +
> + if (!event->nr_avail_pages)
> + return false;
> +
> + threshold = event->nr_free_pages * 100 / event->nr_avail_pages;
> + if (threshold > watch->config.free_threshold)
> + return false;
> + }
> +
> + return true;
> +}
> +
> +static void vmnotify_sample(struct vmnotify_watch *watch)
> +{
> + struct vmnotify_event event;
> + struct sysinfo si;
> +
> + memset(&event, 0, sizeof(event));
> +
> + event.size = sizeof(event);
> + event.nr_free_pages = global_page_state(NR_FREE_PAGES);
> +
> + si_meminfo(&si);
> + event.nr_avail_pages = si.totalram;
> +
> +#ifdef CONFIG_SWAP
> + si_swapinfo(&si);
> + event.nr_swap_pages = si.totalswap;
> +#endif
> +
> + if (!vmnotify_match(watch, &event))
> + return;
> +
> + mutex_lock(&watch->mutex);
> +
> + watch->pending = true;
> +
> + memcpy(&watch->event, &event, sizeof(event));
> +
> + mutex_unlock(&watch->mutex);
> +}
> +
> +static enum hrtimer_restart vmnotify_timer_fn(struct hrtimer *hrtimer)
> +{
> + struct vmnotify_watch *watch = container_of(hrtimer, struct vmnotify_watch, timer);
> + u64 sample_period = watch->config.sample_period_ns;
> +
> + vmnotify_sample(watch);
> +
> + hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
> +
> + wake_up(&watch->waitq);
> +
> + return HRTIMER_RESTART;
> +}
> +
> +static void vmnotify_start_timer(struct vmnotify_watch *watch)
> +{
> + u64 sample_period = watch->config.sample_period_ns;
> +
> + hrtimer_init(&watch->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> + watch->timer.function = vmnotify_timer_fn;
> +
> + hrtimer_start(&watch->timer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED);
> +}
> +
> +static unsigned int vmnotify_poll(struct file *file, poll_table *wait)
> +{
> + struct vmnotify_watch *watch = file->private_data;
> + unsigned int events = 0;
> +
> + poll_wait(file, &watch->waitq, wait);
> +
> + mutex_lock(&watch->mutex);
> +
> + if (watch->pending)
> + events |= POLLIN;
> +
> + mutex_unlock(&watch->mutex);
> +
> + return events;
> +}
> +
> +static ssize_t vmnotify_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> +{
> + struct vmnotify_watch *watch = file->private_data;
> + int ret = 0;
> +
> + mutex_lock(&watch->mutex);
> +
> + if (!watch->pending)
> + goto out_unlock;
> +
> + if (copy_to_user(buf, &watch->event, sizeof(struct vmnotify_event))) {
> + ret = -EFAULT;
> + goto out_unlock;
> + }
> +
> + ret = watch->event.size;
> +
> + watch->pending = false;
> +
> +out_unlock:
> + mutex_unlock(&watch->mutex);
> +
> + return ret;
> +}
> +
> +static int vmnotify_release(struct inode *inode, struct file *file)
> +{
> + struct vmnotify_watch *watch = file->private_data;
> +
> + hrtimer_cancel(&watch->timer);
> +
> + kfree(watch);
> +
> + return 0;
> +}
> +
> +static const struct file_operations vmnotify_fops = {
> + .poll = vmnotify_poll,
> + .read = vmnotify_read,
> + .release = vmnotify_release,
> +};
> +
> +static struct vmnotify_watch *vmnotify_watch_alloc(void)
> +{
> + struct vmnotify_watch *watch;
> +
> + watch = kzalloc(sizeof *watch, GFP_KERNEL);
> + if (!watch)
> + return NULL;
> +
> + mutex_init(&watch->mutex);
> +
> + init_waitqueue_head(&watch->waitq);
> +
> + return watch;
> +}
> +
> +static int vmnotify_copy_config(struct vmnotify_config __user *uconfig,
> + struct vmnotify_config *config)
> +{
> + int ret;
> +
> + ret = copy_from_user(config, uconfig, sizeof(struct vmnotify_config));
> + if (ret)
> + return -EFAULT;
> +
> + if (!config->type)
> + return -EINVAL;
> +
> + if (config->type & VMNOTIFY_TYPE_SAMPLE) {
> + if (config->sample_period_ns < NSEC_PER_MSEC)
> + return -EINVAL;
> + }
> +
> + if (config->type & VMNOTIFY_TYPE_FREE_THRESHOLD) {
> + if (config->free_threshold > VMNOTIFY_MAX_FREE_THRESHOD)
> + return -EINVAL;
> + }
> +
> + return 0;
> +}
> +
> +SYSCALL_DEFINE1(vmnotify_fd,
> + struct vmnotify_config __user *, uconfig)
> +{
> + struct vmnotify_watch *watch;
> + struct file *file;
> + int err;
> + int fd;
> +
> + watch = vmnotify_watch_alloc();
> + if (!watch)
> + return -ENOMEM;
> +
> + err = vmnotify_copy_config(uconfig, &watch->config);
> + if (err)
> + goto err_free;
> +
> + fd = get_unused_fd_flags(O_RDONLY);
> + if (fd < 0) {
> + err = fd;
> + goto err_free;
> + }
> +
> + file = anon_inode_getfile("[vmnotify]", &vmnotify_fops, watch, O_RDONLY);
> + if (IS_ERR(file)) {
> + err = PTR_ERR(file);
> + goto err_fd;
> + }
> +
> + fd_install(fd, file);
> +
> + if (watch->config.type & VMNOTIFY_TYPE_SAMPLE)
> + vmnotify_start_timer(watch);
> +
> + return fd;
> +
> +err_fd:
> + put_unused_fd(fd);
> +err_free:
> + kfree(watch);
> + return err;
> +}
> diff --git a/tools/testing/vmnotify/vmnotify-test.c b/tools/testing/vmnotify/vmnotify-test.c
> new file mode 100644
> index 0000000..3c6b26d
> --- /dev/null
> +++ b/tools/testing/vmnotify/vmnotify-test.c
> @@ -0,0 +1,68 @@
> +#include "../../../include/linux/vmnotify.h"
> +
> +#if defined(__x86_64__)
> +#include "../../../arch/x86/include/asm/unistd.h"
> +#endif
> +
> +#include <stdlib.h>
> +#include <string.h>
> +#include <errno.h>
> +#include <stdio.h>
> +#include <poll.h>
> +
> +static int sys_vmnotify_fd(struct vmnotify_config *config)
> +{
> + config->size = sizeof(*config);
> +
> + return syscall(__NR_vmnotify_fd, config);
> +}
> +
> +int main(int argc, char *argv[])
> +{
> + struct vmnotify_config config;
> + struct vmnotify_event event;
> + struct pollfd pollfd;
> + int i;
> + int fd;
> +
> + config = (struct vmnotify_config) {
> + .type = VMNOTIFY_TYPE_SAMPLE|VMNOTIFY_TYPE_FREE_THRESHOLD,
> + .sample_period_ns = 1000000000L,
> + .free_threshold = 99,
> + };
> +
> + fd = sys_vmnotify_fd(&config);
> + if (fd < 0) {
> + perror("vmnotify_fd failed");
> + exit(1);
> + }
> +
> + for (i = 0; i < 10; i++) {
> + pollfd.fd = fd;
> + pollfd.events = POLLIN;
> +
> + if (poll(&pollfd, 1, -1) < 0) {
> + perror("poll failed");
> + exit(1);
> + }
> +
> + memset(&event, 0, sizeof(event));
> +
> + if (read(fd, &event, sizeof(event)) < 0) {
> + perror("read failed");
> + exit(1);
> + }
> +
> + printf("VM event:\n");
> + printf("\tsize=%lu\n", event.size);
> + printf("\tnr_avail_pages=%Lu\n", event.nr_avail_pages);
> + printf("\tnr_swap_pages=%Lu\n", event.nr_swap_pages);
> + printf("\tnr_free_pages=%Lu\n", event.nr_free_pages);
> + }
> + if (close(fd) < 0) {
> + perror("close failed");
> + exit(1);
> + }
> +
> + return 0;
> +}
> --
> 1.7.6.4
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-17 23:20 ` Minchan Kim
0 siblings, 0 replies; 124+ messages in thread
From: Minchan Kim @ 2012-01-17 23:20 UTC (permalink / raw)
To: Pekka Enberg
Cc: Rik van Riel, linux-mm, LKML, leonid.moiseichuk, kamezawa.hiroyu,
mel, rientjes, KOSAKI Motohiro, Johannes Weiner, Marcelo Tosatti,
Andrew Morton, Ronen Hod, KOSAKI Motohiro
On Tue, Jan 17, 2012 at 08:51:13PM +0200, Pekka Enberg wrote:
> Hello,
>
> Ok, so here's a proof of concept patch that implements sample-base
> per-process free threshold VM event watching using perf-like syscall
> ABI. I'd really like to see something like this that's much more
> extensible and clean than the /dev based ABIs that people have
> proposed so far.
>
> Pekka
>
> ------------------->
>
> From a07f93fdca360b20daef4a5d66f2a5746f31f6a6 Mon Sep 17 00:00:00 2001
> From: Pekka Enberg <penberg@kernel.org>
> Date: Tue, 17 Jan 2012 17:51:48 +0200
> Subject: [PATCH] vmnotify: VM event notification system
>
> This patch implements a new sys_vmnotify_fd() system call that returns a
> pollable file descriptor that can be used to watch VM events.
>
> For example, to watch for VM event when free memory is below 99% of available
> memory using 1 second sample period, you'd do something like this:
>
> struct vmnotify_config config;
> struct vmnotify_event event;
> struct pollfd pollfd;
> int fd;
>
> config = (struct vmnotify_config) {
> .type = VMNOTIFY_TYPE_SAMPLE|VMNOTIFY_TYPE_FREE_THRESHOLD,
> .sample_period_ns = 1000000000L,
> .free_threshold = 99,
> };
>
> fd = sys_vmnotify_fd(&config);
>
> pollfd.fd = fd;
> pollfd.events = POLLIN;
>
> if (poll(&pollfd, 1, -1) < 0) {
> perror("poll failed");
> exit(1);
> }
>
> memset(&event, 0, sizeof(event));
>
> if (read(fd, &event, sizeof(event)) < 0) {
> perror("read failed");
> exit(1);
> }
Hi Pekka,
I didn't look into your code(will do) but as I read description,
still I don't convince we need really some process specific threshold like 99%
I think application can know it by polling /proc/meminfo without this mechanism
if they really want.
I would like to notify when system has a trobule with memory pressure without
some process specific threshold. Of course, applicatoin can't expect it.(ie,
application can know system memory pressure by /proc/meminfo but it can't know
when swapout really happens). Kernel low mem notify have to give such notification
to user space, I think.
>
> Signed-off-by: Pekka Enberg <penberg@kernel.org>
> ---
> arch/x86/include/asm/unistd_64.h | 2 +
> include/linux/vmnotify.h | 44 ++++++
> mm/Kconfig | 6 +
> mm/Makefile | 1 +
> mm/vmnotify.c | 235 ++++++++++++++++++++++++++++++++
> tools/testing/vmnotify/vmnotify-test.c | 68 +++++++++
> 6 files changed, 356 insertions(+), 0 deletions(-)
> create mode 100644 include/linux/vmnotify.h
> create mode 100644 mm/vmnotify.c
> create mode 100644 tools/testing/vmnotify/vmnotify-test.c
>
> diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
> index 0431f19..b0928cd 100644
> --- a/arch/x86/include/asm/unistd_64.h
> +++ b/arch/x86/include/asm/unistd_64.h
> @@ -686,6 +686,8 @@ __SYSCALL(__NR_getcpu, sys_getcpu)
> __SYSCALL(__NR_process_vm_readv, sys_process_vm_readv)
> #define __NR_process_vm_writev 311
> __SYSCALL(__NR_process_vm_writev, sys_process_vm_writev)
> +#define __NR_vmnotify_fd 312
> +__SYSCALL(__NR_vmnotify_fd, sys_vmnotify_fd)
>
> #ifndef __NO_STUBS
> #define __ARCH_WANT_OLD_READDIR
> diff --git a/include/linux/vmnotify.h b/include/linux/vmnotify.h
> new file mode 100644
> index 0000000..8f8642b
> --- /dev/null
> +++ b/include/linux/vmnotify.h
> @@ -0,0 +1,44 @@
> +#ifndef _LINUX_VMNOTIFY_H
> +#define _LINUX_VMNOTIFY_H
> +
> +#include <linux/types.h>
> +
> +enum {
> + VMNOTIFY_TYPE_FREE_THRESHOLD = 1ULL << 0,
> + VMNOTIFY_TYPE_SAMPLE = 1ULL << 1,
> +};
> +
> +struct vmnotify_config {
> + /*
> + * Size of the struct for ABI extensibility.
> + */
> + __u32 size;
> +
> + /*
> + * Notification type bitmask
> + */
> + __u64 type;
> +
> + /*
> + * Free memory threshold in percentages [1..99]
> + */
> + __u32 free_threshold;
> +
> + /*
> + * Sample period in nanoseconds
> + */
> + __u64 sample_period_ns;
> +};
> +
> +struct vmnotify_event {
> + /* Size of the struct for ABI extensibility. */
> + __u32 size;
> +
> + __u64 nr_avail_pages;
> +
> + __u64 nr_swap_pages;
> +
> + __u64 nr_free_pages;
> +};
> +
> +#endif /* _LINUX_VMNOTIFY_H */
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 011b110..6631167 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -373,3 +373,9 @@ config CLEANCACHE
> in a negligible performance hit.
>
> If unsure, say Y to enable cleancache
> +
> +config VMNOTIFY
> + bool "Enable VM event notification system"
> + default n
> + help
> + If unsure, say N to disable vmnotify
> diff --git a/mm/Makefile b/mm/Makefile
> index 50ec00e..e1b5db3 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -51,3 +51,4 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
> obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
> obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
> obj-$(CONFIG_CLEANCACHE) += cleancache.o
> +obj-$(CONFIG_VMNOTIFY) += vmnotify.o
> diff --git a/mm/vmnotify.c b/mm/vmnotify.c
> new file mode 100644
> index 0000000..6800450
> --- /dev/null
> +++ b/mm/vmnotify.c
> @@ -0,0 +1,235 @@
> +#include <linux/anon_inodes.h>
> +#include <linux/vmnotify.h>
> +#include <linux/syscalls.h>
> +#include <linux/file.h>
> +#include <linux/list.h>
> +#include <linux/poll.h>
> +#include <linux/slab.h>
> +#include <linux/swap.h>
> +
> +#define VMNOTIFY_MAX_FREE_THRESHOD 100
> +
> +struct vmnotify_watch {
> + struct vmnotify_config config;
> +
> + struct mutex mutex;
> + bool pending;
> + struct vmnotify_event event;
> +
> + /* sampling */
> + struct hrtimer timer;
> +
> + /* poll */
> + wait_queue_head_t waitq;
> +};
> +
> +static bool vmnotify_match(struct vmnotify_watch *watch, struct vmnotify_event *event)
> +{
> + if (watch->config.type & VMNOTIFY_TYPE_FREE_THRESHOLD) {
> + u64 threshold;
> +
> + if (!event->nr_avail_pages)
> + return false;
> +
> + threshold = event->nr_free_pages * 100 / event->nr_avail_pages;
> + if (threshold > watch->config.free_threshold)
> + return false;
> + }
> +
> + return true;
> +}
> +
> +static void vmnotify_sample(struct vmnotify_watch *watch)
> +{
> + struct vmnotify_event event;
> + struct sysinfo si;
> +
> + memset(&event, 0, sizeof(event));
> +
> + event.size = sizeof(event);
> + event.nr_free_pages = global_page_state(NR_FREE_PAGES);
> +
> + si_meminfo(&si);
> + event.nr_avail_pages = si.totalram;
> +
> +#ifdef CONFIG_SWAP
> + si_swapinfo(&si);
> + event.nr_swap_pages = si.totalswap;
> +#endif
> +
> + if (!vmnotify_match(watch, &event))
> + return;
> +
> + mutex_lock(&watch->mutex);
> +
> + watch->pending = true;
> +
> + memcpy(&watch->event, &event, sizeof(event));
> +
> + mutex_unlock(&watch->mutex);
> +}
> +
> +static enum hrtimer_restart vmnotify_timer_fn(struct hrtimer *hrtimer)
> +{
> + struct vmnotify_watch *watch = container_of(hrtimer, struct vmnotify_watch, timer);
> + u64 sample_period = watch->config.sample_period_ns;
> +
> + vmnotify_sample(watch);
> +
> + hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
> +
> + wake_up(&watch->waitq);
> +
> + return HRTIMER_RESTART;
> +}
> +
> +static void vmnotify_start_timer(struct vmnotify_watch *watch)
> +{
> + u64 sample_period = watch->config.sample_period_ns;
> +
> + hrtimer_init(&watch->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> + watch->timer.function = vmnotify_timer_fn;
> +
> + hrtimer_start(&watch->timer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED);
> +}
> +
> +static unsigned int vmnotify_poll(struct file *file, poll_table *wait)
> +{
> + struct vmnotify_watch *watch = file->private_data;
> + unsigned int events = 0;
> +
> + poll_wait(file, &watch->waitq, wait);
> +
> + mutex_lock(&watch->mutex);
> +
> + if (watch->pending)
> + events |= POLLIN;
> +
> + mutex_unlock(&watch->mutex);
> +
> + return events;
> +}
> +
> +static ssize_t vmnotify_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> +{
> + struct vmnotify_watch *watch = file->private_data;
> + int ret = 0;
> +
> + mutex_lock(&watch->mutex);
> +
> + if (!watch->pending)
> + goto out_unlock;
> +
> + if (copy_to_user(buf, &watch->event, sizeof(struct vmnotify_event))) {
> + ret = -EFAULT;
> + goto out_unlock;
> + }
> +
> + ret = watch->event.size;
> +
> + watch->pending = false;
> +
> +out_unlock:
> + mutex_unlock(&watch->mutex);
> +
> + return ret;
> +}
> +
> +static int vmnotify_release(struct inode *inode, struct file *file)
> +{
> + struct vmnotify_watch *watch = file->private_data;
> +
> + hrtimer_cancel(&watch->timer);
> +
> + kfree(watch);
> +
> + return 0;
> +}
> +
> +static const struct file_operations vmnotify_fops = {
> + .poll = vmnotify_poll,
> + .read = vmnotify_read,
> + .release = vmnotify_release,
> +};
> +
> +static struct vmnotify_watch *vmnotify_watch_alloc(void)
> +{
> + struct vmnotify_watch *watch;
> +
> + watch = kzalloc(sizeof *watch, GFP_KERNEL);
> + if (!watch)
> + return NULL;
> +
> + mutex_init(&watch->mutex);
> +
> + init_waitqueue_head(&watch->waitq);
> +
> + return watch;
> +}
> +
> +static int vmnotify_copy_config(struct vmnotify_config __user *uconfig,
> + struct vmnotify_config *config)
> +{
> + int ret;
> +
> + ret = copy_from_user(config, uconfig, sizeof(struct vmnotify_config));
> + if (ret)
> + return -EFAULT;
> +
> + if (!config->type)
> + return -EINVAL;
> +
> + if (config->type & VMNOTIFY_TYPE_SAMPLE) {
> + if (config->sample_period_ns < NSEC_PER_MSEC)
> + return -EINVAL;
> + }
> +
> + if (config->type & VMNOTIFY_TYPE_FREE_THRESHOLD) {
> + if (config->free_threshold > VMNOTIFY_MAX_FREE_THRESHOD)
> + return -EINVAL;
> + }
> +
> + return 0;
> +}
> +
> +SYSCALL_DEFINE1(vmnotify_fd,
> + struct vmnotify_config __user *, uconfig)
> +{
> + struct vmnotify_watch *watch;
> + struct file *file;
> + int err;
> + int fd;
> +
> + watch = vmnotify_watch_alloc();
> + if (!watch)
> + return -ENOMEM;
> +
> + err = vmnotify_copy_config(uconfig, &watch->config);
> + if (err)
> + goto err_free;
> +
> + fd = get_unused_fd_flags(O_RDONLY);
> + if (fd < 0) {
> + err = fd;
> + goto err_free;
> + }
> +
> + file = anon_inode_getfile("[vmnotify]", &vmnotify_fops, watch, O_RDONLY);
> + if (IS_ERR(file)) {
> + err = PTR_ERR(file);
> + goto err_fd;
> + }
> +
> + fd_install(fd, file);
> +
> + if (watch->config.type & VMNOTIFY_TYPE_SAMPLE)
> + vmnotify_start_timer(watch);
> +
> + return fd;
> +
> +err_fd:
> + put_unused_fd(fd);
> +err_free:
> + kfree(watch);
> + return err;
> +}
> diff --git a/tools/testing/vmnotify/vmnotify-test.c b/tools/testing/vmnotify/vmnotify-test.c
> new file mode 100644
> index 0000000..3c6b26d
> --- /dev/null
> +++ b/tools/testing/vmnotify/vmnotify-test.c
> @@ -0,0 +1,68 @@
> +#include "../../../include/linux/vmnotify.h"
> +
> +#if defined(__x86_64__)
> +#include "../../../arch/x86/include/asm/unistd.h"
> +#endif
> +
> +#include <stdlib.h>
> +#include <string.h>
> +#include <errno.h>
> +#include <stdio.h>
> +#include <poll.h>
> +
> +static int sys_vmnotify_fd(struct vmnotify_config *config)
> +{
> + config->size = sizeof(*config);
> +
> + return syscall(__NR_vmnotify_fd, config);
> +}
> +
> +int main(int argc, char *argv[])
> +{
> + struct vmnotify_config config;
> + struct vmnotify_event event;
> + struct pollfd pollfd;
> + int i;
> + int fd;
> +
> + config = (struct vmnotify_config) {
> + .type = VMNOTIFY_TYPE_SAMPLE|VMNOTIFY_TYPE_FREE_THRESHOLD,
> + .sample_period_ns = 1000000000L,
> + .free_threshold = 99,
> + };
> +
> + fd = sys_vmnotify_fd(&config);
> + if (fd < 0) {
> + perror("vmnotify_fd failed");
> + exit(1);
> + }
> +
> + for (i = 0; i < 10; i++) {
> + pollfd.fd = fd;
> + pollfd.events = POLLIN;
> +
> + if (poll(&pollfd, 1, -1) < 0) {
> + perror("poll failed");
> + exit(1);
> + }
> +
> + memset(&event, 0, sizeof(event));
> +
> + if (read(fd, &event, sizeof(event)) < 0) {
> + perror("read failed");
> + exit(1);
> + }
> +
> + printf("VM event:\n");
> + printf("\tsize=%lu\n", event.size);
> + printf("\tnr_avail_pages=%Lu\n", event.nr_avail_pages);
> + printf("\tnr_swap_pages=%Lu\n", event.nr_swap_pages);
> + printf("\tnr_free_pages=%Lu\n", event.nr_free_pages);
> + }
> + if (close(fd) < 0) {
> + perror("close failed");
> + exit(1);
> + }
> +
> + return 0;
> +}
> --
> 1.7.6.4
>
^ permalink raw reply [flat|nested] 124+ messages in thread* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-17 23:20 ` Minchan Kim
@ 2012-01-18 7:16 ` Pekka Enberg
-1 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-18 7:16 UTC (permalink / raw)
To: Minchan Kim
Cc: Rik van Riel, linux-mm, LKML, leonid.moiseichuk, kamezawa.hiroyu,
mel, rientjes, KOSAKI Motohiro, Johannes Weiner, Marcelo Tosatti,
Andrew Morton, Ronen Hod, KOSAKI Motohiro
On Wed, 18 Jan 2012, Minchan Kim wrote:
> I didn't look into your code(will do) but as I read description,
> still I don't convince we need really some process specific threshold like 99%
> I think application can know it by polling /proc/meminfo without this mechanism
> if they really want.
I'm not sure if we need arbitrary threshold either. However, we need to
support the following cases:
- We're about to swap
- We're about to run out of memory
- We're about to start OOM killing
and I don't think your patch solves that. One possibility is to implement:
VMNOTIFY_TYPE_ABOUT_TO_SWAP
VMNOTIFY_TYPE_ABOUT_TO_OOM
VMNOTIFY_TYPE_ABOUT_TO_OOM_KILL
and maybe rip out support for arbitrary thresholds. Does that more
reasonable?
As for polling /proc/meminfo, I'd much rather deliver stats as part of
vmnotify_read() because it's easier to extend the ABI rather than adding
new fields to /proc/meminfo.
On Wed, 18 Jan 2012, Minchan Kim wrote:
> I would like to notify when system has a trobule with memory pressure without
> some process specific threshold. Of course, applicatoin can't expect it.(ie,
> application can know system memory pressure by /proc/meminfo but it can't know
> when swapout really happens). Kernel low mem notify have to give such notification
> to user space, I think.
It should be simple to add support for VMNOTIFY_TYPE_MEM_PRESSURE that
uses your hooks.
Pekka
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-18 7:16 ` Pekka Enberg
0 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-18 7:16 UTC (permalink / raw)
To: Minchan Kim
Cc: Rik van Riel, linux-mm, LKML, leonid.moiseichuk, kamezawa.hiroyu,
mel, rientjes, KOSAKI Motohiro, Johannes Weiner, Marcelo Tosatti,
Andrew Morton, Ronen Hod, KOSAKI Motohiro
On Wed, 18 Jan 2012, Minchan Kim wrote:
> I didn't look into your code(will do) but as I read description,
> still I don't convince we need really some process specific threshold like 99%
> I think application can know it by polling /proc/meminfo without this mechanism
> if they really want.
I'm not sure if we need arbitrary threshold either. However, we need to
support the following cases:
- We're about to swap
- We're about to run out of memory
- We're about to start OOM killing
and I don't think your patch solves that. One possibility is to implement:
VMNOTIFY_TYPE_ABOUT_TO_SWAP
VMNOTIFY_TYPE_ABOUT_TO_OOM
VMNOTIFY_TYPE_ABOUT_TO_OOM_KILL
and maybe rip out support for arbitrary thresholds. Does that more
reasonable?
As for polling /proc/meminfo, I'd much rather deliver stats as part of
vmnotify_read() because it's easier to extend the ABI rather than adding
new fields to /proc/meminfo.
On Wed, 18 Jan 2012, Minchan Kim wrote:
> I would like to notify when system has a trobule with memory pressure without
> some process specific threshold. Of course, applicatoin can't expect it.(ie,
> application can know system memory pressure by /proc/meminfo but it can't know
> when swapout really happens). Kernel low mem notify have to give such notification
> to user space, I think.
It should be simple to add support for VMNOTIFY_TYPE_MEM_PRESSURE that
uses your hooks.
Pekka
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-18 7:16 ` Pekka Enberg
@ 2012-01-18 7:49 ` Minchan Kim
-1 siblings, 0 replies; 124+ messages in thread
From: Minchan Kim @ 2012-01-18 7:49 UTC (permalink / raw)
To: Pekka Enberg
Cc: Rik van Riel, linux-mm, LKML, leonid.moiseichuk, kamezawa.hiroyu,
mel, rientjes, KOSAKI Motohiro, Johannes Weiner, Marcelo Tosatti,
Andrew Morton, Ronen Hod, KOSAKI Motohiro
On Wed, Jan 18, 2012 at 09:16:49AM +0200, Pekka Enberg wrote:
> On Wed, 18 Jan 2012, Minchan Kim wrote:
> >I didn't look into your code(will do) but as I read description,
> >still I don't convince we need really some process specific threshold like 99%
> >I think application can know it by polling /proc/meminfo without this mechanism
> >if they really want.
>
> I'm not sure if we need arbitrary threshold either. However, we need
> to support the following cases:
>
> - We're about to swap
>
> - We're about to run out of memory
>
> - We're about to start OOM killing
>
> and I don't think your patch solves that. One possibility is to implement:
I think my patch can extend it but your ABI looks good to me than my approach.
>
> VMNOTIFY_TYPE_ABOUT_TO_SWAP
> VMNOTIFY_TYPE_ABOUT_TO_OOM
> VMNOTIFY_TYPE_ABOUT_TO_OOM_KILL
Yes. We can define some levels.
1. page cache reclaim
2. code page reclaim
3. anonymous page swap out
4. OOM kill.
Application might handle it differenlty by the memory pressure level.
>
> and maybe rip out support for arbitrary thresholds. Does that more
> reasonable?
Currently, Nokia people seem to want process specific thresholds so
we might need it.
>
> As for polling /proc/meminfo, I'd much rather deliver stats as part
> of vmnotify_read() because it's easier to extend the ABI rather than
> adding new fields to /proc/meminfo.
Agree.
>
> On Wed, 18 Jan 2012, Minchan Kim wrote:
> >I would like to notify when system has a trobule with memory pressure without
> >some process specific threshold. Of course, applicatoin can't expect it.(ie,
> >application can know system memory pressure by /proc/meminfo but it can't know
> >when swapout really happens). Kernel low mem notify have to give such notification
> >to user space, I think.
>
> It should be simple to add support for VMNOTIFY_TYPE_MEM_PRESSURE
> that uses your hooks.
Indeed.
>
> Pekka
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-18 7:49 ` Minchan Kim
0 siblings, 0 replies; 124+ messages in thread
From: Minchan Kim @ 2012-01-18 7:49 UTC (permalink / raw)
To: Pekka Enberg
Cc: Rik van Riel, linux-mm, LKML, leonid.moiseichuk, kamezawa.hiroyu,
mel, rientjes, KOSAKI Motohiro, Johannes Weiner, Marcelo Tosatti,
Andrew Morton, Ronen Hod, KOSAKI Motohiro
On Wed, Jan 18, 2012 at 09:16:49AM +0200, Pekka Enberg wrote:
> On Wed, 18 Jan 2012, Minchan Kim wrote:
> >I didn't look into your code(will do) but as I read description,
> >still I don't convince we need really some process specific threshold like 99%
> >I think application can know it by polling /proc/meminfo without this mechanism
> >if they really want.
>
> I'm not sure if we need arbitrary threshold either. However, we need
> to support the following cases:
>
> - We're about to swap
>
> - We're about to run out of memory
>
> - We're about to start OOM killing
>
> and I don't think your patch solves that. One possibility is to implement:
I think my patch can extend it but your ABI looks good to me than my approach.
>
> VMNOTIFY_TYPE_ABOUT_TO_SWAP
> VMNOTIFY_TYPE_ABOUT_TO_OOM
> VMNOTIFY_TYPE_ABOUT_TO_OOM_KILL
Yes. We can define some levels.
1. page cache reclaim
2. code page reclaim
3. anonymous page swap out
4. OOM kill.
Application might handle it differenlty by the memory pressure level.
>
> and maybe rip out support for arbitrary thresholds. Does that more
> reasonable?
Currently, Nokia people seem to want process specific thresholds so
we might need it.
>
> As for polling /proc/meminfo, I'd much rather deliver stats as part
> of vmnotify_read() because it's easier to extend the ABI rather than
> adding new fields to /proc/meminfo.
Agree.
>
> On Wed, 18 Jan 2012, Minchan Kim wrote:
> >I would like to notify when system has a trobule with memory pressure without
> >some process specific threshold. Of course, applicatoin can't expect it.(ie,
> >application can know system memory pressure by /proc/meminfo but it can't know
> >when swapout really happens). Kernel low mem notify have to give such notification
> >to user space, I think.
>
> It should be simple to add support for VMNOTIFY_TYPE_MEM_PRESSURE
> that uses your hooks.
Indeed.
>
> Pekka
^ permalink raw reply [flat|nested] 124+ messages in thread
* RE: [RFC 1/3] /dev/low_mem_notify
2012-01-17 18:51 ` Pekka Enberg
@ 2012-01-18 9:06 ` leonid.moiseichuk
-1 siblings, 0 replies; 124+ messages in thread
From: leonid.moiseichuk @ 2012-01-18 9:06 UTC (permalink / raw)
To: penberg, riel
Cc: minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel, rientjes,
kosaki.motohiro, hannes, mtosatti, akpm, rhod, kosaki.motohiro
Hi,
Just couple of observations, which maybe wrong below
> -----Original Message-----
> From: Pekka Enberg [mailto:penberg@gmail.com] On Behalf Of ext Pekka
> Enberg
> Sent: 17 January, 2012 20:51
....
> +struct vmnotify_config {
> + /*
> + * Size of the struct for ABI extensibility.
> + */
> + __u32 size;
> +
> + /*
> + * Notification type bitmask
> + */
> + __u64 type;
> +
> + /*
> + * Free memory threshold in percentages [1..99]
> + */
> + __u32 free_threshold;
Would be possible to not use percents for thesholds? Accounting in pages even not so difficult to user-space.
Also, looking on vmnotify_match I understand that events propagated to user-space only in case threshold trigger change state from 0 to 1 but not back, 1-> 0 is very useful event as well.
Would be possible to use for threshold pointed value(s) e.g. according to enum zone_state_item, because kinds of memory to track could be different?
E.g. to tracking paging activity NR_ACTIVE_ANON and NR_ACTIVE_FILE could be interesting, not only free.
> +
> + /*
> + * Sample period in nanoseconds
> + */
> + __u64 sample_period_ns;
> +};
> +
....
> +struct vmnotify_event {
> + /* Size of the struct for ABI extensibility. */
> + __u32 size;
> +
> + __u64 nr_avail_pages;
> +
> + __u64 nr_swap_pages;
> +
> + __u64 nr_free_pages;
> +};
Two fields here most likely session-constant, (nr_avail_pages and nr_swap_pages), seems not much sense to report them in every event.
If we have memory/swap hotplug user-space can use sysinfo() call.
> +static void vmnotify_sample(struct vmnotify_watch *watch) {
...
> + si_meminfo(&si);
> + event.nr_avail_pages = si.totalram;
> +
> +#ifdef CONFIG_SWAP
> + si_swapinfo(&si);
> + event.nr_swap_pages = si.totalswap;
> +#endif
> +
Why not to use global_page_state() directly? si_meminfo() and especial si_swapinfo are quite expensive call.
> +static void vmnotify_start_timer(struct vmnotify_watch *watch) {
> + u64 sample_period = watch->config.sample_period_ns;
> +
> + hrtimer_init(&watch->timer, CLOCK_MONOTONIC,
> HRTIMER_MODE_REL);
> + watch->timer.function = vmnotify_timer_fn;
> +
> + hrtimer_start(&watch->timer, ns_to_ktime(sample_period),
> +HRTIMER_MODE_REL_PINNED); }
Do I understand correct you allocate timer for every user-space client and propagate events every pointed interval?
What will happened with system if we have a timer but need to turn CPU off? The timer must not be a reason to wakeup if user-space is sleeping.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread* RE: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-18 9:06 ` leonid.moiseichuk
0 siblings, 0 replies; 124+ messages in thread
From: leonid.moiseichuk @ 2012-01-18 9:06 UTC (permalink / raw)
To: penberg, riel
Cc: minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel, rientjes,
kosaki.motohiro, hannes, mtosatti, akpm, rhod, kosaki.motohiro
Hi,
Just couple of observations, which maybe wrong below
> -----Original Message-----
> From: Pekka Enberg [mailto:penberg@gmail.com] On Behalf Of ext Pekka
> Enberg
> Sent: 17 January, 2012 20:51
....
> +struct vmnotify_config {
> + /*
> + * Size of the struct for ABI extensibility.
> + */
> + __u32 size;
> +
> + /*
> + * Notification type bitmask
> + */
> + __u64 type;
> +
> + /*
> + * Free memory threshold in percentages [1..99]
> + */
> + __u32 free_threshold;
Would be possible to not use percents for thesholds? Accounting in pages even not so difficult to user-space.
Also, looking on vmnotify_match I understand that events propagated to user-space only in case threshold trigger change state from 0 to 1 but not back, 1-> 0 is very useful event as well.
Would be possible to use for threshold pointed value(s) e.g. according to enum zone_state_item, because kinds of memory to track could be different?
E.g. to tracking paging activity NR_ACTIVE_ANON and NR_ACTIVE_FILE could be interesting, not only free.
> +
> + /*
> + * Sample period in nanoseconds
> + */
> + __u64 sample_period_ns;
> +};
> +
....
> +struct vmnotify_event {
> + /* Size of the struct for ABI extensibility. */
> + __u32 size;
> +
> + __u64 nr_avail_pages;
> +
> + __u64 nr_swap_pages;
> +
> + __u64 nr_free_pages;
> +};
Two fields here most likely session-constant, (nr_avail_pages and nr_swap_pages), seems not much sense to report them in every event.
If we have memory/swap hotplug user-space can use sysinfo() call.
> +static void vmnotify_sample(struct vmnotify_watch *watch) {
...
> + si_meminfo(&si);
> + event.nr_avail_pages = si.totalram;
> +
> +#ifdef CONFIG_SWAP
> + si_swapinfo(&si);
> + event.nr_swap_pages = si.totalswap;
> +#endif
> +
Why not to use global_page_state() directly? si_meminfo() and especial si_swapinfo are quite expensive call.
> +static void vmnotify_start_timer(struct vmnotify_watch *watch) {
> + u64 sample_period = watch->config.sample_period_ns;
> +
> + hrtimer_init(&watch->timer, CLOCK_MONOTONIC,
> HRTIMER_MODE_REL);
> + watch->timer.function = vmnotify_timer_fn;
> +
> + hrtimer_start(&watch->timer, ns_to_ktime(sample_period),
> +HRTIMER_MODE_REL_PINNED); }
Do I understand correct you allocate timer for every user-space client and propagate events every pointed interval?
What will happened with system if we have a timer but need to turn CPU off? The timer must not be a reason to wakeup if user-space is sleeping.
^ permalink raw reply [flat|nested] 124+ messages in thread* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-18 9:06 ` leonid.moiseichuk
@ 2012-01-18 9:15 ` Pekka Enberg
-1 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-18 9:15 UTC (permalink / raw)
To: leonid.moiseichuk
Cc: riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, mtosatti, akpm, rhod,
kosaki.motohiro
On Wed, Jan 18, 2012 at 11:06 AM, <leonid.moiseichuk@nokia.com> wrote:
> Would be possible to not use percents for thesholds? Accounting in pages even
> not so difficult to user-space.
How does that work with memory hotplug?
On Wed, Jan 18, 2012 at 11:06 AM, <leonid.moiseichuk@nokia.com> wrote:
> Also, looking on vmnotify_match I understand that events propagated to
> user-space only in case threshold trigger change state from 0 to 1 but not
> back, 1-> 0 is very useful event as well.
>
> Would be possible to use for threshold pointed value(s) e.g. according to
> enum zone_state_item, because kinds of memory to track could be different?
> E.g. to tracking paging activity NR_ACTIVE_ANON and NR_ACTIVE_FILE could be
> interesting, not only free.
I don't think there's anything in the ABI that would prevent that.
>> +struct vmnotify_event {
>> + /* Size of the struct for ABI extensibility. */
>> + __u32 size;
>> +
>> + __u64 nr_avail_pages;
>> +
>> + __u64 nr_swap_pages;
>> +
>> + __u64 nr_free_pages;
>> +};
>
> Two fields here most likely session-constant, (nr_avail_pages and
> nr_swap_pages), seems not much sense to report them in every event. If we
> have memory/swap hotplug user-space can use sysinfo() call.
I actually changed the ABI to look like this:
struct vmnotify_event {
/*
* Size of the struct for ABI extensibility.
*/
__u32 size;
__u64 attrs;
__u64 attr_values[];
};
So userspace can decide which fields to include in notifications.
On Wed, Jan 18, 2012 at 11:06 AM, <leonid.moiseichuk@nokia.com> wrote:
>> +static void vmnotify_sample(struct vmnotify_watch *watch) {
> ...
>> + si_meminfo(&si);
>> + event.nr_avail_pages = si.totalram;
>> +
>> +#ifdef CONFIG_SWAP
>> + si_swapinfo(&si);
>> + event.nr_swap_pages = si.totalswap;
>> +#endif
>> +
>
> Why not to use global_page_state() directly? si_meminfo() and especial
> si_swapinfo are quite expensive call.
Sure, we can do that. Feel free to send a patch :-).
>> +static void vmnotify_start_timer(struct vmnotify_watch *watch) {
>> + u64 sample_period = watch->config.sample_period_ns;
>> +
>> + hrtimer_init(&watch->timer, CLOCK_MONOTONIC,
>> HRTIMER_MODE_REL);
>> + watch->timer.function = vmnotify_timer_fn;
>> +
>> + hrtimer_start(&watch->timer, ns_to_ktime(sample_period),
>> +HRTIMER_MODE_REL_PINNED); }
>
> Do I understand correct you allocate timer for every user-space client and
> propagate events every pointed interval? What will happened with system if
> we have a timer but need to turn CPU off? The timer must not be a reason to
> wakeup if user-space is sleeping.
No idea what happens. The sampling code is just a proof of concept thing and I
expect it to be buggy as hell. :-)
Pekka
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-18 9:15 ` Pekka Enberg
0 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-18 9:15 UTC (permalink / raw)
To: leonid.moiseichuk
Cc: riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, mtosatti, akpm, rhod,
kosaki.motohiro
On Wed, Jan 18, 2012 at 11:06 AM, <leonid.moiseichuk@nokia.com> wrote:
> Would be possible to not use percents for thesholds? Accounting in pages even
> not so difficult to user-space.
How does that work with memory hotplug?
On Wed, Jan 18, 2012 at 11:06 AM, <leonid.moiseichuk@nokia.com> wrote:
> Also, looking on vmnotify_match I understand that events propagated to
> user-space only in case threshold trigger change state from 0 to 1 but not
> back, 1-> 0 is very useful event as well.
>
> Would be possible to use for threshold pointed value(s) e.g. according to
> enum zone_state_item, because kinds of memory to track could be different?
> E.g. to tracking paging activity NR_ACTIVE_ANON and NR_ACTIVE_FILE could be
> interesting, not only free.
I don't think there's anything in the ABI that would prevent that.
>> +struct vmnotify_event {
>> + /* Size of the struct for ABI extensibility. */
>> + __u32 size;
>> +
>> + __u64 nr_avail_pages;
>> +
>> + __u64 nr_swap_pages;
>> +
>> + __u64 nr_free_pages;
>> +};
>
> Two fields here most likely session-constant, (nr_avail_pages and
> nr_swap_pages), seems not much sense to report them in every event. If we
> have memory/swap hotplug user-space can use sysinfo() call.
I actually changed the ABI to look like this:
struct vmnotify_event {
/*
* Size of the struct for ABI extensibility.
*/
__u32 size;
__u64 attrs;
__u64 attr_values[];
};
So userspace can decide which fields to include in notifications.
On Wed, Jan 18, 2012 at 11:06 AM, <leonid.moiseichuk@nokia.com> wrote:
>> +static void vmnotify_sample(struct vmnotify_watch *watch) {
> ...
>> + si_meminfo(&si);
>> + event.nr_avail_pages = si.totalram;
>> +
>> +#ifdef CONFIG_SWAP
>> + si_swapinfo(&si);
>> + event.nr_swap_pages = si.totalswap;
>> +#endif
>> +
>
> Why not to use global_page_state() directly? si_meminfo() and especial
> si_swapinfo are quite expensive call.
Sure, we can do that. Feel free to send a patch :-).
>> +static void vmnotify_start_timer(struct vmnotify_watch *watch) {
>> + u64 sample_period = watch->config.sample_period_ns;
>> +
>> + hrtimer_init(&watch->timer, CLOCK_MONOTONIC,
>> HRTIMER_MODE_REL);
>> + watch->timer.function = vmnotify_timer_fn;
>> +
>> + hrtimer_start(&watch->timer, ns_to_ktime(sample_period),
>> +HRTIMER_MODE_REL_PINNED); }
>
> Do I understand correct you allocate timer for every user-space client and
> propagate events every pointed interval? What will happened with system if
> we have a timer but need to turn CPU off? The timer must not be a reason to
> wakeup if user-space is sleeping.
No idea what happens. The sampling code is just a proof of concept thing and I
expect it to be buggy as hell. :-)
Pekka
^ permalink raw reply [flat|nested] 124+ messages in thread* RE: [RFC 1/3] /dev/low_mem_notify
2012-01-18 9:15 ` Pekka Enberg
@ 2012-01-18 9:41 ` leonid.moiseichuk
-1 siblings, 0 replies; 124+ messages in thread
From: leonid.moiseichuk @ 2012-01-18 9:41 UTC (permalink / raw)
To: penberg
Cc: riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, mtosatti, akpm, rhod,
kosaki.motohiro
> -----Original Message-----
> From: penberg@gmail.com [mailto:penberg@gmail.com] On Behalf Of ext
> Pekka Enberg
> Sent: 18 January, 2012 11:16
...
> > Would be possible to not use percents for thesholds? Accounting in pages
> even
> > not so difficult to user-space.
>
> How does that work with memory hotplug?
Not worse than %%. For example you had 10% free memory threshold for 512 MB RAM meaning 51.2 MB in absolute number.
Then hotplug turned off 256 MB, you for sure must update threshold for %% because these 10% for 25.6 MB most likely will be not suitable for different operating mode.
Using pages makes calculations must simpler.
>
> On Wed, Jan 18, 2012 at 11:06 AM, <leonid.moiseichuk@nokia.com> wrote:
> > Also, looking on vmnotify_match I understand that events propagated to
> > user-space only in case threshold trigger change state from 0 to 1 but not
> > back, 1-> 0 is very useful event as well
(*)
> >
> > Would be possible to use for threshold pointed value(s) e.g. according to
> > enum zone_state_item, because kinds of memory to track could be
> different?
> > E.g. to tracking paging activity NR_ACTIVE_ANON and NR_ACTIVE_FILE
> could be
> > interesting, not only free.
>
> I don't think there's anything in the ABI that would prevent that.
If this statement also related my question (*) I have to point need to track attributes history, otherwise user-space will be constantly kicked with updates.
> I actually changed the ABI to look like this:
>
> struct vmnotify_event {
> /*
> * Size of the struct for ABI extensibility.
> */
> __u32 size;
>
> __u64 attrs;
>
> __u64 attr_values[];
> };
>
> So userspace can decide which fields to include in notifications.
Good. But how you can provide current status of attributes to user-space? Need to have read() call support to deliver all supported attr_values[] on demand.
> >> +
> >> +#ifdef CONFIG_SWAP
> >> + si_swapinfo(&si);
> >> + event.nr_swap_pages = si.totalswap;
> >> +#endif
> >> +
> >
> > Why not to use global_page_state() directly? si_meminfo() and especial
> > si_swapinfo are quite expensive call.
>
> Sure, we can do that. Feel free to send a patch :-).
When I see code because from emails it is quite difficult to understand.
For short-term I need to focus on integration "memnotify" version internally which is kind of work for me already and provides all required interfaces n9 needs.
Btw, when API starts to work with pointed thresholds logically it is not anymore low_mem_notify, you need to invent some other name.
> No idea what happens. The sampling code is just a proof of concept thing and
> I expect it to be buggy as hell. :-)
>
> Pekka
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread* RE: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-18 9:41 ` leonid.moiseichuk
0 siblings, 0 replies; 124+ messages in thread
From: leonid.moiseichuk @ 2012-01-18 9:41 UTC (permalink / raw)
To: penberg
Cc: riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, mtosatti, akpm, rhod,
kosaki.motohiro
> -----Original Message-----
> From: penberg@gmail.com [mailto:penberg@gmail.com] On Behalf Of ext
> Pekka Enberg
> Sent: 18 January, 2012 11:16
...
> > Would be possible to not use percents for thesholds? Accounting in pages
> even
> > not so difficult to user-space.
>
> How does that work with memory hotplug?
Not worse than %%. For example you had 10% free memory threshold for 512 MB RAM meaning 51.2 MB in absolute number.
Then hotplug turned off 256 MB, you for sure must update threshold for %% because these 10% for 25.6 MB most likely will be not suitable for different operating mode.
Using pages makes calculations must simpler.
>
> On Wed, Jan 18, 2012 at 11:06 AM, <leonid.moiseichuk@nokia.com> wrote:
> > Also, looking on vmnotify_match I understand that events propagated to
> > user-space only in case threshold trigger change state from 0 to 1 but not
> > back, 1-> 0 is very useful event as well
(*)
> >
> > Would be possible to use for threshold pointed value(s) e.g. according to
> > enum zone_state_item, because kinds of memory to track could be
> different?
> > E.g. to tracking paging activity NR_ACTIVE_ANON and NR_ACTIVE_FILE
> could be
> > interesting, not only free.
>
> I don't think there's anything in the ABI that would prevent that.
If this statement also related my question (*) I have to point need to track attributes history, otherwise user-space will be constantly kicked with updates.
> I actually changed the ABI to look like this:
>
> struct vmnotify_event {
> /*
> * Size of the struct for ABI extensibility.
> */
> __u32 size;
>
> __u64 attrs;
>
> __u64 attr_values[];
> };
>
> So userspace can decide which fields to include in notifications.
Good. But how you can provide current status of attributes to user-space? Need to have read() call support to deliver all supported attr_values[] on demand.
> >> +
> >> +#ifdef CONFIG_SWAP
> >> + si_swapinfo(&si);
> >> + event.nr_swap_pages = si.totalswap;
> >> +#endif
> >> +
> >
> > Why not to use global_page_state() directly? si_meminfo() and especial
> > si_swapinfo are quite expensive call.
>
> Sure, we can do that. Feel free to send a patch :-).
When I see code because from emails it is quite difficult to understand.
For short-term I need to focus on integration "memnotify" version internally which is kind of work for me already and provides all required interfaces n9 needs.
Btw, when API starts to work with pointed thresholds logically it is not anymore low_mem_notify, you need to invent some other name.
> No idea what happens. The sampling code is just a proof of concept thing and
> I expect it to be buggy as hell. :-)
>
> Pekka
^ permalink raw reply [flat|nested] 124+ messages in thread* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-18 9:41 ` leonid.moiseichuk
@ 2012-01-18 10:40 ` Pekka Enberg
-1 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-18 10:40 UTC (permalink / raw)
To: leonid.moiseichuk
Cc: riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, mtosatti, akpm, rhod,
kosaki.motohiro
On Wed, Jan 18, 2012 at 11:41 AM, <leonid.moiseichuk@nokia.com> wrote:
>> -----Original Message-----
>> From: penberg@gmail.com [mailto:penberg@gmail.com] On Behalf Of ext
>> Pekka Enberg
>> Sent: 18 January, 2012 11:16
> ...
>> > Would be possible to not use percents for thesholds? Accounting in pages
>> even
>> > not so difficult to user-space.
>>
>> How does that work with memory hotplug?
>
> Not worse than %%. For example you had 10% free memory threshold for 512 MB
> RAM meaning 51.2 MB in absolute number. Then hotplug turned off 256 MB, you
> for sure must update threshold for %% because these 10% for 25.6 MB most
> likely will be not suitable for different operating mode.
> Using pages makes calculations must simpler.
Right. Does threshold in percentages make any sense then? Is it enough to use
number of free pages?
On Wed, Jan 18, 2012 at 11:06 AM, <leonid.moiseichuk@nokia.com> wrote:
>> > Also, looking on vmnotify_match I understand that events propagated to
>> > user-space only in case threshold trigger change state from 0 to 1 but not
>> > back, 1-> 0 is very useful event as well
> (*)
>
>> >
>> > Would be possible to use for threshold pointed value(s) e.g. according to
>> > enum zone_state_item, because kinds of memory to track could be
>> different?
>> > E.g. to tracking paging activity NR_ACTIVE_ANON and NR_ACTIVE_FILE
>> could be
>> > interesting, not only free.
>>
>> I don't think there's anything in the ABI that would prevent that.
>
> If this statement also related my question (*) I have to point need to track
> attributes history, otherwise user-space will be constantly kicked with
> updates.
Well sure, I think it makes sense to support state change to both directions.
> When I see code because from emails it is quite difficult to understand. For
> short-term I need to focus on integration "memnotify" version internally
> which is kind of work for me already and provides all required interfaces n9
> needs.
Sure. I'm only talking about mainline here.
> Btw, when API starts to work with pointed thresholds logically it is not
Definitely, it's about generic VM event notification now.
Pekka
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-18 10:40 ` Pekka Enberg
0 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-18 10:40 UTC (permalink / raw)
To: leonid.moiseichuk
Cc: riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, mtosatti, akpm, rhod,
kosaki.motohiro
On Wed, Jan 18, 2012 at 11:41 AM, <leonid.moiseichuk@nokia.com> wrote:
>> -----Original Message-----
>> From: penberg@gmail.com [mailto:penberg@gmail.com] On Behalf Of ext
>> Pekka Enberg
>> Sent: 18 January, 2012 11:16
> ...
>> > Would be possible to not use percents for thesholds? Accounting in pages
>> even
>> > not so difficult to user-space.
>>
>> How does that work with memory hotplug?
>
> Not worse than %%. For example you had 10% free memory threshold for 512 MB
> RAM meaning 51.2 MB in absolute number. Then hotplug turned off 256 MB, you
> for sure must update threshold for %% because these 10% for 25.6 MB most
> likely will be not suitable for different operating mode.
> Using pages makes calculations must simpler.
Right. Does threshold in percentages make any sense then? Is it enough to use
number of free pages?
On Wed, Jan 18, 2012 at 11:06 AM, <leonid.moiseichuk@nokia.com> wrote:
>> > Also, looking on vmnotify_match I understand that events propagated to
>> > user-space only in case threshold trigger change state from 0 to 1 but not
>> > back, 1-> 0 is very useful event as well
> (*)
>
>> >
>> > Would be possible to use for threshold pointed value(s) e.g. according to
>> > enum zone_state_item, because kinds of memory to track could be
>> different?
>> > E.g. to tracking paging activity NR_ACTIVE_ANON and NR_ACTIVE_FILE
>> could be
>> > interesting, not only free.
>>
>> I don't think there's anything in the ABI that would prevent that.
>
> If this statement also related my question (*) I have to point need to track
> attributes history, otherwise user-space will be constantly kicked with
> updates.
Well sure, I think it makes sense to support state change to both directions.
> When I see code because from emails it is quite difficult to understand. For
> short-term I need to focus on integration "memnotify" version internally
> which is kind of work for me already and provides all required interfaces n9
> needs.
Sure. I'm only talking about mainline here.
> Btw, when API starts to work with pointed thresholds logically it is not
Definitely, it's about generic VM event notification now.
Pekka
^ permalink raw reply [flat|nested] 124+ messages in thread
* RE: [RFC 1/3] /dev/low_mem_notify
2012-01-18 10:40 ` Pekka Enberg
@ 2012-01-18 10:44 ` leonid.moiseichuk
-1 siblings, 0 replies; 124+ messages in thread
From: leonid.moiseichuk @ 2012-01-18 10:44 UTC (permalink / raw)
To: penberg
Cc: riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, mtosatti, akpm, rhod,
kosaki.motohiro
> -----Original Message-----
> From: penberg@gmail.com [mailto:penberg@gmail.com] On Behalf Of ext
> Pekka Enberg
> Sent: 18 January, 2012 12:40
...
> > Not worse than %%. For example you had 10% free memory threshold for
> > 512 MB RAM meaning 51.2 MB in absolute number. Then hotplug turned
> > off 256 MB, you for sure must update threshold for %% because these
> > 10% for 25.6 MB most likely will be not suitable for different operating
> mode.
> > Using pages makes calculations must simpler.
>
> Right. Does threshold in percentages make any sense then? Is it enough to
> use number of free pages?
Paul Mundt noticed that and we stopped use percentage in 2006 for n770 update.
He was right.
Percents are useless and do not correlate with other kernel APIs like sysinfo().
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* RE: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-18 10:44 ` leonid.moiseichuk
0 siblings, 0 replies; 124+ messages in thread
From: leonid.moiseichuk @ 2012-01-18 10:44 UTC (permalink / raw)
To: penberg
Cc: riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, mtosatti, akpm, rhod,
kosaki.motohiro
> -----Original Message-----
> From: penberg@gmail.com [mailto:penberg@gmail.com] On Behalf Of ext
> Pekka Enberg
> Sent: 18 January, 2012 12:40
...
> > Not worse than %%. For example you had 10% free memory threshold for
> > 512 MB RAM meaning 51.2 MB in absolute number. Then hotplug turned
> > off 256 MB, you for sure must update threshold for %% because these
> > 10% for 25.6 MB most likely will be not suitable for different operating
> mode.
> > Using pages makes calculations must simpler.
>
> Right. Does threshold in percentages make any sense then? Is it enough to
> use number of free pages?
Paul Mundt noticed that and we stopped use percentage in 2006 for n770 update.
He was right.
Percents are useless and do not correlate with other kernel APIs like sysinfo().
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-18 10:44 ` leonid.moiseichuk
@ 2012-01-18 23:34 ` Ronen Hod
-1 siblings, 0 replies; 124+ messages in thread
From: Ronen Hod @ 2012-01-18 23:34 UTC (permalink / raw)
To: leonid.moiseichuk
Cc: penberg, riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu,
mel, rientjes, kosaki.motohiro, hannes, mtosatti, akpm,
kosaki.motohiro
On 01/18/2012 12:44 PM, leonid.moiseichuk@nokia.com wrote:
>> -----Original Message-----
>> From: penberg@gmail.com [mailto:penberg@gmail.com] On Behalf Of ext
>> Pekka Enberg
>> Sent: 18 January, 2012 12:40
> ...
>>> Not worse than %%. For example you had 10% free memory threshold for
>>> 512 MB RAM meaning 51.2 MB in absolute number. Then hotplug turned
>>> off 256 MB, you for sure must update threshold for %% because these
>>> 10% for 25.6 MB most likely will be not suitable for different operating
>> mode.
>>> Using pages makes calculations must simpler.
>> Right. Does threshold in percentages make any sense then? Is it enough to
>> use number of free pages?
> Paul Mundt noticed that and we stopped use percentage in 2006 for n770 update.
> He was right.
> Percents are useless and do not correlate with other kernel APIs like sysinfo().
I believe that it will be best if the kernel publishes an ideal number_of_free_pages (in /proc/meminfo or whatever). Such number is easy to work with since this is what applications do, they free pages. Applications will be able to refer to this number from their garbage collector, or before allocating memory also if they did not get a notification, and it is also useful if several applications free memory at the same time.
Ronen.
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-18 23:34 ` Ronen Hod
0 siblings, 0 replies; 124+ messages in thread
From: Ronen Hod @ 2012-01-18 23:34 UTC (permalink / raw)
To: leonid.moiseichuk
Cc: penberg, riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu,
mel, rientjes, kosaki.motohiro, hannes, mtosatti, akpm,
kosaki.motohiro
On 01/18/2012 12:44 PM, leonid.moiseichuk@nokia.com wrote:
>> -----Original Message-----
>> From: penberg@gmail.com [mailto:penberg@gmail.com] On Behalf Of ext
>> Pekka Enberg
>> Sent: 18 January, 2012 12:40
> ...
>>> Not worse than %%. For example you had 10% free memory threshold for
>>> 512 MB RAM meaning 51.2 MB in absolute number. Then hotplug turned
>>> off 256 MB, you for sure must update threshold for %% because these
>>> 10% for 25.6 MB most likely will be not suitable for different operating
>> mode.
>>> Using pages makes calculations must simpler.
>> Right. Does threshold in percentages make any sense then? Is it enough to
>> use number of free pages?
> Paul Mundt noticed that and we stopped use percentage in 2006 for n770 update.
> He was right.
> Percents are useless and do not correlate with other kernel APIs like sysinfo().
I believe that it will be best if the kernel publishes an ideal number_of_free_pages (in /proc/meminfo or whatever). Such number is easy to work with since this is what applications do, they free pages. Applications will be able to refer to this number from their garbage collector, or before allocating memory also if they did not get a notification, and it is also useful if several applications free memory at the same time.
Ronen.
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-18 23:34 ` Ronen Hod
@ 2012-01-19 7:25 ` Pekka Enberg
-1 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-19 7:25 UTC (permalink / raw)
To: Ronen Hod
Cc: leonid.moiseichuk, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, mtosatti,
akpm, kosaki.motohiro
On Thu, 19 Jan 2012, Ronen Hod wrote:
> I believe that it will be best if the kernel publishes an ideal
> number_of_free_pages (in /proc/meminfo or whatever). Such number is easy to
> work with since this is what applications do, they free pages. Applications
> will be able to refer to this number from their garbage collector, or before
> allocating memory also if they did not get a notification, and it is also
> useful if several applications free memory at the same time.
Isn't
/proc/sys/vm/min_free_kbytes
pretty much just that?
Pekka
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-19 7:25 ` Pekka Enberg
0 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-19 7:25 UTC (permalink / raw)
To: Ronen Hod
Cc: leonid.moiseichuk, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, mtosatti,
akpm, kosaki.motohiro
On Thu, 19 Jan 2012, Ronen Hod wrote:
> I believe that it will be best if the kernel publishes an ideal
> number_of_free_pages (in /proc/meminfo or whatever). Such number is easy to
> work with since this is what applications do, they free pages. Applications
> will be able to refer to this number from their garbage collector, or before
> allocating memory also if they did not get a notification, and it is also
> useful if several applications free memory at the same time.
Isn't
/proc/sys/vm/min_free_kbytes
pretty much just that?
Pekka
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-19 7:25 ` Pekka Enberg
@ 2012-01-19 9:05 ` Ronen Hod
-1 siblings, 0 replies; 124+ messages in thread
From: Ronen Hod @ 2012-01-19 9:05 UTC (permalink / raw)
To: Pekka Enberg
Cc: leonid.moiseichuk, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, mtosatti,
akpm, kosaki.motohiro
On 01/19/2012 09:25 AM, Pekka Enberg wrote:
> On Thu, 19 Jan 2012, Ronen Hod wrote:
>> I believe that it will be best if the kernel publishes an ideal number_of_free_pages (in /proc/meminfo or whatever). Such number is easy to work with since this is what applications do, they free pages. Applications will be able to refer to this number from their garbage collector, or before allocating memory also if they did not get a notification, and it is also useful if several applications free memory at the same time.
>
> Isn't
>
> /proc/sys/vm/min_free_kbytes
>
> pretty much just that?
>
> Pekka
Would you suggest to use min_free_kbytes as the threshold for sending low_memory_notifications to applications, and separately as a target value for the applications' memory giveaway?
Thanks, Ronen.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-19 9:05 ` Ronen Hod
0 siblings, 0 replies; 124+ messages in thread
From: Ronen Hod @ 2012-01-19 9:05 UTC (permalink / raw)
To: Pekka Enberg
Cc: leonid.moiseichuk, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, mtosatti,
akpm, kosaki.motohiro
On 01/19/2012 09:25 AM, Pekka Enberg wrote:
> On Thu, 19 Jan 2012, Ronen Hod wrote:
>> I believe that it will be best if the kernel publishes an ideal number_of_free_pages (in /proc/meminfo or whatever). Such number is easy to work with since this is what applications do, they free pages. Applications will be able to refer to this number from their garbage collector, or before allocating memory also if they did not get a notification, and it is also useful if several applications free memory at the same time.
>
> Isn't
>
> /proc/sys/vm/min_free_kbytes
>
> pretty much just that?
>
> Pekka
Would you suggest to use min_free_kbytes as the threshold for sending low_memory_notifications to applications, and separately as a target value for the applications' memory giveaway?
Thanks, Ronen.
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-19 9:05 ` Ronen Hod
@ 2012-01-19 9:10 ` Pekka Enberg
-1 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-19 9:10 UTC (permalink / raw)
To: Ronen Hod
Cc: leonid.moiseichuk, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, mtosatti,
akpm, kosaki.motohiro
On Thu, Jan 19, 2012 at 11:05 AM, Ronen Hod <rhod@redhat.com> wrote:
>>> I believe that it will be best if the kernel publishes an ideal
>>> number_of_free_pages (in /proc/meminfo or whatever). Such number is easy to
>>> work with since this is what applications do, they free pages. Applications
>>> will be able to refer to this number from their garbage collector, or before
>>> allocating memory also if they did not get a notification, and it is also
>>> useful if several applications free memory at the same time.
>>
>> Isn't
>>
>> /proc/sys/vm/min_free_kbytes
>>
>> pretty much just that?
>
> Would you suggest to use min_free_kbytes as the threshold for sending
> low_memory_notifications to applications, and separately as a target value
> for the applications' memory giveaway?
I'm not saying that the kernel should use it directly but it seems
like the kind of "ideal number of free pages" threshold you're
suggesting. So userspace can read that value and use it as the "number
of free pages" threshold for VM events, no?
Pekka
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-19 9:10 ` Pekka Enberg
0 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-19 9:10 UTC (permalink / raw)
To: Ronen Hod
Cc: leonid.moiseichuk, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, mtosatti,
akpm, kosaki.motohiro
On Thu, Jan 19, 2012 at 11:05 AM, Ronen Hod <rhod@redhat.com> wrote:
>>> I believe that it will be best if the kernel publishes an ideal
>>> number_of_free_pages (in /proc/meminfo or whatever). Such number is easy to
>>> work with since this is what applications do, they free pages. Applications
>>> will be able to refer to this number from their garbage collector, or before
>>> allocating memory also if they did not get a notification, and it is also
>>> useful if several applications free memory at the same time.
>>
>> Isn't
>>
>> /proc/sys/vm/min_free_kbytes
>>
>> pretty much just that?
>
> Would you suggest to use min_free_kbytes as the threshold for sending
> low_memory_notifications to applications, and separately as a target value
> for the applications' memory giveaway?
I'm not saying that the kernel should use it directly but it seems
like the kind of "ideal number of free pages" threshold you're
suggesting. So userspace can read that value and use it as the "number
of free pages" threshold for VM events, no?
Pekka
^ permalink raw reply [flat|nested] 124+ messages in thread* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-19 9:10 ` Pekka Enberg
@ 2012-01-19 9:20 ` Ronen Hod
-1 siblings, 0 replies; 124+ messages in thread
From: Ronen Hod @ 2012-01-19 9:20 UTC (permalink / raw)
To: Pekka Enberg
Cc: leonid.moiseichuk, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, mtosatti,
akpm, kosaki.motohiro
On 01/19/2012 11:10 AM, Pekka Enberg wrote:
> On Thu, Jan 19, 2012 at 11:05 AM, Ronen Hod<rhod@redhat.com> wrote:
>>>> I believe that it will be best if the kernel publishes an ideal
>>>> number_of_free_pages (in /proc/meminfo or whatever). Such number is easy to
>>>> work with since this is what applications do, they free pages. Applications
>>>> will be able to refer to this number from their garbage collector, or before
>>>> allocating memory also if they did not get a notification, and it is also
>>>> useful if several applications free memory at the same time.
>>> Isn't
>>>
>>> /proc/sys/vm/min_free_kbytes
>>>
>>> pretty much just that?
>> Would you suggest to use min_free_kbytes as the threshold for sending
>> low_memory_notifications to applications, and separately as a target value
>> for the applications' memory giveaway?
> I'm not saying that the kernel should use it directly but it seems
> like the kind of "ideal number of free pages" threshold you're
> suggesting. So userspace can read that value and use it as the "number
> of free pages" threshold for VM events, no?
Yes, I like it. The rules of the game are simple and consistent all over, be it the alert threshold, voluntary poling by the apps, and for concurrent work by several applications.
Well, as long as it provides a good indication for low_mem_pressure.
Thanks, Ronen.
>
> Pekka
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-19 9:20 ` Ronen Hod
0 siblings, 0 replies; 124+ messages in thread
From: Ronen Hod @ 2012-01-19 9:20 UTC (permalink / raw)
To: Pekka Enberg
Cc: leonid.moiseichuk, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, mtosatti,
akpm, kosaki.motohiro
On 01/19/2012 11:10 AM, Pekka Enberg wrote:
> On Thu, Jan 19, 2012 at 11:05 AM, Ronen Hod<rhod@redhat.com> wrote:
>>>> I believe that it will be best if the kernel publishes an ideal
>>>> number_of_free_pages (in /proc/meminfo or whatever). Such number is easy to
>>>> work with since this is what applications do, they free pages. Applications
>>>> will be able to refer to this number from their garbage collector, or before
>>>> allocating memory also if they did not get a notification, and it is also
>>>> useful if several applications free memory at the same time.
>>> Isn't
>>>
>>> /proc/sys/vm/min_free_kbytes
>>>
>>> pretty much just that?
>> Would you suggest to use min_free_kbytes as the threshold for sending
>> low_memory_notifications to applications, and separately as a target value
>> for the applications' memory giveaway?
> I'm not saying that the kernel should use it directly but it seems
> like the kind of "ideal number of free pages" threshold you're
> suggesting. So userspace can read that value and use it as the "number
> of free pages" threshold for VM events, no?
Yes, I like it. The rules of the game are simple and consistent all over, be it the alert threshold, voluntary poling by the apps, and for concurrent work by several applications.
Well, as long as it provides a good indication for low_mem_pressure.
Thanks, Ronen.
>
> Pekka
^ permalink raw reply [flat|nested] 124+ messages in thread
* RE: [RFC 1/3] /dev/low_mem_notify
2012-01-19 9:20 ` Ronen Hod
@ 2012-01-19 10:53 ` leonid.moiseichuk
-1 siblings, 0 replies; 124+ messages in thread
From: leonid.moiseichuk @ 2012-01-19 10:53 UTC (permalink / raw)
To: rhod, penberg
Cc: riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, mtosatti, akpm,
kosaki.motohiro
> -----Original Message-----
> From: ext Ronen Hod [mailto:rhod@redhat.com]
> Sent: 19 January, 2012 11:20
> To: Pekka Enberg
...
> >>> Isn't
> >>>
> >>> /proc/sys/vm/min_free_kbytes
> >>>
> >>> pretty much just that?
> >> Would you suggest to use min_free_kbytes as the threshold for sending
> >> low_memory_notifications to applications, and separately as a target
> >> value for the applications' memory giveaway?
> > I'm not saying that the kernel should use it directly but it seems
> > like the kind of "ideal number of free pages" threshold you're
> > suggesting. So userspace can read that value and use it as the "number
> > of free pages" threshold for VM events, no?
>
> Yes, I like it. The rules of the game are simple and consistent all over, be it the
> alert threshold, voluntary poling by the apps, and for concurrent work by
> several applications.
> Well, as long as it provides a good indication for low_mem_pressure.
For me it doesn't look that have much sense. min_free_kbytes could be set from user-space (or auto-tuned by kernel) to keep some amount
of memory available for GFP_ATOMIC allocations. In case situation comes under pointed level kernel will reclaim memory from e.g. caches.
>From potential user point of view the proposed API has number of lacks which would be nice to have implemented:
1. rename this API from low_mem_pressure to something more related to notification and memory situation in system: memory_pressure, memnotify, memory_level etc. The word "low" is misleading here
2. API must use deferred timers to prevent use-time impact. Deferred timer will be triggered only in case HW event or non-deferrable timer, so if device sleeps timer might be skipped and that is what expected for user-space
3. API should be tunable for propagate changes when level is Up or Down, maybe both ways.
4. to avoid triggering too much events probably has sense to filter according to amount of change but that is optional. If subscriber set timer to 1s the amount of events should not be very big.
5. API must provide interface to request parameters e.g. available swap or free memory just to have some base.
6. I do not understand how work with attributes performed ( ) but it has sense to use mask and fill requested attributes using mask and callback table i.e. if free pages requested - they are reported, otherwise not.
7. would have sense to backport couple of attributes from memnotify.c
I can submit couple of patches if some of proposals looks sane for everyone.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* RE: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-19 10:53 ` leonid.moiseichuk
0 siblings, 0 replies; 124+ messages in thread
From: leonid.moiseichuk @ 2012-01-19 10:53 UTC (permalink / raw)
To: rhod, penberg
Cc: riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, mtosatti, akpm,
kosaki.motohiro
> -----Original Message-----
> From: ext Ronen Hod [mailto:rhod@redhat.com]
> Sent: 19 January, 2012 11:20
> To: Pekka Enberg
...
> >>> Isn't
> >>>
> >>> /proc/sys/vm/min_free_kbytes
> >>>
> >>> pretty much just that?
> >> Would you suggest to use min_free_kbytes as the threshold for sending
> >> low_memory_notifications to applications, and separately as a target
> >> value for the applications' memory giveaway?
> > I'm not saying that the kernel should use it directly but it seems
> > like the kind of "ideal number of free pages" threshold you're
> > suggesting. So userspace can read that value and use it as the "number
> > of free pages" threshold for VM events, no?
>
> Yes, I like it. The rules of the game are simple and consistent all over, be it the
> alert threshold, voluntary poling by the apps, and for concurrent work by
> several applications.
> Well, as long as it provides a good indication for low_mem_pressure.
For me it doesn't look that have much sense. min_free_kbytes could be set from user-space (or auto-tuned by kernel) to keep some amount
of memory available for GFP_ATOMIC allocations. In case situation comes under pointed level kernel will reclaim memory from e.g. caches.
>From potential user point of view the proposed API has number of lacks which would be nice to have implemented:
1. rename this API from low_mem_pressure to something more related to notification and memory situation in system: memory_pressure, memnotify, memory_level etc. The word "low" is misleading here
2. API must use deferred timers to prevent use-time impact. Deferred timer will be triggered only in case HW event or non-deferrable timer, so if device sleeps timer might be skipped and that is what expected for user-space
3. API should be tunable for propagate changes when level is Up or Down, maybe both ways.
4. to avoid triggering too much events probably has sense to filter according to amount of change but that is optional. If subscriber set timer to 1s the amount of events should not be very big.
5. API must provide interface to request parameters e.g. available swap or free memory just to have some base.
6. I do not understand how work with attributes performed ( ) but it has sense to use mask and fill requested attributes using mask and callback table i.e. if free pages requested - they are reported, otherwise not.
7. would have sense to backport couple of attributes from memnotify.c
I can submit couple of patches if some of proposals looks sane for everyone.
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-19 10:53 ` leonid.moiseichuk
@ 2012-01-19 11:07 ` Pekka Enberg
-1 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-19 11:07 UTC (permalink / raw)
To: leonid.moiseichuk
Cc: rhod, riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, mtosatti, akpm,
kosaki.motohiro
On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> From potential user point of view the proposed API has number of lacks which
> would be nice to have implemented:
On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> From potential user point of view the proposed API has number of lacks which
> would be nice to have implemented:
> 1. rename this API from low_mem_pressure to something more related to
> notification and memory situation in system: memory_pressure, memnotify,
> memory_level etc. The word "low" is misleading here
The thing is called vmevent:
http://git.kernel.org/?p=linux/kernel/git/penberg/linux.git;a=shortlog;h=refs/heads/vmevent/core
[penberg@tux ~]$ vi
[penberg@tux ~]$ cat email
On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> From potential user point of view the proposed API has number of lacks which
> would be nice to have implemented:
> 1. rename this API from low_mem_pressure to something more related to
> notification and memory situation in system: memory_pressure, memnotify,
> memory_level etc. The word "low" is misleading here
The thing is called vmevent:
http://git.kernel.org/?p=linux/kernel/git/penberg/linux.git;a=shortlog;h=refs/heads/vmevent/core
I haven't used "low mem" at all in the patches.
On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> 2. API must use deferred timers to prevent use-time impact. Deferred timer
> will be triggered only in case HW event or non-deferrable timer, so if device
> sleeps timer might be skipped and that is what expected for user-space
I'm currently looking at the possibility of hooking VM events to perf which
also uses hrtimers. Can't we make hrtimers do the right thing?
On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> 3. API should be tunable for propagate changes when level is Up or Down,
> maybe both ways.
Agreed.
On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> 4. to avoid triggering too much events probably has sense to filter according
> to amount of change but that is optional. If subscriber set timer to 1s the
> amount of events should not be very big.
Agreed.
On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> 5. API must provide interface to request parameters e.g. available swap or
> free memory just to have some base.
The current ABI already supports that. You can specify which attributes you're
interested in and they will be delivered as part of th event.
On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> 6. I do not understand how work with attributes performed ( ) but it has
> sense to use mask and fill requested attributes using mask and callback table
> i.e. if free pages requested - they are reported, otherwise not.
That's how it works now in the git tree.
On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> 7. would have sense to backport couple of attributes from memnotify.c
>
> I can submit couple of patches if some of proposals looks sane for everyone.
Feel free to do that.
I'm currently looking at how to support Minchan's non-sampled events. It seems
to me integrating with perf would be nice because we could simply use
tracepoints for this.
Pekka
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-19 11:07 ` Pekka Enberg
0 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-19 11:07 UTC (permalink / raw)
To: leonid.moiseichuk
Cc: rhod, riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, mtosatti, akpm,
kosaki.motohiro
On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> From potential user point of view the proposed API has number of lacks which
> would be nice to have implemented:
On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> From potential user point of view the proposed API has number of lacks which
> would be nice to have implemented:
> 1. rename this API from low_mem_pressure to something more related to
> notification and memory situation in system: memory_pressure, memnotify,
> memory_level etc. The word "low" is misleading here
The thing is called vmevent:
http://git.kernel.org/?p=linux/kernel/git/penberg/linux.git;a=shortlog;h=refs/heads/vmevent/core
[penberg@tux ~]$ vi
[penberg@tux ~]$ cat email
On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> From potential user point of view the proposed API has number of lacks which
> would be nice to have implemented:
> 1. rename this API from low_mem_pressure to something more related to
> notification and memory situation in system: memory_pressure, memnotify,
> memory_level etc. The word "low" is misleading here
The thing is called vmevent:
http://git.kernel.org/?p=linux/kernel/git/penberg/linux.git;a=shortlog;h=refs/heads/vmevent/core
I haven't used "low mem" at all in the patches.
On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> 2. API must use deferred timers to prevent use-time impact. Deferred timer
> will be triggered only in case HW event or non-deferrable timer, so if device
> sleeps timer might be skipped and that is what expected for user-space
I'm currently looking at the possibility of hooking VM events to perf which
also uses hrtimers. Can't we make hrtimers do the right thing?
On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> 3. API should be tunable for propagate changes when level is Up or Down,
> maybe both ways.
Agreed.
On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> 4. to avoid triggering too much events probably has sense to filter according
> to amount of change but that is optional. If subscriber set timer to 1s the
> amount of events should not be very big.
Agreed.
On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> 5. API must provide interface to request parameters e.g. available swap or
> free memory just to have some base.
The current ABI already supports that. You can specify which attributes you're
interested in and they will be delivered as part of th event.
On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> 6. I do not understand how work with attributes performed ( ) but it has
> sense to use mask and fill requested attributes using mask and callback table
> i.e. if free pages requested - they are reported, otherwise not.
That's how it works now in the git tree.
On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> 7. would have sense to backport couple of attributes from memnotify.c
>
> I can submit couple of patches if some of proposals looks sane for everyone.
Feel free to do that.
I'm currently looking at how to support Minchan's non-sampled events. It seems
to me integrating with perf would be nice because we could simply use
tracepoints for this.
Pekka
^ permalink raw reply [flat|nested] 124+ messages in thread
* RE: [RFC 1/3] /dev/low_mem_notify
2012-01-19 11:07 ` Pekka Enberg
@ 2012-01-19 11:54 ` leonid.moiseichuk
-1 siblings, 0 replies; 124+ messages in thread
From: leonid.moiseichuk @ 2012-01-19 11:54 UTC (permalink / raw)
To: penberg
Cc: rhod, riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, mtosatti, akpm,
kosaki.motohiro
> -----Original Message-----
> From: penberg@gmail.com [mailto:penberg@gmail.com] On Behalf Of ext
> Pekka Enberg
> Sent: 19 January, 2012 13:08
...
> > 1. rename this API from low_mem_pressure to something more related to
> > notification and memory situation in system: memory_pressure,
> > memnotify, memory_level etc. The word "low" is misleading here
>
> The thing is called vmevent:
Yes, I see it. But I was a bit confused with vmnotify_fops and was sure it is mapped through dev. Now it anonymous inode.
>
> On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> > 2. API must use deferred timers to prevent use-time impact. Deferred
> > timer will be triggered only in case HW event or non-deferrable timer,
> > so if device sleeps timer might be skipped and that is what expected
> > for user-space
>
> I'm currently looking at the possibility of hooking VM events to perf which
> also uses hrtimers. Can't we make hrtimers do the right thing?
I had no answer for this question. According to hrtimer_cpu_notify the cpu state is tracked but timer may set HW event to wake up.
In this case use-time will be affected due to you will have too much HW events and reasons to wakeup.
At least powertop reports hrtimers in relation to <kernel core> as an activities sources.
>
> On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> > 3. API should be tunable for propagate changes when level is Up or
> > Down, maybe both ways.
>
> Agreed.
>
> On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> > 4. to avoid triggering too much events probably has sense to filter
> > according to amount of change but that is optional. If subscriber set
> > timer to 1s the amount of events should not be very big.
>
> Agreed.
>
> On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> > 5. API must provide interface to request parameters e.g. available
> > swap or free memory just to have some base.
>
> The current ABI already supports that. You can specify which attributes
> you're interested in and they will be delivered as part of th event.
But you have in vmnotify.h suspicious free_pages_threshold field.
>
> On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> > 6. I do not understand how work with attributes performed ( ) but it
> > has sense to use mask and fill requested attributes using mask and
> > callback table i.e. if free pages requested - they are reported, otherwise
> not.
>
> That's how it works now in the git tree.
Vmnotify.c has vmnotify_watch_event which collects fixed set of parameters.
> I'm currently looking at how to support Minchan's non-sampled events. It
> seems to me integrating with perf would be nice because we could simply
> use tracepoints for this.
If tracepoints not jeopardize use time has sense to do it.
>
> Pekka
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* RE: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-19 11:54 ` leonid.moiseichuk
0 siblings, 0 replies; 124+ messages in thread
From: leonid.moiseichuk @ 2012-01-19 11:54 UTC (permalink / raw)
To: penberg
Cc: rhod, riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, mtosatti, akpm,
kosaki.motohiro
> -----Original Message-----
> From: penberg@gmail.com [mailto:penberg@gmail.com] On Behalf Of ext
> Pekka Enberg
> Sent: 19 January, 2012 13:08
...
> > 1. rename this API from low_mem_pressure to something more related to
> > notification and memory situation in system: memory_pressure,
> > memnotify, memory_level etc. The word "low" is misleading here
>
> The thing is called vmevent:
Yes, I see it. But I was a bit confused with vmnotify_fops and was sure it is mapped through dev. Now it anonymous inode.
>
> On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> > 2. API must use deferred timers to prevent use-time impact. Deferred
> > timer will be triggered only in case HW event or non-deferrable timer,
> > so if device sleeps timer might be skipped and that is what expected
> > for user-space
>
> I'm currently looking at the possibility of hooking VM events to perf which
> also uses hrtimers. Can't we make hrtimers do the right thing?
I had no answer for this question. According to hrtimer_cpu_notify the cpu state is tracked but timer may set HW event to wake up.
In this case use-time will be affected due to you will have too much HW events and reasons to wakeup.
At least powertop reports hrtimers in relation to <kernel core> as an activities sources.
>
> On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> > 3. API should be tunable for propagate changes when level is Up or
> > Down, maybe both ways.
>
> Agreed.
>
> On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> > 4. to avoid triggering too much events probably has sense to filter
> > according to amount of change but that is optional. If subscriber set
> > timer to 1s the amount of events should not be very big.
>
> Agreed.
>
> On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> > 5. API must provide interface to request parameters e.g. available
> > swap or free memory just to have some base.
>
> The current ABI already supports that. You can specify which attributes
> you're interested in and they will be delivered as part of th event.
But you have in vmnotify.h suspicious free_pages_threshold field.
>
> On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
> > 6. I do not understand how work with attributes performed ( ) but it
> > has sense to use mask and fill requested attributes using mask and
> > callback table i.e. if free pages requested - they are reported, otherwise
> not.
>
> That's how it works now in the git tree.
Vmnotify.c has vmnotify_watch_event which collects fixed set of parameters.
> I'm currently looking at how to support Minchan's non-sampled events. It
> seems to me integrating with perf would be nice because we could simply
> use tracepoints for this.
If tracepoints not jeopardize use time has sense to do it.
>
> Pekka
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-19 11:54 ` leonid.moiseichuk
@ 2012-01-19 11:59 ` Pekka Enberg
-1 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-19 11:59 UTC (permalink / raw)
To: leonid.moiseichuk
Cc: rhod, riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, mtosatti, akpm,
kosaki.motohiro
On Thu, Jan 19, 2012 at 1:54 PM, <leonid.moiseichuk@nokia.com> wrote:
>> The current ABI already supports that. You can specify which attributes
>> you're interested in and they will be delivered as part of th event.
>
> But you have in vmnotify.h suspicious free_pages_threshold field.
Aah, I was actually talking about the events userspace _reads_.
The free_pages_threshold field is only used if
VMEVENT_TYPE_FREE_THRESHOLD bit is set. It should be cleaned up a bit
but it in theory it supports watching other attributes as well. I've
postponed the cleanup until I've figured out whether we can use perf
which would make the whole syscall go away.
Pekka
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-19 11:59 ` Pekka Enberg
0 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-19 11:59 UTC (permalink / raw)
To: leonid.moiseichuk
Cc: rhod, riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, mtosatti, akpm,
kosaki.motohiro
On Thu, Jan 19, 2012 at 1:54 PM, <leonid.moiseichuk@nokia.com> wrote:
>> The current ABI already supports that. You can specify which attributes
>> you're interested in and they will be delivered as part of th event.
>
> But you have in vmnotify.h suspicious free_pages_threshold field.
Aah, I was actually talking about the events userspace _reads_.
The free_pages_threshold field is only used if
VMEVENT_TYPE_FREE_THRESHOLD bit is set. It should be cleaned up a bit
but it in theory it supports watching other attributes as well. I've
postponed the cleanup until I've figured out whether we can use perf
which would make the whole syscall go away.
Pekka
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-19 11:54 ` leonid.moiseichuk
@ 2012-01-19 12:06 ` Pekka Enberg
-1 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-19 12:06 UTC (permalink / raw)
To: leonid.moiseichuk
Cc: rhod, riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, mtosatti, akpm,
kosaki.motohiro
On Thu, Jan 19, 2012 at 1:54 PM, <leonid.moiseichuk@nokia.com> wrote:
>> On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
>> > 6. I do not understand how work with attributes performed ( ) but it
>> > has sense to use mask and fill requested attributes using mask and
>> > callback table i.e. if free pages requested - they are reported, otherwise
>> not.
>>
>> That's how it works now in the git tree.
>
> Vmnotify.c has vmnotify_watch_event which collects fixed set of parameters.
That's would be a bug. We should check event_attrs like we do for NR_SWAP_PAGES.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-19 12:06 ` Pekka Enberg
0 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-19 12:06 UTC (permalink / raw)
To: leonid.moiseichuk
Cc: rhod, riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, mtosatti, akpm,
kosaki.motohiro
On Thu, Jan 19, 2012 at 1:54 PM, <leonid.moiseichuk@nokia.com> wrote:
>> On Thu, Jan 19, 2012 at 12:53 PM, <leonid.moiseichuk@nokia.com> wrote:
>> > 6. I do not understand how work with attributes performed ( ) but it
>> > has sense to use mask and fill requested attributes using mask and
>> > callback table i.e. if free pages requested - they are reported, otherwise
>> not.
>>
>> That's how it works now in the git tree.
>
> Vmnotify.c has vmnotify_watch_event which collects fixed set of parameters.
That's would be a bug. We should check event_attrs like we do for NR_SWAP_PAGES.
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-19 10:53 ` leonid.moiseichuk
@ 2012-01-24 15:38 ` Marcelo Tosatti
-1 siblings, 0 replies; 124+ messages in thread
From: Marcelo Tosatti @ 2012-01-24 15:38 UTC (permalink / raw)
To: leonid.moiseichuk
Cc: rhod, penberg, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, akpm,
kosaki.motohiro
On Thu, Jan 19, 2012 at 10:53:29AM +0000, leonid.moiseichuk@nokia.com wrote:
> > -----Original Message-----
> > From: ext Ronen Hod [mailto:rhod@redhat.com]
> > Sent: 19 January, 2012 11:20
> > To: Pekka Enberg
> ...
> > >>> Isn't
> > >>>
> > >>> /proc/sys/vm/min_free_kbytes
> > >>>
> > >>> pretty much just that?
> > >> Would you suggest to use min_free_kbytes as the threshold for sending
> > >> low_memory_notifications to applications, and separately as a target
> > >> value for the applications' memory giveaway?
> > > I'm not saying that the kernel should use it directly but it seems
> > > like the kind of "ideal number of free pages" threshold you're
> > > suggesting. So userspace can read that value and use it as the "number
> > > of free pages" threshold for VM events, no?
> >
> > Yes, I like it. The rules of the game are simple and consistent all over, be it the
> > alert threshold, voluntary poling by the apps, and for concurrent work by
> > several applications.
> > Well, as long as it provides a good indication for low_mem_pressure.
>
> For me it doesn't look that have much sense. min_free_kbytes could be set from user-space (or auto-tuned by kernel) to keep some amount
> of memory available for GFP_ATOMIC allocations. In case situation comes under pointed level kernel will reclaim memory from e.g. caches.
>
> >From potential user point of view the proposed API has number of lacks which would be nice to have implemented:
> 1. rename this API from low_mem_pressure to something more related to notification and memory situation in system: memory_pressure, memnotify, memory_level etc. The word "low" is misleading here
> 2. API must use deferred timers to prevent use-time impact. Deferred timer will be triggered only in case HW event or non-deferrable timer, so if device sleeps timer might be skipped and that is what expected for user-space
Having userspace specify the "sample period" for low memory notification
makes no sense. The frequency of notifications is a function of the
memory pressure.
> 3. API should be tunable for propagate changes when level is Up or Down, maybe both ways.
> 4. to avoid triggering too much events probably has sense to filter according to amount of change but that is optional. If subscriber set timer to 1s the amount of events should not be very big.
> 5. API must provide interface to request parameters e.g. available swap or free memory just to have some base.
It would make the interface easier to use if it provided the number of
pages to free, in the notification (kernel can calculate that as the
delta between current_free_pages -> comfortable_free_pages relative to
process RSS).
> 6. I do not understand how work with attributes performed ( ) but it has sense to use mask and fill requested attributes using mask and callback table i.e. if free pages requested - they are reported, otherwise not.
> 7. would have sense to backport couple of attributes from memnotify.c
>
> I can submit couple of patches if some of proposals looks sane for everyone.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-24 15:38 ` Marcelo Tosatti
0 siblings, 0 replies; 124+ messages in thread
From: Marcelo Tosatti @ 2012-01-24 15:38 UTC (permalink / raw)
To: leonid.moiseichuk
Cc: rhod, penberg, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, akpm,
kosaki.motohiro
On Thu, Jan 19, 2012 at 10:53:29AM +0000, leonid.moiseichuk@nokia.com wrote:
> > -----Original Message-----
> > From: ext Ronen Hod [mailto:rhod@redhat.com]
> > Sent: 19 January, 2012 11:20
> > To: Pekka Enberg
> ...
> > >>> Isn't
> > >>>
> > >>> /proc/sys/vm/min_free_kbytes
> > >>>
> > >>> pretty much just that?
> > >> Would you suggest to use min_free_kbytes as the threshold for sending
> > >> low_memory_notifications to applications, and separately as a target
> > >> value for the applications' memory giveaway?
> > > I'm not saying that the kernel should use it directly but it seems
> > > like the kind of "ideal number of free pages" threshold you're
> > > suggesting. So userspace can read that value and use it as the "number
> > > of free pages" threshold for VM events, no?
> >
> > Yes, I like it. The rules of the game are simple and consistent all over, be it the
> > alert threshold, voluntary poling by the apps, and for concurrent work by
> > several applications.
> > Well, as long as it provides a good indication for low_mem_pressure.
>
> For me it doesn't look that have much sense. min_free_kbytes could be set from user-space (or auto-tuned by kernel) to keep some amount
> of memory available for GFP_ATOMIC allocations. In case situation comes under pointed level kernel will reclaim memory from e.g. caches.
>
> >From potential user point of view the proposed API has number of lacks which would be nice to have implemented:
> 1. rename this API from low_mem_pressure to something more related to notification and memory situation in system: memory_pressure, memnotify, memory_level etc. The word "low" is misleading here
> 2. API must use deferred timers to prevent use-time impact. Deferred timer will be triggered only in case HW event or non-deferrable timer, so if device sleeps timer might be skipped and that is what expected for user-space
Having userspace specify the "sample period" for low memory notification
makes no sense. The frequency of notifications is a function of the
memory pressure.
> 3. API should be tunable for propagate changes when level is Up or Down, maybe both ways.
> 4. to avoid triggering too much events probably has sense to filter according to amount of change but that is optional. If subscriber set timer to 1s the amount of events should not be very big.
> 5. API must provide interface to request parameters e.g. available swap or free memory just to have some base.
It would make the interface easier to use if it provided the number of
pages to free, in the notification (kernel can calculate that as the
delta between current_free_pages -> comfortable_free_pages relative to
process RSS).
> 6. I do not understand how work with attributes performed ( ) but it has sense to use mask and fill requested attributes using mask and callback table i.e. if free pages requested - they are reported, otherwise not.
> 7. would have sense to backport couple of attributes from memnotify.c
>
> I can submit couple of patches if some of proposals looks sane for everyone.
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-24 15:38 ` Marcelo Tosatti
@ 2012-01-24 16:08 ` Ronen Hod
-1 siblings, 0 replies; 124+ messages in thread
From: Ronen Hod @ 2012-01-24 16:08 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: leonid.moiseichuk, penberg, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, akpm,
kosaki.motohiro
On 01/24/2012 05:38 PM, Marcelo Tosatti wrote:
> On Thu, Jan 19, 2012 at 10:53:29AM +0000, leonid.moiseichuk@nokia.com wrote:
>>> -----Original Message-----
>>> From: ext Ronen Hod [mailto:rhod@redhat.com]
>>> Sent: 19 January, 2012 11:20
>>> To: Pekka Enberg
>> ...
>>>>>> Isn't
>>>>>>
>>>>>> /proc/sys/vm/min_free_kbytes
>>>>>>
>>>>>> pretty much just that?
>>>>> Would you suggest to use min_free_kbytes as the threshold for sending
>>>>> low_memory_notifications to applications, and separately as a target
>>>>> value for the applications' memory giveaway?
>>>> I'm not saying that the kernel should use it directly but it seems
>>>> like the kind of "ideal number of free pages" threshold you're
>>>> suggesting. So userspace can read that value and use it as the "number
>>>> of free pages" threshold for VM events, no?
>>> Yes, I like it. The rules of the game are simple and consistent all over, be it the
>>> alert threshold, voluntary poling by the apps, and for concurrent work by
>>> several applications.
>>> Well, as long as it provides a good indication for low_mem_pressure.
>> For me it doesn't look that have much sense. min_free_kbytes could be set from user-space (or auto-tuned by kernel) to keep some amount
>> of memory available for GFP_ATOMIC allocations. In case situation comes under pointed level kernel will reclaim memory from e.g. caches.
>>
>> > From potential user point of view the proposed API has number of lacks which would be nice to have implemented:
>> 1. rename this API from low_mem_pressure to something more related to notification and memory situation in system: memory_pressure, memnotify, memory_level etc. The word "low" is misleading here
>> 2. API must use deferred timers to prevent use-time impact. Deferred timer will be triggered only in case HW event or non-deferrable timer, so if device sleeps timer might be skipped and that is what expected for user-space
> Having userspace specify the "sample period" for low memory notification
> makes no sense. The frequency of notifications is a function of the
> memory pressure.
>
>> 3. API should be tunable for propagate changes when level is Up or Down, maybe both ways.
>
>> 4. to avoid triggering too much events probably has sense to filter according to amount of change but that is optional. If subscriber set timer to 1s the amount of events should not be very big.
>> 5. API must provide interface to request parameters e.g. available swap or free memory just to have some base.
> It would make the interface easier to use if it provided the number of
> pages to free, in the notification (kernel can calculate that as the
> delta between current_free_pages -> comfortable_free_pages relative to
> process RSS).
If you rely on the notification's argument you lose several features:
- Handling of notifications by several applications in parallel
- Voluntary application's decisions, such as cleanup or avoiding allocations, at the application's convenience.
- Iterative release loops, until there are enough free pages.
I believe that the notification should only serve as a trigger to run the cleanup.
Ronen.
>
>> 6. I do not understand how work with attributes performed ( ) but it has sense to use mask and fill requested attributes using mask and callback table i.e. if free pages requested - they are reported, otherwise not.
>> 7. would have sense to backport couple of attributes from memnotify.c
>>
>> I can submit couple of patches if some of proposals looks sane for everyone.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-24 16:08 ` Ronen Hod
0 siblings, 0 replies; 124+ messages in thread
From: Ronen Hod @ 2012-01-24 16:08 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: leonid.moiseichuk, penberg, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, akpm,
kosaki.motohiro
On 01/24/2012 05:38 PM, Marcelo Tosatti wrote:
> On Thu, Jan 19, 2012 at 10:53:29AM +0000, leonid.moiseichuk@nokia.com wrote:
>>> -----Original Message-----
>>> From: ext Ronen Hod [mailto:rhod@redhat.com]
>>> Sent: 19 January, 2012 11:20
>>> To: Pekka Enberg
>> ...
>>>>>> Isn't
>>>>>>
>>>>>> /proc/sys/vm/min_free_kbytes
>>>>>>
>>>>>> pretty much just that?
>>>>> Would you suggest to use min_free_kbytes as the threshold for sending
>>>>> low_memory_notifications to applications, and separately as a target
>>>>> value for the applications' memory giveaway?
>>>> I'm not saying that the kernel should use it directly but it seems
>>>> like the kind of "ideal number of free pages" threshold you're
>>>> suggesting. So userspace can read that value and use it as the "number
>>>> of free pages" threshold for VM events, no?
>>> Yes, I like it. The rules of the game are simple and consistent all over, be it the
>>> alert threshold, voluntary poling by the apps, and for concurrent work by
>>> several applications.
>>> Well, as long as it provides a good indication for low_mem_pressure.
>> For me it doesn't look that have much sense. min_free_kbytes could be set from user-space (or auto-tuned by kernel) to keep some amount
>> of memory available for GFP_ATOMIC allocations. In case situation comes under pointed level kernel will reclaim memory from e.g. caches.
>>
>> > From potential user point of view the proposed API has number of lacks which would be nice to have implemented:
>> 1. rename this API from low_mem_pressure to something more related to notification and memory situation in system: memory_pressure, memnotify, memory_level etc. The word "low" is misleading here
>> 2. API must use deferred timers to prevent use-time impact. Deferred timer will be triggered only in case HW event or non-deferrable timer, so if device sleeps timer might be skipped and that is what expected for user-space
> Having userspace specify the "sample period" for low memory notification
> makes no sense. The frequency of notifications is a function of the
> memory pressure.
>
>> 3. API should be tunable for propagate changes when level is Up or Down, maybe both ways.
>
>> 4. to avoid triggering too much events probably has sense to filter according to amount of change but that is optional. If subscriber set timer to 1s the amount of events should not be very big.
>> 5. API must provide interface to request parameters e.g. available swap or free memory just to have some base.
> It would make the interface easier to use if it provided the number of
> pages to free, in the notification (kernel can calculate that as the
> delta between current_free_pages -> comfortable_free_pages relative to
> process RSS).
If you rely on the notification's argument you lose several features:
- Handling of notifications by several applications in parallel
- Voluntary application's decisions, such as cleanup or avoiding allocations, at the application's convenience.
- Iterative release loops, until there are enough free pages.
I believe that the notification should only serve as a trigger to run the cleanup.
Ronen.
>
>> 6. I do not understand how work with attributes performed ( ) but it has sense to use mask and fill requested attributes using mask and callback table i.e. if free pages requested - they are reported, otherwise not.
>> 7. would have sense to backport couple of attributes from memnotify.c
>>
>> I can submit couple of patches if some of proposals looks sane for everyone.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-24 16:08 ` Ronen Hod
@ 2012-01-24 18:10 ` Marcelo Tosatti
-1 siblings, 0 replies; 124+ messages in thread
From: Marcelo Tosatti @ 2012-01-24 18:10 UTC (permalink / raw)
To: Ronen Hod
Cc: leonid.moiseichuk, penberg, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, akpm,
kosaki.motohiro
On Tue, Jan 24, 2012 at 06:08:31PM +0200, Ronen Hod wrote:
> On 01/24/2012 05:38 PM, Marcelo Tosatti wrote:
> >On Thu, Jan 19, 2012 at 10:53:29AM +0000, leonid.moiseichuk@nokia.com wrote:
> >>>-----Original Message-----
> >>>From: ext Ronen Hod [mailto:rhod@redhat.com]
> >>>Sent: 19 January, 2012 11:20
> >>>To: Pekka Enberg
> >>...
> >>>>>>Isn't
> >>>>>>
> >>>>>>/proc/sys/vm/min_free_kbytes
> >>>>>>
> >>>>>>pretty much just that?
> >>>>>Would you suggest to use min_free_kbytes as the threshold for sending
> >>>>>low_memory_notifications to applications, and separately as a target
> >>>>>value for the applications' memory giveaway?
> >>>>I'm not saying that the kernel should use it directly but it seems
> >>>>like the kind of "ideal number of free pages" threshold you're
> >>>>suggesting. So userspace can read that value and use it as the "number
> >>>>of free pages" threshold for VM events, no?
> >>>Yes, I like it. The rules of the game are simple and consistent all over, be it the
> >>>alert threshold, voluntary poling by the apps, and for concurrent work by
> >>>several applications.
> >>>Well, as long as it provides a good indication for low_mem_pressure.
> >>For me it doesn't look that have much sense. min_free_kbytes could be set from user-space (or auto-tuned by kernel) to keep some amount
> >>of memory available for GFP_ATOMIC allocations. In case situation comes under pointed level kernel will reclaim memory from e.g. caches.
> >>
> >>> From potential user point of view the proposed API has number of lacks which would be nice to have implemented:
> >>1. rename this API from low_mem_pressure to something more related to notification and memory situation in system: memory_pressure, memnotify, memory_level etc. The word "low" is misleading here
> >>2. API must use deferred timers to prevent use-time impact. Deferred timer will be triggered only in case HW event or non-deferrable timer, so if device sleeps timer might be skipped and that is what expected for user-space
> >Having userspace specify the "sample period" for low memory notification
> >makes no sense. The frequency of notifications is a function of the
> >memory pressure.
> >
> >>3. API should be tunable for propagate changes when level is Up or Down, maybe both ways.
> >
> >>4. to avoid triggering too much events probably has sense to filter according to amount of change but that is optional. If subscriber set timer to 1s the amount of events should not be very big.
> >>5. API must provide interface to request parameters e.g. available swap or free memory just to have some base.
> >It would make the interface easier to use if it provided the number of
> >pages to free, in the notification (kernel can calculate that as the
> >delta between current_free_pages -> comfortable_free_pages relative to
> >process RSS).
>
> If you rely on the notification's argument you lose several features:
> - Handling of notifications by several applications in parallel
Each application has its argument built in a custom fashion
(pages_to_free = delta between current_free_pages ->
comfortable_free_pages relative to process RSS), or something to that
effect. It is compatible with parallel notifications.
> - Voluntary application's decisions, such as cleanup or avoiding allocations, at the application's convenience.
I am suggesting an additional field in the notification data so that the
freeing routine has a goal. But it is not mandatory.
> - Iterative release loops, until there are enough free pages.
What is the advantage versus releasing the necessary amount of
memory in a given moment?
> I believe that the notification should only serve as a trigger to run the cleanup.
Agree.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-24 18:10 ` Marcelo Tosatti
0 siblings, 0 replies; 124+ messages in thread
From: Marcelo Tosatti @ 2012-01-24 18:10 UTC (permalink / raw)
To: Ronen Hod
Cc: leonid.moiseichuk, penberg, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, akpm,
kosaki.motohiro
On Tue, Jan 24, 2012 at 06:08:31PM +0200, Ronen Hod wrote:
> On 01/24/2012 05:38 PM, Marcelo Tosatti wrote:
> >On Thu, Jan 19, 2012 at 10:53:29AM +0000, leonid.moiseichuk@nokia.com wrote:
> >>>-----Original Message-----
> >>>From: ext Ronen Hod [mailto:rhod@redhat.com]
> >>>Sent: 19 January, 2012 11:20
> >>>To: Pekka Enberg
> >>...
> >>>>>>Isn't
> >>>>>>
> >>>>>>/proc/sys/vm/min_free_kbytes
> >>>>>>
> >>>>>>pretty much just that?
> >>>>>Would you suggest to use min_free_kbytes as the threshold for sending
> >>>>>low_memory_notifications to applications, and separately as a target
> >>>>>value for the applications' memory giveaway?
> >>>>I'm not saying that the kernel should use it directly but it seems
> >>>>like the kind of "ideal number of free pages" threshold you're
> >>>>suggesting. So userspace can read that value and use it as the "number
> >>>>of free pages" threshold for VM events, no?
> >>>Yes, I like it. The rules of the game are simple and consistent all over, be it the
> >>>alert threshold, voluntary poling by the apps, and for concurrent work by
> >>>several applications.
> >>>Well, as long as it provides a good indication for low_mem_pressure.
> >>For me it doesn't look that have much sense. min_free_kbytes could be set from user-space (or auto-tuned by kernel) to keep some amount
> >>of memory available for GFP_ATOMIC allocations. In case situation comes under pointed level kernel will reclaim memory from e.g. caches.
> >>
> >>> From potential user point of view the proposed API has number of lacks which would be nice to have implemented:
> >>1. rename this API from low_mem_pressure to something more related to notification and memory situation in system: memory_pressure, memnotify, memory_level etc. The word "low" is misleading here
> >>2. API must use deferred timers to prevent use-time impact. Deferred timer will be triggered only in case HW event or non-deferrable timer, so if device sleeps timer might be skipped and that is what expected for user-space
> >Having userspace specify the "sample period" for low memory notification
> >makes no sense. The frequency of notifications is a function of the
> >memory pressure.
> >
> >>3. API should be tunable for propagate changes when level is Up or Down, maybe both ways.
> >
> >>4. to avoid triggering too much events probably has sense to filter according to amount of change but that is optional. If subscriber set timer to 1s the amount of events should not be very big.
> >>5. API must provide interface to request parameters e.g. available swap or free memory just to have some base.
> >It would make the interface easier to use if it provided the number of
> >pages to free, in the notification (kernel can calculate that as the
> >delta between current_free_pages -> comfortable_free_pages relative to
> >process RSS).
>
> If you rely on the notification's argument you lose several features:
> - Handling of notifications by several applications in parallel
Each application has its argument built in a custom fashion
(pages_to_free = delta between current_free_pages ->
comfortable_free_pages relative to process RSS), or something to that
effect. It is compatible with parallel notifications.
> - Voluntary application's decisions, such as cleanup or avoiding allocations, at the application's convenience.
I am suggesting an additional field in the notification data so that the
freeing routine has a goal. But it is not mandatory.
> - Iterative release loops, until there are enough free pages.
What is the advantage versus releasing the necessary amount of
memory in a given moment?
> I believe that the notification should only serve as a trigger to run the cleanup.
Agree.
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-24 18:10 ` Marcelo Tosatti
@ 2012-01-25 8:52 ` Ronen Hod
-1 siblings, 0 replies; 124+ messages in thread
From: Ronen Hod @ 2012-01-25 8:52 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: leonid.moiseichuk, penberg, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, akpm,
kosaki.motohiro
On 01/24/2012 08:10 PM, Marcelo Tosatti wrote:
> On Tue, Jan 24, 2012 at 06:08:31PM +0200, Ronen Hod wrote:
>> On 01/24/2012 05:38 PM, Marcelo Tosatti wrote:
>>> On Thu, Jan 19, 2012 at 10:53:29AM +0000, leonid.moiseichuk@nokia.com wrote:
>>>>> -----Original Message-----
>>>>> From: ext Ronen Hod [mailto:rhod@redhat.com]
>>>>> Sent: 19 January, 2012 11:20
>>>>> To: Pekka Enberg
>>>> ...
>>>>>>>> Isn't
>>>>>>>>
>>>>>>>> /proc/sys/vm/min_free_kbytes
>>>>>>>>
>>>>>>>> pretty much just that?
>>>>>>> Would you suggest to use min_free_kbytes as the threshold for sending
>>>>>>> low_memory_notifications to applications, and separately as a target
>>>>>>> value for the applications' memory giveaway?
>>>>>> I'm not saying that the kernel should use it directly but it seems
>>>>>> like the kind of "ideal number of free pages" threshold you're
>>>>>> suggesting. So userspace can read that value and use it as the "number
>>>>>> of free pages" threshold for VM events, no?
>>>>> Yes, I like it. The rules of the game are simple and consistent all over, be it the
>>>>> alert threshold, voluntary poling by the apps, and for concurrent work by
>>>>> several applications.
>>>>> Well, as long as it provides a good indication for low_mem_pressure.
>>>> For me it doesn't look that have much sense. min_free_kbytes could be set from user-space (or auto-tuned by kernel) to keep some amount
>>>> of memory available for GFP_ATOMIC allocations. In case situation comes under pointed level kernel will reclaim memory from e.g. caches.
>>>>
>>>>> From potential user point of view the proposed API has number of lacks which would be nice to have implemented:
>>>> 1. rename this API from low_mem_pressure to something more related to notification and memory situation in system: memory_pressure, memnotify, memory_level etc. The word "low" is misleading here
>>>> 2. API must use deferred timers to prevent use-time impact. Deferred timer will be triggered only in case HW event or non-deferrable timer, so if device sleeps timer might be skipped and that is what expected for user-space
>>> Having userspace specify the "sample period" for low memory notification
>>> makes no sense. The frequency of notifications is a function of the
>>> memory pressure.
>>>
>>>> 3. API should be tunable for propagate changes when level is Up or Down, maybe both ways.
>>>> 4. to avoid triggering too much events probably has sense to filter according to amount of change but that is optional. If subscriber set timer to 1s the amount of events should not be very big.
>>>> 5. API must provide interface to request parameters e.g. available swap or free memory just to have some base.
>>> It would make the interface easier to use if it provided the number of
>>> pages to free, in the notification (kernel can calculate that as the
>>> delta between current_free_pages -> comfortable_free_pages relative to
>>> process RSS).
>> If you rely on the notification's argument you lose several features:
>> - Handling of notifications by several applications in parallel
> Each application has its argument built in a custom fashion
> (pages_to_free = delta between current_free_pages ->
> comfortable_free_pages relative to process RSS), or something to that
> effect. It is compatible with parallel notifications.
Not sure that I got it. Do you suggest to ask all the applications to free say 3% of their memory?. Some may be able to free more, and some cannot free any. Isn't it more practical to just notify them, and let each app contribute its part to the global moving target?
>> - Voluntary application's decisions, such as cleanup or avoiding allocations, at the application's convenience.
> I am suggesting an additional field in the notification data so that the
> freeing routine has a goal. But it is not mandatory.
If you do want to support voluntary (notification less) app decisions, based on the current status, then why not satisfy with this API and only use the notifications to trigger this procedure?
>
>> - Iterative release loops, until there are enough free pages.
> What is the advantage versus releasing the necessary amount of
> memory in a given moment?
The cleanup logic may be unaware of the page-level effects of its alloc and free, more so when freeing complex internal data structures (such as cached web pages), and this way you let it free until things settle down.
Ronen.
>
>> I believe that the notification should only serve as a trigger to run the cleanup.
> Agree.
>
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-25 8:52 ` Ronen Hod
0 siblings, 0 replies; 124+ messages in thread
From: Ronen Hod @ 2012-01-25 8:52 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: leonid.moiseichuk, penberg, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, akpm,
kosaki.motohiro
On 01/24/2012 08:10 PM, Marcelo Tosatti wrote:
> On Tue, Jan 24, 2012 at 06:08:31PM +0200, Ronen Hod wrote:
>> On 01/24/2012 05:38 PM, Marcelo Tosatti wrote:
>>> On Thu, Jan 19, 2012 at 10:53:29AM +0000, leonid.moiseichuk@nokia.com wrote:
>>>>> -----Original Message-----
>>>>> From: ext Ronen Hod [mailto:rhod@redhat.com]
>>>>> Sent: 19 January, 2012 11:20
>>>>> To: Pekka Enberg
>>>> ...
>>>>>>>> Isn't
>>>>>>>>
>>>>>>>> /proc/sys/vm/min_free_kbytes
>>>>>>>>
>>>>>>>> pretty much just that?
>>>>>>> Would you suggest to use min_free_kbytes as the threshold for sending
>>>>>>> low_memory_notifications to applications, and separately as a target
>>>>>>> value for the applications' memory giveaway?
>>>>>> I'm not saying that the kernel should use it directly but it seems
>>>>>> like the kind of "ideal number of free pages" threshold you're
>>>>>> suggesting. So userspace can read that value and use it as the "number
>>>>>> of free pages" threshold for VM events, no?
>>>>> Yes, I like it. The rules of the game are simple and consistent all over, be it the
>>>>> alert threshold, voluntary poling by the apps, and for concurrent work by
>>>>> several applications.
>>>>> Well, as long as it provides a good indication for low_mem_pressure.
>>>> For me it doesn't look that have much sense. min_free_kbytes could be set from user-space (or auto-tuned by kernel) to keep some amount
>>>> of memory available for GFP_ATOMIC allocations. In case situation comes under pointed level kernel will reclaim memory from e.g. caches.
>>>>
>>>>> From potential user point of view the proposed API has number of lacks which would be nice to have implemented:
>>>> 1. rename this API from low_mem_pressure to something more related to notification and memory situation in system: memory_pressure, memnotify, memory_level etc. The word "low" is misleading here
>>>> 2. API must use deferred timers to prevent use-time impact. Deferred timer will be triggered only in case HW event or non-deferrable timer, so if device sleeps timer might be skipped and that is what expected for user-space
>>> Having userspace specify the "sample period" for low memory notification
>>> makes no sense. The frequency of notifications is a function of the
>>> memory pressure.
>>>
>>>> 3. API should be tunable for propagate changes when level is Up or Down, maybe both ways.
>>>> 4. to avoid triggering too much events probably has sense to filter according to amount of change but that is optional. If subscriber set timer to 1s the amount of events should not be very big.
>>>> 5. API must provide interface to request parameters e.g. available swap or free memory just to have some base.
>>> It would make the interface easier to use if it provided the number of
>>> pages to free, in the notification (kernel can calculate that as the
>>> delta between current_free_pages -> comfortable_free_pages relative to
>>> process RSS).
>> If you rely on the notification's argument you lose several features:
>> - Handling of notifications by several applications in parallel
> Each application has its argument built in a custom fashion
> (pages_to_free = delta between current_free_pages ->
> comfortable_free_pages relative to process RSS), or something to that
> effect. It is compatible with parallel notifications.
Not sure that I got it. Do you suggest to ask all the applications to free say 3% of their memory?. Some may be able to free more, and some cannot free any. Isn't it more practical to just notify them, and let each app contribute its part to the global moving target?
>> - Voluntary application's decisions, such as cleanup or avoiding allocations, at the application's convenience.
> I am suggesting an additional field in the notification data so that the
> freeing routine has a goal. But it is not mandatory.
If you do want to support voluntary (notification less) app decisions, based on the current status, then why not satisfy with this API and only use the notifications to trigger this procedure?
>
>> - Iterative release loops, until there are enough free pages.
> What is the advantage versus releasing the necessary amount of
> memory in a given moment?
The cleanup logic may be unaware of the page-level effects of its alloc and free, more so when freeing complex internal data structures (such as cached web pages), and this way you let it free until things settle down.
Ronen.
>
>> I believe that the notification should only serve as a trigger to run the cleanup.
> Agree.
>
>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-25 8:52 ` Ronen Hod
@ 2012-01-25 10:12 ` Marcelo Tosatti
-1 siblings, 0 replies; 124+ messages in thread
From: Marcelo Tosatti @ 2012-01-25 10:12 UTC (permalink / raw)
To: Ronen Hod
Cc: leonid.moiseichuk, penberg, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, akpm,
kosaki.motohiro
On Wed, Jan 25, 2012 at 10:52:24AM +0200, Ronen Hod wrote:
> On 01/24/2012 08:10 PM, Marcelo Tosatti wrote:
> >On Tue, Jan 24, 2012 at 06:08:31PM +0200, Ronen Hod wrote:
> >>On 01/24/2012 05:38 PM, Marcelo Tosatti wrote:
> >>>On Thu, Jan 19, 2012 at 10:53:29AM +0000, leonid.moiseichuk@nokia.com wrote:
> >>>>>-----Original Message-----
> >>>>>From: ext Ronen Hod [mailto:rhod@redhat.com]
> >>>>>Sent: 19 January, 2012 11:20
> >>>>>To: Pekka Enberg
> >>>>...
> >>>>>>>>Isn't
> >>>>>>>>
> >>>>>>>>/proc/sys/vm/min_free_kbytes
> >>>>>>>>
> >>>>>>>>pretty much just that?
> >>>>>>>Would you suggest to use min_free_kbytes as the threshold for sending
> >>>>>>>low_memory_notifications to applications, and separately as a target
> >>>>>>>value for the applications' memory giveaway?
> >>>>>>I'm not saying that the kernel should use it directly but it seems
> >>>>>>like the kind of "ideal number of free pages" threshold you're
> >>>>>>suggesting. So userspace can read that value and use it as the "number
> >>>>>>of free pages" threshold for VM events, no?
> >>>>>Yes, I like it. The rules of the game are simple and consistent all over, be it the
> >>>>>alert threshold, voluntary poling by the apps, and for concurrent work by
> >>>>>several applications.
> >>>>>Well, as long as it provides a good indication for low_mem_pressure.
> >>>>For me it doesn't look that have much sense. min_free_kbytes could be set from user-space (or auto-tuned by kernel) to keep some amount
> >>>>of memory available for GFP_ATOMIC allocations. In case situation comes under pointed level kernel will reclaim memory from e.g. caches.
> >>>>
> >>>>> From potential user point of view the proposed API has number of lacks which would be nice to have implemented:
> >>>>1. rename this API from low_mem_pressure to something more related to notification and memory situation in system: memory_pressure, memnotify, memory_level etc. The word "low" is misleading here
> >>>>2. API must use deferred timers to prevent use-time impact. Deferred timer will be triggered only in case HW event or non-deferrable timer, so if device sleeps timer might be skipped and that is what expected for user-space
> >>>Having userspace specify the "sample period" for low memory notification
> >>>makes no sense. The frequency of notifications is a function of the
> >>>memory pressure.
> >>>
> >>>>3. API should be tunable for propagate changes when level is Up or Down, maybe both ways.
> >>>>4. to avoid triggering too much events probably has sense to filter according to amount of change but that is optional. If subscriber set timer to 1s the amount of events should not be very big.
> >>>>5. API must provide interface to request parameters e.g. available swap or free memory just to have some base.
> >>>It would make the interface easier to use if it provided the number of
> >>>pages to free, in the notification (kernel can calculate that as the
> >>>delta between current_free_pages -> comfortable_free_pages relative to
> >>>process RSS).
> >>If you rely on the notification's argument you lose several features:
> >> - Handling of notifications by several applications in parallel
> >Each application has its argument built in a custom fashion
> >(pages_to_free = delta between current_free_pages ->
> >comfortable_free_pages relative to process RSS), or something to that
> >effect. It is compatible with parallel notifications.
>
> Not sure that I got it. Do you suggest to ask all the applications to free say 3% of their memory?.
> Some may be able to free more, and some cannot free any. Isn't it more practical to just notify them, and let each app contribute its part to the global moving target?
The problem is, how is each process supposed to know how much memory
it should free for each notification received, that is, its part?
Its easier if there is a goal, a hint of how many pages the process
should release.
> >> - Voluntary application's decisions, such as cleanup or avoiding allocations, at the application's convenience.
> >I am suggesting an additional field in the notification data so that the
> >freeing routine has a goal. But it is not mandatory.
>
> If you do want to support voluntary (notification less) app decisions, based on the current status, then why not satisfy with this API and only use the notifications to trigger this procedure?
>
> >
> >>- Iterative release loops, until there are enough free pages.
> >What is the advantage versus releasing the necessary amount of
> >memory in a given moment?
>
> The cleanup logic may be unaware of the page-level effects of its alloc and free, more so when freeing complex internal data structures (such as cached web pages), and this way you let it free until things settle down.
>
> Ronen.
>
> >
> >>I believe that the notification should only serve as a trigger to run the cleanup.
> >Agree.
> >
> >
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-25 10:12 ` Marcelo Tosatti
0 siblings, 0 replies; 124+ messages in thread
From: Marcelo Tosatti @ 2012-01-25 10:12 UTC (permalink / raw)
To: Ronen Hod
Cc: leonid.moiseichuk, penberg, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, akpm,
kosaki.motohiro
On Wed, Jan 25, 2012 at 10:52:24AM +0200, Ronen Hod wrote:
> On 01/24/2012 08:10 PM, Marcelo Tosatti wrote:
> >On Tue, Jan 24, 2012 at 06:08:31PM +0200, Ronen Hod wrote:
> >>On 01/24/2012 05:38 PM, Marcelo Tosatti wrote:
> >>>On Thu, Jan 19, 2012 at 10:53:29AM +0000, leonid.moiseichuk@nokia.com wrote:
> >>>>>-----Original Message-----
> >>>>>From: ext Ronen Hod [mailto:rhod@redhat.com]
> >>>>>Sent: 19 January, 2012 11:20
> >>>>>To: Pekka Enberg
> >>>>...
> >>>>>>>>Isn't
> >>>>>>>>
> >>>>>>>>/proc/sys/vm/min_free_kbytes
> >>>>>>>>
> >>>>>>>>pretty much just that?
> >>>>>>>Would you suggest to use min_free_kbytes as the threshold for sending
> >>>>>>>low_memory_notifications to applications, and separately as a target
> >>>>>>>value for the applications' memory giveaway?
> >>>>>>I'm not saying that the kernel should use it directly but it seems
> >>>>>>like the kind of "ideal number of free pages" threshold you're
> >>>>>>suggesting. So userspace can read that value and use it as the "number
> >>>>>>of free pages" threshold for VM events, no?
> >>>>>Yes, I like it. The rules of the game are simple and consistent all over, be it the
> >>>>>alert threshold, voluntary poling by the apps, and for concurrent work by
> >>>>>several applications.
> >>>>>Well, as long as it provides a good indication for low_mem_pressure.
> >>>>For me it doesn't look that have much sense. min_free_kbytes could be set from user-space (or auto-tuned by kernel) to keep some amount
> >>>>of memory available for GFP_ATOMIC allocations. In case situation comes under pointed level kernel will reclaim memory from e.g. caches.
> >>>>
> >>>>> From potential user point of view the proposed API has number of lacks which would be nice to have implemented:
> >>>>1. rename this API from low_mem_pressure to something more related to notification and memory situation in system: memory_pressure, memnotify, memory_level etc. The word "low" is misleading here
> >>>>2. API must use deferred timers to prevent use-time impact. Deferred timer will be triggered only in case HW event or non-deferrable timer, so if device sleeps timer might be skipped and that is what expected for user-space
> >>>Having userspace specify the "sample period" for low memory notification
> >>>makes no sense. The frequency of notifications is a function of the
> >>>memory pressure.
> >>>
> >>>>3. API should be tunable for propagate changes when level is Up or Down, maybe both ways.
> >>>>4. to avoid triggering too much events probably has sense to filter according to amount of change but that is optional. If subscriber set timer to 1s the amount of events should not be very big.
> >>>>5. API must provide interface to request parameters e.g. available swap or free memory just to have some base.
> >>>It would make the interface easier to use if it provided the number of
> >>>pages to free, in the notification (kernel can calculate that as the
> >>>delta between current_free_pages -> comfortable_free_pages relative to
> >>>process RSS).
> >>If you rely on the notification's argument you lose several features:
> >> - Handling of notifications by several applications in parallel
> >Each application has its argument built in a custom fashion
> >(pages_to_free = delta between current_free_pages ->
> >comfortable_free_pages relative to process RSS), or something to that
> >effect. It is compatible with parallel notifications.
>
> Not sure that I got it. Do you suggest to ask all the applications to free say 3% of their memory?.
> Some may be able to free more, and some cannot free any. Isn't it more practical to just notify them, and let each app contribute its part to the global moving target?
The problem is, how is each process supposed to know how much memory
it should free for each notification received, that is, its part?
Its easier if there is a goal, a hint of how many pages the process
should release.
> >> - Voluntary application's decisions, such as cleanup or avoiding allocations, at the application's convenience.
> >I am suggesting an additional field in the notification data so that the
> >freeing routine has a goal. But it is not mandatory.
>
> If you do want to support voluntary (notification less) app decisions, based on the current status, then why not satisfy with this API and only use the notifications to trigger this procedure?
>
> >
> >>- Iterative release loops, until there are enough free pages.
> >What is the advantage versus releasing the necessary amount of
> >memory in a given moment?
>
> The cleanup logic may be unaware of the page-level effects of its alloc and free, more so when freeing complex internal data structures (such as cached web pages), and this way you let it free until things settle down.
>
> Ronen.
>
> >
> >>I believe that the notification should only serve as a trigger to run the cleanup.
> >Agree.
> >
> >
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-25 10:12 ` Marcelo Tosatti
@ 2012-01-25 10:48 ` Ronen Hod
-1 siblings, 0 replies; 124+ messages in thread
From: Ronen Hod @ 2012-01-25 10:48 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: leonid.moiseichuk, penberg, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, akpm,
kosaki.motohiro
On 01/25/2012 12:12 PM, Marcelo Tosatti wrote:
> On Wed, Jan 25, 2012 at 10:52:24AM +0200, Ronen Hod wrote:
>> On 01/24/2012 08:10 PM, Marcelo Tosatti wrote:
>>> On Tue, Jan 24, 2012 at 06:08:31PM +0200, Ronen Hod wrote:
>>>> On 01/24/2012 05:38 PM, Marcelo Tosatti wrote:
>>>>> On Thu, Jan 19, 2012 at 10:53:29AM +0000, leonid.moiseichuk@nokia.com wrote:
>>>>>>> -----Original Message-----
>>>>>>> From: ext Ronen Hod [mailto:rhod@redhat.com]
>>>>>>> Sent: 19 January, 2012 11:20
>>>>>>> To: Pekka Enberg
>>>>>> ...
>>>>>>>>>> Isn't
>>>>>>>>>>
>>>>>>>>>> /proc/sys/vm/min_free_kbytes
>>>>>>>>>>
>>>>>>>>>> pretty much just that?
>>>>>>>>> Would you suggest to use min_free_kbytes as the threshold for sending
>>>>>>>>> low_memory_notifications to applications, and separately as a target
>>>>>>>>> value for the applications' memory giveaway?
>>>>>>>> I'm not saying that the kernel should use it directly but it seems
>>>>>>>> like the kind of "ideal number of free pages" threshold you're
>>>>>>>> suggesting. So userspace can read that value and use it as the "number
>>>>>>>> of free pages" threshold for VM events, no?
>>>>>>> Yes, I like it. The rules of the game are simple and consistent all over, be it the
>>>>>>> alert threshold, voluntary poling by the apps, and for concurrent work by
>>>>>>> several applications.
>>>>>>> Well, as long as it provides a good indication for low_mem_pressure.
>>>>>> For me it doesn't look that have much sense. min_free_kbytes could be set from user-space (or auto-tuned by kernel) to keep some amount
>>>>>> of memory available for GFP_ATOMIC allocations. In case situation comes under pointed level kernel will reclaim memory from e.g. caches.
>>>>>>
>>>>>>> From potential user point of view the proposed API has number of lacks which would be nice to have implemented:
>>>>>> 1. rename this API from low_mem_pressure to something more related to notification and memory situation in system: memory_pressure, memnotify, memory_level etc. The word "low" is misleading here
>>>>>> 2. API must use deferred timers to prevent use-time impact. Deferred timer will be triggered only in case HW event or non-deferrable timer, so if device sleeps timer might be skipped and that is what expected for user-space
>>>>> Having userspace specify the "sample period" for low memory notification
>>>>> makes no sense. The frequency of notifications is a function of the
>>>>> memory pressure.
>>>>>
>>>>>> 3. API should be tunable for propagate changes when level is Up or Down, maybe both ways.
>>>>>> 4. to avoid triggering too much events probably has sense to filter according to amount of change but that is optional. If subscriber set timer to 1s the amount of events should not be very big.
>>>>>> 5. API must provide interface to request parameters e.g. available swap or free memory just to have some base.
>>>>> It would make the interface easier to use if it provided the number of
>>>>> pages to free, in the notification (kernel can calculate that as the
>>>>> delta between current_free_pages -> comfortable_free_pages relative to
>>>>> process RSS).
>>>> If you rely on the notification's argument you lose several features:
>>>> - Handling of notifications by several applications in parallel
>>> Each application has its argument built in a custom fashion
>>> (pages_to_free = delta between current_free_pages ->
>>> comfortable_free_pages relative to process RSS), or something to that
>>> effect. It is compatible with parallel notifications.
>> Not sure that I got it. Do you suggest to ask all the applications to free say 3% of their memory?.
>> Some may be able to free more, and some cannot free any. Isn't it more practical to just notify them, and let each app contribute its part to the global moving target?
> The problem is, how is each process supposed to know how much memory
> it should free for each notification received, that is, its part?
>
> Its easier if there is a goal, a hint of how many pages the process
> should release.
I have to agree.
Still, the amount of memory that an app should free per memory-pressure-level can be best calculated inside the application (based on comfortable_free_pages relative to process RSS, as you suggested). Fairness is also an issue.
And, if in the meantime the memory pressure ended, would you recommend that the application will continue with its work?
Ronen.
>
>>>> - Voluntary application's decisions, such as cleanup or avoiding allocations, at the application's convenience.
>>> I am suggesting an additional field in the notification data so that the
>>> freeing routine has a goal. But it is not mandatory.
>> If you do want to support voluntary (notification less) app decisions, based on the current status, then why not satisfy with this API and only use the notifications to trigger this procedure?
>>
>>>> - Iterative release loops, until there are enough free pages.
>>> What is the advantage versus releasing the necessary amount of
>>> memory in a given moment?
>> The cleanup logic may be unaware of the page-level effects of its alloc and free, more so when freeing complex internal data structures (such as cached web pages), and this way you let it free until things settle down.
>>
>> Ronen.
>>
>>>> I believe that the notification should only serve as a trigger to run the cleanup.
>>> Agree.
>>>
>>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-25 10:48 ` Ronen Hod
0 siblings, 0 replies; 124+ messages in thread
From: Ronen Hod @ 2012-01-25 10:48 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: leonid.moiseichuk, penberg, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, akpm,
kosaki.motohiro
On 01/25/2012 12:12 PM, Marcelo Tosatti wrote:
> On Wed, Jan 25, 2012 at 10:52:24AM +0200, Ronen Hod wrote:
>> On 01/24/2012 08:10 PM, Marcelo Tosatti wrote:
>>> On Tue, Jan 24, 2012 at 06:08:31PM +0200, Ronen Hod wrote:
>>>> On 01/24/2012 05:38 PM, Marcelo Tosatti wrote:
>>>>> On Thu, Jan 19, 2012 at 10:53:29AM +0000, leonid.moiseichuk@nokia.com wrote:
>>>>>>> -----Original Message-----
>>>>>>> From: ext Ronen Hod [mailto:rhod@redhat.com]
>>>>>>> Sent: 19 January, 2012 11:20
>>>>>>> To: Pekka Enberg
>>>>>> ...
>>>>>>>>>> Isn't
>>>>>>>>>>
>>>>>>>>>> /proc/sys/vm/min_free_kbytes
>>>>>>>>>>
>>>>>>>>>> pretty much just that?
>>>>>>>>> Would you suggest to use min_free_kbytes as the threshold for sending
>>>>>>>>> low_memory_notifications to applications, and separately as a target
>>>>>>>>> value for the applications' memory giveaway?
>>>>>>>> I'm not saying that the kernel should use it directly but it seems
>>>>>>>> like the kind of "ideal number of free pages" threshold you're
>>>>>>>> suggesting. So userspace can read that value and use it as the "number
>>>>>>>> of free pages" threshold for VM events, no?
>>>>>>> Yes, I like it. The rules of the game are simple and consistent all over, be it the
>>>>>>> alert threshold, voluntary poling by the apps, and for concurrent work by
>>>>>>> several applications.
>>>>>>> Well, as long as it provides a good indication for low_mem_pressure.
>>>>>> For me it doesn't look that have much sense. min_free_kbytes could be set from user-space (or auto-tuned by kernel) to keep some amount
>>>>>> of memory available for GFP_ATOMIC allocations. In case situation comes under pointed level kernel will reclaim memory from e.g. caches.
>>>>>>
>>>>>>> From potential user point of view the proposed API has number of lacks which would be nice to have implemented:
>>>>>> 1. rename this API from low_mem_pressure to something more related to notification and memory situation in system: memory_pressure, memnotify, memory_level etc. The word "low" is misleading here
>>>>>> 2. API must use deferred timers to prevent use-time impact. Deferred timer will be triggered only in case HW event or non-deferrable timer, so if device sleeps timer might be skipped and that is what expected for user-space
>>>>> Having userspace specify the "sample period" for low memory notification
>>>>> makes no sense. The frequency of notifications is a function of the
>>>>> memory pressure.
>>>>>
>>>>>> 3. API should be tunable for propagate changes when level is Up or Down, maybe both ways.
>>>>>> 4. to avoid triggering too much events probably has sense to filter according to amount of change but that is optional. If subscriber set timer to 1s the amount of events should not be very big.
>>>>>> 5. API must provide interface to request parameters e.g. available swap or free memory just to have some base.
>>>>> It would make the interface easier to use if it provided the number of
>>>>> pages to free, in the notification (kernel can calculate that as the
>>>>> delta between current_free_pages -> comfortable_free_pages relative to
>>>>> process RSS).
>>>> If you rely on the notification's argument you lose several features:
>>>> - Handling of notifications by several applications in parallel
>>> Each application has its argument built in a custom fashion
>>> (pages_to_free = delta between current_free_pages ->
>>> comfortable_free_pages relative to process RSS), or something to that
>>> effect. It is compatible with parallel notifications.
>> Not sure that I got it. Do you suggest to ask all the applications to free say 3% of their memory?.
>> Some may be able to free more, and some cannot free any. Isn't it more practical to just notify them, and let each app contribute its part to the global moving target?
> The problem is, how is each process supposed to know how much memory
> it should free for each notification received, that is, its part?
>
> Its easier if there is a goal, a hint of how many pages the process
> should release.
I have to agree.
Still, the amount of memory that an app should free per memory-pressure-level can be best calculated inside the application (based on comfortable_free_pages relative to process RSS, as you suggested). Fairness is also an issue.
And, if in the meantime the memory pressure ended, would you recommend that the application will continue with its work?
Ronen.
>
>>>> - Voluntary application's decisions, such as cleanup or avoiding allocations, at the application's convenience.
>>> I am suggesting an additional field in the notification data so that the
>>> freeing routine has a goal. But it is not mandatory.
>> If you do want to support voluntary (notification less) app decisions, based on the current status, then why not satisfy with this API and only use the notifications to trigger this procedure?
>>
>>>> - Iterative release loops, until there are enough free pages.
>>> What is the advantage versus releasing the necessary amount of
>>> memory in a given moment?
>> The cleanup logic may be unaware of the page-level effects of its alloc and free, more so when freeing complex internal data structures (such as cached web pages), and this way you let it free until things settle down.
>>
>> Ronen.
>>
>>>> I believe that the notification should only serve as a trigger to run the cleanup.
>>> Agree.
>>>
>>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-25 10:48 ` Ronen Hod
@ 2012-01-26 16:17 ` Marcelo Tosatti
-1 siblings, 0 replies; 124+ messages in thread
From: Marcelo Tosatti @ 2012-01-26 16:17 UTC (permalink / raw)
To: Ronen Hod
Cc: leonid.moiseichuk, penberg, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, akpm,
kosaki.motohiro
> >it should free for each notification received, that is, its part?
> >
> >Its easier if there is a goal, a hint of how many pages the process
> >should release.
>
> I have to agree.
> Still, the amount of memory that an app should free per memory-pressure-level can be best calculated inside the application (based on comfortable_free_pages relative to process RSS, as you suggested).
It is easier if the kernel calculates the target (the application is
free to ignore the hint, of course), because it depends on information
not readily available in userspace.
> Fairness is also an issue.
> And, if in the meantime the memory pressure ended, would you recommend that the application will continue with its work?
There appears to be interest in an event to notify that higher levels
of memory are available (see Leonid's email).
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-26 16:17 ` Marcelo Tosatti
0 siblings, 0 replies; 124+ messages in thread
From: Marcelo Tosatti @ 2012-01-26 16:17 UTC (permalink / raw)
To: Ronen Hod
Cc: leonid.moiseichuk, penberg, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, akpm,
kosaki.motohiro
> >it should free for each notification received, that is, its part?
> >
> >Its easier if there is a goal, a hint of how many pages the process
> >should release.
>
> I have to agree.
> Still, the amount of memory that an app should free per memory-pressure-level can be best calculated inside the application (based on comfortable_free_pages relative to process RSS, as you suggested).
It is easier if the kernel calculates the target (the application is
free to ignore the hint, of course), because it depends on information
not readily available in userspace.
> Fairness is also an issue.
> And, if in the meantime the memory pressure ended, would you recommend that the application will continue with its work?
There appears to be interest in an event to notify that higher levels
of memory are available (see Leonid's email).
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-24 15:38 ` Marcelo Tosatti
@ 2012-01-24 16:10 ` Pekka Enberg
-1 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-24 16:10 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: leonid.moiseichuk, rhod, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, akpm,
kosaki.motohiro
On Tue, 2012-01-24 at 13:38 -0200, Marcelo Tosatti wrote:
> Having userspace specify the "sample period" for low memory notification
> makes no sense. The frequency of notifications is a function of the
> memory pressure.
Sure, it makes sense to autotune sample period. I don't see the problem
with letting userspace decide it for themselves if they want to.
Pekka
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-24 16:10 ` Pekka Enberg
0 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-24 16:10 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: leonid.moiseichuk, rhod, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, akpm,
kosaki.motohiro
On Tue, 2012-01-24 at 13:38 -0200, Marcelo Tosatti wrote:
> Having userspace specify the "sample period" for low memory notification
> makes no sense. The frequency of notifications is a function of the
> memory pressure.
Sure, it makes sense to autotune sample period. I don't see the problem
with letting userspace decide it for themselves if they want to.
Pekka
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-24 16:10 ` Pekka Enberg
@ 2012-01-24 18:29 ` Marcelo Tosatti
-1 siblings, 0 replies; 124+ messages in thread
From: Marcelo Tosatti @ 2012-01-24 18:29 UTC (permalink / raw)
To: Pekka Enberg
Cc: leonid.moiseichuk, rhod, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, akpm,
kosaki.motohiro
On Tue, Jan 24, 2012 at 06:10:40PM +0200, Pekka Enberg wrote:
> On Tue, 2012-01-24 at 13:38 -0200, Marcelo Tosatti wrote:
> > Having userspace specify the "sample period" for low memory notification
> > makes no sense. The frequency of notifications is a function of the
> > memory pressure.
>
> Sure, it makes sense to autotune sample period. I don't see the problem
> with letting userspace decide it for themselves if they want to.
>
> Pekka
Application polls on a file descriptor waiting for asynchronous events,
particular conditions of memory reclaim upon which an action is
necessary.
These signalled conditions are not simply percentages of free memory,
but depend on the amount of freeable cache available, etc. Otherwise
applications could monitor /proc/mem_info and act on that.
What is the point of sampling in the interface as you have it?
Application can read() from the file descriptor to retrieve the current
status, if it wishes.
The objective in this argument is to make the API as simple and easy to
use as possible.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-24 18:29 ` Marcelo Tosatti
0 siblings, 0 replies; 124+ messages in thread
From: Marcelo Tosatti @ 2012-01-24 18:29 UTC (permalink / raw)
To: Pekka Enberg
Cc: leonid.moiseichuk, rhod, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, akpm,
kosaki.motohiro
On Tue, Jan 24, 2012 at 06:10:40PM +0200, Pekka Enberg wrote:
> On Tue, 2012-01-24 at 13:38 -0200, Marcelo Tosatti wrote:
> > Having userspace specify the "sample period" for low memory notification
> > makes no sense. The frequency of notifications is a function of the
> > memory pressure.
>
> Sure, it makes sense to autotune sample period. I don't see the problem
> with letting userspace decide it for themselves if they want to.
>
> Pekka
Application polls on a file descriptor waiting for asynchronous events,
particular conditions of memory reclaim upon which an action is
necessary.
These signalled conditions are not simply percentages of free memory,
but depend on the amount of freeable cache available, etc. Otherwise
applications could monitor /proc/mem_info and act on that.
What is the point of sampling in the interface as you have it?
Application can read() from the file descriptor to retrieve the current
status, if it wishes.
The objective in this argument is to make the API as simple and easy to
use as possible.
^ permalink raw reply [flat|nested] 124+ messages in thread
* RE: [RFC 1/3] /dev/low_mem_notify
2012-01-24 16:10 ` Pekka Enberg
@ 2012-01-25 8:19 ` leonid.moiseichuk
-1 siblings, 0 replies; 124+ messages in thread
From: leonid.moiseichuk @ 2012-01-25 8:19 UTC (permalink / raw)
To: penberg, mtosatti
Cc: rhod, riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, akpm, kosaki.motohiro
> -----Original Message-----
> From: ext Pekka Enberg [mailto:penberg@kernel.org]
> Sent: 24 January, 2012 18:11
> To: Marcelo Tosatti
....
> On Tue, 2012-01-24 at 13:38 -0200, Marcelo Tosatti wrote:
> > Having userspace specify the "sample period" for low memory
> > notification makes no sense. The frequency of notifications is a
> > function of the memory pressure.
>
> Sure, it makes sense to autotune sample period. I don't see the problem
> with letting userspace decide it for themselves if they want to.
>
> Pekka
Good point, but you must take into account that reaction time in user-space depends how SW stack is organized.
So for some components 1s is good enough update time, for another cases 10ms.
If changes on VM happened too often they had no sense for user-space.
Thus from practical point of view having sampling period is not a bad idea.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* RE: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-25 8:19 ` leonid.moiseichuk
0 siblings, 0 replies; 124+ messages in thread
From: leonid.moiseichuk @ 2012-01-25 8:19 UTC (permalink / raw)
To: penberg, mtosatti
Cc: rhod, riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, akpm, kosaki.motohiro
> -----Original Message-----
> From: ext Pekka Enberg [mailto:penberg@kernel.org]
> Sent: 24 January, 2012 18:11
> To: Marcelo Tosatti
....
> On Tue, 2012-01-24 at 13:38 -0200, Marcelo Tosatti wrote:
> > Having userspace specify the "sample period" for low memory
> > notification makes no sense. The frequency of notifications is a
> > function of the memory pressure.
>
> Sure, it makes sense to autotune sample period. I don't see the problem
> with letting userspace decide it for themselves if they want to.
>
> Pekka
Good point, but you must take into account that reaction time in user-space depends how SW stack is organized.
So for some components 1s is good enough update time, for another cases 10ms.
If changes on VM happened too often they had no sense for user-space.
Thus from practical point of view having sampling period is not a bad idea.
^ permalink raw reply [flat|nested] 124+ messages in thread
* RE: [RFC 1/3] /dev/low_mem_notify
2012-01-18 10:44 ` leonid.moiseichuk
@ 2012-01-19 7:34 ` Pekka Enberg
-1 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-19 7:34 UTC (permalink / raw)
To: leonid.moiseichuk
Cc: riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, mtosatti, akpm, rhod,
kosaki.motohiro
On Wed, 18 Jan 2012, leonid.moiseichuk@nokia.com wrote:
> Paul Mundt noticed that and we stopped use percentage in 2006 for n770 update.
> He was right.
> Percents are useless and do not correlate with other kernel APIs like sysinfo().
I changed the code to use number of pages. Thanks!
Pekka
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* RE: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-19 7:34 ` Pekka Enberg
0 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-19 7:34 UTC (permalink / raw)
To: leonid.moiseichuk
Cc: riel, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, mtosatti, akpm, rhod,
kosaki.motohiro
On Wed, 18 Jan 2012, leonid.moiseichuk@nokia.com wrote:
> Paul Mundt noticed that and we stopped use percentage in 2006 for n770 update.
> He was right.
> Percents are useless and do not correlate with other kernel APIs like sysinfo().
I changed the code to use number of pages. Thanks!
Pekka
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-18 9:15 ` Pekka Enberg
@ 2012-01-24 16:22 ` Arnd Bergmann
-1 siblings, 0 replies; 124+ messages in thread
From: Arnd Bergmann @ 2012-01-24 16:22 UTC (permalink / raw)
To: Pekka Enberg
Cc: leonid.moiseichuk, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, mtosatti,
akpm, rhod, kosaki.motohiro
On Wednesday 18 January 2012, Pekka Enberg wrote:
> >> +struct vmnotify_event {
> >> + /* Size of the struct for ABI extensibility. */
> >> + __u32 size;
> >> +
> >> + __u64 nr_avail_pages;
> >> +
> >> + __u64 nr_swap_pages;
> >> +
> >> + __u64 nr_free_pages;
> >> +};
> >
> > Two fields here most likely session-constant, (nr_avail_pages and
> > nr_swap_pages), seems not much sense to report them in every event. If we
> > have memory/swap hotplug user-space can use sysinfo() call.
>
> I actually changed the ABI to look like this:
>
> struct vmnotify_event {
> /*
> * Size of the struct for ABI extensibility.
> */
> __u32 size;
>
> __u64 attrs;
>
> __u64 attr_values[];
> };
>
> So userspace can decide which fields to include in notifications.
Please make the first member a __u64 instead of a __u32. This will
avoid incompatibility between 32 and 64 bit processes, which have
different alignment rules on x86: x86-32 would implicitly pack the
struct while x86-64 would add padding with your layout.
Arnd
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-24 16:22 ` Arnd Bergmann
0 siblings, 0 replies; 124+ messages in thread
From: Arnd Bergmann @ 2012-01-24 16:22 UTC (permalink / raw)
To: Pekka Enberg
Cc: leonid.moiseichuk, riel, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, mtosatti,
akpm, rhod, kosaki.motohiro
On Wednesday 18 January 2012, Pekka Enberg wrote:
> >> +struct vmnotify_event {
> >> + /* Size of the struct for ABI extensibility. */
> >> + __u32 size;
> >> +
> >> + __u64 nr_avail_pages;
> >> +
> >> + __u64 nr_swap_pages;
> >> +
> >> + __u64 nr_free_pages;
> >> +};
> >
> > Two fields here most likely session-constant, (nr_avail_pages and
> > nr_swap_pages), seems not much sense to report them in every event. If we
> > have memory/swap hotplug user-space can use sysinfo() call.
>
> I actually changed the ABI to look like this:
>
> struct vmnotify_event {
> /*
> * Size of the struct for ABI extensibility.
> */
> __u32 size;
>
> __u64 attrs;
>
> __u64 attr_values[];
> };
>
> So userspace can decide which fields to include in notifications.
Please make the first member a __u64 instead of a __u32. This will
avoid incompatibility between 32 and 64 bit processes, which have
different alignment rules on x86: x86-32 would implicitly pack the
struct while x86-64 would add padding with your layout.
Arnd
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-18 9:06 ` leonid.moiseichuk
@ 2012-01-18 14:30 ` Rik van Riel
-1 siblings, 0 replies; 124+ messages in thread
From: Rik van Riel @ 2012-01-18 14:30 UTC (permalink / raw)
To: leonid.moiseichuk
Cc: penberg, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, mtosatti, akpm, rhod,
kosaki.motohiro
On 01/18/2012 04:06 AM, leonid.moiseichuk@nokia.com wrote:
> Would be possible to use for threshold pointed value(s) e.g. according to enum zone_state_item, because kinds of memory to track could be different?
> E.g. to tracking paging activity NR_ACTIVE_ANON and NR_ACTIVE_FILE could be interesting, not only free.
That seems like a horrible idea, because there is no guarantee that
the kernel will continue to use NR_ACTIVE_ANON and NR_ACTIVE_FILE
internally in the future.
What is exported to userspace must be somewhat independent of the
specifics of how the kernel implements things internally.
--
All rights reversed
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-18 14:30 ` Rik van Riel
0 siblings, 0 replies; 124+ messages in thread
From: Rik van Riel @ 2012-01-18 14:30 UTC (permalink / raw)
To: leonid.moiseichuk
Cc: penberg, minchan, linux-mm, linux-kernel, kamezawa.hiroyu, mel,
rientjes, kosaki.motohiro, hannes, mtosatti, akpm, rhod,
kosaki.motohiro
On 01/18/2012 04:06 AM, leonid.moiseichuk@nokia.com wrote:
> Would be possible to use for threshold pointed value(s) e.g. according to enum zone_state_item, because kinds of memory to track could be different?
> E.g. to tracking paging activity NR_ACTIVE_ANON and NR_ACTIVE_FILE could be interesting, not only free.
That seems like a horrible idea, because there is no guarantee that
the kernel will continue to use NR_ACTIVE_ANON and NR_ACTIVE_FILE
internally in the future.
What is exported to userspace must be somewhat independent of the
specifics of how the kernel implements things internally.
--
All rights reversed
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-18 14:30 ` Rik van Riel
@ 2012-01-18 15:29 ` Pekka Enberg
-1 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-18 15:29 UTC (permalink / raw)
To: Rik van Riel
Cc: leonid.moiseichuk, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, mtosatti,
akpm, rhod, kosaki.motohiro
On Wed, Jan 18, 2012 at 4:30 PM, Rik van Riel <riel@redhat.com> wrote:
> That seems like a horrible idea, because there is no guarantee that
> the kernel will continue to use NR_ACTIVE_ANON and NR_ACTIVE_FILE
> internally in the future.
>
> What is exported to userspace must be somewhat independent of the
> specifics of how the kernel implements things internally.
Exactly, that's what I'm also interested in.
Pekka
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-18 15:29 ` Pekka Enberg
0 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-18 15:29 UTC (permalink / raw)
To: Rik van Riel
Cc: leonid.moiseichuk, minchan, linux-mm, linux-kernel,
kamezawa.hiroyu, mel, rientjes, kosaki.motohiro, hannes, mtosatti,
akpm, rhod, kosaki.motohiro
On Wed, Jan 18, 2012 at 4:30 PM, Rik van Riel <riel@redhat.com> wrote:
> That seems like a horrible idea, because there is no guarantee that
> the kernel will continue to use NR_ACTIVE_ANON and NR_ACTIVE_FILE
> internally in the future.
>
> What is exported to userspace must be somewhat independent of the
> specifics of how the kernel implements things internally.
Exactly, that's what I'm also interested in.
Pekka
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-17 18:51 ` Pekka Enberg
@ 2012-01-24 15:40 ` Marcelo Tosatti
-1 siblings, 0 replies; 124+ messages in thread
From: Marcelo Tosatti @ 2012-01-24 15:40 UTC (permalink / raw)
To: Pekka Enberg
Cc: Rik van Riel, Minchan Kim, linux-mm, LKML, leonid.moiseichuk,
kamezawa.hiroyu, mel, rientjes, KOSAKI Motohiro, Johannes Weiner,
Andrew Morton, Ronen Hod, KOSAKI Motohiro
On Tue, Jan 17, 2012 at 08:51:13PM +0200, Pekka Enberg wrote:
> Hello,
>
> Ok, so here's a proof of concept patch that implements sample-base
> per-process free threshold VM event watching using perf-like syscall
> ABI. I'd really like to see something like this that's much more
> extensible and clean than the /dev based ABIs that people have
> proposed so far.
>
> Pekka
What is the practical advantage of a syscall, again?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-24 15:40 ` Marcelo Tosatti
0 siblings, 0 replies; 124+ messages in thread
From: Marcelo Tosatti @ 2012-01-24 15:40 UTC (permalink / raw)
To: Pekka Enberg
Cc: Rik van Riel, Minchan Kim, linux-mm, LKML, leonid.moiseichuk,
kamezawa.hiroyu, mel, rientjes, KOSAKI Motohiro, Johannes Weiner,
Andrew Morton, Ronen Hod, KOSAKI Motohiro
On Tue, Jan 17, 2012 at 08:51:13PM +0200, Pekka Enberg wrote:
> Hello,
>
> Ok, so here's a proof of concept patch that implements sample-base
> per-process free threshold VM event watching using perf-like syscall
> ABI. I'd really like to see something like this that's much more
> extensible and clean than the /dev based ABIs that people have
> proposed so far.
>
> Pekka
What is the practical advantage of a syscall, again?
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-24 15:40 ` Marcelo Tosatti
@ 2012-01-24 16:01 ` Pekka Enberg
-1 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-24 16:01 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: Rik van Riel, Minchan Kim, linux-mm, LKML, leonid.moiseichuk,
kamezawa.hiroyu, mel, rientjes, KOSAKI Motohiro, Johannes Weiner,
Andrew Morton, Ronen Hod, KOSAKI Motohiro
On Tue, 2012-01-24 at 13:40 -0200, Marcelo Tosatti wrote:
> What is the practical advantage of a syscall, again?
Why do you ask? The advantage for this particular case is not needing to
add ioctls() for configuration and keeping the file read/write ABI
simple.
Pekka
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-24 16:01 ` Pekka Enberg
0 siblings, 0 replies; 124+ messages in thread
From: Pekka Enberg @ 2012-01-24 16:01 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: Rik van Riel, Minchan Kim, linux-mm, LKML, leonid.moiseichuk,
kamezawa.hiroyu, mel, rientjes, KOSAKI Motohiro, Johannes Weiner,
Andrew Morton, Ronen Hod, KOSAKI Motohiro
On Tue, 2012-01-24 at 13:40 -0200, Marcelo Tosatti wrote:
> What is the practical advantage of a syscall, again?
Why do you ask? The advantage for this particular case is not needing to
add ioctls() for configuration and keeping the file read/write ABI
simple.
Pekka
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-24 16:01 ` Pekka Enberg
@ 2012-01-24 16:25 ` Arnd Bergmann
-1 siblings, 0 replies; 124+ messages in thread
From: Arnd Bergmann @ 2012-01-24 16:25 UTC (permalink / raw)
To: Pekka Enberg
Cc: Marcelo Tosatti, Rik van Riel, Minchan Kim, linux-mm, LKML,
leonid.moiseichuk, kamezawa.hiroyu, mel, rientjes,
KOSAKI Motohiro, Johannes Weiner, Andrew Morton, Ronen Hod,
KOSAKI Motohiro
On Tuesday 24 January 2012, Pekka Enberg wrote:
> On Tue, 2012-01-24 at 13:40 -0200, Marcelo Tosatti wrote:
> > What is the practical advantage of a syscall, again?
>
> Why do you ask? The advantage for this particular case is not needing to
> add ioctls() for configuration and keeping the file read/write ABI
> simple.
The two are obviously equivalent and there is no reason to avoid
ioctl in general. However I agree that the syscall would be better
in this case, because that is what we tend to use for core kernel
functionality, while character devices tend to be used for I/O device
drivers that need stuff like enumeration and permission management.
Arnd
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-24 16:25 ` Arnd Bergmann
0 siblings, 0 replies; 124+ messages in thread
From: Arnd Bergmann @ 2012-01-24 16:25 UTC (permalink / raw)
To: Pekka Enberg
Cc: Marcelo Tosatti, Rik van Riel, Minchan Kim, linux-mm, LKML,
leonid.moiseichuk, kamezawa.hiroyu, mel, rientjes,
KOSAKI Motohiro, Johannes Weiner, Andrew Morton, Ronen Hod,
KOSAKI Motohiro
On Tuesday 24 January 2012, Pekka Enberg wrote:
> On Tue, 2012-01-24 at 13:40 -0200, Marcelo Tosatti wrote:
> > What is the practical advantage of a syscall, again?
>
> Why do you ask? The advantage for this particular case is not needing to
> add ioctls() for configuration and keeping the file read/write ABI
> simple.
The two are obviously equivalent and there is no reason to avoid
ioctl in general. However I agree that the syscall would be better
in this case, because that is what we tend to use for core kernel
functionality, while character devices tend to be used for I/O device
drivers that need stuff like enumeration and permission management.
Arnd
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-24 16:25 ` Arnd Bergmann
@ 2012-01-24 18:32 ` Marcelo Tosatti
-1 siblings, 0 replies; 124+ messages in thread
From: Marcelo Tosatti @ 2012-01-24 18:32 UTC (permalink / raw)
To: Arnd Bergmann
Cc: Pekka Enberg, Rik van Riel, Minchan Kim, linux-mm, LKML,
leonid.moiseichuk, kamezawa.hiroyu, mel, rientjes,
KOSAKI Motohiro, Johannes Weiner, Andrew Morton, Ronen Hod,
KOSAKI Motohiro
On Tue, Jan 24, 2012 at 04:25:55PM +0000, Arnd Bergmann wrote:
> On Tuesday 24 January 2012, Pekka Enberg wrote:
> > On Tue, 2012-01-24 at 13:40 -0200, Marcelo Tosatti wrote:
> > > What is the practical advantage of a syscall, again?
> >
> > Why do you ask? The advantage for this particular case is not needing to
> > add ioctls() for configuration and keeping the file read/write ABI
> > simple.
>
> The two are obviously equivalent and there is no reason to avoid
> ioctl in general. However I agree that the syscall would be better
> in this case, because that is what we tend to use for core kernel
> functionality, while character devices tend to be used for I/O device
> drivers that need stuff like enumeration and permission management.
>
> Arnd
Makes sense.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-24 18:32 ` Marcelo Tosatti
0 siblings, 0 replies; 124+ messages in thread
From: Marcelo Tosatti @ 2012-01-24 18:32 UTC (permalink / raw)
To: Arnd Bergmann
Cc: Pekka Enberg, Rik van Riel, Minchan Kim, linux-mm, LKML,
leonid.moiseichuk, kamezawa.hiroyu, mel, rientjes,
KOSAKI Motohiro, Johannes Weiner, Andrew Morton, Ronen Hod,
KOSAKI Motohiro
On Tue, Jan 24, 2012 at 04:25:55PM +0000, Arnd Bergmann wrote:
> On Tuesday 24 January 2012, Pekka Enberg wrote:
> > On Tue, 2012-01-24 at 13:40 -0200, Marcelo Tosatti wrote:
> > > What is the practical advantage of a syscall, again?
> >
> > Why do you ask? The advantage for this particular case is not needing to
> > add ioctls() for configuration and keeping the file read/write ABI
> > simple.
>
> The two are obviously equivalent and there is no reason to avoid
> ioctl in general. However I agree that the syscall would be better
> in this case, because that is what we tend to use for core kernel
> functionality, while character devices tend to be used for I/O device
> drivers that need stuff like enumeration and permission management.
>
> Arnd
Makes sense.
^ permalink raw reply [flat|nested] 124+ messages in thread
* Re: [RFC 1/3] /dev/low_mem_notify
2012-01-17 18:51 ` Pekka Enberg
@ 2012-01-24 21:57 ` Jonathan Corbet
-1 siblings, 0 replies; 124+ messages in thread
From: Jonathan Corbet @ 2012-01-24 21:57 UTC (permalink / raw)
To: Pekka Enberg
Cc: Rik van Riel, Minchan Kim, linux-mm, LKML, leonid.moiseichuk,
kamezawa.hiroyu, mel, rientjes, KOSAKI Motohiro, Johannes Weiner,
Marcelo Tosatti, Andrew Morton, Ronen Hod, KOSAKI Motohiro
On Tue, 17 Jan 2012 20:51:13 +0200 (EET)
Pekka Enberg <penberg@kernel.org> wrote:
> Ok, so here's a proof of concept patch that implements sample-base
> per-process free threshold VM event watching using perf-like syscall ABI.
> I'd really like to see something like this that's much more extensible and
> clean than the /dev based ABIs that people have proposed so far.
OK, so I'm slow, but better late than never. I plead travel.
I guess the thing that surprises me is that nobody has said this yet: this
looks a lot like an event-reporting mechanism like perf. Is there a reason
these can't be perf-style events integrated with all the rest?
> +struct vmnotify_config {
> + /*
> + * Size of the struct for ABI extensibility.
> + */
> + __u32 size;
> +
> + /*
> + * Notification type bitmask
> + */
> + __u64 type;
> +
> + /*
> + * Free memory threshold in percentages [1..99]
> + */
> + __u32 free_threshold;
Is this an upper-bound threshold or a lower-bound threshold? From your
example, it looks like "free_threshold" is "the amount of memory that is
not free", which seems confusing.
[...]
> new file mode 100644
> index 0000000..6800450
> --- /dev/null
> +++ b/mm/vmnotify.c
> @@ -0,0 +1,235 @@
> +#include <linux/anon_inodes.h>
> +#include <linux/vmnotify.h>
> +#include <linux/syscalls.h>
> +#include <linux/file.h>
> +#include <linux/list.h>
> +#include <linux/poll.h>
> +#include <linux/slab.h>
> +#include <linux/swap.h>
> +
> +#define VMNOTIFY_MAX_FREE_THRESHOD 100
Did we run out of L's here? :)
> +static ssize_t vmnotify_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> +{
> + struct vmnotify_watch *watch = file->private_data;
> + int ret = 0;
> +
> + mutex_lock(&watch->mutex);
> +
> + if (!watch->pending)
> + goto out_unlock;
> +
> + if (copy_to_user(buf, &watch->event, sizeof(struct vmnotify_event))) {
> + ret = -EFAULT;
> + goto out_unlock;
> + }
> +
> + ret = watch->event.size;
> +
> + watch->pending = false;
> +
> +out_unlock:
> + mutex_unlock(&watch->mutex);
> +
> + return ret;
> +}
So this is a nonblocking-only interface? That may surprise some
developers. You already have a wait queue, why not wait on it if need be?
> +static int vmnotify_copy_config(struct vmnotify_config __user *uconfig,
> + struct vmnotify_config *config)
> +{
> + int ret;
> +
> + ret = copy_from_user(config, uconfig, sizeof(struct vmnotify_config));
> + if (ret)
> + return -EFAULT;
> +
> + if (!config->type)
> + return -EINVAL;
> +
> + if (config->type & VMNOTIFY_TYPE_SAMPLE) {
> + if (config->sample_period_ns < NSEC_PER_MSEC)
> + return -EINVAL;
> + }
What happens if the sample period is zero?
jon
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 124+ messages in thread* Re: [RFC 1/3] /dev/low_mem_notify
@ 2012-01-24 21:57 ` Jonathan Corbet
0 siblings, 0 replies; 124+ messages in thread
From: Jonathan Corbet @ 2012-01-24 21:57 UTC (permalink / raw)
To: Pekka Enberg
Cc: Rik van Riel, Minchan Kim, linux-mm, LKML, leonid.moiseichuk,
kamezawa.hiroyu, mel, rientjes, KOSAKI Motohiro, Johannes Weiner,
Marcelo Tosatti, Andrew Morton, Ronen Hod, KOSAKI Motohiro
On Tue, 17 Jan 2012 20:51:13 +0200 (EET)
Pekka Enberg <penberg@kernel.org> wrote:
> Ok, so here's a proof of concept patch that implements sample-base
> per-process free threshold VM event watching using perf-like syscall ABI.
> I'd really like to see something like this that's much more extensible and
> clean than the /dev based ABIs that people have proposed so far.
OK, so I'm slow, but better late than never. I plead travel.
I guess the thing that surprises me is that nobody has said this yet: this
looks a lot like an event-reporting mechanism like perf. Is there a reason
these can't be perf-style events integrated with all the rest?
> +struct vmnotify_config {
> + /*
> + * Size of the struct for ABI extensibility.
> + */
> + __u32 size;
> +
> + /*
> + * Notification type bitmask
> + */
> + __u64 type;
> +
> + /*
> + * Free memory threshold in percentages [1..99]
> + */
> + __u32 free_threshold;
Is this an upper-bound threshold or a lower-bound threshold? From your
example, it looks like "free_threshold" is "the amount of memory that is
not free", which seems confusing.
[...]
> new file mode 100644
> index 0000000..6800450
> --- /dev/null
> +++ b/mm/vmnotify.c
> @@ -0,0 +1,235 @@
> +#include <linux/anon_inodes.h>
> +#include <linux/vmnotify.h>
> +#include <linux/syscalls.h>
> +#include <linux/file.h>
> +#include <linux/list.h>
> +#include <linux/poll.h>
> +#include <linux/slab.h>
> +#include <linux/swap.h>
> +
> +#define VMNOTIFY_MAX_FREE_THRESHOD 100
Did we run out of L's here? :)
> +static ssize_t vmnotify_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> +{
> + struct vmnotify_watch *watch = file->private_data;
> + int ret = 0;
> +
> + mutex_lock(&watch->mutex);
> +
> + if (!watch->pending)
> + goto out_unlock;
> +
> + if (copy_to_user(buf, &watch->event, sizeof(struct vmnotify_event))) {
> + ret = -EFAULT;
> + goto out_unlock;
> + }
> +
> + ret = watch->event.size;
> +
> + watch->pending = false;
> +
> +out_unlock:
> + mutex_unlock(&watch->mutex);
> +
> + return ret;
> +}
So this is a nonblocking-only interface? That may surprise some
developers. You already have a wait queue, why not wait on it if need be?
> +static int vmnotify_copy_config(struct vmnotify_config __user *uconfig,
> + struct vmnotify_config *config)
> +{
> + int ret;
> +
> + ret = copy_from_user(config, uconfig, sizeof(struct vmnotify_config));
> + if (ret)
> + return -EFAULT;
> +
> + if (!config->type)
> + return -EINVAL;
> +
> + if (config->type & VMNOTIFY_TYPE_SAMPLE) {
> + if (config->sample_period_ns < NSEC_PER_MSEC)
> + return -EINVAL;
> + }
What happens if the sample period is zero?
jon
^ permalink raw reply [flat|nested] 124+ messages in thread