[PATCH V8 2/4] mm: frontswap: core code

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH V8 2/4] mm: frontswap: core code
@ 2011-08-29 16:49 Dan Magenheimer
  2011-09-07 23:25 ` Andrew Morton
  0 siblings, 1 reply; 4+ messages in thread
From: Dan Magenheimer @ 2011-08-29 16:49 UTC (permalink / raw)
  To: linux-kernel, linux-mm, jeremy, hughd, ngupta, konrad.wilk,
	JBeulich, kurt.hackel, npiggin, akpm, riel, hannes, matthew,
	chris.mason, dan.magenheimer, sjenning, kamezawa.hiroyu,
	jackdachef, cyclonusj, levinsasha928

From: Dan Magenheimer <dan.magenheimer@oracle.com>
Subject: [PATCH V8 2/4] mm: frontswap: core code

This second patch of four in this frontswap series provides the core code
for frontswap that interfaces between the hooks in the swap subsystem and
a frontswap backend via frontswap_ops.

Two new files are added: mm/frontswap.c and include/linux/frontswap.h

Credits: Frontswap_ops design derived from Jeremy Fitzhardinge
design for tmem; sysfs code modelled after mm/ksm.c

[v8: rebase to 3.0-rc4]
[v8: kamezawa.hiroyu@jp.fujitsu.com: change count to atomic_t to avoid races]
[v7: rebase to 3.0-rc3]
[v7: JBeulich@novell.com: new static inlines resolve to no-ops if not config'd]
[v7: JBeulich@novell.com: avoid redundant shifts/divides for *_bit lib calls]
[v6: rebase to 3.1-rc1]
[v6: lliubbo@gmail.com: fix null pointer deref if vzalloc fails]
[v6: konrad.wilk@oracl.com: various checks and code clarifications/comments]
[v5: no change from v4]
[v4: rebase to 2.6.39]
Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
Reviewed-by: Konrad Wilk <konrad.wilk@oracle.com>
Acked-by: Jan Beulich <JBeulich@novell.com>
Acked-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Rik Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>

--- linux/include/linux/frontswap.h	1969-12-31 17:00:00.000000000 -0700
+++ frontswap/include/linux/frontswap.h	2011-08-29 09:52:14.304747064 -0600
@@ -0,0 +1,125 @@
+#ifndef _LINUX_FRONTSWAP_H
+#define _LINUX_FRONTSWAP_H
+
+#include <linux/swap.h>
+#include <linux/mm.h>
+
+struct frontswap_ops {
+	void (*init)(unsigned);
+	int (*put_page)(unsigned, pgoff_t, struct page *);
+	int (*get_page)(unsigned, pgoff_t, struct page *);
+	void (*flush_page)(unsigned, pgoff_t);
+	void (*flush_area)(unsigned);
+};
+
+extern int frontswap_enabled;
+extern struct frontswap_ops
+	frontswap_register_ops(struct frontswap_ops *ops);
+extern void frontswap_shrink(unsigned long);
+extern unsigned long frontswap_curr_pages(void);
+
+extern void __frontswap_init(unsigned type);
+extern int __frontswap_put_page(struct page *page);
+extern int __frontswap_get_page(struct page *page);
+extern void __frontswap_flush_page(unsigned, pgoff_t);
+extern void __frontswap_flush_area(unsigned);
+
+#ifdef CONFIG_FRONTSWAP
+
+static inline int frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
+{
+	int ret = 0;
+
+	if (frontswap_enabled && sis->frontswap_map)
+		ret = test_bit(offset, sis->frontswap_map);
+	return ret;
+}
+
+static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
+{
+	if (frontswap_enabled && sis->frontswap_map)
+		set_bit(offset, sis->frontswap_map);
+}
+
+static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
+{
+	if (frontswap_enabled && sis->frontswap_map)
+		clear_bit(offset, sis->frontswap_map);
+}
+
+static inline void frontswap_map_set(struct swap_info_struct *p,
+				     unsigned long *map)
+{
+	p->frontswap_map = map;
+}
+
+static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
+{
+	return p->frontswap_map;
+}
+#else
+/* all inline routines become no-ops and all externs are ignored */
+
+#define frontswap_enabled (0)
+
+static inline int frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
+{
+	return 0;
+}
+
+static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
+{
+}
+
+static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
+{
+}
+
+static inline void frontswap_map_set(struct swap_info_struct *p,
+				     unsigned long *map)
+{
+}
+
+static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
+{
+	return NULL;
+}
+#endif
+
+static inline int frontswap_put_page(struct page *page)
+{
+	int ret = -1;
+
+	if (frontswap_enabled)
+		ret = __frontswap_put_page(page);
+	return ret;
+}
+
+static inline int frontswap_get_page(struct page *page)
+{
+	int ret = -1;
+
+	if (frontswap_enabled)
+		ret = __frontswap_get_page(page);
+	return ret;
+}
+
+static inline void frontswap_flush_page(unsigned type, pgoff_t offset)
+{
+	if (frontswap_enabled)
+		__frontswap_flush_page(type, offset);
+}
+
+static inline void frontswap_flush_area(unsigned type)
+{
+	if (frontswap_enabled)
+		__frontswap_flush_area(type);
+}
+
+static inline void frontswap_init(unsigned type)
+{
+	if (frontswap_enabled)
+		__frontswap_init(type);
+}
+
+#endif /* _LINUX_FRONTSWAP_H */
--- linux/mm/frontswap.c	1969-12-31 17:00:00.000000000 -0700
+++ frontswap/mm/frontswap.c	2011-08-29 10:04:05.455869059 -0600
@@ -0,0 +1,351 @@
+/*
+ * Frontswap frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of frontswap.  See
+ * Documentation/vm/frontswap.txt for more information.
+ *
+ * Copyright (C) 2009-2010 Oracle Corp.  All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sysctl.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
+
+/*
+ * frontswap_ops is set by frontswap_register_ops to contain the pointers
+ * to the frontswap "backend" implementation functions.
+ */
+static struct frontswap_ops frontswap_ops;
+
+/*
+ * This global enablement flag reduces overhead on systems where frontswap_ops
+ * has not been registered, so is preferred to the slower alternative: a
+ * function call that checks a non-global.
+ */
+int frontswap_enabled;
+EXPORT_SYMBOL(frontswap_enabled);
+
+/*
+ * Useful stats available in /sys/kernel/mm/frontswap.  These are for
+ * information only so are not protected against increment/decrement races.
+ */
+static unsigned long frontswap_gets;
+static unsigned long frontswap_succ_puts;
+static unsigned long frontswap_failed_puts;
+static unsigned long frontswap_flushes;
+
+/*
+ * Register operations for frontswap, returning previous thus allowing
+ * detection of multiple backends and possible nesting
+ */
+struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
+{
+	struct frontswap_ops old = frontswap_ops;
+
+	frontswap_ops = *ops;
+	frontswap_enabled = 1;
+	return old;
+}
+EXPORT_SYMBOL(frontswap_register_ops);
+
+/* Called when a swap device is swapon'd */
+void __frontswap_init(unsigned type)
+{
+	struct swap_info_struct *sis = swap_info[type];
+
+	BUG_ON(sis == NULL);
+	if (sis->frontswap_map == NULL)
+		return;
+	if (frontswap_enabled)
+		(*frontswap_ops.init)(type);
+}
+EXPORT_SYMBOL(__frontswap_init);
+
+/*
+ * "Put" data from a page to frontswap and associate it with the page's
+ * swaptype and offset.  Page must be locked and in the swap cache.
+ * If frontswap already contains a page with matching swaptype and
+ * offset, the frontswap implmentation may either overwrite the data
+ * and return success or flush the page from frontswap and return failure
+ */
+int __frontswap_put_page(struct page *page)
+{
+	int ret = -1, dup = 0;
+	swp_entry_t entry = { .val = page_private(page), };
+	int type = swp_type(entry);
+	struct swap_info_struct *sis = swap_info[type];
+	pgoff_t offset = swp_offset(entry);
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(sis == NULL);
+	if (frontswap_test(sis, offset))
+		dup = 1;
+	ret = (*frontswap_ops.put_page)(type, offset, page);
+	if (ret == 0) {
+		frontswap_set(sis, offset);
+		frontswap_succ_puts++;
+		if (!dup)
+			atomic_inc(&sis->frontswap_pages);
+	} else if (dup) {
+		/*
+		  failed dup always results in automatic flush of
+		  the (older) page from frontswap
+		 */
+		frontswap_clear(sis, offset);
+		atomic_dec(&sis->frontswap_pages);
+		frontswap_failed_puts++;
+	} else
+		frontswap_failed_puts++;
+	return ret;
+}
+EXPORT_SYMBOL(__frontswap_put_page);
+
+/*
+ * "Get" data from frontswap associated with swaptype and offset that were
+ * specified when the data was put to frontswap and use it to fill the
+ * specified page with data. Page must be locked and in the swap cache
+ */
+int __frontswap_get_page(struct page *page)
+{
+	int ret = -1;
+	swp_entry_t entry = { .val = page_private(page), };
+	int type = swp_type(entry);
+	struct swap_info_struct *sis = swap_info[type];
+	pgoff_t offset = swp_offset(entry);
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(sis == NULL);
+	if (frontswap_test(sis, offset))
+		ret = (*frontswap_ops.get_page)(type, offset, page);
+	if (ret == 0)
+		frontswap_gets++;
+	return ret;
+}
+EXPORT_SYMBOL(__frontswap_get_page);
+
+/*
+ * Flush any data from frontswap associated with the specified swaptype
+ * and offset so that a subsequent "get" will fail.
+ */
+void __frontswap_flush_page(unsigned type, pgoff_t offset)
+{
+	struct swap_info_struct *sis = swap_info[type];
+
+	BUG_ON(sis == NULL);
+	if (frontswap_test(sis, offset)) {
+		(*frontswap_ops.flush_page)(type, offset);
+		atomic_dec(&sis->frontswap_pages);
+		frontswap_clear(sis, offset);
+		frontswap_flushes++;
+	}
+}
+EXPORT_SYMBOL(__frontswap_flush_page);
+
+/*
+ * Flush all data from frontswap associated with all offsets for the
+ * specified swaptype.
+ */
+void __frontswap_flush_area(unsigned type)
+{
+	struct swap_info_struct *sis = swap_info[type];
+
+	BUG_ON(sis == NULL);
+	if (sis->frontswap_map == NULL)
+		return;
+	(*frontswap_ops.flush_area)(type);
+	atomic_set(&sis->frontswap_pages, 0);
+	memset(sis->frontswap_map, 0, sis->max / sizeof(long));
+}
+EXPORT_SYMBOL(__frontswap_flush_area);
+
+/*
+ * Frontswap, like a true swap device, may unnecessarily retain pages
+ * under certain circumstances; "shrink" frontswap is essentially a
+ * "partial swapoff" and works by calling try_to_unuse to attempt to
+ * unuse enough frontswap pages to attempt to -- subject to memory
+ * constraints -- reduce the number of pages in frontswap
+ */
+void frontswap_shrink(unsigned long target_pages)
+{
+	int wrapped = 0;
+	bool locked = false;
+
+	/* try a few times to maximize chance of try_to_unuse success */
+	for (wrapped = 0; wrapped < 3; wrapped++) {
+
+		struct swap_info_struct *si = NULL;
+		int si_frontswap_pages;
+		unsigned long total_pages = 0, total_pages_to_unuse;
+		unsigned long pages = 0, pages_to_unuse = 0;
+		int type;
+
+		/*
+		 * we don't want to hold swap_lock while doing a very
+		 * lengthy try_to_unuse, but swap_list may change
+		 * so restart scan from swap_list.head each time
+		 */
+		spin_lock(&swap_lock);
+		locked = true;
+		total_pages = 0;
+		for (type = swap_list.head; type >= 0; type = si->next) {
+			si = swap_info[type];
+			total_pages += atomic_read(&si->frontswap_pages);
+		}
+		if (total_pages <= target_pages)
+			goto out;
+		total_pages_to_unuse = total_pages - target_pages;
+		for (type = swap_list.head; type >= 0; type = si->next) {
+			si = swap_info[type];
+			si_frontswap_pages = atomic_read(&si->frontswap_pages);
+			if (total_pages_to_unuse < si_frontswap_pages)
+				pages = pages_to_unuse = total_pages_to_unuse;
+			else {
+				pages = si_frontswap_pages;
+				pages_to_unuse = 0; /* unuse all */
+			}
+			if (security_vm_enough_memory_kern(pages))
+				continue;
+			vm_unacct_memory(pages);
+			break;
+		}
+		if (type < 0)
+			goto out;
+		locked = false;
+		spin_unlock(&swap_lock);
+		try_to_unuse(type, true, pages_to_unuse);
+	}
+
+out:
+	if (locked)
+		spin_unlock(&swap_lock);
+	return;
+}
+EXPORT_SYMBOL(frontswap_shrink);
+
+/*
+ * Count and return the number of pages frontswap pages across all
+ * swap devices.  This is exported so that a kernel module can
+ * determine current usage without reading sysfs.
+ */
+unsigned long frontswap_curr_pages(void)
+{
+	int type;
+	unsigned long totalpages = 0;
+	struct swap_info_struct *si = NULL;
+
+	spin_lock(&swap_lock);
+	for (type = swap_list.head; type >= 0; type = si->next) {
+		si = swap_info[type];
+		if (si != NULL)
+			totalpages += atomic_read(&si->frontswap_pages);
+	}
+	spin_unlock(&swap_lock);
+	return totalpages;
+}
+EXPORT_SYMBOL(frontswap_curr_pages);
+
+#ifdef CONFIG_SYSFS
+
+/* see Documentation/ABI/xxx/sysfs-kernel-mm-frontswap */
+
+#define FRONTSWAP_ATTR_RO(_name) \
+	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+#define FRONTSWAP_ATTR(_name) \
+	static struct kobj_attribute _name##_attr = \
+		__ATTR(_name, 0644, _name##_show, _name##_store)
+
+static ssize_t curr_pages_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", frontswap_curr_pages());
+}
+
+static ssize_t curr_pages_store(struct kobject *kobj,
+			       struct kobj_attribute *attr,
+			       const char *buf, size_t count)
+{
+	unsigned long target_pages;
+
+	if (strict_strtoul(buf, 10, &target_pages))
+		return -EINVAL;
+
+	frontswap_shrink(target_pages);
+
+	return count;
+}
+FRONTSWAP_ATTR(curr_pages);
+
+static ssize_t succ_puts_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", frontswap_succ_puts);
+}
+FRONTSWAP_ATTR_RO(succ_puts);
+
+static ssize_t failed_puts_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", frontswap_failed_puts);
+}
+FRONTSWAP_ATTR_RO(failed_puts);
+
+static ssize_t gets_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", frontswap_gets);
+}
+FRONTSWAP_ATTR_RO(gets);
+
+static ssize_t flushes_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", frontswap_flushes);
+}
+FRONTSWAP_ATTR_RO(flushes);
+
+static struct attribute *frontswap_attrs[] = {
+	&curr_pages_attr.attr,
+	&succ_puts_attr.attr,
+	&failed_puts_attr.attr,
+	&gets_attr.attr,
+	&flushes_attr.attr,
+	NULL,
+};
+
+static struct attribute_group frontswap_attr_group = {
+	.attrs = frontswap_attrs,
+	.name = "frontswap",
+};
+
+#endif /* CONFIG_SYSFS */
+
+static int __init init_frontswap(void)
+{
+	int err = 0;
+
+#ifdef CONFIG_SYSFS
+	err = sysfs_create_group(mm_kobj, &frontswap_attr_group);
+#endif /* CONFIG_SYSFS */
+	return err;
+}
+
+static void __exit exit_frontswap(void)
+{
+	frontswap_shrink(0UL);
+}
+
+module_init(init_frontswap);
+module_exit(exit_frontswap);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH V8 2/4] mm: frontswap: core code
  2011-08-29 16:49 [PATCH V8 2/4] mm: frontswap: core code Dan Magenheimer
@ 2011-09-07 23:25 ` Andrew Morton
  2011-09-08 15:00   ` Dan Magenheimer
  0 siblings, 1 reply; 4+ messages in thread
From: Andrew Morton @ 2011-09-07 23:25 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: linux-kernel, linux-mm, jeremy, hughd, ngupta, konrad.wilk,
	JBeulich, kurt.hackel, npiggin, riel, hannes, matthew,
	chris.mason, sjenning, kamezawa.hiroyu, jackdachef, cyclonusj,
	levinsasha928

On Mon, 29 Aug 2011 09:49:09 -0700
Dan Magenheimer <dan.magenheimer@oracle.com> wrote:

> From: Dan Magenheimer <dan.magenheimer@oracle.com>
> Subject: [PATCH V8 2/4] mm: frontswap: core code
> 
> This second patch of four in this frontswap series provides the core code
> for frontswap that interfaces between the hooks in the swap subsystem and
> a frontswap backend via frontswap_ops.
> 
> Two new files are added: mm/frontswap.c and include/linux/frontswap.h
> 
> Credits: Frontswap_ops design derived from Jeremy Fitzhardinge
> design for tmem; sysfs code modelled after mm/ksm.c
> 
>
> ...
>
> --- linux/include/linux/frontswap.h	1969-12-31 17:00:00.000000000 -0700
> +++ frontswap/include/linux/frontswap.h	2011-08-29 09:52:14.304747064 -0600
> @@ -0,0 +1,125 @@
> +#ifndef _LINUX_FRONTSWAP_H
> +#define _LINUX_FRONTSWAP_H
> +
> +#include <linux/swap.h>
> +#include <linux/mm.h>
> +
> +struct frontswap_ops {
> +	void (*init)(unsigned);
> +	int (*put_page)(unsigned, pgoff_t, struct page *);
> +	int (*get_page)(unsigned, pgoff_t, struct page *);
> +	void (*flush_page)(unsigned, pgoff_t);
> +	void (*flush_area)(unsigned);
> +};

Please don't use the term "flush".  In both the pagecache code and the
pte code it is interchangably used to refer to both writeback and
invalidation.  The way to avoid this ambiguity and confusion is to use
the terms "writeback" and "invalidate" instead.

Here, you're referring to invalidation.

>
> ...
>
> +/*
> + * frontswap_ops is set by frontswap_register_ops to contain the pointers
> + * to the frontswap "backend" implementation functions.
> + */
> +static struct frontswap_ops frontswap_ops;

__read_mostly?

> +/*
> + * This global enablement flag reduces overhead on systems where frontswap_ops
> + * has not been registered, so is preferred to the slower alternative: a
> + * function call that checks a non-global.
> + */
> +int frontswap_enabled;

__read_mostly?

bool?

> +EXPORT_SYMBOL(frontswap_enabled);
> +
> +/*
> + * Useful stats available in /sys/kernel/mm/frontswap.  These are for
> + * information only so are not protected against increment/decrement races.
> + */
> +static unsigned long frontswap_gets;
> +static unsigned long frontswap_succ_puts;
> +static unsigned long frontswap_failed_puts;
> +static unsigned long frontswap_flushes;

If they're in /sys/kernel/mm then they rather become permanent parts of
the exported kernel interface.  We're stuck with them.  Plus they're
inaccurate and updating them might be inefficient, so we don't want to
be stuck with them.

I suggest moving these to debugfs from where we can remove them if we
feel like doing so.

>
> ...
>
> +/*
> + * Frontswap, like a true swap device, may unnecessarily retain pages
> + * under certain circumstances; "shrink" frontswap is essentially a
> + * "partial swapoff" and works by calling try_to_unuse to attempt to
> + * unuse enough frontswap pages to attempt to -- subject to memory
> + * constraints -- reduce the number of pages in frontswap
> + */
> +void frontswap_shrink(unsigned long target_pages)

It's unclear whether `target_pages' refers to the number of pages to
remove or to the number of pages to retain.  A comment is needed.

> +{
> +	int wrapped = 0;
> +	bool locked = false;
> +
> +	/* try a few times to maximize chance of try_to_unuse success */

Why?  Is this necessary?  How often does try_to_unuse fail?

> +	for (wrapped = 0; wrapped < 3; wrapped++) {

`wrapped' seems an inappropriate identifier.

> +

unneeded newline

> +		struct swap_info_struct *si = NULL;
> +		int si_frontswap_pages;
> +		unsigned long total_pages = 0, total_pages_to_unuse;
> +		unsigned long pages = 0, pages_to_unuse = 0;
> +		int type;
> +
> +		/*
> +		 * we don't want to hold swap_lock while doing a very
> +		 * lengthy try_to_unuse, but swap_list may change
> +		 * so restart scan from swap_list.head each time
> +		 */
> +		spin_lock(&swap_lock);
> +		locked = true;
> +		total_pages = 0;
> +		for (type = swap_list.head; type >= 0; type = si->next) {
> +			si = swap_info[type];
> +			total_pages += atomic_read(&si->frontswap_pages);
> +		}
> +		if (total_pages <= target_pages)
> +			goto out;
> +		total_pages_to_unuse = total_pages - target_pages;
> +		for (type = swap_list.head; type >= 0; type = si->next) {
> +			si = swap_info[type];
> +			si_frontswap_pages = atomic_read(&si->frontswap_pages);
> +			if (total_pages_to_unuse < si_frontswap_pages)
> +				pages = pages_to_unuse = total_pages_to_unuse;
> +			else {
> +				pages = si_frontswap_pages;
> +				pages_to_unuse = 0; /* unuse all */
> +			}
> +			if (security_vm_enough_memory_kern(pages))

What's this doing here?  Needs a comment please.

> +				continue;
> +			vm_unacct_memory(pages);

hm, is that accurate?  Or should we account for the pages which
try_to_unuse() actually unused?

> +			break;
> +		}
> +		if (type < 0)
> +			goto out;
> +		locked = false;
> +		spin_unlock(&swap_lock);
> +		try_to_unuse(type, true, pages_to_unuse);
> +	}
> +
> +out:
> +	if (locked)
> +		spin_unlock(&swap_lock);
> +	return;
> +}
> +EXPORT_SYMBOL(frontswap_shrink);
> +
> +/*
> + * Count and return the number of pages frontswap pages across all

s/pages//

> + * swap devices.  This is exported so that a kernel module can
> + * determine current usage without reading sysfs.

Which kernel module might want to do this?

> + */
> +unsigned long frontswap_curr_pages(void)
> +{
> +	int type;
> +	unsigned long totalpages = 0;
> +	struct swap_info_struct *si = NULL;
> +
> +	spin_lock(&swap_lock);
> +	for (type = swap_list.head; type >= 0; type = si->next) {
> +		si = swap_info[type];
> +		if (si != NULL)
> +			totalpages += atomic_read(&si->frontswap_pages);
> +	}
> +	spin_unlock(&swap_lock);
> +	return totalpages;
> +}
> +EXPORT_SYMBOL(frontswap_curr_pages);
> +
> +#ifdef CONFIG_SYSFS

Has the code been tested with CONFIG_SYSFS=n?

>
> ...
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* RE: [PATCH V8 2/4] mm: frontswap: core code
  2011-09-07 23:25 ` Andrew Morton
@ 2011-09-08 15:00   ` Dan Magenheimer
  2011-09-08 23:51     ` Andrew Morton
  0 siblings, 1 reply; 4+ messages in thread
From: Dan Magenheimer @ 2011-09-08 15:00 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, jeremy, hughd, ngupta, Konrad Wilk,
	JBeulich, Kurt Hackel, npiggin, riel, hannes, matthew,
	Chris Mason, sjenning, kamezawa.hiroyu, jackdachef, cyclonusj,
	levinsasha928

> From: Andrew Morton [mailto:akpm@linux-foundation.org]
> Subject: Re: [PATCH V8 2/4] mm: frontswap: core code

Thanks very much for taking the time for this feedback!

Please correct me if I am presumptuous or misreading
SubmittingPatches, but after making the changes below,
I am thinking this constitutes a "Reviewed-by"?

> > From: Dan Magenheimer <dan.magenheimer@oracle.com>
> > Subject: [PATCH V8 2/4] mm: frontswap: core code
> >
> > This second patch of four in this frontswap series provides the core code
> > for frontswap that interfaces between the hooks in the swap subsystem and
> > +
> > +struct frontswap_ops {
> > +	void (*init)(unsigned);
> > +	int (*put_page)(unsigned, pgoff_t, struct page *);
> > +	int (*get_page)(unsigned, pgoff_t, struct page *);
> > +	void (*flush_page)(unsigned, pgoff_t);
> > +	void (*flush_area)(unsigned);
> > +};
> 
> Please don't use the term "flush".  In both the pagecache code and the
> pte code it is interchangably used to refer to both writeback and
> invalidation.  The way to avoid this ambiguity and confusion is to use
> the terms "writeback" and "invalidate" instead.
> 
> Here, you're referring to invalidation.

While the different name is OK, changing this consistently would now
require simultaneous patches in cleancache, zcache, and xen (not
to mention lots of docs inside and outside the kernel).  I suspect
it would be cleaner to do this later across all affected code
with a single commit.  Hope that's OK.

(Personally, I find "invalidate" to be inaccurate because common
usage of the term doesn't imply that the space used in the cache
is recovered, i.e. garbage collection, which is the case here.
To me, "flush" implies invalidate PLUS recover space.)

> > +static struct frontswap_ops frontswap_ops;
> 
> __read_mostly?

Yep.  Will fix.
 
> > +/*
> > + * This global enablement flag reduces overhead on systems where frontswap_ops
> > + * has not been registered, so is preferred to the slower alternative: a
> > + * function call that checks a non-global.
> > + */
> > +int frontswap_enabled;
> 
> __read_mostly?

Yep.  Will fix.

> > +/*
> > + * Useful stats available in /sys/kernel/mm/frontswap.  These are for
> > + * information only so are not protected against increment/decrement races.
> > + */
> > +static unsigned long frontswap_gets;
> > +static unsigned long frontswap_succ_puts;
> > +static unsigned long frontswap_failed_puts;
> > +static unsigned long frontswap_flushes;
> 
> If they're in /sys/kernel/mm then they rather become permanent parts of
> the exported kernel interface.  We're stuck with them.  Plus they're
> inaccurate and updating them might be inefficient, so we don't want to
> be stuck with them.
> 
> I suggest moving these to debugfs from where we can remove them if we
> feel like doing so.

The style (and code) for this was mimicked from ksm and hugepages, which
expose the stats the same way... as does cleancache now.  slub is also
similar.  I'm OK with using a different approach (e.g. debugfs), but
think it would be inconsistent and confusing to expose these stats
differently than cleancache (or ksm and hugepages).  I'd support
and help with a massive cleanup commit across all of mm later though.
Hope that's OK for now.

> > +/*
> > + * Frontswap, like a true swap device, may unnecessarily retain pages
> > + * under certain circumstances; "shrink" frontswap is essentially a
> > + * "partial swapoff" and works by calling try_to_unuse to attempt to
> > + * unuse enough frontswap pages to attempt to -- subject to memory
> > + * constraints -- reduce the number of pages in frontswap
> > + */
> > +void frontswap_shrink(unsigned long target_pages)
> 
> It's unclear whether `target_pages' refers to the number of pages to
> remove or to the number of pages to retain.  A comment is needed.

OK.  Will fix.

> > +{
> > +	int wrapped = 0;
> > +	bool locked = false;
> > +
> > +	/* try a few times to maximize chance of try_to_unuse success */
> 
> Why?  Is this necessary?  How often does try_to_unuse fail?
> 
> > +	for (wrapped = 0; wrapped < 3; wrapped++) {
> 
> `wrapped' seems an inappropriate identifier.

Hmmm... this loop was mimicking some swap code that now
seems to be long gone.  I agree it's not really necessary
and will remove the loop (which also removes the identifier).

> > +
> 
> unneeded newline

Doh!  Will fix.

> > +			if (security_vm_enough_memory_kern(pages))
> 
> What's this doing here? Needs a comment please.
> 
> > +				continue;
> > +			vm_unacct_memory(pages);
> 
> hm, is that accurate?  Or should we account for the pages which
> try_to_unuse() actually unused?

These are mimicking code in swapoff.  I think the code is
correct but agree a clarifying comment or two is in order.

> > +/*
> > + * Count and return the number of pages frontswap pages across all
> 
> s/pages//

Doh!  Will fix.

> > + * swap devices.  This is exported so that a kernel module can
> > + * determine current usage without reading sysfs.
> 
> Which kernel module might want to do this?

The tmem backends (currently zcache and xen tmem).  I'll change
the wording in the comment to clarify.

> > +#ifdef CONFIG_SYSFS
> 
> Has the code been tested with CONFIG_SYSFS=n?

Yes, in a previous version.  I'll doublecheck though.

Thanks again for the feedback!  I'll publish a V9 with
the corrections no later than early next week.

Dan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH V8 2/4] mm: frontswap: core code
  2011-09-08 15:00   ` Dan Magenheimer
@ 2011-09-08 23:51     ` Andrew Morton
  0 siblings, 0 replies; 4+ messages in thread
From: Andrew Morton @ 2011-09-08 23:51 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: linux-kernel, linux-mm, jeremy, hughd, ngupta, Konrad Wilk,
	JBeulich, Kurt Hackel, npiggin, riel, hannes, matthew,
	Chris Mason, sjenning, kamezawa.hiroyu, jackdachef, cyclonusj,
	levinsasha928

On Thu, 8 Sep 2011 08:00:36 -0700 (PDT)
Dan Magenheimer <dan.magenheimer@oracle.com> wrote:

> > From: Andrew Morton [mailto:akpm@linux-foundation.org]
> > Subject: Re: [PATCH V8 2/4] mm: frontswap: core code
> 
> Thanks very much for taking the time for this feedback!
> 
> Please correct me if I am presumptuous or misreading
> SubmittingPatches, but after making the changes below,
> I am thinking this constitutes a "Reviewed-by"?

Not really.  More like Briefly-browsed-by:.

> > > From: Dan Magenheimer <dan.magenheimer@oracle.com>
> > > Subject: [PATCH V8 2/4] mm: frontswap: core code
> > >
> > > This second patch of four in this frontswap series provides the core code
> > > for frontswap that interfaces between the hooks in the swap subsystem and
> > > +
> > > +struct frontswap_ops {
> > > +	void (*init)(unsigned);
> > > +	int (*put_page)(unsigned, pgoff_t, struct page *);
> > > +	int (*get_page)(unsigned, pgoff_t, struct page *);
> > > +	void (*flush_page)(unsigned, pgoff_t);
> > > +	void (*flush_area)(unsigned);
> > > +};
> > 
> > Please don't use the term "flush".  In both the pagecache code and the
> > pte code it is interchangably used to refer to both writeback and
> > invalidation.  The way to avoid this ambiguity and confusion is to use
> > the terms "writeback" and "invalidate" instead.
> > 
> > Here, you're referring to invalidation.
> 
> While the different name is OK, changing this consistently would now
> require simultaneous patches in cleancache, zcache, and xen (not
> to mention lots of docs inside and outside the kernel).  I suspect
> it would be cleaner to do this later across all affected code
> with a single commit.  Hope that's OK.

Well, if you can make that happen...

> (Personally, I find "invalidate" to be inaccurate because common
> usage of the term doesn't imply that the space used in the cache
> is recovered, i.e. garbage collection, which is the case here.
> To me, "flush" implies invalidate PLUS recover space.)

invalidate is close enough.  Consider block/blk-flush.c, sigh.

> 
> > > +/*
> > > + * Useful stats available in /sys/kernel/mm/frontswap.  These are for
> > > + * information only so are not protected against increment/decrement races.
> > > + */
> > > +static unsigned long frontswap_gets;
> > > +static unsigned long frontswap_succ_puts;
> > > +static unsigned long frontswap_failed_puts;
> > > +static unsigned long frontswap_flushes;
> > 
> > If they're in /sys/kernel/mm then they rather become permanent parts of
> > the exported kernel interface.  We're stuck with them.  Plus they're
> > inaccurate and updating them might be inefficient, so we don't want to
> > be stuck with them.
> > 
> > I suggest moving these to debugfs from where we can remove them if we
> > feel like doing so.
> 
> The style (and code) for this was mimicked from ksm and hugepages, which
> expose the stats the same way... as does cleancache now.  slub is also
> similar.  I'm OK with using a different approach (e.g. debugfs), but
> think it would be inconsistent and confusing to expose these stats
> differently than cleancache (or ksm and hugepages).  I'd support
> and help with a massive cleanup commit across all of mm later though.
> Hope that's OK for now.

These are boring internal counters for a few developers.  They're so
uninteresting to end users that the developer didn't even bother to
document them ;)

They should be in debugfs.  Probably some/all of the existing
cleancache/ksm/hugepage stats should be in debugfs too.  This a mistake
we often make.  Please let's be extremely miserly with the kernel API.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2011-09-08 23:52 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-08-29 16:49 [PATCH V8 2/4] mm: frontswap: core code Dan Magenheimer
2011-09-07 23:25 ` Andrew Morton
2011-09-08 15:00   ` Dan Magenheimer
2011-09-08 23:51     ` Andrew Morton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).