* [RFC v2 1/3] memcg: add page_cgroup_ino helper
2015-04-07 11:55 [RFC v2 0/3] idle memory tracking Vladimir Davydov
@ 2015-04-07 11:55 ` Vladimir Davydov
[not found] ` <cover.1428401673.git.vdavydov-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: Vladimir Davydov @ 2015-04-07 11:55 UTC (permalink / raw)
To: Andrew Morton
Cc: Minchan Kim, Johannes Weiner, Michal Hocko, Greg Thelen,
Michel Lespinasse, David Rientjes, Pavel Emelyanov,
Cyrill Gorcunov, Jonathan Corbet, linux-api, linux-doc, linux-mm,
cgroups, linux-kernel
Hwpoison allows to filter pages by memory cgroup ino. To ahieve that, it
calls try_get_mem_cgroup_from_page(), then mem_cgroup_css(), and finally
cgroup_ino() on the cgroup returned. This looks bulky. Since in the next
patch I need to get the ino of the memory cgroup a page is charged to
too, in this patch I introduce the page_cgroup_ino() helper.
Note that page_cgroup_ino() only considers those pages that are charged
to mem_cgroup->memory (i.e. page->mem_cgroup != NULL), and for others it
returns 0, while try_get_mem_cgroup_page(), used by hwpoison before, may
extract the cgroup from a swapcache readahead page too. Ignoring
swapcache readahead pages allows to call page_cgroup_ino() on unlocked
pages, which is nice. Hwpoison users will hardly see any difference.
Another difference between try_get_mem_cgroup_page() and
page_cgroup_ino() is that the latter works on pages charged to offline
memory cgroups, returning the inode number of the closest online
ancestor in this case, while the former does not, which is crucial for
the next patch.
Since try_get_mem_cgroup_page() is not used by anyone else, this patch
removes this function. Also, it makes hwpoison memcg filter depend on
CONFIG_MEMCG instead of CONFIG_MEMCG_SWAP (I've no idea why it was made
dependant on CONFIG_MEMCG_SWAP initially).
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
---
include/linux/memcontrol.h | 8 ++---
mm/hwpoison-inject.c | 5 +--
mm/memcontrol.c | 73 ++++++++++++++++++++++----------------------
mm/memory-failure.c | 16 ++--------
4 files changed, 42 insertions(+), 60 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 72dff5fb0d0c..9262a8407af7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -91,7 +91,6 @@ bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
struct mem_cgroup *root);
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
-extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
@@ -192,6 +191,8 @@ static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
void mem_cgroup_split_huge_fixup(struct page *head);
#endif
+unsigned long page_cgroup_ino(struct page *page);
+
#else /* CONFIG_MEMCG */
struct mem_cgroup;
@@ -252,11 +253,6 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
return &zone->lruvec;
}
-static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
-{
- return NULL;
-}
-
static inline bool mm_match_cgroup(struct mm_struct *mm,
struct mem_cgroup *memcg)
{
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 329caf56df22..df63c3133d70 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -45,12 +45,9 @@ static int hwpoison_inject(void *data, u64 val)
/*
* do a racy check with elevated page count, to make sure PG_hwpoison
* will only be set for the targeted owner (or on a free page).
- * We temporarily take page lock for try_get_mem_cgroup_from_page().
* memory_failure() will redo the check reliably inside page lock.
*/
- lock_page(hpage);
err = hwpoison_filter(hpage);
- unlock_page(hpage);
if (err)
return 0;
@@ -123,7 +120,7 @@ static int pfn_inject_init(void)
if (!dentry)
goto fail;
-#ifdef CONFIG_MEMCG_SWAP
+#ifdef CONFIG_MEMCG
dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
hwpoison_dir, &hwpoison_filter_memcg);
if (!dentry)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 14c2f2017e37..87c7f852d45b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2349,40 +2349,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
css_put_many(&memcg->css, nr_pages);
}
-/*
- * try_get_mem_cgroup_from_page - look up page's memcg association
- * @page: the page
- *
- * Look up, get a css reference, and return the memcg that owns @page.
- *
- * The page must be locked to prevent racing with swap-in and page
- * cache charges. If coming from an unlocked page table, the caller
- * must ensure the page is on the LRU or this can race with charging.
- */
-struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
-{
- struct mem_cgroup *memcg;
- unsigned short id;
- swp_entry_t ent;
-
- VM_BUG_ON_PAGE(!PageLocked(page), page);
-
- memcg = page->mem_cgroup;
- if (memcg) {
- if (!css_tryget_online(&memcg->css))
- memcg = NULL;
- } else if (PageSwapCache(page)) {
- ent.val = page_private(page);
- id = lookup_swap_cgroup_id(ent);
- rcu_read_lock();
- memcg = mem_cgroup_from_id(id);
- if (memcg && !css_tryget_online(&memcg->css))
- memcg = NULL;
- rcu_read_unlock();
- }
- return memcg;
-}
-
static void lock_page_lru(struct page *page, int *isolated)
{
struct zone *zone = page_zone(page);
@@ -2774,6 +2740,31 @@ void mem_cgroup_split_huge_fixup(struct page *head)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+/**
+ * page_cgroup_ino - return inode number of page's memcg
+ * @page: the page
+ *
+ * Look up the closest online ancestor of the memory cgroup @page is charged to
+ * and return its inode number. It is safe to call this function without taking
+ * a reference to the page.
+ */
+unsigned long page_cgroup_ino(struct page *page)
+{
+ struct mem_cgroup *memcg;
+ unsigned long ino = 0;
+
+ rcu_read_lock();
+ memcg = READ_ONCE(page->mem_cgroup);
+ while (memcg && !css_tryget_online(&memcg->css))
+ memcg = parent_mem_cgroup(memcg);
+ rcu_read_unlock();
+ if (memcg) {
+ ino = cgroup_ino(memcg->css.cgroup);
+ css_put(&memcg->css);
+ }
+ return ino;
+}
+
#ifdef CONFIG_MEMCG_SWAP
static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
bool charge)
@@ -5482,8 +5473,18 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
}
- if (do_swap_account && PageSwapCache(page))
- memcg = try_get_mem_cgroup_from_page(page);
+ if (do_swap_account && PageSwapCache(page)) {
+ swp_entry_t ent = { .val = page_private(page), };
+ unsigned short id = lookup_swap_cgroup_id(ent);
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(id);
+ if (memcg && !css_tryget_online(&memcg->css))
+ memcg = NULL;
+ rcu_read_unlock();
+ }
if (!memcg)
memcg = get_mem_cgroup_from_mm(mm);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 441eff52d099..824fa3b5aff3 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -128,27 +128,15 @@ static int hwpoison_filter_flags(struct page *p)
* can only guarantee that the page either belongs to the memcg tasks, or is
* a freed page.
*/
-#ifdef CONFIG_MEMCG_SWAP
+#ifdef CONFIG_MEMCG
u64 hwpoison_filter_memcg;
EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
static int hwpoison_filter_task(struct page *p)
{
- struct mem_cgroup *mem;
- struct cgroup_subsys_state *css;
- unsigned long ino;
-
if (!hwpoison_filter_memcg)
return 0;
- mem = try_get_mem_cgroup_from_page(p);
- if (!mem)
- return -EINVAL;
-
- css = mem_cgroup_css(mem);
- ino = cgroup_ino(css->cgroup);
- css_put(css);
-
- if (ino != hwpoison_filter_memcg)
+ if (page_cgroup_ino(p) != hwpoison_filter_memcg)
return -EINVAL;
return 0;
--
1.7.10.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [RFC v2 3/3] proc: add kpageidle file
2015-04-07 11:55 [RFC v2 0/3] idle memory tracking Vladimir Davydov
2015-04-07 11:55 ` [RFC v2 1/3] memcg: add page_cgroup_ino helper Vladimir Davydov
[not found] ` <cover.1428401673.git.vdavydov-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>
@ 2015-04-07 11:55 ` Vladimir Davydov
2015-04-07 12:08 ` [RFC v2 0/3] idle memory tracking Vladimir Davydov
3 siblings, 0 replies; 5+ messages in thread
From: Vladimir Davydov @ 2015-04-07 11:55 UTC (permalink / raw)
To: Andrew Morton
Cc: Minchan Kim, Johannes Weiner, Michal Hocko, Greg Thelen,
Michel Lespinasse, David Rientjes, Pavel Emelyanov,
Cyrill Gorcunov, Jonathan Corbet, linux-api, linux-doc, linux-mm,
cgroups, linux-kernel
Knowing the portion of memory that is not used by a certain application
or memory cgroup (idle memory) can be useful for partitioning the system
efficiently, e.g. by setting memory cgroup limits appropriately.
Currently, the only means to estimate the amount of idle memory provided
by the kernel is /proc/PID/{clear_refs,smaps}: the user can clear the
access bit for all pages mapped to a particular process by writing 1 to
clear_refs, wait for some time, and then count smaps:Referenced.
However, this method has two serious shortcomings:
- it does not count unmapped file pages
- it affects the reclaimer logic
To overcome these drawbacks, this patch introduces two new page flags,
Idle and Young, and a new proc file, /proc/kpageidle. A page's Idle flag
can only be set from userspace by writing to /proc/kpageidle at the
offset corresponding to the page, and it is cleared whenever the page is
accessed either through page tables (it is cleared in page_referenced()
in this case) or using the read(2) system call (mark_page_accessed()).
Thus by setting the Idle flag for pages of a particular workload, which
can be found e.g. by reading /proc/PID/pagemap, waiting for some time to
let the workload access its working set, and then reading the kpageidle
file, one can estimate the amount of pages that are not used by the
workload.
The Young page flag is used to avoid interference with the memory
reclaimer. A page's Young flag is set whenever the Access bit of a page
table entry pointing to the page is cleared by writing to kpageidle. If
page_referenced() is called on a Young page, it will add 1 to its return
value, therefore concealing the fact that the Access bit was cleared.
Since this new feature adds two extra page flags, it is made dependant
on 64BIT, where we have plenty of space for page flags. We could use
page_ext to accomodate new flags on 32BIT, but this is left for the
future work.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
---
Documentation/vm/pagemap.txt | 17 ++++-
fs/proc/page.c | 149 ++++++++++++++++++++++++++++++++++++++++++
fs/proc/task_mmu.c | 4 +-
include/linux/page-flags.h | 12 ++++
mm/Kconfig | 12 ++++
mm/debug.c | 4 ++
mm/rmap.c | 7 ++
mm/swap.c | 2 +
8 files changed, 205 insertions(+), 2 deletions(-)
diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
index 1ddfa1367b03..2ab2d5b98e8d 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -5,7 +5,7 @@ pagemap is a new (as of 2.6.25) set of interfaces in the kernel that allow
userspace programs to examine the page tables and related information by
reading files in /proc.
-There are four components to pagemap:
+There are five components to pagemap:
* /proc/pid/pagemap. This file lets a userspace process find out which
physical frame each virtual page is mapped to. It contains one 64-bit
@@ -69,6 +69,21 @@ There are four components to pagemap:
memory cgroup each page is charged to, indexed by PFN. Only available when
CONFIG_MEMCG is set.
+ * /proc/kpageidle. For each page this file contains a 64-bit number, which
+ equals 1 if the page is idle or 0 otherwise. The file is indexed by PFN. To
+ set or clear a page's Idle flag, one can write 1 or 0 respectively to this
+ file at the offset corresponding to the page. It is only possible to modify
+ the Idle flag for user pages (pages that are on an LRU list, to be more
+ exact). For other page types, the input is silently ignored. Writing to this
+ file beyond max PFN results in the ENXIO error.
+
+ A page's Idle flag is automatically cleared whenever the page is accessed
+ (via a page table entry or using the read(2) system call). This makes this
+ file useful for tracking a workload's working set, i.e. the set of pages
+ that are actively used by the workload.
+
+ The file is only available when CONFIG_IDLE_PAGE_TRACKING is set.
+
Short descriptions to the page flags:
0. LOCKED
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 70d23245dd43..974498a4c4b4 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -275,6 +275,151 @@ static const struct file_operations proc_kpagecgroup_operations = {
};
#endif /* CONFIG_MEMCG */
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+static struct page *kpageidle_get_page(struct page *page)
+{
+ if (!page || page_count(page) == 0 || !PageLRU(page))
+ return NULL;
+ if (!get_page_unless_zero(page))
+ return NULL;
+ if (unlikely(!PageLRU(page))) {
+ put_page(page);
+ return NULL;
+ }
+ return page;
+}
+
+static void kpageidle_clear_refs(struct page *page)
+{
+ unsigned long dummy;
+
+ if (page_referenced(page, 0, NULL, &dummy, NULL))
+ SetPageYoung(page);
+}
+
+static u64 kpageidle_read_page_state(struct page *page)
+{
+ u64 state = 0;
+
+ page = kpageidle_get_page(page);
+ if (!page)
+ return 0;
+ if (PageIdle(page)) {
+ kpageidle_clear_refs(page);
+ if (PageIdle(page))
+ state = 1;
+ }
+ put_page(page);
+ return state;
+}
+
+static int kpageidle_write_page_state(struct page *page, u64 state)
+{
+ if (state != 0 && state != 1)
+ return -EINVAL;
+ page = kpageidle_get_page(page);
+ if (!page)
+ return 0;
+ if (state && !PageIdle(page)) {
+ kpageidle_clear_refs(page);
+ SetPageIdle(page);
+ } else if (!state && PageIdle(page))
+ ClearPageIdle(page);
+ put_page(page);
+ return 0;
+}
+
+static ssize_t kpageidle_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ u64 __user *out = (u64 __user *)buf;
+ struct page *ppage;
+ unsigned long src = *ppos;
+ unsigned long pfn;
+ ssize_t ret = 0;
+ u64 val;
+
+ pfn = src / KPMSIZE;
+ count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
+ if (src & KPMMASK || count & KPMMASK)
+ return -EINVAL;
+
+ while (count > 0) {
+ if (pfn_valid(pfn))
+ ppage = pfn_to_page(pfn);
+ else
+ ppage = NULL;
+
+ val = kpageidle_read_page_state(ppage);
+
+ if (put_user(val, out)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ pfn++;
+ out++;
+ count -= KPMSIZE;
+ }
+
+ *ppos += (char __user *)out - buf;
+ if (!ret)
+ ret = (char __user *)out - buf;
+ return ret;
+}
+
+static ssize_t kpageidle_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ const u64 __user *in = (u64 __user *)buf;
+ struct page *ppage;
+ unsigned long src = *ppos;
+ unsigned long pfn;
+ ssize_t ret = 0;
+ u64 val;
+
+ pfn = src / KPMSIZE;
+ if (src & KPMMASK || count & KPMMASK)
+ return -EINVAL;
+
+ while (count > 0) {
+ if (pfn >= max_pfn) {
+ if ((char __user *)in == buf)
+ ret = -ENXIO;
+ break;
+ }
+ if (pfn_valid(pfn))
+ ppage = pfn_to_page(pfn);
+ else
+ ppage = NULL;
+
+ if (get_user(val, in)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = kpageidle_write_page_state(ppage, val);
+ if (ret)
+ break;
+
+ pfn++;
+ in++;
+ count -= KPMSIZE;
+ }
+
+ *ppos += (char __user *)in - buf;
+ if (!ret)
+ ret = (char __user *)in - buf;
+ return ret;
+}
+
+static const struct file_operations proc_kpageidle_operations = {
+ .llseek = mem_lseek,
+ .read = kpageidle_read,
+ .write = kpageidle_write,
+};
+#endif /* CONFIG_IDLE_PAGE_TRACKING */
+
static int __init proc_page_init(void)
{
proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
@@ -282,6 +427,10 @@ static int __init proc_page_init(void)
#ifdef CONFIG_MEMCG
proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations);
#endif
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+ proc_create("kpageidle", S_IRUSR | S_IWUSR, NULL,
+ &proc_kpageidle_operations);
+#endif
return 0;
}
fs_initcall(proc_page_init);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6dee68d013ff..5ed5f707cac3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -458,7 +458,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
mss->resident += size;
/* Accumulate the size in pages that have been accessed. */
- if (young || PageReferenced(page))
+ if (young || PageYoung(page) || PageReferenced(page))
mss->referenced += size;
mapcount = page_mapcount(page);
if (mapcount >= 2) {
@@ -808,6 +808,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
/* Clear accessed and referenced bits. */
pmdp_test_and_clear_young(vma, addr, pmd);
+ ClearPageYoung(page);
ClearPageReferenced(page);
out:
spin_unlock(ptl);
@@ -835,6 +836,7 @@ out:
/* Clear accessed and referenced bits. */
ptep_test_and_clear_young(vma, addr, pte);
+ ClearPageYoung(page);
ClearPageReferenced(page);
}
pte_unmap_unlock(pte - 1, ptl);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 91b7f9b2b774..e53afb2738f8 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -109,6 +109,10 @@ enum pageflags {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
PG_compound_lock,
#endif
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+ PG_young,
+ PG_idle,
+#endif
__NR_PAGEFLAGS,
/* Filesystems */
@@ -363,6 +367,14 @@ PAGEFLAG_FALSE(HWPoison)
#define __PG_HWPOISON 0
#endif
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+PAGEFLAG(Young, young, PF_HEAD)
+PAGEFLAG(Idle, idle, PF_HEAD)
+#else
+PAGEFLAG_FALSE(Young)
+PAGEFLAG_FALSE(Idle)
+#endif
+
/*
* On an anonymous page mapped into a user virtual memory area,
* page->mapping points to its anon_vma, not to a struct address_space;
diff --git a/mm/Kconfig b/mm/Kconfig
index 390214da4546..880dffd9fce1 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -635,3 +635,15 @@ config MAX_STACK_SIZE_MB
changed to a smaller value in which case that is used.
A sane initial value is 80 MB.
+
+config IDLE_PAGE_TRACKING
+ bool "Enable idle page tracking"
+ depends on 64BIT
+ select PROC_PAGE_MONITOR
+ help
+ This feature allows to estimate the amount of user pages that have
+ not been touched during a given period of time. This information can
+ be useful to tune memory cgroup limits and/or for job placement
+ within a compute cluster.
+
+ See Documentation/vm/pagemap.txt for more details.
diff --git a/mm/debug.c b/mm/debug.c
index 3eb3ac2fcee7..25d58478f59b 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -48,6 +48,10 @@ static const struct trace_print_flags pageflag_names[] = {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
{1UL << PG_compound_lock, "compound_lock" },
#endif
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+ {1UL << PG_young, "young" },
+ {1UL << PG_idle, "idle" },
+#endif
};
static void dump_flags(unsigned long flags,
diff --git a/mm/rmap.c b/mm/rmap.c
index dad23a43e42c..b6ead8a13185 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -799,6 +799,13 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
if (referenced) {
pra->referenced++;
pra->vm_flags |= vma->vm_flags;
+ if (PageIdle(page))
+ ClearPageIdle(page);
+ }
+
+ if (PageYoung(page)) {
+ ClearPageYoung(page);
+ pra->referenced++;
}
if (dirty)
diff --git a/mm/swap.c b/mm/swap.c
index 8773de093171..bee91fab10fc 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -624,6 +624,8 @@ void mark_page_accessed(struct page *page)
} else if (!PageReferenced(page)) {
SetPageReferenced(page);
}
+ if (PageIdle(page))
+ ClearPageIdle(page);
}
EXPORT_SYMBOL(mark_page_accessed);
--
1.7.10.4
^ permalink raw reply related [flat|nested] 5+ messages in thread