From: Johannes Weiner <hannes@cmpxchg.org>
To: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Andrew Morton <akpm@linux-foundation.org>,
Michal Hocko <mhocko@kernel.org>,
linux-mm@kvack.org, linux-kernel@vger.kernel.org
Subject: [PATCH 3/3] mm: workingset: cgroup-aware
Date: Mon, 25 Jan 2016 11:42:41 -0500 [thread overview]
Message-ID: <20160125164241.GD29291@cmpxchg.org> (raw)
In-Reply-To: <20160125163907.GA29291@cmpxchg.org>
TODO:
- cgroup-aware shadow node reclaim
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
include/linux/memcontrol.h | 39 ++++++++++++++++++++++++++++++
include/linux/mmzone.h | 11 ++++-----
include/linux/swap.h | 1 +
mm/memcontrol.c | 25 --------------------
mm/vmscan.c | 18 +++++++-------
mm/workingset.c | 59 +++++++++++++++++++++++++++++++++++++---------
6 files changed, 102 insertions(+), 51 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 792c8981e633..705aba54a50d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -89,6 +89,10 @@ enum mem_cgroup_events_target {
};
#ifdef CONFIG_MEMCG
+
+#define MEM_CGROUP_ID_SHIFT 16
+#define MEM_CGROUP_ID_MAX USHRT_MAX
+
struct mem_cgroup_stat_cpu {
long count[MEMCG_NR_STAT];
unsigned long events[MEMCG_NR_EVENTS];
@@ -312,6 +316,25 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
struct mem_cgroup_reclaim_cookie *);
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
+{
+ return memcg->css.id;
+}
+
+/**
+ * mem_cgroup_from_id - look up a memcg from an id
+ * @id: the id to look up
+ *
+ * Caller must hold rcu_read_lock() and use css_tryget() as necessary.
+ */
+static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
+{
+ struct cgroup_subsys_state *css;
+
+ css = css_from_id(id, &memory_cgrp_subsys);
+ return mem_cgroup_from_css(css);
+}
+
/**
* parent_mem_cgroup - find the accounting parent of a memcg
* @memcg: memcg whose parent to find
@@ -496,6 +519,10 @@ void mem_cgroup_split_huge_fixup(struct page *head);
#endif
#else /* CONFIG_MEMCG */
+
+#define MEM_CGROUP_ID_SHIFT 0
+#define MEM_CGROUP_ID_MAX 0
+
struct mem_cgroup;
static inline void mem_cgroup_events(struct mem_cgroup *memcg,
@@ -580,6 +607,18 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
{
}
+static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
+{
+ return 0;
+}
+
+static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
+{
+ WARN_ON_ONCE(id);
+ /* XXX: This should always return root_mem_cgroup */
+ return NULL;
+}
+
static inline bool mem_cgroup_disabled(void)
{
return true;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 33bb1b19273e..a7d8eeb6658a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -209,10 +209,12 @@ struct zone_reclaim_stat {
};
struct lruvec {
- struct list_head lists[NR_LRU_LISTS];
- struct zone_reclaim_stat reclaim_stat;
+ struct list_head lists[NR_LRU_LISTS];
+ struct zone_reclaim_stat reclaim_stat;
+ /* Evictions & activations on the inactive file list */
+ atomic_long_t inactive_age;
#ifdef CONFIG_MEMCG
- struct zone *zone;
+ struct zone *zone;
#endif
};
@@ -487,9 +489,6 @@ struct zone {
spinlock_t lru_lock;
struct lruvec lruvec;
- /* Evictions & activations on the inactive file list */
- atomic_long_t inactive_age;
-
/*
* When free pages are below this point, additional steps are taken
* when reading the number of free pages to avoid per-cpu counter
diff --git a/include/linux/swap.h b/include/linux/swap.h
index b14a2bb33514..1cf3065c143b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -317,6 +317,7 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
/* linux/mm/vmscan.c */
extern unsigned long zone_reclaimable_pages(struct zone *zone);
+extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru);
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *mask);
extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d06cae2de783..4ea79f225fe8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -268,31 +268,6 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
return (memcg == root_mem_cgroup);
}
-/*
- * We restrict the id in the range of [1, 65535], so it can fit into
- * an unsigned short.
- */
-#define MEM_CGROUP_ID_MAX USHRT_MAX
-
-static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
-{
- return memcg->css.id;
-}
-
-/*
- * A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock(). The caller is responsible for calling
- * css_tryget_online() if the mem_cgroup is used for charging. (dropping
- * refcnt from swap can be called against removed memcg.)
- */
-static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
-{
- struct cgroup_subsys_state *css;
-
- css = css_from_id(id, &memory_cgrp_subsys);
- return mem_cgroup_from_css(css);
-}
-
#ifndef CONFIG_SLOB
/*
* This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 30e0cd7a0ceb..f4ac04c0d35a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -213,7 +213,7 @@ bool zone_reclaimable(struct zone *zone)
zone_reclaimable_pages(zone) * 6;
}
-static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
{
if (!mem_cgroup_disabled())
return mem_cgroup_get_lru_size(lruvec, lru);
@@ -1931,8 +1931,8 @@ static bool inactive_file_is_low(struct lruvec *lruvec)
unsigned long inactive;
unsigned long active;
- inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
- active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
+ inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
+ active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
return active > inactive;
}
@@ -2071,7 +2071,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* system is under heavy pressure.
*/
if (!inactive_file_is_low(lruvec) &&
- get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
+ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
}
@@ -2097,10 +2097,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* anon in [0], file in [1]
*/
- anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
- get_lru_size(lruvec, LRU_INACTIVE_ANON);
- file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
- get_lru_size(lruvec, LRU_INACTIVE_FILE);
+ anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
+ lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
+ file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
+ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
spin_lock_irq(&zone->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
@@ -2138,7 +2138,7 @@ out:
unsigned long size;
unsigned long scan;
- size = get_lru_size(lruvec, lru);
+ size = lruvec_lru_size(lruvec, lru);
scan = size >> sc->priority;
if (!scan && pass && force_scan)
diff --git a/mm/workingset.c b/mm/workingset.c
index ac6eb7bc1faa..fe69da29bbad 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -152,7 +152,8 @@
* refault distance will immediately activate the refaulting page.
*/
-#define EVICTION_SHIFT (NODES_SHIFT + ZONES_SHIFT + \
+#define EVICTION_SHIFT (MEM_CGROUP_ID_SHIFT + \
+ NODES_SHIFT + ZONES_SHIFT + \
RADIX_TREE_EXCEPTIONAL_SHIFT)
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
@@ -166,9 +167,10 @@
*/
static unsigned int bucket_order;
-static void *pack_shadow(unsigned long eviction, struct zone *zone)
+static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction)
{
eviction >>= bucket_order;
+ eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone);
eviction = (eviction << ZONES_SHIFT) | zone_idx(zone);
eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
@@ -176,21 +178,23 @@ static void *pack_shadow(unsigned long eviction, struct zone *zone)
return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
}
-static void unpack_shadow(void *shadow, struct zone **zonep,
+static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep,
unsigned long *evictionp)
{
unsigned long entry = (unsigned long)shadow;
- int zid, nid;
+ int memcgid, nid, zid;
entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
zid = entry & ((1UL << ZONES_SHIFT) - 1);
entry >>= ZONES_SHIFT;
nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT;
+ memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
+ entry >>= MEM_CGROUP_ID_SHIFT;
+ *memcgidp = memcgid;
*zonep = NODE_DATA(nid)->node_zones + zid;
*evictionp = entry << bucket_order;
-
}
/**
@@ -203,11 +207,20 @@ static void unpack_shadow(void *shadow, struct zone **zonep,
*/
void *workingset_eviction(struct address_space *mapping, struct page *page)
{
+ struct mem_cgroup *memcg = page_memcg(page);
struct zone *zone = page_zone(page);
+ int memcgid = mem_cgroup_id(memcg);
unsigned long eviction;
+ struct lruvec *lruvec;
+
+ /* Page is fully exclusive and pins page->mem_cgroup */
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
- eviction = atomic_long_inc_return(&zone->inactive_age);
- return pack_shadow(eviction, zone);
+ lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ eviction = atomic_long_inc_return(&lruvec->inactive_age);
+ return pack_shadow(memcgid, zone, eviction);
}
/**
@@ -222,13 +235,32 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
bool workingset_refault(void *shadow)
{
unsigned long refault_distance;
+ unsigned long active_file;
+ struct mem_cgroup *memcg;
unsigned long eviction;
+ struct lruvec *lruvec;
unsigned long refault;
struct zone *zone;
+ int memcgid;
- unpack_shadow(shadow, &zone, &eviction);
+ unpack_shadow(shadow, &memcgid, &zone, &eviction);
- refault = atomic_long_read(&zone->inactive_age);
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(memcgid);
+ /*
+ * Don't count a refault if the remembered memcg has been
+ * deleted since. XXX: On !CONFIG_MEMCG, this will always
+ * return NULL; it would be better if the root_mem_cgroup
+ * existed in all configurations instead.
+ */
+ if (!mem_cgroup_disabled() && !memcg) {
+ rcu_read_unlock();
+ return false;
+ }
+ lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ refault = atomic_long_read(&lruvec->inactive_age);
+ active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
+ rcu_read_unlock();
/*
* The unsigned subtraction here gives an accurate distance
@@ -250,7 +282,7 @@ bool workingset_refault(void *shadow)
inc_zone_state(zone, WORKINGSET_REFAULT);
- if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) {
+ if (refault_distance <= active_file) {
inc_zone_state(zone, WORKINGSET_ACTIVATE);
return true;
}
@@ -263,7 +295,12 @@ bool workingset_refault(void *shadow)
*/
void workingset_activation(struct page *page)
{
- atomic_long_inc(&page_zone(page)->inactive_age);
+ struct lruvec *lruvec;
+
+ rcu_read_lock();
+ lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page));
+ atomic_long_inc(&lruvec->inactive_age);
+ rcu_read_unlock();
}
/*
--
2.7.0
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
WARNING: multiple messages have this Message-ID (diff)
From: Johannes Weiner <hannes@cmpxchg.org>
To: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Andrew Morton <akpm@linux-foundation.org>,
Michal Hocko <mhocko@kernel.org>,
linux-mm@kvack.org, linux-kernel@vger.kernel.org
Subject: [PATCH 3/3] mm: workingset: cgroup-aware
Date: Mon, 25 Jan 2016 11:42:41 -0500 [thread overview]
Message-ID: <20160125164241.GD29291@cmpxchg.org> (raw)
In-Reply-To: <20160125163907.GA29291@cmpxchg.org>
TODO:
- cgroup-aware shadow node reclaim
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
include/linux/memcontrol.h | 39 ++++++++++++++++++++++++++++++
include/linux/mmzone.h | 11 ++++-----
include/linux/swap.h | 1 +
mm/memcontrol.c | 25 --------------------
mm/vmscan.c | 18 +++++++-------
mm/workingset.c | 59 +++++++++++++++++++++++++++++++++++++---------
6 files changed, 102 insertions(+), 51 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 792c8981e633..705aba54a50d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -89,6 +89,10 @@ enum mem_cgroup_events_target {
};
#ifdef CONFIG_MEMCG
+
+#define MEM_CGROUP_ID_SHIFT 16
+#define MEM_CGROUP_ID_MAX USHRT_MAX
+
struct mem_cgroup_stat_cpu {
long count[MEMCG_NR_STAT];
unsigned long events[MEMCG_NR_EVENTS];
@@ -312,6 +316,25 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
struct mem_cgroup_reclaim_cookie *);
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
+{
+ return memcg->css.id;
+}
+
+/**
+ * mem_cgroup_from_id - look up a memcg from an id
+ * @id: the id to look up
+ *
+ * Caller must hold rcu_read_lock() and use css_tryget() as necessary.
+ */
+static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
+{
+ struct cgroup_subsys_state *css;
+
+ css = css_from_id(id, &memory_cgrp_subsys);
+ return mem_cgroup_from_css(css);
+}
+
/**
* parent_mem_cgroup - find the accounting parent of a memcg
* @memcg: memcg whose parent to find
@@ -496,6 +519,10 @@ void mem_cgroup_split_huge_fixup(struct page *head);
#endif
#else /* CONFIG_MEMCG */
+
+#define MEM_CGROUP_ID_SHIFT 0
+#define MEM_CGROUP_ID_MAX 0
+
struct mem_cgroup;
static inline void mem_cgroup_events(struct mem_cgroup *memcg,
@@ -580,6 +607,18 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
{
}
+static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
+{
+ return 0;
+}
+
+static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
+{
+ WARN_ON_ONCE(id);
+ /* XXX: This should always return root_mem_cgroup */
+ return NULL;
+}
+
static inline bool mem_cgroup_disabled(void)
{
return true;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 33bb1b19273e..a7d8eeb6658a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -209,10 +209,12 @@ struct zone_reclaim_stat {
};
struct lruvec {
- struct list_head lists[NR_LRU_LISTS];
- struct zone_reclaim_stat reclaim_stat;
+ struct list_head lists[NR_LRU_LISTS];
+ struct zone_reclaim_stat reclaim_stat;
+ /* Evictions & activations on the inactive file list */
+ atomic_long_t inactive_age;
#ifdef CONFIG_MEMCG
- struct zone *zone;
+ struct zone *zone;
#endif
};
@@ -487,9 +489,6 @@ struct zone {
spinlock_t lru_lock;
struct lruvec lruvec;
- /* Evictions & activations on the inactive file list */
- atomic_long_t inactive_age;
-
/*
* When free pages are below this point, additional steps are taken
* when reading the number of free pages to avoid per-cpu counter
diff --git a/include/linux/swap.h b/include/linux/swap.h
index b14a2bb33514..1cf3065c143b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -317,6 +317,7 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
/* linux/mm/vmscan.c */
extern unsigned long zone_reclaimable_pages(struct zone *zone);
+extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru);
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *mask);
extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d06cae2de783..4ea79f225fe8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -268,31 +268,6 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
return (memcg == root_mem_cgroup);
}
-/*
- * We restrict the id in the range of [1, 65535], so it can fit into
- * an unsigned short.
- */
-#define MEM_CGROUP_ID_MAX USHRT_MAX
-
-static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
-{
- return memcg->css.id;
-}
-
-/*
- * A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock(). The caller is responsible for calling
- * css_tryget_online() if the mem_cgroup is used for charging. (dropping
- * refcnt from swap can be called against removed memcg.)
- */
-static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
-{
- struct cgroup_subsys_state *css;
-
- css = css_from_id(id, &memory_cgrp_subsys);
- return mem_cgroup_from_css(css);
-}
-
#ifndef CONFIG_SLOB
/*
* This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 30e0cd7a0ceb..f4ac04c0d35a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -213,7 +213,7 @@ bool zone_reclaimable(struct zone *zone)
zone_reclaimable_pages(zone) * 6;
}
-static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
{
if (!mem_cgroup_disabled())
return mem_cgroup_get_lru_size(lruvec, lru);
@@ -1931,8 +1931,8 @@ static bool inactive_file_is_low(struct lruvec *lruvec)
unsigned long inactive;
unsigned long active;
- inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
- active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
+ inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
+ active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
return active > inactive;
}
@@ -2071,7 +2071,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* system is under heavy pressure.
*/
if (!inactive_file_is_low(lruvec) &&
- get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
+ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
}
@@ -2097,10 +2097,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* anon in [0], file in [1]
*/
- anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
- get_lru_size(lruvec, LRU_INACTIVE_ANON);
- file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
- get_lru_size(lruvec, LRU_INACTIVE_FILE);
+ anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
+ lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
+ file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
+ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
spin_lock_irq(&zone->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
@@ -2138,7 +2138,7 @@ out:
unsigned long size;
unsigned long scan;
- size = get_lru_size(lruvec, lru);
+ size = lruvec_lru_size(lruvec, lru);
scan = size >> sc->priority;
if (!scan && pass && force_scan)
diff --git a/mm/workingset.c b/mm/workingset.c
index ac6eb7bc1faa..fe69da29bbad 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -152,7 +152,8 @@
* refault distance will immediately activate the refaulting page.
*/
-#define EVICTION_SHIFT (NODES_SHIFT + ZONES_SHIFT + \
+#define EVICTION_SHIFT (MEM_CGROUP_ID_SHIFT + \
+ NODES_SHIFT + ZONES_SHIFT + \
RADIX_TREE_EXCEPTIONAL_SHIFT)
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
@@ -166,9 +167,10 @@
*/
static unsigned int bucket_order;
-static void *pack_shadow(unsigned long eviction, struct zone *zone)
+static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction)
{
eviction >>= bucket_order;
+ eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone);
eviction = (eviction << ZONES_SHIFT) | zone_idx(zone);
eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
@@ -176,21 +178,23 @@ static void *pack_shadow(unsigned long eviction, struct zone *zone)
return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
}
-static void unpack_shadow(void *shadow, struct zone **zonep,
+static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep,
unsigned long *evictionp)
{
unsigned long entry = (unsigned long)shadow;
- int zid, nid;
+ int memcgid, nid, zid;
entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
zid = entry & ((1UL << ZONES_SHIFT) - 1);
entry >>= ZONES_SHIFT;
nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT;
+ memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
+ entry >>= MEM_CGROUP_ID_SHIFT;
+ *memcgidp = memcgid;
*zonep = NODE_DATA(nid)->node_zones + zid;
*evictionp = entry << bucket_order;
-
}
/**
@@ -203,11 +207,20 @@ static void unpack_shadow(void *shadow, struct zone **zonep,
*/
void *workingset_eviction(struct address_space *mapping, struct page *page)
{
+ struct mem_cgroup *memcg = page_memcg(page);
struct zone *zone = page_zone(page);
+ int memcgid = mem_cgroup_id(memcg);
unsigned long eviction;
+ struct lruvec *lruvec;
+
+ /* Page is fully exclusive and pins page->mem_cgroup */
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
- eviction = atomic_long_inc_return(&zone->inactive_age);
- return pack_shadow(eviction, zone);
+ lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ eviction = atomic_long_inc_return(&lruvec->inactive_age);
+ return pack_shadow(memcgid, zone, eviction);
}
/**
@@ -222,13 +235,32 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
bool workingset_refault(void *shadow)
{
unsigned long refault_distance;
+ unsigned long active_file;
+ struct mem_cgroup *memcg;
unsigned long eviction;
+ struct lruvec *lruvec;
unsigned long refault;
struct zone *zone;
+ int memcgid;
- unpack_shadow(shadow, &zone, &eviction);
+ unpack_shadow(shadow, &memcgid, &zone, &eviction);
- refault = atomic_long_read(&zone->inactive_age);
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(memcgid);
+ /*
+ * Don't count a refault if the remembered memcg has been
+ * deleted since. XXX: On !CONFIG_MEMCG, this will always
+ * return NULL; it would be better if the root_mem_cgroup
+ * existed in all configurations instead.
+ */
+ if (!mem_cgroup_disabled() && !memcg) {
+ rcu_read_unlock();
+ return false;
+ }
+ lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ refault = atomic_long_read(&lruvec->inactive_age);
+ active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
+ rcu_read_unlock();
/*
* The unsigned subtraction here gives an accurate distance
@@ -250,7 +282,7 @@ bool workingset_refault(void *shadow)
inc_zone_state(zone, WORKINGSET_REFAULT);
- if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) {
+ if (refault_distance <= active_file) {
inc_zone_state(zone, WORKINGSET_ACTIVATE);
return true;
}
@@ -263,7 +295,12 @@ bool workingset_refault(void *shadow)
*/
void workingset_activation(struct page *page)
{
- atomic_long_inc(&page_zone(page)->inactive_age);
+ struct lruvec *lruvec;
+
+ rcu_read_lock();
+ lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page));
+ atomic_long_inc(&lruvec->inactive_age);
+ rcu_read_unlock();
}
/*
--
2.7.0
next prev parent reply other threads:[~2016-01-25 16:43 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-01-24 16:56 [PATCH v2] mm: workingset: make workingset detection logic memcg aware Vladimir Davydov
2016-01-24 16:56 ` Vladimir Davydov
2016-01-25 16:39 ` Johannes Weiner
2016-01-25 16:39 ` Johannes Weiner
2016-01-25 16:41 ` [PATCH 1/3] mm: workingset: eviction buckets for bigmem/lowbit machines Johannes Weiner
2016-01-25 16:41 ` Johannes Weiner
2016-01-25 16:41 ` [PATCH 2/3] mm: workingset: separate shadow unpacking and refault calculation Johannes Weiner
2016-01-25 16:41 ` Johannes Weiner
2016-01-25 16:42 ` Johannes Weiner [this message]
2016-01-25 16:42 ` [PATCH 3/3] mm: workingset: cgroup-aware Johannes Weiner
2016-01-26 8:27 ` [PATCH v2] mm: workingset: make workingset detection logic memcg aware Vladimir Davydov
2016-01-26 8:27 ` Vladimir Davydov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20160125164241.GD29291@cmpxchg.org \
--to=hannes@cmpxchg.org \
--cc=akpm@linux-foundation.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mhocko@kernel.org \
--cc=vdavydov@virtuozzo.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.