* [PATCH RFC] mm: lru milestones, timestamps and ages
@ 2013-04-30 11:02 Konstantin Khlebnikov
2013-05-03 14:07 ` Zlatko Calusic
2013-05-10 10:28 ` Mel Gorman
0 siblings, 2 replies; 9+ messages in thread
From: Konstantin Khlebnikov @ 2013-04-30 11:02 UTC (permalink / raw)
To: linux-mm
This patch adds engine for estimating rotation time for pages in lru lists.
This adds bunch of 'milestones' into each struct lruvec and inserts them into
lru lists periodically. Milestone flows in lru together with pages and brings
timestamp to the end of lru. Because milestones are embedded into lruvec they
can be easily distinguished from pages by comparing pointers.
Only few functions should care about that.
This machinery provides discrete-time estimation for age of pages from the end
of each lru and average age of each kind of evictable lrus in each zone.
Numbers are shown in '/proc/zoneinfo' and in memcg attribute 'memory.stat'.
Overhead on fast-path is nearly zero: is_lru_milestone() in isolate_lru_pages()
distinguishes milestones and pages without touching any extra cache-lines.
Memory overhead is more noticeable: 1.5k for 16 milestones per struct lruvec.
Struct mem_cgroup_per_node now requires order-1 page on 64-bit system with 4
zones per node.
In our kernel we use similar engine as source of statistics for scheduler in
memory reclaimer. This is O(1) scheduler which shifts vmscan priorities for lru
vectors depending on their sizes, limits and ages. It tries to balance memory
pressure among containers. I'll try to rework it for the mainline kernel soon.
Seems like these ages also can be used for optimal memory pressure distribution
between file and anon pages, and probably for balancing pressure among zones.
Moreover slab shrinkers also can provide similar time-based statistics,
some of them may embed timestamps directly into their objects.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
---
include/linux/mmzone.h | 26 ++++++++++
mm/memcontrol.c | 32 ++++++++++++
mm/mmzone.c | 10 ++++
mm/vmscan.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++++
mm/vmstat.c | 9 +++
5 files changed, 203 insertions(+)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c74092e..d8a6a43 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -7,6 +7,7 @@
#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/wait.h>
+#include <linux/workqueue.h>
#include <linux/bitops.h>
#include <linux/cache.h>
#include <linux/threads.h>
@@ -164,6 +165,7 @@ enum lru_list {
LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
LRU_UNEVICTABLE,
+ NR_EVICTABLE_LRU_LISTS = LRU_UNEVICTABLE,
NR_LRU_LISTS
};
@@ -199,14 +201,35 @@ struct zone_reclaim_stat {
unsigned long recent_scanned[2];
};
+struct lru_milestone {
+ unsigned long timestamp;
+ struct list_head lru;
+};
+
+#define NR_LRU_MILESTONES 16
+
struct lruvec {
struct list_head lists[NR_LRU_LISTS];
struct zone_reclaim_stat reclaim_stat;
#ifdef CONFIG_MEMCG
struct zone *zone;
#endif
+ unsigned long age[NR_EVICTABLE_LRU_LISTS];
+ unsigned long next_timestamp[NR_EVICTABLE_LRU_LISTS];
+ unsigned char last_milestone[NR_EVICTABLE_LRU_LISTS];
+ struct lru_milestone milestones[NR_EVICTABLE_LRU_LISTS][NR_LRU_MILESTONES];
};
+static inline bool
+is_lru_milestone(struct lruvec *lruvec, struct list_head *list)
+{
+ return unlikely(list >= &lruvec->milestones[0][0].lru &&
+ list < &lruvec->milestones[NR_EVICTABLE_LRU_LISTS]
+ [NR_LRU_MILESTONES].lru);
+}
+
+void remove_lru_milestone(struct lruvec *lruvec, enum lru_list lru);
+
/* Mask used at gathering information at once (see memcontrol.c) */
#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
@@ -487,6 +510,9 @@ struct zone {
* rarely used fields:
*/
const char *name;
+
+ struct delayed_work milestones_work;
+ unsigned long average_age[NR_EVICTABLE_LRU_LISTS];
} ____cacheline_internodealigned_in_smp;
typedef enum {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2b55222..07af4ce 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -967,6 +967,27 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
return total;
}
+static unsigned mem_cgroup_avg_age(struct mem_cgroup *memcg, enum lru_list lru)
+{
+ struct mem_cgroup_per_zone *mz;
+ unsigned long size = 0, pages;
+ int nid, zid;
+ u64 age = 0;
+
+ for_each_node_state(nid, N_MEMORY) {
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+ pages = mz->lru_size[lru];
+ size += pages;
+ age += (u64)pages * mz->lruvec.age[lru];
+ }
+ }
+
+ if (size)
+ do_div(age, size);
+ return jiffies_to_msecs(age);
+}
+
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
enum mem_cgroup_events_target target)
{
@@ -4707,6 +4728,11 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
break;
}
page = list_entry(list->prev, struct page, lru);
+ if (is_lru_milestone(lruvec, &page->lru)) {
+ remove_lru_milestone(lruvec, lru);
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ continue;
+ }
if (busy == page) {
list_move(&page->lru, list);
busy = NULL;
@@ -5298,6 +5324,10 @@ static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
+ for (i = 0; i < NR_EVICTABLE_LRU_LISTS; i++)
+ seq_printf(m, "avg_age_%s %u\n", mem_cgroup_lru_names[i],
+ mem_cgroup_avg_age(memcg, i));
+
/* Hierarchical information */
{
unsigned long long limit, memsw_limit;
@@ -5923,6 +5953,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
*/
if (!node_state(node, N_NORMAL_MEMORY))
tmp = -1;
+ /* Try to decrease NR_LRU_MILESTONES if this happens */
+ BUILD_BUG_ON(sizeof(struct mem_cgroup_per_node) > 2 * PAGE_SIZE);
pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
if (!pn)
return 1;
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 2ac0afb..64d59d6 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -89,12 +89,22 @@ int memmap_valid_within(unsigned long pfn,
void lruvec_init(struct lruvec *lruvec)
{
+ unsigned long now = jiffies;
enum lru_list lru;
+ int i;
memset(lruvec, 0, sizeof(struct lruvec));
for_each_lru(lru)
INIT_LIST_HEAD(&lruvec->lists[lru]);
+
+ for_each_evictable_lru(lru) {
+ for (i = 0; i < NR_LRU_MILESTONES; i++) {
+ INIT_LIST_HEAD(&lruvec->milestones[lru][i].lru);
+ lruvec->milestones[lru][i].timestamp = now;
+ }
+ lruvec->next_timestamp[lru] = now;
+ }
}
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 669fba3..caf5fee 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1101,6 +1101,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
+ if (is_lru_milestone(lruvec, &page->lru)) {
+ remove_lru_milestone(lruvec, lru);
+ continue;
+ }
+
VM_BUG_ON(!PageLRU(page));
switch (__isolate_lru_page(page, mode)) {
@@ -2489,6 +2494,113 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
} while (memcg);
}
+void remove_lru_milestone(struct lruvec *lruvec, enum lru_list lru)
+{
+ struct zone *zone = lruvec_zone(lruvec);
+ unsigned long now = jiffies, interval, next;
+ struct lru_milestone *ms;
+
+ ms = container_of(lruvec->lists[lru].prev, struct lru_milestone, lru);
+ list_del_init(&ms->lru);
+ lruvec->age[lru] = now - ms->timestamp;
+
+ pr_debug("lruvec:%p lru:%d remove:%02ld age:%lu\n", lruvec, lru,
+ ms - lruvec->milestones[lru], lruvec->age[lru]);
+
+ /* get new estimation for next milestone */
+ interval = lruvec->age[lru] / NR_LRU_MILESTONES;
+ ms = lruvec->milestones[lru] + lruvec->last_milestone[lru];
+ next = ms->timestamp + interval;
+ lruvec->next_timestamp[lru] = next;
+
+ if (time_before(next, zone->milestones_work.timer.expires))
+ mod_delayed_work(system_wq, &zone->milestones_work,
+ time_after(next, now) ? (next - now) : 0);
+}
+
+static void insert_lru_milestone(struct lruvec *lruvec, enum lru_list lru)
+{
+ unsigned long now = jiffies, interval;
+ struct lru_milestone *ms;
+
+ lruvec->last_milestone[lru]++;
+ lruvec->last_milestone[lru] %= NR_LRU_MILESTONES;
+ ms = lruvec->milestones[lru] + lruvec->last_milestone[lru];
+
+ /* get linear estimation of perfect interval between milestones */
+ interval = lruvec->age[lru] / NR_LRU_MILESTONES;
+
+ if (!list_empty(&ms->lru)) {
+ list_del(&ms->lru);
+ lruvec->age[lru] = now - ms->timestamp;
+ /* double inteval if oldest milestone is still in lru */
+ interval += HZ/100 + lruvec->age[lru] / NR_LRU_MILESTONES;
+ }
+
+ /* Required for calculating average ages in u64 without overflows */
+ interval = min_t(unsigned long, interval, INT_MAX / NR_LRU_MILESTONES);
+
+ ms->timestamp = now;
+ list_add(&ms->lru, &lruvec->lists[lru]);
+ lruvec->next_timestamp[lru] = now + interval;
+
+ pr_debug("lruvec:%p lru:%d insert:%02ld age:%lu\n", lruvec, lru,
+ ms - lruvec->milestones[lru], lruvec->age[lru]);
+}
+
+static void lru_milestones_work(struct work_struct *work)
+{
+ unsigned long size[NR_EVICTABLE_LRU_LISTS] = {0,};
+ u64 age[NR_EVICTABLE_LRU_LISTS] = {0,};
+ struct mem_cgroup *memcg;
+ unsigned long next;
+ struct zone *zone;
+ enum lru_list lru;
+
+ zone = container_of(work, struct zone, milestones_work.work);
+ if (!populated_zone(zone) || !node_state(zone_to_nid(zone), N_MEMORY))
+ return;
+
+ next = jiffies + INT_MAX / NR_LRU_MILESTONES;
+ zone->milestones_work.timer.expires = next;
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ unsigned long now = jiffies;
+ unsigned long pages;
+
+ for_each_evictable_lru(lru) {
+ if (time_after_eq(now, lruvec->next_timestamp[lru])) {
+ spin_lock_irq(&zone->lru_lock);
+ insert_lru_milestone(lruvec, lru);
+ spin_unlock_irq(&zone->lru_lock);
+ }
+ if (time_before(lruvec->next_timestamp[lru], next))
+ next = lruvec->next_timestamp[lru];
+
+ pages = get_lru_size(lruvec, lru);
+ size[lru] += pages;
+ age[lru] += (u64)pages * lruvec->age[lru];
+ }
+
+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
+ } while (memcg);
+
+ for_each_evictable_lru(lru) {
+ if (size[lru])
+ do_div(age[lru], size[lru]);
+ zone->average_age[lru] = age[lru];
+ }
+
+ if (time_before_eq(next, zone->milestones_work.timer.expires)) {
+ unsigned long now = jiffies;
+
+ mod_delayed_work(system_wq, &zone->milestones_work,
+ time_after(next, now) ? (next - now) : 0);
+ }
+}
+
static bool zone_balanced(struct zone *zone, int order,
unsigned long balance_gap, int classzone_idx)
{
@@ -2958,6 +3070,7 @@ static int kswapd(void *p)
int balanced_classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
+ int i;
struct reclaim_state reclaim_state = {
.reclaimed_slab = 0,
@@ -2985,6 +3098,13 @@ static int kswapd(void *p)
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
set_freezable();
+ for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ INIT_DELAYED_WORK(&zone->milestones_work, lru_milestones_work);
+ schedule_delayed_work(&zone->milestones_work, 0);
+ }
+
order = new_order = 0;
balanced_order = 0;
classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
@@ -3039,6 +3159,12 @@ static int kswapd(void *p)
}
}
+ for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ cancel_delayed_work_sync(&zone->milestones_work);
+ }
+
current->reclaim_state = NULL;
return 0;
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index e1d8ed1..ed82165 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1053,6 +1053,15 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
zone->all_unreclaimable,
zone->zone_start_pfn,
zone->inactive_ratio);
+ seq_printf(m,
+ "\n avg_age_inactive_anon: %u"
+ "\n avg_age_active_anon: %u"
+ "\n avg_age_inactive_file: %u"
+ "\n avg_age_active_file: %u",
+ jiffies_to_msecs(zone->average_age[LRU_INACTIVE_ANON]),
+ jiffies_to_msecs(zone->average_age[LRU_ACTIVE_ANON]),
+ jiffies_to_msecs(zone->average_age[LRU_INACTIVE_FILE]),
+ jiffies_to_msecs(zone->average_age[LRU_ACTIVE_FILE]));
seq_putc(m, '\n');
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH RFC] mm: lru milestones, timestamps and ages
2013-04-30 11:02 [PATCH RFC] mm: lru milestones, timestamps and ages Konstantin Khlebnikov
@ 2013-05-03 14:07 ` Zlatko Calusic
2013-05-04 11:53 ` Konstantin Khlebnikov
2013-05-10 10:28 ` Mel Gorman
1 sibling, 1 reply; 9+ messages in thread
From: Zlatko Calusic @ 2013-05-03 14:07 UTC (permalink / raw)
To: Konstantin Khlebnikov; +Cc: linux-mm
On 30.04.2013 13:02, Konstantin Khlebnikov wrote:
> This patch adds engine for estimating rotation time for pages in lru lists.
>
> This adds bunch of 'milestones' into each struct lruvec and inserts them into
> lru lists periodically. Milestone flows in lru together with pages and brings
> timestamp to the end of lru. Because milestones are embedded into lruvec they
> can be easily distinguished from pages by comparing pointers.
> Only few functions should care about that.
>
> This machinery provides discrete-time estimation for age of pages from the end
> of each lru and average age of each kind of evictable lrus in each zone.
Great stuff!
Believe it or not, I had an idea of writing something similar to this,
but of course having an idea and actually implementing it are two very
different things. Thank you for your work!
I will use this to prove (or not) that file pages in the normal zone on
a 4GB RAM machine are reused waaaay too soon. Actually, I already have
the patch applied and running on the desktop, but it should be much more
useful on server workloads. Desktops have erratic load and can go for a
long time with very little I/O activity. But, here are the current
numbers anyway:
Node 0, zone DMA32
pages free 5371
nr_inactive_anon 4257
nr_active_anon 139719
nr_inactive_file 617537
nr_active_file 51671
inactive_ratio: 5
avg_age_inactive_anon: 2514752
avg_age_active_anon: 2514752
avg_age_inactive_file: 876416
avg_age_active_file: 2514752
Node 0, zone Normal
pages free 424
nr_inactive_anon 253
nr_active_anon 54480
nr_inactive_file 63274
nr_active_file 44116
inactive_ratio: 1
avg_age_inactive_anon: 2531712
avg_age_active_anon: 2531712
avg_age_inactive_file: 901120
avg_age_active_file: 2531712
> In our kernel we use similar engine as source of statistics for scheduler in
> memory reclaimer. This is O(1) scheduler which shifts vmscan priorities for lru
> vectors depending on their sizes, limits and ages. It tries to balance memory
> pressure among containers. I'll try to rework it for the mainline kernel soon.
>
> Seems like these ages also can be used for optimal memory pressure distribution
> between file and anon pages, and probably for balancing pressure among zones.
This all sounds very promising. Especially because I currently observe
quite some imbalance among zones.
Regards,
--
Zlatko
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH RFC] mm: lru milestones, timestamps and ages
2013-05-03 14:07 ` Zlatko Calusic
@ 2013-05-04 11:53 ` Konstantin Khlebnikov
2013-05-04 13:01 ` Konstantin Khlebnikov
2013-05-04 13:32 ` Zlatko Calusic
0 siblings, 2 replies; 9+ messages in thread
From: Konstantin Khlebnikov @ 2013-05-04 11:53 UTC (permalink / raw)
To: Zlatko Calusic; +Cc: linux-mm
Zlatko Calusic wrote:
> On 30.04.2013 13:02, Konstantin Khlebnikov wrote:
>> This patch adds engine for estimating rotation time for pages in lru lists.
>>
>> This adds bunch of 'milestones' into each struct lruvec and inserts them into
>> lru lists periodically. Milestone flows in lru together with pages and brings
>> timestamp to the end of lru. Because milestones are embedded into lruvec they
>> can be easily distinguished from pages by comparing pointers.
>> Only few functions should care about that.
>>
>> This machinery provides discrete-time estimation for age of pages from the end
>> of each lru and average age of each kind of evictable lrus in each zone.
>
> Great stuff!
Thanks!
>
> Believe it or not, I had an idea of writing something similar to this, but of course having an idea and actually implementing it are two very different things. Thank you for your work!
>
> I will use this to prove (or not) that file pages in the normal zone on a 4GB RAM machine are reused waaaay too soon. Actually, I already have the patch applied and running on the desktop, but it should be much more useful on server workloads. Desktops have erratic load and can go for a long time with very little I/O activity. But, here are the current numbers anyway:
>
> Node 0, zone DMA32
> pages free 5371
> nr_inactive_anon 4257
> nr_active_anon 139719
> nr_inactive_file 617537
> nr_active_file 51671
> inactive_ratio: 5
> avg_age_inactive_anon: 2514752
> avg_age_active_anon: 2514752
> avg_age_inactive_file: 876416
> avg_age_active_file: 2514752
> Node 0, zone Normal
> pages free 424
> nr_inactive_anon 253
> nr_active_anon 54480
> nr_inactive_file 63274
> nr_active_file 44116
> inactive_ratio: 1
> avg_age_inactive_anon: 2531712
> avg_age_active_anon: 2531712
> avg_age_inactive_file: 901120
> avg_age_active_file: 2531712
>
>> In our kernel we use similar engine as source of statistics for scheduler in
>> memory reclaimer. This is O(1) scheduler which shifts vmscan priorities for lru
>> vectors depending on their sizes, limits and ages. It tries to balance memory
>> pressure among containers. I'll try to rework it for the mainline kernel soon.
>>
>> Seems like these ages also can be used for optimal memory pressure distribution
>> between file and anon pages, and probably for balancing pressure among zones.
>
> This all sounds very promising. Especially because I currently observe quite some imbalance among zones.
As I see, most likely reason of such imbalances is 'break' condition inside of shrink_lruvec().
So can try to disable it see what will happen.
But these numbers from your desktop actually doesn't proves this problem. Seems like difference
between zones is within the precision of this method. I don't know how to describe this precisely.
Probably irregularity between milestones also should be taken into the account to describe current
situation and quality of measurement.
Here current numbers from my 8Gb node. Main workload is a torrent client.
Node 0, zone DMA32
nr_inactive_anon 1
nr_active_anon 1494
nr_inactive_file 404028
nr_active_file 365525
nr_dirtied 855068
nr_written 854991
avg_age_inactive_anon: 64942528
avg_age_active_anon: 64942528
avg_age_inactive_file: 1281317
avg_age_active_file: 15813376
Node 0, zone Normal
nr_inactive_anon 376
nr_active_anon 13793
nr_inactive_file 542605
nr_active_file 542247
nr_dirtied 2746747
nr_written 2746266
avg_age_inactive_anon: 65064192
avg_age_active_anon: 65064192
avg_age_inactive_file: 1260611
avg_age_active_file: 8765240
So, here noticeable imbalance in ages of active file lru and nr_dirtied/nr_written.
I have no idea why, but torrent client uses syscall fadvise() which messes whole picture.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH RFC] mm: lru milestones, timestamps and ages
2013-05-04 11:53 ` Konstantin Khlebnikov
@ 2013-05-04 13:01 ` Konstantin Khlebnikov
2013-05-04 21:36 ` Zlatko Calusic
2013-05-06 19:08 ` Johannes Weiner
2013-05-04 13:32 ` Zlatko Calusic
1 sibling, 2 replies; 9+ messages in thread
From: Konstantin Khlebnikov @ 2013-05-04 13:01 UTC (permalink / raw)
To: Zlatko Calusic; +Cc: linux-mm
Konstantin Khlebnikov wrote:
> Zlatko Calusic wrote:
>> On 30.04.2013 13:02, Konstantin Khlebnikov wrote:
>>> This patch adds engine for estimating rotation time for pages in lru lists.
>>>
>>> This adds bunch of 'milestones' into each struct lruvec and inserts them into
>>> lru lists periodically. Milestone flows in lru together with pages and brings
>>> timestamp to the end of lru. Because milestones are embedded into lruvec they
>>> can be easily distinguished from pages by comparing pointers.
>>> Only few functions should care about that.
>>>
>>> This machinery provides discrete-time estimation for age of pages from the end
>>> of each lru and average age of each kind of evictable lrus in each zone.
>>
>> Great stuff!
>
> Thanks!
>
>>
>> Believe it or not, I had an idea of writing something similar to this, but of course having an idea and actually
>> implementing it are two very different things. Thank you for your work!
>>
>> I will use this to prove (or not) that file pages in the normal zone on a 4GB RAM machine are reused waaaay too soon.
>> Actually, I already have the patch applied and running on the desktop, but it should be much more useful on server
>> workloads. Desktops have erratic load and can go for a long time with very little I/O activity. But, here are the
>> current numbers anyway:
>>
>> Node 0, zone DMA32
>> pages free 5371
>> nr_inactive_anon 4257
>> nr_active_anon 139719
>> nr_inactive_file 617537
>> nr_active_file 51671
>> inactive_ratio: 5
>> avg_age_inactive_anon: 2514752
>> avg_age_active_anon: 2514752
>> avg_age_inactive_file: 876416
>> avg_age_active_file: 2514752
>> Node 0, zone Normal
>> pages free 424
>> nr_inactive_anon 253
>> nr_active_anon 54480
>> nr_inactive_file 63274
>> nr_active_file 44116
>> inactive_ratio: 1
>> avg_age_inactive_anon: 2531712
>> avg_age_active_anon: 2531712
>> avg_age_inactive_file: 901120
>> avg_age_active_file: 2531712
>>
>>> In our kernel we use similar engine as source of statistics for scheduler in
>>> memory reclaimer. This is O(1) scheduler which shifts vmscan priorities for lru
>>> vectors depending on their sizes, limits and ages. It tries to balance memory
>>> pressure among containers. I'll try to rework it for the mainline kernel soon.
>>>
>>> Seems like these ages also can be used for optimal memory pressure distribution
>>> between file and anon pages, and probably for balancing pressure among zones.
>>
>> This all sounds very promising. Especially because I currently observe quite some imbalance among zones.
>
> As I see, most likely reason of such imbalances is 'break' condition inside of shrink_lruvec().
> So can try to disable it see what will happen.
>
> But these numbers from your desktop actually doesn't proves this problem. Seems like difference
> between zones is within the precision of this method. I don't know how to describe this precisely.
> Probably irregularity between milestones also should be taken into the account to describe current
> situation and quality of measurement.
>
> Here current numbers from my 8Gb node. Main workload is a torrent client.
>
> Node 0, zone DMA32
> nr_inactive_anon 1
> nr_active_anon 1494
> nr_inactive_file 404028
> nr_active_file 365525
> nr_dirtied 855068
> nr_written 854991
> avg_age_inactive_anon: 64942528
> avg_age_active_anon: 64942528
> avg_age_inactive_file: 1281317
> avg_age_active_file: 15813376
> Node 0, zone Normal
> nr_inactive_anon 376
> nr_active_anon 13793
> nr_inactive_file 542605
> nr_active_file 542247
> nr_dirtied 2746747
> nr_written 2746266
> avg_age_inactive_anon: 65064192
> avg_age_active_anon: 65064192
> avg_age_inactive_file: 1260611
> avg_age_active_file: 8765240
>
> So, here noticeable imbalance in ages of active file lru and nr_dirtied/nr_written.
> I have no idea why, but torrent client uses syscall fadvise() which messes whole picture.
Hey! I can reproduce this:
Node 0, zone DMA32
nr_inactive_anon 1
nr_active_anon 2368
nr_inactive_file 373642
nr_active_file 375462
nr_dirtied 2887369
nr_written 2887291
inactive_ratio: 5
avg_age_inactive_anon: 64942528
avg_age_active_anon: 64942528
avg_age_inactive_file: 389824
avg_age_active_file: 1330368
Node 0, zone Normal
nr_inactive_anon 376
nr_active_anon 17768
nr_inactive_file 534695
nr_active_file 533685
nr_dirtied 12071397
nr_written 11940007
inactive_ratio: 6
avg_age_inactive_anon: 65064192
avg_age_active_anon: 65064192
avg_age_inactive_file: 28074
avg_age_active_file: 1304800
I'm just copying huge files from one disk to another by rsync.
In /proc/vmstat pgsteal_kswapd_normal and pgscan_kswapd_normal are rising rapidly,
other pgscan_* pgsteal_* are standing still. So, bug is somewhere in the kswapd.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH RFC] mm: lru milestones, timestamps and ages
2013-05-04 11:53 ` Konstantin Khlebnikov
2013-05-04 13:01 ` Konstantin Khlebnikov
@ 2013-05-04 13:32 ` Zlatko Calusic
1 sibling, 0 replies; 9+ messages in thread
From: Zlatko Calusic @ 2013-05-04 13:32 UTC (permalink / raw)
To: Konstantin Khlebnikov; +Cc: linux-mm
On 04.05.2013 13:53, Konstantin Khlebnikov wrote:
> Zlatko Calusic wrote:
>> On 30.04.2013 13:02, Konstantin Khlebnikov wrote:
>>> This patch adds engine for estimating rotation time for pages in lru
>>> lists.
>>>
>>> This adds bunch of 'milestones' into each struct lruvec and inserts
>>> them into
>>> lru lists periodically. Milestone flows in lru together with pages
>>> and brings
>>> timestamp to the end of lru. Because milestones are embedded into
>>> lruvec they
>>> can be easily distinguished from pages by comparing pointers.
>>> Only few functions should care about that.
>>>
>>> This machinery provides discrete-time estimation for age of pages
>>> from the end
>>> of each lru and average age of each kind of evictable lrus in each zone.
>>
>> Great stuff!
>
> Thanks!
>
>>
>> Believe it or not, I had an idea of writing something similar to this,
>> but of course having an idea and actually implementing it are two very
>> different things. Thank you for your work!
>>
>> I will use this to prove (or not) that file pages in the normal zone
>> on a 4GB RAM machine are reused waaaay too soon. Actually, I already
>> have the patch applied and running on the desktop, but it should be
>> much more useful on server workloads. Desktops have erratic load and
>> can go for a long time with very little I/O activity. But, here are
>> the current numbers anyway:
>>
>> Node 0, zone DMA32
>> pages free 5371
>> nr_inactive_anon 4257
>> nr_active_anon 139719
>> nr_inactive_file 617537
>> nr_active_file 51671
>> inactive_ratio: 5
>> avg_age_inactive_anon: 2514752
>> avg_age_active_anon: 2514752
>> avg_age_inactive_file: 876416
>> avg_age_active_file: 2514752
>> Node 0, zone Normal
>> pages free 424
>> nr_inactive_anon 253
>> nr_active_anon 54480
>> nr_inactive_file 63274
>> nr_active_file 44116
>> inactive_ratio: 1
>> avg_age_inactive_anon: 2531712
>> avg_age_active_anon: 2531712
>> avg_age_inactive_file: 901120
>> avg_age_active_file: 2531712
>>
>>> In our kernel we use similar engine as source of statistics for
>>> scheduler in
>>> memory reclaimer. This is O(1) scheduler which shifts vmscan
>>> priorities for lru
>>> vectors depending on their sizes, limits and ages. It tries to
>>> balance memory
>>> pressure among containers. I'll try to rework it for the mainline
>>> kernel soon.
>>>
>>> Seems like these ages also can be used for optimal memory pressure
>>> distribution
>>> between file and anon pages, and probably for balancing pressure
>>> among zones.
>>
>> This all sounds very promising. Especially because I currently observe
>> quite some imbalance among zones.
>
> As I see, most likely reason of such imbalances is 'break' condition
> inside of shrink_lruvec().
> So can try to disable it see what will happen.
Thanks for the hint. I will pay some more attention to this function
next time I investigate code.
>
> But these numbers from your desktop actually doesn't proves this
> problem. Seems like difference
> between zones is within the precision of this method. I don't know how
> to describe this precisely.
> Probably irregularity between milestones also should be taken into the
> account to describe current
> situation and quality of measurement.
>
Ah, no, the numbers were more like a proof that your patch is running
fine, nothing specific about them. I was just making a quick check that
your patch is stable enough before I run it in production, and it seems
it's working just fine.
In the next hour or so I will patch the kernel on the server where I
intend to do much more analysis. I also prepared a set of graphs based
on the numbers your code provides. Based on the preliminary tests, I
believe that I'll be interested only in the aging of the inactive file
lists. What I'm after is the bug explained here
http://marc.info/?l=linux-mm&m=136571221426984 and if I'm right, your
patch will help to better reveal extreme disbalance observed between
dma32 and normal zone file LRU aging. But only on a 4GB nodes. I haven't
seen anything similar on a 8GB nodes, where dma32 and normal zones are
approximately the same sizes.
--
Zlatko
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH RFC] mm: lru milestones, timestamps and ages
2013-05-04 13:01 ` Konstantin Khlebnikov
@ 2013-05-04 21:36 ` Zlatko Calusic
2013-05-06 19:08 ` Johannes Weiner
1 sibling, 0 replies; 9+ messages in thread
From: Zlatko Calusic @ 2013-05-04 21:36 UTC (permalink / raw)
To: Konstantin Khlebnikov; +Cc: linux-mm
[-- Attachment #1: Type: text/plain, Size: 6155 bytes --]
On 04.05.2013 15:01, Konstantin Khlebnikov wrote:
> Konstantin Khlebnikov wrote:
>> Zlatko Calusic wrote:
>>> On 30.04.2013 13:02, Konstantin Khlebnikov wrote:
>>>> This patch adds engine for estimating rotation time for pages in lru
>>>> lists.
>>>>
>>>> This adds bunch of 'milestones' into each struct lruvec and inserts
>>>> them into
>>>> lru lists periodically. Milestone flows in lru together with pages
>>>> and brings
>>>> timestamp to the end of lru. Because milestones are embedded into
>>>> lruvec they
>>>> can be easily distinguished from pages by comparing pointers.
>>>> Only few functions should care about that.
>>>>
>>>> This machinery provides discrete-time estimation for age of pages
>>>> from the end
>>>> of each lru and average age of each kind of evictable lrus in each
>>>> zone.
>>>
>>> Great stuff!
>>
>> Thanks!
>>
>>>
>>> Believe it or not, I had an idea of writing something similar to
>>> this, but of course having an idea and actually
>>> implementing it are two very different things. Thank you for your work!
>>>
>>> I will use this to prove (or not) that file pages in the normal zone
>>> on a 4GB RAM machine are reused waaaay too soon.
>>> Actually, I already have the patch applied and running on the
>>> desktop, but it should be much more useful on server
>>> workloads. Desktops have erratic load and can go for a long time with
>>> very little I/O activity. But, here are the
>>> current numbers anyway:
>>>
>>> Node 0, zone DMA32
>>> pages free 5371
>>> nr_inactive_anon 4257
>>> nr_active_anon 139719
>>> nr_inactive_file 617537
>>> nr_active_file 51671
>>> inactive_ratio: 5
>>> avg_age_inactive_anon: 2514752
>>> avg_age_active_anon: 2514752
>>> avg_age_inactive_file: 876416
>>> avg_age_active_file: 2514752
>>> Node 0, zone Normal
>>> pages free 424
>>> nr_inactive_anon 253
>>> nr_active_anon 54480
>>> nr_inactive_file 63274
>>> nr_active_file 44116
>>> inactive_ratio: 1
>>> avg_age_inactive_anon: 2531712
>>> avg_age_active_anon: 2531712
>>> avg_age_inactive_file: 901120
>>> avg_age_active_file: 2531712
>>>
>>>> In our kernel we use similar engine as source of statistics for
>>>> scheduler in
>>>> memory reclaimer. This is O(1) scheduler which shifts vmscan
>>>> priorities for lru
>>>> vectors depending on their sizes, limits and ages. It tries to
>>>> balance memory
>>>> pressure among containers. I'll try to rework it for the mainline
>>>> kernel soon.
>>>>
>>>> Seems like these ages also can be used for optimal memory pressure
>>>> distribution
>>>> between file and anon pages, and probably for balancing pressure
>>>> among zones.
>>>
>>> This all sounds very promising. Especially because I currently
>>> observe quite some imbalance among zones.
>>
>> As I see, most likely reason of such imbalances is 'break' condition
>> inside of shrink_lruvec().
>> So can try to disable it see what will happen.
>>
>> But these numbers from your desktop actually doesn't proves this
>> problem. Seems like difference
>> between zones is within the precision of this method. I don't know how
>> to describe this precisely.
>> Probably irregularity between milestones also should be taken into the
>> account to describe current
>> situation and quality of measurement.
>>
>> Here current numbers from my 8Gb node. Main workload is a torrent client.
>>
>> Node 0, zone DMA32
>> nr_inactive_anon 1
>> nr_active_anon 1494
>> nr_inactive_file 404028
>> nr_active_file 365525
>> nr_dirtied 855068
>> nr_written 854991
>> avg_age_inactive_anon: 64942528
>> avg_age_active_anon: 64942528
>> avg_age_inactive_file: 1281317
>> avg_age_active_file: 15813376
>> Node 0, zone Normal
>> nr_inactive_anon 376
>> nr_active_anon 13793
>> nr_inactive_file 542605
>> nr_active_file 542247
>> nr_dirtied 2746747
>> nr_written 2746266
>> avg_age_inactive_anon: 65064192
>> avg_age_active_anon: 65064192
>> avg_age_inactive_file: 1260611
>> avg_age_active_file: 8765240
>>
>> So, here noticeable imbalance in ages of active file lru and
>> nr_dirtied/nr_written.
>> I have no idea why, but torrent client uses syscall fadvise() which
>> messes whole picture.
>
> Hey! I can reproduce this:
>
> Node 0, zone DMA32
> nr_inactive_anon 1
> nr_active_anon 2368
> nr_inactive_file 373642
> nr_active_file 375462
> nr_dirtied 2887369
> nr_written 2887291
> inactive_ratio: 5
> avg_age_inactive_anon: 64942528
> avg_age_active_anon: 64942528
> avg_age_inactive_file: 389824
> avg_age_active_file: 1330368
> Node 0, zone Normal
> nr_inactive_anon 376
> nr_active_anon 17768
> nr_inactive_file 534695
> nr_active_file 533685
> nr_dirtied 12071397
> nr_written 11940007
> inactive_ratio: 6
> avg_age_inactive_anon: 65064192
> avg_age_active_anon: 65064192
> avg_age_inactive_file: 28074
> avg_age_active_file: 1304800
>
> I'm just copying huge files from one disk to another by rsync.
>
> In /proc/vmstat pgsteal_kswapd_normal and pgscan_kswapd_normal are
> rising rapidly,
> other pgscan_* pgsteal_* are standing still. So, bug is somewhere in the
> kswapd.
>
Not necessarily, because processes also do a direct reclaim. Also, if
you continued the copying, I bet you would see that DMA32 zone also gets
to play. Just a bit later.
I can now see that effect nicely on the graphs I prepared. Attached is
one from the desktop. Where the red line suddenly drops, I copied 2GB
file from the network to the machine. Half an hour later I copied
another 1.6GB file. That's when the blue line dropped. Though, it all
makes sense, about 3GB of I/O was needed to expunge all old inactive
pages from both zones, the first 2GB wasn't enough to push old pages
from the DMA32 zone.
I'm of the opinion that your instrumentation will be of use only when
there's a constant reclaim goin' on. Otherwise pages stay in memory for
a long long time, and then it doesn't matter much if it's one hour or
two hours before some of them are reclaimed. For the same reason I will
limit graphs like these to some useful value, so to get precision for
the important time periods when the reclaim is really active.
--
Zlatko
[-- Attachment #2: memage-hourly.png --]
[-- Type: image/png, Size: 11890 bytes --]
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH RFC] mm: lru milestones, timestamps and ages
2013-05-04 13:01 ` Konstantin Khlebnikov
2013-05-04 21:36 ` Zlatko Calusic
@ 2013-05-06 19:08 ` Johannes Weiner
1 sibling, 0 replies; 9+ messages in thread
From: Johannes Weiner @ 2013-05-06 19:08 UTC (permalink / raw)
To: Konstantin Khlebnikov; +Cc: Zlatko Calusic, Mel Gorman, Rik van Riel, linux-mm
Mel: we talked about this issue below in SFO, apparently I'm not the
only one who noticed :-)
Rik: a fix for the problem below is crucial for the refault
distance-based page cache sizing. The unequal LRU aging is a problem
in itself, but it's compounded when we use the skewed non-resident
times to base reclaim decisions on
On Sat, May 04, 2013 at 05:01:14PM +0400, Konstantin Khlebnikov wrote:
> Hey! I can reproduce this:
>
> Node 0, zone DMA32
> nr_inactive_anon 1
> nr_active_anon 2368
> nr_inactive_file 373642
> nr_active_file 375462
> nr_dirtied 2887369
> nr_written 2887291
> inactive_ratio: 5
> avg_age_inactive_anon: 64942528
> avg_age_active_anon: 64942528
> avg_age_inactive_file: 389824
> avg_age_active_file: 1330368
> Node 0, zone Normal
> nr_inactive_anon 376
> nr_active_anon 17768
> nr_inactive_file 534695
> nr_active_file 533685
> nr_dirtied 12071397
> nr_written 11940007
> inactive_ratio: 6
> avg_age_inactive_anon: 65064192
> avg_age_active_anon: 65064192
> avg_age_inactive_file: 28074
> avg_age_active_file: 1304800
>
> I'm just copying huge files from one disk to another by rsync.
>
> In /proc/vmstat pgsteal_kswapd_normal and pgscan_kswapd_normal are rising rapidly,
> other pgscan_* pgsteal_* are standing still. So, bug is somewhere in the kswapd.
There is a window where a steady stream of allocations and kswapd
cooperate in perfect unison and keep the Normal zone always between
the low and high watermarks. Kswapd does not stop until the high
watermark is met, the allocator does not go to lower zones until the
low watermark is breached. As a result, most allocations happen in
the Normal zone.
I'm playing around with a round-robin scheme on the page allocator
side to spread out file pages more evenly, but I'm torn on whether the
fix should actually be on the kswapd side, to enforce reclaim instead
of allocation more evenly. Thoughts?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH RFC] mm: lru milestones, timestamps and ages
2013-04-30 11:02 [PATCH RFC] mm: lru milestones, timestamps and ages Konstantin Khlebnikov
2013-05-03 14:07 ` Zlatko Calusic
@ 2013-05-10 10:28 ` Mel Gorman
2013-05-10 14:12 ` Konstantin Khlebnikov
1 sibling, 1 reply; 9+ messages in thread
From: Mel Gorman @ 2013-05-10 10:28 UTC (permalink / raw)
To: Konstantin Khlebnikov; +Cc: linux-mm
On Tue, Apr 30, 2013 at 03:02:14PM +0400, Konstantin Khlebnikov wrote:
> +static inline bool
> +is_lru_milestone(struct lruvec *lruvec, struct list_head *list)
> +{
> + return unlikely(list >= &lruvec->milestones[0][0].lru &&
> + list < &lruvec->milestones[NR_EVICTABLE_LRU_LISTS]
> + [NR_LRU_MILESTONES].lru);
> +}
> +
Not reviewing properly yet, just taking a quick look out of interest but
this check may be delicate. 32-bit x86 machines start the kernel direct
mapping at 0xC0000000 so milestones[0][0].lru will have some value betewen
0xC0000000 and 0xFFFFFFFF. HZ=250 on my distro config so after 0xC0000000
jiffies or a bit over 149 days of uptime, it looks like there will be a
window where LRU entries look like milestones. If I'm right, that is
bound to cause problems.
--
Mel Gorman
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH RFC] mm: lru milestones, timestamps and ages
2013-05-10 10:28 ` Mel Gorman
@ 2013-05-10 14:12 ` Konstantin Khlebnikov
0 siblings, 0 replies; 9+ messages in thread
From: Konstantin Khlebnikov @ 2013-05-10 14:12 UTC (permalink / raw)
To: Mel Gorman; +Cc: linux-mm
Mel Gorman wrote:
> On Tue, Apr 30, 2013 at 03:02:14PM +0400, Konstantin Khlebnikov wrote:
>> +static inline bool
>> +is_lru_milestone(struct lruvec *lruvec, struct list_head *list)
>> +{
>> + return unlikely(list>=&lruvec->milestones[0][0].lru&&
>> + list< &lruvec->milestones[NR_EVICTABLE_LRU_LISTS]
>> + [NR_LRU_MILESTONES].lru);
>> +}
>> +
>
> Not reviewing properly yet, just taking a quick look out of interest but
> this check may be delicate. 32-bit x86 machines start the kernel direct
> mapping at 0xC0000000 so milestones[0][0].lru will have some value betewen
> 0xC0000000 and 0xFFFFFFFF. HZ=250 on my distro config so after 0xC0000000
> jiffies or a bit over 149 days of uptime, it looks like there will be a
> window where LRU entries look like milestones. If I'm right, that is
> bound to cause problems.
>
Nope. There is no such dangerous magic. This function compares only pointers.
List heads in page LRU list can be either &page->lru or &lru_milestone->lru.
Since milestones are embedded into struct lruvec we can separate them in this way.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2013-05-10 14:12 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-04-30 11:02 [PATCH RFC] mm: lru milestones, timestamps and ages Konstantin Khlebnikov
2013-05-03 14:07 ` Zlatko Calusic
2013-05-04 11:53 ` Konstantin Khlebnikov
2013-05-04 13:01 ` Konstantin Khlebnikov
2013-05-04 21:36 ` Zlatko Calusic
2013-05-06 19:08 ` Johannes Weiner
2013-05-04 13:32 ` Zlatko Calusic
2013-05-10 10:28 ` Mel Gorman
2013-05-10 14:12 ` Konstantin Khlebnikov
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).