diff for duplicates of <20140904150846.GA10794@cmpxchg.org> diff --git a/a/1.txt b/N1/1.txt index dcee5a0..85acf81 100644 --- a/a/1.txt +++ b/N1/1.txt @@ -29,3 +29,238 @@ Thanks for the clarification, that is truly horrible. Does the following revert restore performance in your case? --- +>From 6fa7599054868cd0df940d7b0973dd64f8acb0b5 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner <hannes@cmpxchg.org> +Date: Thu, 4 Sep 2014 10:04:34 -0400 +Subject: [patch] mm: memcontrol: revert use of root_mem_cgroup res_counter + +Dave Hansen reports a massive scalability regression in an uncontained +page fault benchmark with more than 30 concurrent threads, which he +bisected down to 05b843012335 ("mm: memcontrol: use root_mem_cgroup +res_counter") and pin-pointed on res_counter spinlock contention. + +That change relied on the per-cpu charge caches to mostly swallow the +res_counter costs, but it's apparent that the caches don't scale yet. + +Revert memcg back to bypassing res_counters on the root level in order +to restore performance for uncontained workloads. + +Reported-by: Dave Hansen <dave@sr71.net> +Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> +--- + mm/memcontrol.c | 103 ++++++++++++++++++++++++++++++++++++++++++-------------- + 1 file changed, 78 insertions(+), 25 deletions(-) + +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index ec4dcf1b9562..085dc6d2f876 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -2534,6 +2534,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, + unsigned long long size; + int ret = 0; + ++ if (mem_cgroup_is_root(memcg)) ++ goto done; + retry: + if (consume_stock(memcg, nr_pages)) + goto done; +@@ -2611,9 +2613,7 @@ nomem: + if (!(gfp_mask & __GFP_NOFAIL)) + return -ENOMEM; + bypass: +- memcg = root_mem_cgroup; +- ret = -EINTR; +- goto retry; ++ return -EINTR; + + done_restock: + if (batch > nr_pages) +@@ -2626,6 +2626,9 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) + { + unsigned long bytes = nr_pages * PAGE_SIZE; + ++ if (mem_cgroup_is_root(memcg)) ++ return; ++ + res_counter_uncharge(&memcg->res, bytes); + if (do_swap_account) + res_counter_uncharge(&memcg->memsw, bytes); +@@ -2640,6 +2643,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, + { + unsigned long bytes = nr_pages * PAGE_SIZE; + ++ if (mem_cgroup_is_root(memcg)) ++ return; ++ + res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); + if (do_swap_account) + res_counter_uncharge_until(&memcg->memsw, +@@ -4093,6 +4099,46 @@ out: + return retval; + } + ++static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, ++ enum mem_cgroup_stat_index idx) ++{ ++ struct mem_cgroup *iter; ++ long val = 0; ++ ++ /* Per-cpu values can be negative, use a signed accumulator */ ++ for_each_mem_cgroup_tree(iter, memcg) ++ val += mem_cgroup_read_stat(iter, idx); ++ ++ if (val < 0) /* race ? */ ++ val = 0; ++ return val; ++} ++ ++static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) ++{ ++ u64 val; ++ ++ if (!mem_cgroup_is_root(memcg)) { ++ if (!swap) ++ return res_counter_read_u64(&memcg->res, RES_USAGE); ++ else ++ return res_counter_read_u64(&memcg->memsw, RES_USAGE); ++ } ++ ++ /* ++ * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS ++ * as well as in MEM_CGROUP_STAT_RSS_HUGE. ++ */ ++ val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); ++ val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); ++ ++ if (swap) ++ val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); ++ ++ return val << PAGE_SHIFT; ++} ++ ++ + static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) + { +@@ -4102,8 +4148,12 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, + + switch (type) { + case _MEM: ++ if (name == RES_USAGE) ++ return mem_cgroup_usage(memcg, false); + return res_counter_read_u64(&memcg->res, name); + case _MEMSWAP: ++ if (name == RES_USAGE) ++ return mem_cgroup_usage(memcg, true); + return res_counter_read_u64(&memcg->memsw, name); + case _KMEM: + return res_counter_read_u64(&memcg->kmem, name); +@@ -4572,10 +4622,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) + if (!t) + goto unlock; + +- if (!swap) +- usage = res_counter_read_u64(&memcg->res, RES_USAGE); +- else +- usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); ++ usage = mem_cgroup_usage(memcg, swap); + + /* + * current_threshold points to threshold just below or equal to usage. +@@ -4673,10 +4720,10 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, + + if (type == _MEM) { + thresholds = &memcg->thresholds; +- usage = res_counter_read_u64(&memcg->res, RES_USAGE); ++ usage = mem_cgroup_usage(memcg, false); + } else if (type == _MEMSWAP) { + thresholds = &memcg->memsw_thresholds; +- usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); ++ usage = mem_cgroup_usage(memcg, true); + } else + BUG(); + +@@ -4762,10 +4809,10 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + + if (type == _MEM) { + thresholds = &memcg->thresholds; +- usage = res_counter_read_u64(&memcg->res, RES_USAGE); ++ usage = mem_cgroup_usage(memcg, false); + } else if (type == _MEMSWAP) { + thresholds = &memcg->memsw_thresholds; +- usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); ++ usage = mem_cgroup_usage(memcg, true); + } else + BUG(); + +@@ -5525,9 +5572,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) + * core guarantees its existence. + */ + } else { +- res_counter_init(&memcg->res, &root_mem_cgroup->res); +- res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw); +- res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); ++ res_counter_init(&memcg->res, NULL); ++ res_counter_init(&memcg->memsw, NULL); ++ res_counter_init(&memcg->kmem, NULL); + /* + * Deeper hierachy with use_hierarchy == false doesn't make + * much sense so let cgroup subsystem know about this +@@ -5969,8 +6016,9 @@ static void __mem_cgroup_clear_mc(void) + /* we must fixup refcnts and charges */ + if (mc.moved_swap) { + /* uncharge swap account from the old cgroup */ +- res_counter_uncharge(&mc.from->memsw, +- PAGE_SIZE * mc.moved_swap); ++ if (!mem_cgroup_is_root(mc.from)) ++ res_counter_uncharge(&mc.from->memsw, ++ PAGE_SIZE * mc.moved_swap); + + for (i = 0; i < mc.moved_swap; i++) + css_put(&mc.from->css); +@@ -5979,8 +6027,9 @@ static void __mem_cgroup_clear_mc(void) + * we charged both to->res and to->memsw, so we should + * uncharge to->res. + */ +- res_counter_uncharge(&mc.to->res, +- PAGE_SIZE * mc.moved_swap); ++ if (!mem_cgroup_is_root(mc.to)) ++ res_counter_uncharge(&mc.to->res, ++ PAGE_SIZE * mc.moved_swap); + /* we've already done css_get(mc.to) */ + mc.moved_swap = 0; + } +@@ -6345,7 +6394,8 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) + rcu_read_lock(); + memcg = mem_cgroup_lookup(id); + if (memcg) { +- res_counter_uncharge(&memcg->memsw, PAGE_SIZE); ++ if (!mem_cgroup_is_root(memcg)) ++ res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + mem_cgroup_swap_statistics(memcg, false); + css_put(&memcg->css); + } +@@ -6509,12 +6559,15 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, + { + unsigned long flags; + +- if (nr_mem) +- res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE); +- if (nr_memsw) +- res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE); +- +- memcg_oom_recover(memcg); ++ if (!mem_cgroup_is_root(memcg)) { ++ if (nr_mem) ++ res_counter_uncharge(&memcg->res, ++ nr_mem * PAGE_SIZE); ++ if (nr_memsw) ++ res_counter_uncharge(&memcg->memsw, ++ nr_memsw * PAGE_SIZE); ++ memcg_oom_recover(memcg); ++ } + + local_irq_save(flags); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); +-- +2.0.4 diff --git a/a/content_digest b/N1/content_digest index 0a2d0a0..f8446b8 100644 --- a/a/content_digest +++ b/N1/content_digest @@ -47,6 +47,241 @@ "Thanks for the clarification, that is truly horrible. Does the\n" "following revert restore performance in your case?\n" "\n" - --- + "---\n" + ">From 6fa7599054868cd0df940d7b0973dd64f8acb0b5 Mon Sep 17 00:00:00 2001\n" + "From: Johannes Weiner <hannes@cmpxchg.org>\n" + "Date: Thu, 4 Sep 2014 10:04:34 -0400\n" + "Subject: [patch] mm: memcontrol: revert use of root_mem_cgroup res_counter\n" + "\n" + "Dave Hansen reports a massive scalability regression in an uncontained\n" + "page fault benchmark with more than 30 concurrent threads, which he\n" + "bisected down to 05b843012335 (\"mm: memcontrol: use root_mem_cgroup\n" + "res_counter\") and pin-pointed on res_counter spinlock contention.\n" + "\n" + "That change relied on the per-cpu charge caches to mostly swallow the\n" + "res_counter costs, but it's apparent that the caches don't scale yet.\n" + "\n" + "Revert memcg back to bypassing res_counters on the root level in order\n" + "to restore performance for uncontained workloads.\n" + "\n" + "Reported-by: Dave Hansen <dave@sr71.net>\n" + "Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>\n" + "---\n" + " mm/memcontrol.c | 103 ++++++++++++++++++++++++++++++++++++++++++--------------\n" + " 1 file changed, 78 insertions(+), 25 deletions(-)\n" + "\n" + "diff --git a/mm/memcontrol.c b/mm/memcontrol.c\n" + "index ec4dcf1b9562..085dc6d2f876 100644\n" + "--- a/mm/memcontrol.c\n" + "+++ b/mm/memcontrol.c\n" + "@@ -2534,6 +2534,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,\n" + " \tunsigned long long size;\n" + " \tint ret = 0;\n" + " \n" + "+\tif (mem_cgroup_is_root(memcg))\n" + "+\t\tgoto done;\n" + " retry:\n" + " \tif (consume_stock(memcg, nr_pages))\n" + " \t\tgoto done;\n" + "@@ -2611,9 +2613,7 @@ nomem:\n" + " \tif (!(gfp_mask & __GFP_NOFAIL))\n" + " \t\treturn -ENOMEM;\n" + " bypass:\n" + "-\tmemcg = root_mem_cgroup;\n" + "-\tret = -EINTR;\n" + "-\tgoto retry;\n" + "+\treturn -EINTR;\n" + " \n" + " done_restock:\n" + " \tif (batch > nr_pages)\n" + "@@ -2626,6 +2626,9 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)\n" + " {\n" + " \tunsigned long bytes = nr_pages * PAGE_SIZE;\n" + " \n" + "+\tif (mem_cgroup_is_root(memcg))\n" + "+\t\treturn;\n" + "+\n" + " \tres_counter_uncharge(&memcg->res, bytes);\n" + " \tif (do_swap_account)\n" + " \t\tres_counter_uncharge(&memcg->memsw, bytes);\n" + "@@ -2640,6 +2643,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,\n" + " {\n" + " \tunsigned long bytes = nr_pages * PAGE_SIZE;\n" + " \n" + "+\tif (mem_cgroup_is_root(memcg))\n" + "+\t\treturn;\n" + "+\n" + " \tres_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);\n" + " \tif (do_swap_account)\n" + " \t\tres_counter_uncharge_until(&memcg->memsw,\n" + "@@ -4093,6 +4099,46 @@ out:\n" + " \treturn retval;\n" + " }\n" + " \n" + "+static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,\n" + "+\t\t\t\t\t enum mem_cgroup_stat_index idx)\n" + "+{\n" + "+\tstruct mem_cgroup *iter;\n" + "+\tlong val = 0;\n" + "+\n" + "+\t/* Per-cpu values can be negative, use a signed accumulator */\n" + "+\tfor_each_mem_cgroup_tree(iter, memcg)\n" + "+\t\tval += mem_cgroup_read_stat(iter, idx);\n" + "+\n" + "+\tif (val < 0) /* race ? */\n" + "+\t\tval = 0;\n" + "+\treturn val;\n" + "+}\n" + "+\n" + "+static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)\n" + "+{\n" + "+\tu64 val;\n" + "+\n" + "+\tif (!mem_cgroup_is_root(memcg)) {\n" + "+\t\tif (!swap)\n" + "+\t\t\treturn res_counter_read_u64(&memcg->res, RES_USAGE);\n" + "+\t\telse\n" + "+\t\t\treturn res_counter_read_u64(&memcg->memsw, RES_USAGE);\n" + "+\t}\n" + "+\n" + "+\t/*\n" + "+\t * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS\n" + "+\t * as well as in MEM_CGROUP_STAT_RSS_HUGE.\n" + "+\t */\n" + "+\tval = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);\n" + "+\tval += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);\n" + "+\n" + "+\tif (swap)\n" + "+\t\tval += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);\n" + "+\n" + "+\treturn val << PAGE_SHIFT;\n" + "+}\n" + "+\n" + "+\n" + " static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,\n" + " \t\t\t struct cftype *cft)\n" + " {\n" + "@@ -4102,8 +4148,12 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,\n" + " \n" + " \tswitch (type) {\n" + " \tcase _MEM:\n" + "+\t\tif (name == RES_USAGE)\n" + "+\t\t\treturn mem_cgroup_usage(memcg, false);\n" + " \t\treturn res_counter_read_u64(&memcg->res, name);\n" + " \tcase _MEMSWAP:\n" + "+\t\tif (name == RES_USAGE)\n" + "+\t\t\treturn mem_cgroup_usage(memcg, true);\n" + " \t\treturn res_counter_read_u64(&memcg->memsw, name);\n" + " \tcase _KMEM:\n" + " \t\treturn res_counter_read_u64(&memcg->kmem, name);\n" + "@@ -4572,10 +4622,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)\n" + " \tif (!t)\n" + " \t\tgoto unlock;\n" + " \n" + "-\tif (!swap)\n" + "-\t\tusage = res_counter_read_u64(&memcg->res, RES_USAGE);\n" + "-\telse\n" + "-\t\tusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);\n" + "+\tusage = mem_cgroup_usage(memcg, swap);\n" + " \n" + " \t/*\n" + " \t * current_threshold points to threshold just below or equal to usage.\n" + "@@ -4673,10 +4720,10 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,\n" + " \n" + " \tif (type == _MEM) {\n" + " \t\tthresholds = &memcg->thresholds;\n" + "-\t\tusage = res_counter_read_u64(&memcg->res, RES_USAGE);\n" + "+\t\tusage = mem_cgroup_usage(memcg, false);\n" + " \t} else if (type == _MEMSWAP) {\n" + " \t\tthresholds = &memcg->memsw_thresholds;\n" + "-\t\tusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);\n" + "+\t\tusage = mem_cgroup_usage(memcg, true);\n" + " \t} else\n" + " \t\tBUG();\n" + " \n" + "@@ -4762,10 +4809,10 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,\n" + " \n" + " \tif (type == _MEM) {\n" + " \t\tthresholds = &memcg->thresholds;\n" + "-\t\tusage = res_counter_read_u64(&memcg->res, RES_USAGE);\n" + "+\t\tusage = mem_cgroup_usage(memcg, false);\n" + " \t} else if (type == _MEMSWAP) {\n" + " \t\tthresholds = &memcg->memsw_thresholds;\n" + "-\t\tusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);\n" + "+\t\tusage = mem_cgroup_usage(memcg, true);\n" + " \t} else\n" + " \t\tBUG();\n" + " \n" + "@@ -5525,9 +5572,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)\n" + " \t\t * core guarantees its existence.\n" + " \t\t */\n" + " \t} else {\n" + "-\t\tres_counter_init(&memcg->res, &root_mem_cgroup->res);\n" + "-\t\tres_counter_init(&memcg->memsw, &root_mem_cgroup->memsw);\n" + "-\t\tres_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);\n" + "+\t\tres_counter_init(&memcg->res, NULL);\n" + "+\t\tres_counter_init(&memcg->memsw, NULL);\n" + "+\t\tres_counter_init(&memcg->kmem, NULL);\n" + " \t\t/*\n" + " \t\t * Deeper hierachy with use_hierarchy == false doesn't make\n" + " \t\t * much sense so let cgroup subsystem know about this\n" + "@@ -5969,8 +6016,9 @@ static void __mem_cgroup_clear_mc(void)\n" + " \t/* we must fixup refcnts and charges */\n" + " \tif (mc.moved_swap) {\n" + " \t\t/* uncharge swap account from the old cgroup */\n" + "-\t\tres_counter_uncharge(&mc.from->memsw,\n" + "-\t\t\t\t PAGE_SIZE * mc.moved_swap);\n" + "+\t\tif (!mem_cgroup_is_root(mc.from))\n" + "+\t\t\tres_counter_uncharge(&mc.from->memsw,\n" + "+\t\t\t\t\t PAGE_SIZE * mc.moved_swap);\n" + " \n" + " \t\tfor (i = 0; i < mc.moved_swap; i++)\n" + " \t\t\tcss_put(&mc.from->css);\n" + "@@ -5979,8 +6027,9 @@ static void __mem_cgroup_clear_mc(void)\n" + " \t\t * we charged both to->res and to->memsw, so we should\n" + " \t\t * uncharge to->res.\n" + " \t\t */\n" + "-\t\tres_counter_uncharge(&mc.to->res,\n" + "-\t\t\t\t PAGE_SIZE * mc.moved_swap);\n" + "+\t\tif (!mem_cgroup_is_root(mc.to))\n" + "+\t\t\tres_counter_uncharge(&mc.to->res,\n" + "+\t\t\t\t\t PAGE_SIZE * mc.moved_swap);\n" + " \t\t/* we've already done css_get(mc.to) */\n" + " \t\tmc.moved_swap = 0;\n" + " \t}\n" + "@@ -6345,7 +6394,8 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)\n" + " \trcu_read_lock();\n" + " \tmemcg = mem_cgroup_lookup(id);\n" + " \tif (memcg) {\n" + "-\t\tres_counter_uncharge(&memcg->memsw, PAGE_SIZE);\n" + "+\t\tif (!mem_cgroup_is_root(memcg))\n" + "+\t\t\tres_counter_uncharge(&memcg->memsw, PAGE_SIZE);\n" + " \t\tmem_cgroup_swap_statistics(memcg, false);\n" + " \t\tcss_put(&memcg->css);\n" + " \t}\n" + "@@ -6509,12 +6559,15 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,\n" + " {\n" + " \tunsigned long flags;\n" + " \n" + "-\tif (nr_mem)\n" + "-\t\tres_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE);\n" + "-\tif (nr_memsw)\n" + "-\t\tres_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE);\n" + "-\n" + "-\tmemcg_oom_recover(memcg);\n" + "+\tif (!mem_cgroup_is_root(memcg)) {\n" + "+\t\tif (nr_mem)\n" + "+\t\t\tres_counter_uncharge(&memcg->res,\n" + "+\t\t\t\t\t nr_mem * PAGE_SIZE);\n" + "+\t\tif (nr_memsw)\n" + "+\t\t\tres_counter_uncharge(&memcg->memsw,\n" + "+\t\t\t\t\t nr_memsw * PAGE_SIZE);\n" + "+\t\tmemcg_oom_recover(memcg);\n" + "+\t}\n" + " \n" + " \tlocal_irq_save(flags);\n" + " \t__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);\n" + "-- \n" + 2.0.4 -b250a1d17c59e4a173b040a61c81cacc1396def806cf05bd43610670ed21bb88 +5948ad12330a4b2e5384ce059e2f29b3a14cc073b8f5503fafbe84eb2d3ce592
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.