From: Gregory Price <gourry@gourry.net>
To: kaiyang2@cs.cmu.edu
Cc: linux-mm@kvack.org, cgroups@vger.kernel.org,
roman.gushchin@linux.dev, shakeel.butt@linux.dev,
muchun.song@linux.dev, akpm@linux-foundation.org,
mhocko@kernel.org, nehagholkar@meta.com, abhishekd@meta.com,
hannes@cmpxchg.org, weixugc@google.com, rientjes@google.com
Subject: Re: [RFC PATCH 2/4] calculate memory.low for the local node and track its usage
Date: Tue, 15 Oct 2024 18:05:05 -0400 [thread overview]
Message-ID: <Zw7nEfmkm36JZaae@PC2K9PVX.TheFacebook.com> (raw)
In-Reply-To: <20240920221202.1734227-3-kaiyang2@cs.cmu.edu>
On Fri, Sep 20, 2024 at 10:11:49PM +0000, kaiyang2@cs.cmu.edu wrote:
> From: Kaiyang Zhao <kaiyang2@cs.cmu.edu>
>
> Add a memory.low for the top-tier node (locallow) and track its usage.
> locallow is set by scaling low by the ratio of node 0 capacity and
> node 0 + node 1 capacity.
>
> Signed-off-by: Kaiyang Zhao <kaiyang2@cs.cmu.edu>
> ---
> include/linux/page_counter.h | 16 ++++++++---
> mm/hugetlb_cgroup.c | 4 +--
> mm/memcontrol.c | 42 ++++++++++++++++++++++-------
> mm/page_counter.c | 52 ++++++++++++++++++++++++++++--------
> 4 files changed, 88 insertions(+), 26 deletions(-)
>
> diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
> index 79dbd8bc35a7..aa56c93415ef 100644
> --- a/include/linux/page_counter.h
> +++ b/include/linux/page_counter.h
> @@ -13,6 +13,7 @@ struct page_counter {
> * memcg->memory.usage is a hot member of struct mem_cgroup.
> */
> atomic_long_t usage;
> + struct mem_cgroup *memcg; /* memcg that owns this counter */
Can you make some comments on the lifetime of this new memcg reference?
How is it referenced, how is it cleaned up, etc.
Probably it's worth added this in a separate patch so it's easier
to review the reference tracking.
> CACHELINE_PADDING(_pad1_);
>
> /* effective memory.min and memory.min usage tracking */
> @@ -25,6 +26,10 @@ struct page_counter {
> atomic_long_t low_usage;
> atomic_long_t children_low_usage;
>
> + unsigned long elocallow;
> + atomic_long_t locallow_usage;
per note on other email - probably want local_low_* instead of locallow.
> + atomic_long_t children_locallow_usage;
> +
> unsigned long watermark;
> /* Latest cg2 reset watermark */
> unsigned long local_watermark;
> @@ -36,6 +41,7 @@ struct page_counter {
> bool protection_support;
> unsigned long min;
> unsigned long low;
> + unsigned long locallow;
> unsigned long high;
> unsigned long max;
> struct page_counter *parent;
> @@ -52,12 +58,13 @@ struct page_counter {
> */
> static inline void page_counter_init(struct page_counter *counter,
> struct page_counter *parent,
> - bool protection_support)
> + bool protection_support, struct mem_cgroup *memcg)
> {
> counter->usage = (atomic_long_t)ATOMIC_LONG_INIT(0);
> counter->max = PAGE_COUNTER_MAX;
> counter->parent = parent;
> counter->protection_support = protection_support;
> + counter->memcg = memcg;
> }
>
> static inline unsigned long page_counter_read(struct page_counter *counter)
> @@ -72,7 +79,8 @@ bool page_counter_try_charge(struct page_counter *counter,
> struct page_counter **fail);
> void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
> void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages);
> -void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages);
> +void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages,
> + unsigned long nr_pages_local);
>
> static inline void page_counter_set_high(struct page_counter *counter,
> unsigned long nr_pages)
> @@ -99,11 +107,11 @@ static inline void page_counter_reset_watermark(struct page_counter *counter)
> #ifdef CONFIG_MEMCG
> void page_counter_calculate_protection(struct page_counter *root,
> struct page_counter *counter,
> - bool recursive_protection);
> + bool recursive_protection, int is_local);
`bool is_local` is preferred
> #else
> static inline void page_counter_calculate_protection(struct page_counter *root,
> struct page_counter *counter,
> - bool recursive_protection) {}
> + bool recursive_protection, int is_local) {}
> #endif
>
> #endif /* _LINUX_PAGE_COUNTER_H */
> diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
> index d8d0e665caed..0e07a7a1d5b8 100644
> --- a/mm/hugetlb_cgroup.c
> +++ b/mm/hugetlb_cgroup.c
> @@ -114,10 +114,10 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
> }
> page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup,
> idx),
> - fault_parent, false);
> + fault_parent, false, NULL);
> page_counter_init(
> hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx),
> - rsvd_parent, false);
> + rsvd_parent, false, NULL);
>
> limit = round_down(PAGE_COUNTER_MAX,
> pages_per_huge_page(&hstates[idx]));
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 20b715441332..d7c5fff12105 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1497,6 +1497,9 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
> vm_event_name(memcg_vm_event_stat[i]),
> memcg_events(memcg, memcg_vm_event_stat[i]));
> }
> +
> + seq_buf_printf(s, "local_usage %lu\n",
> + get_cgroup_local_usage(memcg, true));
> }
>
> static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
> @@ -3597,8 +3600,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
> if (parent) {
> WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
>
> - page_counter_init(&memcg->memory, &parent->memory, true);
> - page_counter_init(&memcg->swap, &parent->swap, false);
> + page_counter_init(&memcg->memory, &parent->memory, true, memcg);
> + page_counter_init(&memcg->swap, &parent->swap, false, NULL);
> #ifdef CONFIG_MEMCG_V1
> WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));
> page_counter_init(&memcg->kmem, &parent->kmem, false);
> @@ -3607,8 +3610,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
> } else {
> init_memcg_stats();
> init_memcg_events();
> - page_counter_init(&memcg->memory, NULL, true);
> - page_counter_init(&memcg->swap, NULL, false);
> + page_counter_init(&memcg->memory, NULL, true, memcg);
> + page_counter_init(&memcg->swap, NULL, false, NULL);
> #ifdef CONFIG_MEMCG_V1
> page_counter_init(&memcg->kmem, NULL, false);
> page_counter_init(&memcg->tcpmem, NULL, false);
> @@ -3677,7 +3680,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
> memcg1_css_offline(memcg);
>
> page_counter_set_min(&memcg->memory, 0);
> - page_counter_set_low(&memcg->memory, 0);
> + page_counter_set_low(&memcg->memory, 0, 0);
>
> zswap_memcg_offline_cleanup(memcg);
>
> @@ -3748,7 +3751,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
> page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
> #endif
> page_counter_set_min(&memcg->memory, 0);
> - page_counter_set_low(&memcg->memory, 0);
> + page_counter_set_low(&memcg->memory, 0, 0);
> page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
> memcg1_soft_limit_reset(memcg);
> page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
> @@ -4051,6 +4054,12 @@ static ssize_t memory_min_write(struct kernfs_open_file *of,
> return nbytes;
> }
>
> +static int memory_locallow_show(struct seq_file *m, void *v)
> +{
> + return seq_puts_memcg_tunable(m,
> + READ_ONCE(mem_cgroup_from_seq(m)->memory.locallow));
> +}
> +
> static int memory_low_show(struct seq_file *m, void *v)
> {
> return seq_puts_memcg_tunable(m,
> @@ -4061,7 +4070,8 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
> char *buf, size_t nbytes, loff_t off)
> {
> struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> - unsigned long low;
> + struct sysinfo si;
> + unsigned long low, locallow, local_capacity, total_capacity;
> int err;
>
> buf = strstrip(buf);
> @@ -4069,7 +4079,15 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
> if (err)
> return err;
>
> - page_counter_set_low(&memcg->memory, low);
> + /* Hardcoded 0 for local node and 1 for remote. */
I know we've talked about this before about this, but this is obviously broken
for multi-socket systems. If so, this needs a FIXME or a TODO at least so that
it's at least obvious that this patch isn't ready for upstream - even as an RFC.
Probably we can't move forward until we figure out how to solve this problem
out ahead of this patch set. Worth discussing this issue explicitly.
Maybe rather than guessing, a preferred node should be set for local and
remote if this mechanism is in use. Otherwise just guessing which local
and which remote node seems like it will be wrong - especially for sufficiently
large-threaded processes.
> + si_meminfo_node(&si, 0);
> + local_capacity = si.totalram; /* In pages. */
> + total_capacity = local_capacity;
> + si_meminfo_node(&si, 1);
> + total_capacity += si.totalram;
> + locallow = low * local_capacity / total_capacity;
> +
> + page_counter_set_low(&memcg->memory, low, locallow);
>
> return nbytes;
> }
> @@ -4394,6 +4412,11 @@ static struct cftype memory_files[] = {
> .seq_show = memory_low_show,
> .write = memory_low_write,
> },
> + {
> + .name = "locallow",
> + .flags = CFTYPE_NOT_ON_ROOT,
> + .seq_show = memory_locallow_show,
> + },
> {
> .name = "high",
> .flags = CFTYPE_NOT_ON_ROOT,
> @@ -4483,7 +4506,8 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
> if (!root)
> root = root_mem_cgroup;
>
> - page_counter_calculate_protection(&root->memory, &memcg->memory, recursive_protection);
> + page_counter_calculate_protection(&root->memory, &memcg->memory,
> + recursive_protection, false);
> }
>
> static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
> diff --git a/mm/page_counter.c b/mm/page_counter.c
> index b249d15af9dd..97205aafab46 100644
> --- a/mm/page_counter.c
> +++ b/mm/page_counter.c
> @@ -18,8 +18,10 @@ static bool track_protection(struct page_counter *c)
> return c->protection_support;
> }
>
> +extern unsigned long get_cgroup_local_usage(struct mem_cgroup *memcg, bool flush);
> +
> static void propagate_protected_usage(struct page_counter *c,
> - unsigned long usage)
> + unsigned long usage, unsigned long local_usage)
> {
> unsigned long protected, old_protected;
> long delta;
> @@ -44,6 +46,15 @@ static void propagate_protected_usage(struct page_counter *c,
> if (delta)
> atomic_long_add(delta, &c->parent->children_low_usage);
> }
> +
> + protected = min(local_usage, READ_ONCE(c->locallow));
> + old_protected = atomic_long_read(&c->locallow_usage);
> + if (protected != old_protected) {
> + old_protected = atomic_long_xchg(&c->locallow_usage, protected);
> + delta = protected - old_protected;
> + if (delta)
> + atomic_long_add(delta, &c->parent->children_locallow_usage);
> + }
> }
>
> /**
> @@ -63,7 +74,8 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
> atomic_long_set(&counter->usage, new);
> }
> if (track_protection(counter))
> - propagate_protected_usage(counter, new);
> + propagate_protected_usage(counter, new,
> + get_cgroup_local_usage(counter->memcg, false));
> }
>
> /**
> @@ -83,7 +95,8 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
>
> new = atomic_long_add_return(nr_pages, &c->usage);
> if (protection)
> - propagate_protected_usage(c, new);
> + propagate_protected_usage(c, new,
> + get_cgroup_local_usage(counter->memcg, false));
> /*
> * This is indeed racy, but we can live with some
> * inaccuracy in the watermark.
> @@ -151,7 +164,8 @@ bool page_counter_try_charge(struct page_counter *counter,
> goto failed;
> }
> if (protection)
> - propagate_protected_usage(c, new);
> + propagate_protected_usage(c, new,
> + get_cgroup_local_usage(counter->memcg, false));
>
> /* see comment on page_counter_charge */
> if (new > READ_ONCE(c->local_watermark)) {
> @@ -238,7 +252,8 @@ void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
> WRITE_ONCE(counter->min, nr_pages);
>
> for (c = counter; c; c = c->parent)
> - propagate_protected_usage(c, atomic_long_read(&c->usage));
> + propagate_protected_usage(c, atomic_long_read(&c->usage),
> + get_cgroup_local_usage(counter->memcg, false));
> }
>
> /**
> @@ -248,14 +263,17 @@ void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
> *
> * The caller must serialize invocations on the same counter.
> */
> -void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
> +void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages,
> + unsigned long nr_pages_local)
> {
> struct page_counter *c;
>
> WRITE_ONCE(counter->low, nr_pages);
> + WRITE_ONCE(counter->locallow, nr_pages_local);
>
> for (c = counter; c; c = c->parent)
> - propagate_protected_usage(c, atomic_long_read(&c->usage));
> + propagate_protected_usage(c, atomic_long_read(&c->usage),
> + get_cgroup_local_usage(counter->memcg, false));
> }
>
> /**
> @@ -421,9 +439,9 @@ static unsigned long effective_protection(unsigned long usage,
> */
> void page_counter_calculate_protection(struct page_counter *root,
> struct page_counter *counter,
> - bool recursive_protection)
> + bool recursive_protection, int is_local)
> {
> - unsigned long usage, parent_usage;
> + unsigned long usage, parent_usage, local_usage, parent_local_usage;
> struct page_counter *parent = counter->parent;
>
> /*
> @@ -437,16 +455,19 @@ void page_counter_calculate_protection(struct page_counter *root,
> return;
>
> usage = page_counter_read(counter);
> - if (!usage)
> + local_usage = get_cgroup_local_usage(counter->memcg, true);
> + if (!usage || !local_usage)
> return;
>
> if (parent == root) {
> counter->emin = READ_ONCE(counter->min);
> counter->elow = READ_ONCE(counter->low);
> + counter->elocallow = READ_ONCE(counter->locallow);
> return;
> }
>
> parent_usage = page_counter_read(parent);
> + parent_local_usage = get_cgroup_local_usage(parent->memcg, true);
>
> WRITE_ONCE(counter->emin, effective_protection(usage, parent_usage,
> READ_ONCE(counter->min),
> @@ -454,7 +475,16 @@ void page_counter_calculate_protection(struct page_counter *root,
> atomic_long_read(&parent->children_min_usage),
> recursive_protection));
>
> - WRITE_ONCE(counter->elow, effective_protection(usage, parent_usage,
> + if (is_local)
> + WRITE_ONCE(counter->elocallow,
> + effective_protection(local_usage, parent_local_usage,
> + READ_ONCE(counter->locallow),
> + READ_ONCE(parent->elocallow),
> + atomic_long_read(&parent->children_locallow_usage),
> + recursive_protection));
> + else
> + WRITE_ONCE(counter->elow,
> + effective_protection(usage, parent_usage,
> READ_ONCE(counter->low),
> READ_ONCE(parent->elow),
> atomic_long_read(&parent->children_low_usage),
> --
> 2.43.0
>
next prev parent reply other threads:[~2024-10-15 22:05 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-09-20 22:11 [RFC PATCH 0/4] memory tiering fairness by per-cgroup control of promotion and demotion kaiyang2
2024-09-20 22:11 ` [RFC PATCH 1/4] Add get_cgroup_local_usage for estimating the top-tier memory usage kaiyang2
2024-09-20 22:11 ` [RFC PATCH 2/4] calculate memory.low for the local node and track its usage kaiyang2
2024-09-22 8:39 ` kernel test robot
2024-10-15 22:05 ` Gregory Price [this message]
2024-09-20 22:11 ` [RFC PATCH 3/4] use memory.low local node protection for local node reclaim kaiyang2
2024-10-15 21:52 ` Gregory Price
2024-09-20 22:11 ` [RFC PATCH 4/4] reduce NUMA balancing scan size of cgroups over their local memory.low kaiyang2
2024-10-11 20:51 ` [RFC PATCH 0/4] memory tiering fairness by per-cgroup control of promotion and demotion Kaiyang Zhao
2024-11-08 19:01 ` kaiyang2
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=Zw7nEfmkm36JZaae@PC2K9PVX.TheFacebook.com \
--to=gourry@gourry.net \
--cc=abhishekd@meta.com \
--cc=akpm@linux-foundation.org \
--cc=cgroups@vger.kernel.org \
--cc=hannes@cmpxchg.org \
--cc=kaiyang2@cs.cmu.edu \
--cc=linux-mm@kvack.org \
--cc=mhocko@kernel.org \
--cc=muchun.song@linux.dev \
--cc=nehagholkar@meta.com \
--cc=rientjes@google.com \
--cc=roman.gushchin@linux.dev \
--cc=shakeel.butt@linux.dev \
--cc=weixugc@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox