From: Yuanchu Xie <yuanchu@google.com>
To: Andrew Morton <akpm@linux-foundation.org>,
David Hildenbrand <david@redhat.com>,
"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>,
Khalid Aziz <khalid.aziz@oracle.com>,
Henry Huang <henry.hj@antgroup.com>, Yu Zhao <yuzhao@google.com>,
Dan Williams <dan.j.williams@intel.com>,
Gregory Price <gregory.price@memverge.com>,
Huang Ying <ying.huang@intel.com>,
Lance Yang <ioworker0@gmail.com>,
Randy Dunlap <rdunlap@infradead.org>,
Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: "Tejun Heo" <tj@kernel.org>,
"Johannes Weiner" <hannes@cmpxchg.org>,
"Michal Koutný" <mkoutny@suse.com>,
"Jonathan Corbet" <corbet@lwn.net>,
"Greg Kroah-Hartman" <gregkh@linuxfoundation.org>,
"Rafael J. Wysocki" <rafael@kernel.org>,
"Michael S. Tsirkin" <mst@redhat.com>,
"Jason Wang" <jasowang@redhat.com>,
"Xuan Zhuo" <xuanzhuo@linux.alibaba.com>,
"Eugenio Pérez" <eperezma@redhat.com>,
"Michal Hocko" <mhocko@kernel.org>,
"Roman Gushchin" <roman.gushchin@linux.dev>,
"Shakeel Butt" <shakeel.butt@linux.dev>,
"Muchun Song" <muchun.song@linux.dev>,
"Mike Rapoport" <rppt@kernel.org>,
"Shuah Khan" <shuah@kernel.org>,
"Christian Brauner" <brauner@kernel.org>,
"Daniel Watson" <ozzloy@each.do>,
"Yuanchu Xie" <yuanchu@google.com>,
cgroups@vger.kernel.org, linux-doc@vger.kernel.org,
linux-kernel@vger.kernel.org, virtualization@lists.linux.dev,
linux-mm@kvack.org, linux-kselftest@vger.kernel.org
Subject: [PATCH v4 2/9] mm: use refresh interval to rate-limit workingset report aggregation
Date: Tue, 26 Nov 2024 18:57:21 -0800 [thread overview]
Message-ID: <20241127025728.3689245-3-yuanchu@google.com> (raw)
In-Reply-To: <20241127025728.3689245-1-yuanchu@google.com>
The refresh interval is a rate limiting factor to workingset page age
histogram reads. When a workingset report is generated, the oldest
timestamp of all the lruvecs is stored as the timestamp of the report.
The same report will be read until the report expires beyond the refresh
interval, at which point a new report is generated.
Sysfs interface
/sys/devices/system/node/nodeX/workingset_report/refresh_interval
time in milliseconds specifying how long the report is valid for
Signed-off-by: Yuanchu Xie <yuanchu@google.com>
---
include/linux/workingset_report.h | 1 +
mm/workingset_report.c | 101 ++++++++++++++++++++++++------
2 files changed, 83 insertions(+), 19 deletions(-)
diff --git a/include/linux/workingset_report.h b/include/linux/workingset_report.h
index d7c2ee14ec87..8bae6a600410 100644
--- a/include/linux/workingset_report.h
+++ b/include/linux/workingset_report.h
@@ -37,6 +37,7 @@ struct wsr_page_age_histo {
};
struct wsr_state {
+ unsigned long refresh_interval;
/* breakdown of workingset by page age */
struct mutex page_age_lock;
struct wsr_page_age_histo *page_age;
diff --git a/mm/workingset_report.c b/mm/workingset_report.c
index a4dcf62fcd96..8678536ccfc7 100644
--- a/mm/workingset_report.c
+++ b/mm/workingset_report.c
@@ -174,9 +174,11 @@ static void collect_page_age_type(const struct lru_gen_folio *lrugen,
* Assume the heuristic that pages are in the MGLRU generation
* through uniform accesses, so we can aggregate them
* proportionally into bins.
+ *
+ * Returns the timestamp of the youngest gen in this lruvec.
*/
-static void collect_page_age(struct wsr_page_age_histo *page_age,
- const struct lruvec *lruvec)
+static unsigned long collect_page_age(struct wsr_page_age_histo *page_age,
+ const struct lruvec *lruvec)
{
int type;
const struct lru_gen_folio *lrugen = &lruvec->lrugen;
@@ -191,11 +193,14 @@ static void collect_page_age(struct wsr_page_age_histo *page_age,
for (type = 0; type < ANON_AND_FILE; type++)
collect_page_age_type(lrugen, bin, max_seq, min_seq[type],
curr_timestamp, type);
+
+ return READ_ONCE(lruvec->lrugen.timestamps[lru_gen_from_seq(max_seq)]);
}
/* First step: hierarchically scan child memcgs. */
static void refresh_scan(struct wsr_state *wsr, struct mem_cgroup *root,
- struct pglist_data *pgdat)
+ struct pglist_data *pgdat,
+ unsigned long refresh_interval)
{
struct mem_cgroup *memcg;
unsigned int flags;
@@ -208,12 +213,15 @@ static void refresh_scan(struct wsr_state *wsr, struct mem_cgroup *root,
do {
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq);
+ int gen = lru_gen_from_seq(max_seq);
+ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
/*
* setting can_swap=true and force_scan=true ensures
* proper workingset stats when the system cannot swap.
*/
- try_to_inc_max_seq(lruvec, max_seq, true, true);
+ if (time_is_before_jiffies(birth + refresh_interval))
+ try_to_inc_max_seq(lruvec, max_seq, true, true);
cond_resched();
} while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
@@ -228,6 +236,7 @@ static void refresh_aggregate(struct wsr_page_age_histo *page_age,
{
struct mem_cgroup *memcg;
struct wsr_report_bin *bin;
+ unsigned long oldest_lruvec_time = jiffies;
for (bin = page_age->bins;
bin->idle_age != WORKINGSET_INTERVAL_MAX; bin++) {
@@ -241,11 +250,15 @@ static void refresh_aggregate(struct wsr_page_age_histo *page_age,
memcg = mem_cgroup_iter(root, NULL, NULL);
do {
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ unsigned long lruvec_time =
+ collect_page_age(page_age, lruvec);
+
+ if (time_before(lruvec_time, oldest_lruvec_time))
+ oldest_lruvec_time = lruvec_time;
- collect_page_age(page_age, lruvec);
cond_resched();
} while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
- WRITE_ONCE(page_age->timestamp, jiffies);
+ WRITE_ONCE(page_age->timestamp, oldest_lruvec_time);
}
static void copy_node_bins(struct pglist_data *pgdat,
@@ -270,17 +283,25 @@ bool wsr_refresh_report(struct wsr_state *wsr, struct mem_cgroup *root,
struct pglist_data *pgdat)
{
struct wsr_page_age_histo *page_age;
+ unsigned long refresh_interval = READ_ONCE(wsr->refresh_interval);
if (!READ_ONCE(wsr->page_age))
return false;
- refresh_scan(wsr, root, pgdat);
+ if (!refresh_interval)
+ return false;
+
mutex_lock(&wsr->page_age_lock);
page_age = READ_ONCE(wsr->page_age);
- if (page_age) {
- copy_node_bins(pgdat, page_age);
- refresh_aggregate(page_age, root, pgdat);
- }
+ if (!page_age)
+ goto unlock;
+ if (page_age->timestamp &&
+ time_is_after_jiffies(page_age->timestamp + refresh_interval))
+ goto unlock;
+ refresh_scan(wsr, root, pgdat, refresh_interval);
+ copy_node_bins(pgdat, page_age);
+ refresh_aggregate(page_age, root, pgdat);
+unlock:
mutex_unlock(&wsr->page_age_lock);
return !!page_age;
}
@@ -299,6 +320,52 @@ static struct wsr_state *kobj_to_wsr(struct kobject *kobj)
return &mem_cgroup_lruvec(NULL, kobj_to_pgdat(kobj))->wsr;
}
+static ssize_t refresh_interval_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct wsr_state *wsr = kobj_to_wsr(kobj);
+ unsigned int interval = READ_ONCE(wsr->refresh_interval);
+
+ return sysfs_emit(buf, "%u\n", jiffies_to_msecs(interval));
+}
+
+static ssize_t refresh_interval_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ unsigned int interval;
+ int err;
+ struct wsr_state *wsr = kobj_to_wsr(kobj);
+
+ err = kstrtouint(buf, 0, &interval);
+ if (err)
+ return err;
+
+ mutex_lock(&wsr->page_age_lock);
+ if (interval && !wsr->page_age) {
+ struct wsr_page_age_histo *page_age =
+ kzalloc(sizeof(struct wsr_page_age_histo), GFP_KERNEL);
+
+ if (!page_age) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+ wsr->page_age = page_age;
+ }
+ if (!interval && wsr->page_age) {
+ kfree(wsr->page_age);
+ wsr->page_age = NULL;
+ }
+
+ WRITE_ONCE(wsr->refresh_interval, msecs_to_jiffies(interval));
+unlock:
+ mutex_unlock(&wsr->page_age_lock);
+ return err ?: len;
+}
+
+static struct kobj_attribute refresh_interval_attr =
+ __ATTR_RW(refresh_interval);
+
static ssize_t page_age_intervals_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -382,13 +449,6 @@ static ssize_t page_age_show(struct kobject *kobj, struct kobj_attribute *attr,
int ret = 0;
struct wsr_state *wsr = kobj_to_wsr(kobj);
-
- mutex_lock(&wsr->page_age_lock);
- if (!wsr->page_age)
- wsr->page_age =
- kzalloc(sizeof(struct wsr_page_age_histo), GFP_KERNEL);
- mutex_unlock(&wsr->page_age_lock);
-
wsr_refresh_report(wsr, NULL, kobj_to_pgdat(kobj));
mutex_lock(&wsr->page_age_lock);
@@ -414,7 +474,10 @@ static ssize_t page_age_show(struct kobject *kobj, struct kobj_attribute *attr,
static struct kobj_attribute page_age_attr = __ATTR_RO(page_age);
static struct attribute *workingset_report_attrs[] = {
- &page_age_intervals_attr.attr, &page_age_attr.attr, NULL
+ &refresh_interval_attr.attr,
+ &page_age_intervals_attr.attr,
+ &page_age_attr.attr,
+ NULL
};
static const struct attribute_group workingset_report_attr_group = {
--
2.47.0.338.g60cca15819-goog
next prev parent reply other threads:[~2024-11-27 2:57 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-11-27 2:57 [PATCH v4 0/9] mm: workingset reporting Yuanchu Xie
2024-11-27 2:57 ` [PATCH v4 1/9] mm: aggregate workingset information into histograms Yuanchu Xie
2024-11-27 4:21 ` Matthew Wilcox
2024-11-27 17:47 ` Yuanchu Xie
2024-11-27 2:57 ` Yuanchu Xie [this message]
2024-11-27 2:57 ` [PATCH v4 3/9] mm: report workingset during memory pressure driven scanning Yuanchu Xie
2024-11-27 2:57 ` [PATCH v4 4/9] mm: extend workingset reporting to memcgs Yuanchu Xie
2024-11-27 2:57 ` [PATCH v4 5/9] mm: add kernel aging thread for workingset reporting Yuanchu Xie
2024-11-27 2:57 ` [PATCH v4 6/9] selftest: test system-wide " Yuanchu Xie
2024-11-27 2:57 ` [PATCH v4 7/9] Docs/admin-guide/mm/workingset_report: document sysfs and memcg interfaces Yuanchu Xie
2024-11-27 2:57 ` [PATCH v4 8/9] Docs/admin-guide/cgroup-v2: document workingset reporting Yuanchu Xie
2024-11-27 2:57 ` [PATCH v4 9/9] virtio-balloon: add " Yuanchu Xie
2024-11-27 23:14 ` Daniel Verkamp
2024-11-27 23:38 ` Yuanchu Xie
2024-11-27 7:26 ` [PATCH v4 0/9] mm: " Johannes Weiner
2024-11-27 19:40 ` SeongJae Park
2024-11-27 23:33 ` Yu Zhao
2024-12-06 19:57 ` Yuanchu Xie
2024-12-11 19:53 ` SeongJae Park
2025-01-30 2:02 ` Yuanchu Xie
2025-01-30 4:11 ` SeongJae Park
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20241127025728.3689245-3-yuanchu@google.com \
--to=yuanchu@google.com \
--cc=akpm@linux-foundation.org \
--cc=aneesh.kumar@linux.ibm.com \
--cc=brauner@kernel.org \
--cc=cgroups@vger.kernel.org \
--cc=corbet@lwn.net \
--cc=dan.j.williams@intel.com \
--cc=david@redhat.com \
--cc=eperezma@redhat.com \
--cc=gregkh@linuxfoundation.org \
--cc=gregory.price@memverge.com \
--cc=hannes@cmpxchg.org \
--cc=henry.hj@antgroup.com \
--cc=ioworker0@gmail.com \
--cc=jasowang@redhat.com \
--cc=khalid.aziz@oracle.com \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mhocko@kernel.org \
--cc=mkoutny@suse.com \
--cc=mst@redhat.com \
--cc=muchun.song@linux.dev \
--cc=ozzloy@each.do \
--cc=rafael@kernel.org \
--cc=rdunlap@infradead.org \
--cc=roman.gushchin@linux.dev \
--cc=rppt@kernel.org \
--cc=shakeel.butt@linux.dev \
--cc=shuah@kernel.org \
--cc=tj@kernel.org \
--cc=usama.anjum@collabora.com \
--cc=virtualization@lists.linux.dev \
--cc=xuanzhuo@linux.alibaba.com \
--cc=ying.huang@intel.com \
--cc=yuzhao@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).