From: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
To: linux-kernel@vger.kernel.org
Cc: peterz@infradead.org, surenb@google.com, brauner@kernel.org,
chris@chrisdown.name, hannes@cmpxchg.org,
Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Subject: [PATCH 1/4] sched/psi: rearrange polling code in preparation
Date: Thu, 9 Mar 2023 18:07:53 +0100 [thread overview]
Message-ID: <20230309170756.52927-2-cerasuolodomenico@gmail.com> (raw)
In-Reply-To: <20230309170756.52927-1-cerasuolodomenico@gmail.com>
Move a few functions up in the file to avoid forward declaration needed
in the patch implementing unprivileged PSI triggers.
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
---
kernel/sched/psi.c | 196 ++++++++++++++++++++++-----------------------
1 file changed, 98 insertions(+), 98 deletions(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 02e011cabe91..fe9269f1d2a4 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -384,92 +384,6 @@ static void collect_percpu_times(struct psi_group *group,
*pchanged_states = changed_states;
}
-static u64 update_averages(struct psi_group *group, u64 now)
-{
- unsigned long missed_periods = 0;
- u64 expires, period;
- u64 avg_next_update;
- int s;
-
- /* avgX= */
- expires = group->avg_next_update;
- if (now - expires >= psi_period)
- missed_periods = div_u64(now - expires, psi_period);
-
- /*
- * The periodic clock tick can get delayed for various
- * reasons, especially on loaded systems. To avoid clock
- * drift, we schedule the clock in fixed psi_period intervals.
- * But the deltas we sample out of the per-cpu buckets above
- * are based on the actual time elapsing between clock ticks.
- */
- avg_next_update = expires + ((1 + missed_periods) * psi_period);
- period = now - (group->avg_last_update + (missed_periods * psi_period));
- group->avg_last_update = now;
-
- for (s = 0; s < NR_PSI_STATES - 1; s++) {
- u32 sample;
-
- sample = group->total[PSI_AVGS][s] - group->avg_total[s];
- /*
- * Due to the lockless sampling of the time buckets,
- * recorded time deltas can slip into the next period,
- * which under full pressure can result in samples in
- * excess of the period length.
- *
- * We don't want to report non-sensical pressures in
- * excess of 100%, nor do we want to drop such events
- * on the floor. Instead we punt any overage into the
- * future until pressure subsides. By doing this we
- * don't underreport the occurring pressure curve, we
- * just report it delayed by one period length.
- *
- * The error isn't cumulative. As soon as another
- * delta slips from a period P to P+1, by definition
- * it frees up its time T in P.
- */
- if (sample > period)
- sample = period;
- group->avg_total[s] += sample;
- calc_avgs(group->avg[s], missed_periods, sample, period);
- }
-
- return avg_next_update;
-}
-
-static void psi_avgs_work(struct work_struct *work)
-{
- struct delayed_work *dwork;
- struct psi_group *group;
- u32 changed_states;
- u64 now;
-
- dwork = to_delayed_work(work);
- group = container_of(dwork, struct psi_group, avgs_work);
-
- mutex_lock(&group->avgs_lock);
-
- now = sched_clock();
-
- collect_percpu_times(group, PSI_AVGS, &changed_states);
- /*
- * If there is task activity, periodically fold the per-cpu
- * times and feed samples into the running averages. If things
- * are idle and there is no data to process, stop the clock.
- * Once restarted, we'll catch up the running averages in one
- * go - see calc_avgs() and missed_periods.
- */
- if (now >= group->avg_next_update)
- group->avg_next_update = update_averages(group, now);
-
- if (changed_states & PSI_STATE_RESCHEDULE) {
- schedule_delayed_work(dwork, nsecs_to_jiffies(
- group->avg_next_update - now) + 1);
- }
-
- mutex_unlock(&group->avgs_lock);
-}
-
/* Trigger tracking window manipulations */
static void window_reset(struct psi_window *win, u64 now, u64 value,
u64 prev_growth)
@@ -516,18 +430,6 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
return growth;
}
-static void init_triggers(struct psi_group *group, u64 now)
-{
- struct psi_trigger *t;
-
- list_for_each_entry(t, &group->triggers, node)
- window_reset(&t->win, now,
- group->total[PSI_POLL][t->state], 0);
- memcpy(group->polling_total, group->total[PSI_POLL],
- sizeof(group->polling_total));
- group->polling_next_update = now + group->poll_min_period;
-}
-
static u64 update_triggers(struct psi_group *group, u64 now)
{
struct psi_trigger *t;
@@ -590,6 +492,104 @@ static u64 update_triggers(struct psi_group *group, u64 now)
return now + group->poll_min_period;
}
+static u64 update_averages(struct psi_group *group, u64 now)
+{
+ unsigned long missed_periods = 0;
+ u64 expires, period;
+ u64 avg_next_update;
+ int s;
+
+ /* avgX= */
+ expires = group->avg_next_update;
+ if (now - expires >= psi_period)
+ missed_periods = div_u64(now - expires, psi_period);
+
+ /*
+ * The periodic clock tick can get delayed for various
+ * reasons, especially on loaded systems. To avoid clock
+ * drift, we schedule the clock in fixed psi_period intervals.
+ * But the deltas we sample out of the per-cpu buckets above
+ * are based on the actual time elapsing between clock ticks.
+ */
+ avg_next_update = expires + ((1 + missed_periods) * psi_period);
+ period = now - (group->avg_last_update + (missed_periods * psi_period));
+ group->avg_last_update = now;
+
+ for (s = 0; s < NR_PSI_STATES - 1; s++) {
+ u32 sample;
+
+ sample = group->total[PSI_AVGS][s] - group->avg_total[s];
+ /*
+ * Due to the lockless sampling of the time buckets,
+ * recorded time deltas can slip into the next period,
+ * which under full pressure can result in samples in
+ * excess of the period length.
+ *
+ * We don't want to report non-sensical pressures in
+ * excess of 100%, nor do we want to drop such events
+ * on the floor. Instead we punt any overage into the
+ * future until pressure subsides. By doing this we
+ * don't underreport the occurring pressure curve, we
+ * just report it delayed by one period length.
+ *
+ * The error isn't cumulative. As soon as another
+ * delta slips from a period P to P+1, by definition
+ * it frees up its time T in P.
+ */
+ if (sample > period)
+ sample = period;
+ group->avg_total[s] += sample;
+ calc_avgs(group->avg[s], missed_periods, sample, period);
+ }
+
+ return avg_next_update;
+}
+
+static void psi_avgs_work(struct work_struct *work)
+{
+ struct delayed_work *dwork;
+ struct psi_group *group;
+ u32 changed_states;
+ u64 now;
+
+ dwork = to_delayed_work(work);
+ group = container_of(dwork, struct psi_group, avgs_work);
+
+ mutex_lock(&group->avgs_lock);
+
+ now = sched_clock();
+
+ collect_percpu_times(group, PSI_AVGS, &changed_states);
+ /*
+ * If there is task activity, periodically fold the per-cpu
+ * times and feed samples into the running averages. If things
+ * are idle and there is no data to process, stop the clock.
+ * Once restarted, we'll catch up the running averages in one
+ * go - see calc_avgs() and missed_periods.
+ */
+ if (now >= group->avg_next_update)
+ group->avg_next_update = update_averages(group, now);
+
+ if (changed_states & PSI_STATE_RESCHEDULE) {
+ schedule_delayed_work(dwork, nsecs_to_jiffies(
+ group->avg_next_update - now) + 1);
+ }
+
+ mutex_unlock(&group->avgs_lock);
+}
+
+static void init_triggers(struct psi_group *group, u64 now)
+{
+ struct psi_trigger *t;
+
+ list_for_each_entry(t, &group->triggers, node)
+ window_reset(&t->win, now,
+ group->total[PSI_POLL][t->state], 0);
+ memcpy(group->polling_total, group->total[PSI_POLL],
+ sizeof(group->polling_total));
+ group->polling_next_update = now + group->poll_min_period;
+}
+
/* Schedule polling if it's not already scheduled or forced. */
static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay,
bool force)
--
2.34.1
next prev parent reply other threads:[~2023-03-09 17:11 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-03-09 17:07 [PATCH 0/4] sched/psi: Allow unprivileged PSI polling Domenico Cerasuolo
2023-03-09 17:07 ` Domenico Cerasuolo [this message]
2023-03-20 21:06 ` [PATCH 1/4] sched/psi: rearrange polling code in preparation Suren Baghdasaryan
2023-03-09 17:07 ` [PATCH 2/4] sched/psi: rename existing poll members " Domenico Cerasuolo
2023-03-20 21:19 ` Suren Baghdasaryan
2023-03-09 17:07 ` [PATCH 3/4] sched/psi: extract update_triggers side effect Domenico Cerasuolo
2023-03-20 23:00 ` Suren Baghdasaryan
[not found] ` <CA+CLi1g=70ot=YFL+xug3jC4OXG727NGo+NXxmC45WcwaFpo8g@mail.gmail.com>
2023-03-22 3:40 ` Suren Baghdasaryan
[not found] ` <CA+CLi1gjKFFgoeHQs-sNn1knqk1w9rb73kaOqP9z8TUwwiqqFQ@mail.gmail.com>
2023-03-22 16:41 ` Suren Baghdasaryan
2023-03-09 17:07 ` [PATCH 4/4] sched/psi: allow unprivileged polling of N*2s period Domenico Cerasuolo
2023-03-20 23:17 ` Suren Baghdasaryan
2023-03-13 15:29 ` [PATCH 0/4] sched/psi: Allow unprivileged PSI polling Suren Baghdasaryan
2023-03-14 16:10 ` Johannes Weiner
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230309170756.52927-2-cerasuolodomenico@gmail.com \
--to=cerasuolodomenico@gmail.com \
--cc=brauner@kernel.org \
--cc=chris@chrisdown.name \
--cc=hannes@cmpxchg.org \
--cc=linux-kernel@vger.kernel.org \
--cc=peterz@infradead.org \
--cc=surenb@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox