From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 266F82288F4 for ; Fri, 25 Apr 2025 21:58:55 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1745618337; cv=none; b=EoEmX7X5+sLewh++L80fEIxj/yhOTKDnrYaF/v2ken/WA7lLAIP62cMIdYg9J26F5+rubX6yzKSCjFpT81hOHO3QFnGKFDdOhCQ/tcgxblQ1pNdx5NoO0skwVeK1YS31jMMU/apEDqY4NeHi9iC3wo5pT02OzxDZs7L2d5kw4Fs= ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1745618337; c=relaxed/simple; bh=i35djFKE7u7cXMY/Zz7C7JTSM9N6tp3NpptpqLhcxeA=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=d8+8/RM8cIKcn9Kr8PlghJivEXRDU8BD7FdIdRy/D9E1lTGxoIWM5oX5wOXRWy/h+qXiDEUX9c+VGQJozTzJixr2Ixb6kFbStTbMEWPputMJW1lgQBrEYLaT6CaE1I91Ymy+WRan+438yHnO94S+OjrVc8/ZHJT76WzkRcnbzQU= ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=iC8I7MhT; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="iC8I7MhT" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 85E6FC4CEE9; Fri, 25 Apr 2025 21:58:55 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1745618335; bh=i35djFKE7u7cXMY/Zz7C7JTSM9N6tp3NpptpqLhcxeA=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=iC8I7MhTMrk3Oh0CqvFXsMooLbHJs9zVxbiByFdlkwdlydo2sF3OU4Udm9uTygXh5 jf5vtbjJTiskCXfNgUUKKbX/v+OGtfkKpZxtxG+C4pZprVzfs30lt69QY2K3jXzGCY gJgVEW5qaF4iJ5VNfco1m3dK5/N9O+fLujR6H6xYNtaMp8SvdozulJMXac+d8EVFEH LcbJ0RJZShmprlKVKDIKDAXXuJt6hZZ3Assk79ZkZhXi1ciVTwUigX9A6Xp9NB5iye 897LQAxwA1G5ABirfqhGlaJzeysy5M32jFFWFuw2lEb9bddpZvCGXCLfU/PIt+D7SV krCc+7dFbgAYQ== From: Tejun Heo To: void@manifault.com, arighi@nvidia.com, multics69@gmail.com Cc: linux-kernel@vger.kernel.org, sched-ext@meta.com, Tejun Heo Subject: [PATCH 10/12] sched_ext: Move event_stats_cpu into scx_sched Date: Fri, 25 Apr 2025 11:58:25 -1000 Message-ID: <20250425215840.2334972-11-tj@kernel.org> X-Mailer: git-send-email 2.49.0 In-Reply-To: <20250425215840.2334972-1-tj@kernel.org> References: <20250425215840.2334972-1-tj@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: 8bit The event counters are going to become per scheduler instance. Move event_stats_cpu into scx_sched. - [__]scx_add_event() are updated to take @sch. While at it, add missing parentheses around @cnt expansions. - scx_read_events() is updated to take @sch. - scx_bpf_events() accesses scx_root under RCU read lock. v2: - Replace stale scx_bpf_get_event_stat() reference in a comment with scx_bpf_events(). - Trivial goto label rename. Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 100 ++++++++++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 42 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 885cd71a847e..154c354856fc 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -838,6 +838,13 @@ struct scx_sched { struct rhashtable dsq_hash; struct scx_dispatch_q **global_dsqs; + /* + * The event counters are in a per-CPU variable to minimize the + * accounting overhead. A system-wide view on the event counter is + * constructed when requested by scx_bpf_events(). + */ + struct scx_event_stats __percpu *event_stats_cpu; + bool warned_zero_slice; atomic_t exit_kind; @@ -1599,34 +1606,29 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) return p; } -/* - * The event counter is organized by a per-CPU variable to minimize the - * accounting overhead without synchronization. A system-wide view on the - * event counter is constructed when requested by scx_bpf_get_event_stat(). - */ -static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu); - /** * scx_add_event - Increase an event counter for 'name' by 'cnt' + * @sch: scx_sched to account events for * @name: an event name defined in struct scx_event_stats * @cnt: the number of the event occured * * This can be used when preemption is not disabled. */ -#define scx_add_event(name, cnt) do { \ - this_cpu_add(event_stats_cpu.name, cnt); \ - trace_sched_ext_event(#name, cnt); \ +#define scx_add_event(sch, name, cnt) do { \ + this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ + trace_sched_ext_event(#name, (cnt)); \ } while(0) /** * __scx_add_event - Increase an event counter for 'name' by 'cnt' + * @sch: scx_sched to account events for * @name: an event name defined in struct scx_event_stats * @cnt: the number of the event occured * * This should be used only when preemption is disabled. */ -#define __scx_add_event(name, cnt) do { \ - __this_cpu_add(event_stats_cpu.name, cnt); \ +#define __scx_add_event(sch, name, cnt) do { \ + __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ trace_sched_ext_event(#name, cnt); \ } while(0) @@ -1651,7 +1653,8 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu); } while (0) -static void scx_read_events(struct scx_event_stats *events); +static void scx_read_events(struct scx_sched *sch, + struct scx_event_stats *events); static enum scx_enable_state scx_enable_state(void) { @@ -1877,7 +1880,7 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) static void refill_task_slice_dfl(struct task_struct *p) { p->scx.slice = SCX_SLICE_DFL; - __scx_add_event(SCX_EV_REFILL_SLICE_DFL, 1); + __scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1); } static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, @@ -2206,7 +2209,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, goto local; if (scx_rq_bypassing(rq)) { - __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1); + __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); goto global; } @@ -2216,14 +2219,14 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, /* see %SCX_OPS_ENQ_EXITING */ if (!(scx_root->ops.flags & SCX_OPS_ENQ_EXITING) && unlikely(p->flags & PF_EXITING)) { - __scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1); + __scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1); goto local; } /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ if (!(scx_root->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) && is_migration_disabled(p)) { - __scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); + __scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); goto local; } @@ -2346,7 +2349,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags if ((enq_flags & SCX_ENQ_CPU_SELECTED) && unlikely(cpu_of(rq) != p->scx.selected_cpu)) - __scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1); + __scx_add_event(scx_root, SCX_EV_SELECT_CPU_FALLBACK, 1); } static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) @@ -2574,7 +2577,8 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, if (!scx_rq_online(rq)) { if (enforce) - __scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); + __scx_add_event(scx_root, + SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); return false; } @@ -3096,7 +3100,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) if (prev_on_rq && (!(scx_root->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) { rq->scx.flags |= SCX_RQ_BAL_KEEP; - __scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1); + __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1); goto has_tasks; } rq->scx.flags &= ~SCX_RQ_IN_BALANCE; @@ -3427,6 +3431,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) { + struct scx_sched *sch = scx_root; bool rq_bypass; /* @@ -3443,7 +3448,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag return prev_cpu; rq_bypass = scx_rq_bypassing(task_rq(p)); - if (likely(SCX_HAS_OP(scx_root, select_cpu)) && !rq_bypass) { + if (likely(SCX_HAS_OP(sch, select_cpu)) && !rq_bypass) { s32 cpu; struct task_struct **ddsp_taskp; @@ -3472,7 +3477,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag p->scx.selected_cpu = cpu; if (rq_bypass) - __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1); + __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); return cpu; } } @@ -4413,6 +4418,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work) struct scx_dispatch_q *dsq; int node; + free_percpu(sch->event_stats_cpu); + for_each_node_state(node, N_POSSIBLE) kfree(sch->global_dsqs[node]); kfree(sch->global_dsqs); @@ -4455,10 +4462,11 @@ SCX_ATTR(ops); static ssize_t scx_attr_events_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { + struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); struct scx_event_stats events; int at = 0; - scx_read_events(&events); + scx_read_events(sch, &events); at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK); at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); @@ -4591,25 +4599,29 @@ static void scx_bypass(bool bypass) { static DEFINE_RAW_SPINLOCK(bypass_lock); static unsigned long bypass_timestamp; - - int cpu; + struct scx_sched *sch; unsigned long flags; + int cpu; raw_spin_lock_irqsave(&bypass_lock, flags); + sch = rcu_dereference_bh(scx_root); + if (bypass) { scx_bypass_depth++; WARN_ON_ONCE(scx_bypass_depth <= 0); if (scx_bypass_depth != 1) goto unlock; bypass_timestamp = ktime_get_ns(); - scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1); + if (sch) + scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); } else { scx_bypass_depth--; WARN_ON_ONCE(scx_bypass_depth < 0); if (scx_bypass_depth != 0) goto unlock; - scx_add_event(SCX_EV_BYPASS_DURATION, - ktime_get_ns() - bypass_timestamp); + if (sch) + scx_add_event(sch, SCX_EV_BYPASS_DURATION, + ktime_get_ns() - bypass_timestamp); } atomic_inc(&scx_breather_depth); @@ -5182,7 +5194,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) dump_line(&s, "Event counters"); dump_line(&s, "--------------"); - scx_read_events(&events); + scx_read_events(scx_root, &events); scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK); scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); @@ -5290,6 +5302,10 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) sch->global_dsqs[node] = dsq; } + sch->event_stats_cpu = alloc_percpu(struct scx_event_stats); + if (!sch->event_stats_cpu) + goto err_free_gdsqs; + atomic_set(&sch->exit_kind, SCX_EXIT_NONE); sch->ops = *ops; ops->priv = sch; @@ -5297,10 +5313,12 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) sch->kobj.kset = scx_kset; ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); if (ret < 0) - goto err_free_gdsqs; + goto err_free_event_stats; return sch; +err_free_event_stats: + free_percpu(sch->event_stats_cpu); err_free_gdsqs: for_each_node_state(node, N_POSSIBLE) kfree(sch->global_dsqs[node]); @@ -5376,15 +5394,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) mutex_lock(&scx_enable_mutex); - /* - * Clear event counters so a new scx scheduler gets - * fresh event counter values. - */ - for_each_possible_cpu(cpu) { - struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu); - memset(e, 0, sizeof(*e)); - } - if (!scx_helper) { WRITE_ONCE(scx_helper, scx_create_rt_helper("sched_ext_helper")); if (!scx_helper) { @@ -7405,7 +7414,7 @@ __bpf_kfunc u64 scx_bpf_now(void) return clock; } -static void scx_read_events(struct scx_event_stats *events) +static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events) { struct scx_event_stats *e_cpu; int cpu; @@ -7413,7 +7422,7 @@ static void scx_read_events(struct scx_event_stats *events) /* Aggregate per-CPU event counters into @events. */ memset(events, 0, sizeof(*events)); for_each_possible_cpu(cpu) { - e_cpu = per_cpu_ptr(&event_stats_cpu, cpu); + e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu); scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); @@ -7434,9 +7443,16 @@ static void scx_read_events(struct scx_event_stats *events) __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) { + struct scx_sched *sch; struct scx_event_stats e_sys; - scx_read_events(&e_sys); + rcu_read_lock(); + sch = rcu_dereference(scx_root); + if (sch) + scx_read_events(sch, &e_sys); + else + memset(&e_sys, 0, sizeof(e_sys)); + rcu_read_unlock(); /* * We cannot entirely trust a BPF-provided size since a BPF program -- 2.49.0