All of lore.kernel.org
 help / color / mirror / Atom feed
From: Leonardo Temperanza <leonardo.temperanza@gmail.com>
To: Changwoo Min <changwoo@igalia.com>,
	tj@kernel.org, void@manifault.com, arighi@nvidia.com
Cc: kernel-dev@igalia.com, linux-kernel@vger.kernel.org
Subject: Re: [PATCH 1/7] sched_ext: Implement event counter infrastructure and add an event
Date: Tue, 21 Jan 2025 16:00:12 +0100	[thread overview]
Message-ID: <5c1f6f0b-2b9d-4b16-8ec0-e3be876eb260@gmail.com> (raw)
In-Reply-To: <20250116151543.80163-2-changwoo@igalia.com>

Hello,

On 1/16/2025 4:15 PM, Changwoo Min wrote:
> Collect the statistics of specific types of behavior in the sched_ext core,
> which are not easily visible but still interesting to an scx scheduler.
>
> Also, add a core event, SCX_EVENT_INVAL_SELECT_CPU, which represents how
> many times ops.select_cpu() returns a CPU that the task can't use.
>
> Signed-off-by: Changwoo Min <changwoo@igalia.com>
> ---
>   kernel/sched/ext.c | 120 ++++++++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 118 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 0bcdd1a31676..7e12d5b8322e 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -1458,6 +1458,66 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
>   	return p;
>   }
>   
> +/*
> + * Collection of event counters.
> + */
> +struct scx_event_stat {
> +	/*
> +	 * If ops.select_cpu() returns a CPU which can't be used by the task,
> +	 * the core scheduler code silently picks a fallback CPU.
> +	 */
> +	u64		INVAL_SELECT_CPU;
> +};
> +
> +#define SCX_EVENT_IDX(e)	(offsetof(struct scx_event_stat, e)/sizeof(u64))
> +#define SCX_EVENT_END_IDX()	(sizeof(struct scx_event_stat)/sizeof(u64))
> +#define SCX_EVENT_DEFINE(e)	SCX_EVENT_##e = SCX_EVENT_IDX(e)
> +


scx_event_stat fields are required to be u64, otherwise the macros below 
don't work correctly. Perhaps a comment could highlight this "hidden" 
constraint? As well as a BUILD_BUG_ON to verify that this constraint is 
verified at build time.


> +/*
> + * Types of event counters.
> + */
> +enum scx_event_kind {
> +	SCX_EVENT_BEGIN = 0,
> +	SCX_EVENT_DEFINE(INVAL_SELECT_CPU),
> +	SCX_EVENT_END = SCX_EVENT_END_IDX(),
> +};
> +
> +static const char *scx_event_stat_str[] = {
> +	[SCX_EVENT_INVAL_SELECT_CPU]	= "invalid_select_cpu",
> +};
> +


If an event is added to scx_event_kind but not scx_event_stat_str, the 
array can possibly be accessed out of bounds (or into a NULL string 
depending on the missing index). The GNU C extension could be used to 
initialize all elements to the empty string "", like this:

static const char *scx_event_stat_str[SCX_EVENT_END] = {
     [0 ... SCX_EVENT_END - 1] = "",
     [SCX_EVENT_INVAL_SELECT_CPU] = "invalid_select_cpu",
};

Alternatively a helper could be used:

static const char *scx_event_name(enum scx_event_kind kind)
{
     return scx_event_stat_str[kind] ? : "";
}


> +/*
> + * The event counter is organized by a per-CPU variable to minimize the
> + * accounting overhead without synchronization. A system-wide view on the
> + * event counter is constructed when requested by scx_bpf_get_event_stat().
> + */
> +static DEFINE_PER_CPU(struct scx_event_stat, event_stats);
> +
> +/**
> + * scx_add_event - Increase an event counter for 'name' by 'cnt'
> + * @name: an event name defined in struct scx_event_stat
> + * @cnt: the number of the event occured
> + */
> +#define scx_add_event(name, cnt) ({						\
> +	struct scx_event_stat *__e;						\
> +	__e = get_cpu_ptr(&event_stats);					\
> +	WRITE_ONCE(__e->name, __e->name+ (cnt));				\
> +	put_cpu_ptr(&event_stats);						\
> +})
> +
> +
> +/**
> + * scx_read_event_kind - Read an event from 'e' with 'kind'
> + * @e: a pointer to an event collected by scx_bpf_event_stat()
> + * @kine: an event type defined in scx_event_kind
> + */
> +#define scx_read_event_kind(e, kind) ({						\
> +	u64 *__e64 = (u64 *)(e);						\
> +	__e64[kind];								\
> +})
> +


nit: typo, "@kine" instead of "@kind"


> +static void scx_bpf_event_stat(struct scx_event_stat *event, size_t event__sz);
> +
>   static enum scx_ops_enable_state scx_ops_enable_state(void)
>   {
>   	return atomic_read(&scx_ops_enable_state_var);
> @@ -3607,8 +3667,10 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
>   		*ddsp_taskp = NULL;
>   		if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
>   			return cpu;
> -		else
> +		else {
> +			scx_add_event(INVAL_SELECT_CPU, 1);
>   			return prev_cpu;
> +		}
>   	} else {
>   		bool found;
>   		s32 cpu;
> @@ -5053,6 +5115,15 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
>   		scx_rq_clock_invalidate(rq);
>   	}
>   
> +	/*
> +	 * Clear event counters so the next scx scheduler always gets
> +	 * fresh event counter values.
> +	 */
> +	for_each_possible_cpu(cpu) {
> +		struct scx_event_stat *e = per_cpu_ptr(&event_stats, cpu);
> +		memset(e, 0, sizeof(*e));
> +	}
> +
>   	/* no task is on scx, turn off all the switches and flush in-progress calls */
>   	static_branch_disable(&__scx_ops_enabled);
>   	for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
> @@ -5309,9 +5380,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
>   		.at_jiffies = jiffies,
>   	};
>   	struct seq_buf s;
> +	struct scx_event_stat event;
>   	unsigned long flags;
>   	char *buf;
> -	int cpu;
> +	int cpu, kind;
>   
>   	spin_lock_irqsave(&dump_lock, flags);
>   
> @@ -5417,6 +5489,16 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
>   		rq_unlock(rq, &rf);
>   	}
>   
> +	dump_newline(&s);
> +	dump_line(&s, "Event counters");
> +	dump_line(&s, "--------------");
> +
> +	scx_bpf_event_stat(&event, sizeof(event));
> +	for (kind = SCX_EVENT_BEGIN; kind < SCX_EVENT_END; kind++) {
> +		dump_line(&s, "%25s : %llu", scx_event_stat_str[kind],
> +			  scx_read_event_kind(&event, kind));
> +	}
> +
>   	if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
>   		memcpy(ei->dump + dump_len - sizeof(trunc_marker),
>   		       trunc_marker, sizeof(trunc_marker));
> @@ -7720,6 +7802,39 @@ __bpf_kfunc u64 scx_bpf_now(void)
>   	return clock;
>   }
>   
> +/*
> + * scx_bpf_event_stat - Get a system-wide event counter to
> + * @event: output buffer from a BPF program
> + * @event__sz: @event len, must end in '__sz'' for the verifier
> + */
> +__bpf_kfunc void scx_bpf_event_stat(struct scx_event_stat *event,
> +				    size_t event__sz)
> +{
> +	struct scx_event_stat *e;
> +	u64 *event64, *e64;
> +	int cpu, kind, event_end;
> +
> +	/*
> +	 * We cannot entirely trust a BPF-provided size since a BPF program
> +	 * might be compiled against a different vmlinux.h, of which
> +	 * scx_event_stat would be larger (a newer vmlinux.h) or smaller
> +	 * (an older vmlinux.h). Hence, we use the smaller size to avoid
> +	 * memory corruption.
> +	 */
> +	event__sz = min(event__sz, sizeof(*event));
> +	event_end = event__sz / sizeof(u64);
> +
> +	event64 = (u64 *)event;
> +	memset(event, 0, event__sz);
> +	for_each_possible_cpu(cpu) {
> +		e = per_cpu_ptr(&event_stats, cpu);
> +		e64 = (u64 *)e;
> +		for (kind = 0; kind < event_end; kind++) {
> +			event64[kind] += READ_ONCE(e64[kind]);
> +		}
> +	}
> +}
> +
>   __bpf_kfunc_end_defs();
>   
>   BTF_KFUNCS_START(scx_kfunc_ids_any)
> @@ -7752,6 +7867,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
>   BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
>   #endif
>   BTF_ID_FLAGS(func, scx_bpf_now)
> +BTF_ID_FLAGS(func, scx_bpf_event_stat, KF_TRUSTED_ARGS)
>   BTF_KFUNCS_END(scx_kfunc_ids_any)
>   
>   static const struct btf_kfunc_id_set scx_kfunc_set_any = {

  parent reply	other threads:[~2025-01-21 15:00 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-01-16 15:15 [PATCH 0/7] sched_ext: Implement core event counters Changwoo Min
2025-01-16 15:15 ` [PATCH 1/7] sched_ext: Implement event counter infrastructure and add an event Changwoo Min
2025-01-17  1:33   ` Tejun Heo
2025-01-17  7:08     ` Changwoo Min
2025-01-17 16:24       ` Tejun Heo
2025-01-18  0:27         ` Changwoo Min
2025-01-22  0:42           ` Tejun Heo
2025-01-22  1:37             ` Changwoo Min
2025-01-17  9:49   ` Andrea Righi
2025-01-17 16:26     ` Tejun Heo
2025-01-18  0:00     ` Changwoo Min
2025-01-21 15:00   ` Leonardo Temperanza [this message]
2025-01-22  1:45     ` Changwoo Min
2025-01-16 15:15 ` [PATCH 2/7] sched_ext: Add an event, SCX_EVENT_OFFLINE_LOCAL_DSQ Changwoo Min
2025-01-17  1:37   ` Tejun Heo
2025-01-17  7:11     ` Changwoo Min
2025-01-16 15:15 ` [PATCH 3/7] sched_ext: Add an event, SCX_EVENT_CNTD_RUN_WO_ENQ Changwoo Min
2025-01-17  1:39   ` Tejun Heo
2025-01-17  7:12     ` Changwoo Min
2025-01-16 15:15 ` [PATCH 4/7] sched_ext: Add an event, SCX_EVENT_ENQ_LOCAL_EXITING Changwoo Min
2025-01-17  1:40   ` Tejun Heo
2025-01-17  7:12     ` Changwoo Min
2025-01-16 15:15 ` [PATCH 5/7] sched_ext: Add an event, SCX_EVENT_RQ_BYPASSING_OPS Changwoo Min
2025-01-17  1:41   ` Tejun Heo
2025-01-17  7:31     ` Changwoo Min
2025-01-17 16:14       ` Tejun Heo
2025-01-16 15:15 ` [PATCH 6/7] sched_ext: Add scx_bpf_event_stat() and scx_read_event() for BPF schedulers Changwoo Min
2025-01-16 15:15 ` [PATCH 7/7] sched_ext: Print core event count in scx_central scheduler Changwoo Min

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5c1f6f0b-2b9d-4b16-8ec0-e3be876eb260@gmail.com \
    --to=leonardo.temperanza@gmail.com \
    --cc=arighi@nvidia.com \
    --cc=changwoo@igalia.com \
    --cc=kernel-dev@igalia.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=tj@kernel.org \
    --cc=void@manifault.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.