All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 1/2] sched: fix firing of sched:::on-cpu
@ 2024-06-28 17:16 Alan Maguire
  2024-06-28 17:16 ` [PATCH v2 2/2] unittest/sched: remove dtv2 xfail Alan Maguire
  2024-08-02 21:36 ` [DTrace-devel] [PATCH v2 1/2] sched: fix firing of sched:::on-cpu Kris Van Hees
  0 siblings, 2 replies; 9+ messages in thread
From: Alan Maguire @ 2024-06-28 17:16 UTC (permalink / raw)
  To: dtrace; +Cc: dtrace-devel, Alan Maguire

sched:::on-cpu is not firing very often versus off-cpu.  It appears
that - for recent kernels at least - fbt::schedule_tail:entry
placement is wrong.  The only way to efficiently ensure firing in
the right place - when the new task has been just scheduled in -
is to use fbt::__perf_event_task_sched_in:entry as it

- fires at the right time
- is not static, so not subject to inlining or other optimizations
- is stable across kernel versions.

However the downside is it will not be called unless context switch
perf events are enabled.  So the most efficient method is to
perf_event_open() such an event but not attach anything to it.
Also explored was attaching to cpc:::sched_switch-all-1 and weeding
out off-cpu events, but that required a copy in of task state,
comparison etc so in such a hot codepath a more precise attach
is preferable.

With this in place we get sensible on/off cpu numbers:

$ dtrace -n 'sched:::*-cpu { @c[probename] = count();}'
dtrace: description 'sched:::*-cpu ' matched 2 probes
^C

  off-cpu                                                        1454
  on-cpu                                                         1454

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
 libdtrace/dt_prov_sched.c | 48 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/libdtrace/dt_prov_sched.c b/libdtrace/dt_prov_sched.c
index 2749385a..3e9d4f6b 100644
--- a/libdtrace/dt_prov_sched.c
+++ b/libdtrace/dt_prov_sched.c
@@ -9,6 +9,9 @@
 #include <assert.h>
 #include <errno.h>
 
+#include <linux/perf_event.h>
+#include <perfmon/pfmlib_perf_event.h>
+
 #include "dt_dctx.h"
 #include "dt_cg.h"
 #include "dt_provider_sdt.h"
@@ -25,7 +28,7 @@ static probe_dep_t	probes[] = {
 	{ "off-cpu",
 	  DTRACE_PROBESPEC_NAME,	"rawtp:sched::sched_switch" },
 	{ "on-cpu",
-	  DTRACE_PROBESPEC_NAME,	"fbt::schedule_tail:entry" },
+	  DTRACE_PROBESPEC_NAME,	"fbt::__perf_event_task_sched_in:entry" },
 	{ "surrender",
 	  DTRACE_PROBESPEC_NAME,	"fbt::do_sched_yield:entry" },
 	{ "tick",
@@ -141,13 +144,54 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
 	return 0;
 }
 
+/* We need a custom enabling for on-cpu probes to ensure that the fbt function
+ * __perf_event_task_sched_in is called.  __perf_event_task_sched_in
+ * will not be called unless context switch perf events have been enabled,
+ * so we do that here by opening a context switch count perf event but not
+ * attaching anything to it to minimize overhead.  The alternative - attaching
+ * to cpc:::context_switches-all-1 and weeding out on- versus off-cpu events
+ * via a trampoline is too expensive.  This approach works stably across
+ * kernels because __perf_event_task_sched_in() is not static, so not potentially
+ * subject to inlining or other optimizations.
+ */
+static void enable(dtrace_hdl_t *dtp, dt_probe_t *prp)
+{
+	struct perf_event_attr attr = {};
+	int swfd;
+
+	if (strcmp(prp->desc->prb, "on-cpu") != 0)
+		return dt_sdt_enable(dtp, prp);
+
+	memset(&attr, 0, sizeof(attr));
+	attr.size = sizeof(attr);
+	attr.type = PERF_TYPE_SOFTWARE;
+	attr.config = PERF_COUNT_SW_CONTEXT_SWITCHES;
+	attr.freq = 1;
+	attr.sample_freq = 1000;
+	attr.context_switch = 1;
+
+	swfd = dt_perf_event_open(&attr, -1, 0, -1, 0);
+	if (swfd < 0)
+		dt_dprintf("open of context_switch perf event open failed: %d\n", errno);
+	else
+		prp->prv_data = (void *)(long)swfd;
+	dt_sdt_enable(dtp, prp);
+}
+		
+static void detach(dtrace_hdl_t *dtp, const dt_probe_t *prp)
+{
+	if (prp->prv_data)
+		close((int)(long)prp->prv_data);
+}
+
 dt_provimpl_t	dt_sched = {
 	.name		= prvname,
 	.prog_type	= BPF_PROG_TYPE_UNSPEC,
 	.populate	= &populate,
-	.enable		= &dt_sdt_enable,
+	.enable		= &enable,
 	.load_prog	= &dt_bpf_prog_load,
 	.trampoline	= &trampoline,
 	.probe_info	= &dt_sdt_probe_info,
+	.detach		= &detach,
 	.destroy	= &dt_sdt_destroy,
 };
-- 
2.43.5


^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2024-10-08 18:18 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-06-28 17:16 [PATCH v2 1/2] sched: fix firing of sched:::on-cpu Alan Maguire
2024-06-28 17:16 ` [PATCH v2 2/2] unittest/sched: remove dtv2 xfail Alan Maguire
2024-08-02 21:37   ` Kris Van Hees
2024-08-16 19:33     ` Kris Van Hees
2024-08-17  0:28       ` Kris Van Hees
2024-08-28 16:58         ` Alan Maguire
2024-10-07 19:10           ` Kris Van Hees
2024-10-08 18:17             ` Alan Maguire
2024-08-02 21:36 ` [DTrace-devel] [PATCH v2 1/2] sched: fix firing of sched:::on-cpu Kris Van Hees

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.