* [PATCH] perf/core: Fix sampling period inconsistency across CPU migration
@ 2026-04-28 11:53 Minwoo Ahn
2026-04-29 7:34 ` kernel test robot
0 siblings, 1 reply; 2+ messages in thread
From: Minwoo Ahn @ 2026-04-28 11:53 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Namhyung Kim
Cc: Mark Rutland, Alexander Shishkin, Jiri Olsa, Ian Rogers,
Adrian Hunter, James Clark, Jinkyu Jeong, Minwoo Ahn,
linux-perf-users, linux-kernel
When per-task software events are sampled, period_left is not
managed consistently when task migration happens. The perf_event
may observe a different hw_perf_event::period_left on the new CPU,
breaking the sampling periodicity. Even if a task was near its
sampling point, it would use a stale period_left after migration.
Introduce struct perf_task_context as a per-task container to
preserve period_left across CPU migrations. A separate structure
is used rather than adding fields to hw_perf_event, because
hw_perf_event is a general-purpose structure shared by all event
types (hardware, software, tracepoint, breakpoint, etc.) and
embedding per-task sampling state there would bloat it for the
majority of events that do not need it. perf_task_context is only
allocated for per-task software sampling events.
Multiple per-CPU perf_event instances originating from the same
perf_event_open caller share a single perf_task_context via
refcounting. The perf_event owner field is used to distinguish events
from different perf_event_open callers, preventing unrelated sampling
sessions from interfering with each other. For inherited events
(where owner is NULL), the inherit flag relaxes the owner check
so that child events properly share perf_task_context. The
allocation condition also accounts for inherited events whose
attr.type has been remapped from PERF_TYPE_SOFTWARE to a dynamic
PMU type during initialization.
perf_task_context serves purely as a transport for period_left
across CPU migrations. On event removal (swevent_del for non-clock
events, cancel_hrtimer for clock events), hw_perf_event::period_left
is backed up to perf_task_context::period_left. On event addition
(swevent_add for non-clock events, start_hrtimer for clock events),
perf_task_context::period_left is restored to hw_perf_event::period_left.
During normal operation between migrations, hw_perf_event::period_left
remains the sole working copy, keeping existing code paths unaffected.
To reproduce, force CPU migration during task-clock sampling:
$ sysbench cpu --threads=1 --time=60 run &
$ sleep 0.1
$ TID=$(ls /proc/$!/task/ | grep -v "^$!$")
$ perf record -e task-clock -c 1000000000 -t $TID &
# Force migration across CPUs every 1.2 seconds
$ while kill -0 $TID 2>/dev/null; do
taskset -p -c 0 $TID; sleep 1.2
taskset -p -c 1 $TID; sleep 1.2
taskset -p -c 2 $TID; sleep 1.2
done
# Check sample intervals (expected: ~1.000s each)
$ perf script -F time | \
awk 'NR==1 {prev=$1; next} {print $1-prev; prev=$1}'
Without this patch, sample intervals show significant deviation
from the expected 1-second period after each migration. With this
patch, intervals remain consistent.
Co-developed-by: Jinkyu Jeong <jinkyu@yonsei.ac.kr>
Signed-off-by: Jinkyu Jeong <jinkyu@yonsei.ac.kr>
Signed-off-by: Minwoo Ahn <mwahn402@gmail.com>
---
include/linux/perf_event.h | 18 +++++++++++
kernel/events/core.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 93 insertions(+)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 48d851fbd8ea..84827f81cc9c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -829,6 +829,9 @@ struct perf_event {
u16 read_size;
struct hw_perf_event hw;
+ /* Per-task sampling state for sw events, survives CPU migration */
+ struct perf_task_context *perf_task_ctxp;
+
struct perf_event_context *ctx;
/*
* event->pmu_ctx points to perf_event_pmu_context in which the event
@@ -1148,6 +1151,21 @@ struct perf_cpu_context {
struct perf_event *heap_default[2];
};
+#define perf_event_equal_task_ctx(a1, a2) \
+ ((a1)->config == (a2)->config && \
+ (a1)->sample_period == (a2)->sample_period)
+
+/**
+ * struct perf_task_context - per-task software event context
+ *
+ * Shared across per-CPU perf_event instances of the same task to
+ * preserve period_left across CPU migrations.
+ */
+struct perf_task_context {
+ refcount_t refcount;
+ local64_t period_left;
+};
+
struct perf_output_handle {
struct perf_event *event;
struct perf_buffer *rb;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6d1f8bad7e1c..bd106e0b854a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5740,6 +5740,13 @@ static bool exclusive_event_installable(struct perf_event *event,
static void perf_free_addr_filters(struct perf_event *event);
+static void perf_put_task_ctxp(struct perf_event *event)
+{
+ if (event->perf_task_ctxp &&
+ refcount_dec_and_test(&event->perf_task_ctxp->refcount))
+ kfree(event->perf_task_ctxp);
+}
+
/* vs perf_event_alloc() error */
static void __free_event(struct perf_event *event)
{
@@ -5761,6 +5768,9 @@ static void __free_event(struct perf_event *event)
if (event->attach_state & PERF_ATTACH_TASK_DATA)
detach_perf_ctx_data(event);
+ if (event->perf_task_ctxp)
+ perf_put_task_ctxp(event);
+
if (event->destroy)
event->destroy(event);
@@ -11054,9 +11064,14 @@ static void perf_swevent_read(struct perf_event *event)
static int perf_swevent_add(struct perf_event *event, int flags)
{
struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
+ struct perf_task_context *ctxp = event->perf_task_ctxp;
struct hw_perf_event *hwc = &event->hw;
struct hlist_head *head;
+ if (ctxp)
+ local64_set(&hwc->period_left,
+ local64_read(&ctxp->period_left));
+
if (is_sampling_event(event)) {
hwc->last_period = hwc->sample_period;
perf_swevent_set_period(event);
@@ -11076,7 +11091,13 @@ static int perf_swevent_add(struct perf_event *event, int flags)
static void perf_swevent_del(struct perf_event *event, int flags)
{
+ struct perf_task_context *ctxp = event->perf_task_ctxp;
+
hlist_del_rcu(&event->hlist_entry);
+
+ if (ctxp)
+ local64_set(&ctxp->period_left,
+ local64_read(&event->hw.period_left));
}
static void perf_swevent_start(struct perf_event *event, int flags)
@@ -12203,12 +12224,17 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
static void perf_swevent_start_hrtimer(struct perf_event *event)
{
+ struct perf_task_context *ctxp = event->perf_task_ctxp;
struct hw_perf_event *hwc = &event->hw;
s64 period;
if (!is_sampling_event(event))
return;
+ if (ctxp)
+ local64_set(&hwc->period_left,
+ local64_read(&ctxp->period_left));
+
period = local64_read(&hwc->period_left);
if (period) {
if (period < 0)
@@ -12224,6 +12250,7 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
static void perf_swevent_cancel_hrtimer(struct perf_event *event)
{
+ struct perf_task_context *ctxp = event->perf_task_ctxp;
struct hw_perf_event *hwc = &event->hw;
/*
@@ -12238,8 +12265,13 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
*/
if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) {
ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
+
local64_set(&hwc->period_left, ktime_to_ns(remaining));
+ if (ctxp)
+ local64_set(&ctxp->period_left,
+ ktime_to_ns(remaining));
+
hrtimer_try_to_cancel(&hwc->hrtimer);
}
}
@@ -13259,6 +13291,40 @@ static void account_event(struct perf_event *event)
account_pmu_sb_event(event);
}
+static struct perf_task_context *
+perf_get_task_ctxp(struct perf_event *event, struct task_struct *task,
+ bool inherit)
+{
+ struct perf_task_context *ctxp = NULL;
+ struct perf_event_context *ctx = task->perf_event_ctxp;
+ struct perf_event *iter;
+
+ if (ctx) {
+ raw_spin_lock(&ctx->lock);
+ list_for_each_entry(iter, &ctx->event_list, event_entry) {
+ if (iter->perf_task_ctxp &&
+ (iter->owner == current ||
+ (inherit && !iter->owner)) &&
+ perf_event_equal_task_ctx(&iter->attr,
+ &event->attr)) {
+ ctxp = iter->perf_task_ctxp;
+ refcount_inc(&ctxp->refcount);
+ break;
+ }
+ }
+ raw_spin_unlock(&ctx->lock);
+ }
+
+ if (!ctxp) {
+ ctxp = kzalloc_obj(struct perf_task_context);
+ if (!ctxp)
+ return NULL;
+ refcount_set(&ctxp->refcount, 1);
+ }
+
+ return ctxp;
+}
+
/*
* Allocate and initialize an event structure
*/
@@ -13344,6 +13410,15 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
* pmu before we get a ctx.
*/
event->hw.target = get_task_struct(task);
+
+ if (attr->sample_period &&
+ attr->config < PERF_COUNT_SW_MAX &&
+ (attr->type == PERF_TYPE_SOFTWARE || parent_event)) {
+ event->perf_task_ctxp = perf_get_task_ctxp(event, task,
+ !!parent_event);
+ if (!event->perf_task_ctxp)
+ return ERR_PTR(-ENOMEM);
+ }
}
event->clock = &local_clock;
--
2.49.0
^ permalink raw reply related [flat|nested] 2+ messages in thread* Re: [PATCH] perf/core: Fix sampling period inconsistency across CPU migration
2026-04-28 11:53 [PATCH] perf/core: Fix sampling period inconsistency across CPU migration Minwoo Ahn
@ 2026-04-29 7:34 ` kernel test robot
0 siblings, 0 replies; 2+ messages in thread
From: kernel test robot @ 2026-04-29 7:34 UTC (permalink / raw)
To: Minwoo Ahn, Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Namhyung Kim
Cc: oe-kbuild-all, Mark Rutland, Alexander Shishkin, Jiri Olsa,
Ian Rogers, Adrian Hunter, James Clark, Jinkyu Jeong, Minwoo Ahn,
linux-perf-users, linux-kernel
Hi Minwoo,
kernel test robot noticed the following build errors:
[auto build test ERROR on perf-tools-next/perf-tools-next]
[also build test ERROR on tip/perf/core perf-tools/perf-tools linus/master v7.1-rc1 next-20260428]
[cannot apply to acme/perf/core]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Minwoo-Ahn/perf-core-Fix-sampling-period-inconsistency-across-CPU-migration/20260429-115721
base: https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git perf-tools-next
patch link: https://lore.kernel.org/r/20260428115317.22839-1-mwahn402%40gmail.com
patch subject: [PATCH] perf/core: Fix sampling period inconsistency across CPU migration
config: arc-allnoconfig (https://download.01.org/0day-ci/archive/20260429/202604291520.xGDlzjFN-lkp@intel.com/config)
compiler: arc-linux-gcc (GCC) 15.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260429/202604291520.xGDlzjFN-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202604291520.xGDlzjFN-lkp@intel.com/
All errors (new ones prefixed by >>):
In file included from include/linux/trace_events.h:10,
from include/trace/syscall.h:7,
from include/linux/syscalls.h:95,
from include/linux/syscalls_api.h:1,
from kernel/sched/sched.h:64,
from kernel/sched/rq-offsets.c:5:
>> include/linux/perf_event.h:1166:9: error: unknown type name 'local64_t'; did you mean 'local_t'?
1166 | local64_t period_left;
| ^~~~~~~~~
| local_t
make[3]: *** [scripts/Makefile.build:184: kernel/sched/rq-offsets.s] Error 1
make[3]: Target 'prepare' not remade because of errors.
make[2]: *** [Makefile:1337: prepare0] Error 2
make[2]: Target 'prepare' not remade because of errors.
make[1]: *** [Makefile:248: __sub-make] Error 2
make[1]: Target 'prepare' not remade because of errors.
make: *** [Makefile:248: __sub-make] Error 2
make: Target 'prepare' not remade because of errors.
vim +1166 include/linux/perf_event.h
1153
1154 #define perf_event_equal_task_ctx(a1, a2) \
1155 ((a1)->config == (a2)->config && \
1156 (a1)->sample_period == (a2)->sample_period)
1157
1158 /**
1159 * struct perf_task_context - per-task software event context
1160 *
1161 * Shared across per-CPU perf_event instances of the same task to
1162 * preserve period_left across CPU migrations.
1163 */
1164 struct perf_task_context {
1165 refcount_t refcount;
> 1166 local64_t period_left;
1167 };
1168
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2026-04-29 7:35 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-28 11:53 [PATCH] perf/core: Fix sampling period inconsistency across CPU migration Minwoo Ahn
2026-04-29 7:34 ` kernel test robot
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox