[LTP] [PATCH v3] thermal: add new test group

public inbox for ltp@lists.linux.it
 help / color / mirror / Atom feed

* [LTP] [PATCH v3] thermal: add new test group
@ 2025-11-19 11:45 Piotr Kubaj
  2025-11-20 16:30 ` Petr Vorel
  0 siblings, 1 reply; 9+ messages in thread
From: Piotr Kubaj @ 2025-11-19 11:45 UTC (permalink / raw)
  To: ltp; +Cc: helena.anna.dubel, tomasz.ossowski, rafael.j.wysocki,
	daniel.niestepski

This is a new test for checking thermal interrupt events.
stress-ng is used because genload doesn't seem to generate enough load.

This version adds min_runtime, switches echo to tst_res TINFO and TDEBUG and
removes usage of bc. It also adds --quiet to stress-ng's arguments.

Signed-off-by: Piotr Kubaj <piotr.kubaj@intel.com>
---
 runtest/thermal                       |   2 +
 scenario_groups/default               |   1 +
 testcases/kernel/Makefile             |   1 +
 testcases/kernel/thermal/Makefile     |  11 +++
 testcases/kernel/thermal/thermal01.sh | 102 ++++++++++++++++++++++++++
 5 files changed, 117 insertions(+)
 create mode 100644 runtest/thermal
 create mode 100644 testcases/kernel/thermal/Makefile
 create mode 100755 testcases/kernel/thermal/thermal01.sh

diff --git a/runtest/thermal b/runtest/thermal
new file mode 100644
index 000000000..804ef7d79
--- /dev/null
+++ b/runtest/thermal
@@ -0,0 +1,2 @@
+#THERMAL
+thermal_interrupt_events thermal01.sh
diff --git a/scenario_groups/default b/scenario_groups/default
index 0e76b2bee..ffdd7ff25 100644
--- a/scenario_groups/default
+++ b/scenario_groups/default
@@ -26,3 +26,4 @@ crypto
 kernel_misc
 uevent
 watchqueue
+thermal
diff --git a/testcases/kernel/Makefile b/testcases/kernel/Makefile
index 98fd45a9d..ac816e4e8 100644
--- a/testcases/kernel/Makefile
+++ b/testcases/kernel/Makefile
@@ -36,6 +36,7 @@ SUBDIRS			+= connectors \
 			   sched \
 			   security \
 			   sound \
+			   thermal \
 			   tracing \
 			   uevents \
 			   watchqueue \
diff --git a/testcases/kernel/thermal/Makefile b/testcases/kernel/thermal/Makefile
new file mode 100644
index 000000000..789db430d
--- /dev/null
+++ b/testcases/kernel/thermal/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2025, Intel Corporation. All rights reserved.
+# Author:Piotr Kubaj <piotr.kubaj@intel.com>
+
+top_srcdir             ?= ../../..
+
+include $(top_srcdir)/include/mk/env_pre.mk
+
+INSTALL_TARGETS                := thermal01.sh
+
+include $(top_srcdir)/include/mk/generic_leaf_target.mk
diff --git a/testcases/kernel/thermal/thermal01.sh b/testcases/kernel/thermal/thermal01.sh
new file mode 100755
index 000000000..91e4c02c5
--- /dev/null
+++ b/testcases/kernel/thermal/thermal01.sh
@@ -0,0 +1,102 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (C) 2025 Intel - http://www.intel.com/
+#
+# ---
+# doc
+# Tests the CPU package thermal sensor interface for Intel platforms.
+#
+# It works by checking the initial count of thermal interrupts. Then it
+# decreases the threshold for sending a thermal interrupt to just above
+# the current temperature and runs a workload on the CPU. Finally, it restores
+# the original thermal threshold and checks whether the number of thermal
+# interrupts increased.
+# ---
+#
+# ---
+# env
+# {
+#  "needs_root": true,
+#  "supported_archs": ["x86", "x86_64"],
+#  "needs_cmds": ["stress-ng"],
+#  "min_runtime": 180
+# }
+# ---
+
+. tst_loader.sh
+
+thermal_zone_numbers=""
+temp=""
+temp_high=""
+status=0
+
+tst_test()
+{
+	line=$(grep "Thermal event interrupts" /proc/interrupts)
+	if [ $? -eq 0 ]; then
+		interrupt_array_init=$(echo "$line" | tr -d "a-zA-Z:" | awk '{$1=$1;print}')
+		tst_res TDEBUG "Initial values of thermal interrupt counters: $interrupt_array_init"
+		num=$(nproc)
+		tst_res TDEBUG "Number of logical cores: $num"
+	else
+		tst_brk TCONF "Thermal event interrupts is not found."
+	fi
+
+	# Below we check for the thermal_zone which uses x86_pkg_temp driver
+	thermal_zone_numbers=$(grep -l x86_pkg_temp /sys/class/thermal/thermal_zone*/type | sed 's/[^0-9]//g' | tr -t '\n' ' ')
+	tst_res TINFO "x86_pkg_temp thermal zones: $thermal_zone_numbers"
+
+	if [ -z $thermal_zone_numbers ]; then
+		tst_brk TCONF "No x86_pkg_temp thermal zones found"
+	fi
+	for i in $thermal_zone_numbers; do
+		tst_res TINFO "Currently testing x86_pkg_temp thermal_zone$i"
+		TEMP=/sys/class/thermal/thermal_zone$i/temp
+		temp=$(cat "$TEMP")
+		tst_res TDEBUG "thermal_zone$i current temperature is $temp"
+		case $temp in
+		[0-9]*) ;;
+		*)
+			tst_brk TBROK "Unexpected zone temperature value $temp";;
+		esac
+		trip=$(cat /sys/class/thermal/thermal_zone$i/trip_point_1_temp)
+		# Setting trip_point_1_temp for termal_zone$i to $temp + 10 (0.001°C)
+		temp_high=$(( temp + 10 ))
+		echo $temp_high > /sys/class/thermal/thermal_zone$i/trip_point_1_temp
+		run_time=30
+		sleep_time=10
+		while [ $sleep_time -gt 0 ]; do
+			stress-ng --matrix 0 --timeout $run_time --quiet
+			temp_cur=$(cat "$TEMP")
+			tst_res TDEBUG "temp_cur: $temp_cur"
+			[ $temp_cur -gt $temp_high ] && break
+			sleep $sleep_time
+			run_time=$(( run_time - 3 ))
+			sleep_time=$(( sleep_time - 1 ))
+		done
+		[ $temp_cur -gt $temp_high ] || tst_res TFAIL "Zone temperature is not rising as expected"
+
+		# Restore the original trip_point_1_temp value
+		echo $trip > /sys/class/thermal/thermal_zone$i/trip_point_1_temp
+
+		# Check whether thermal interrupts count actually increased
+		interrupt_array_later=$(grep "Thermal event interrupts" /proc/interrupts | \
+			tr -d "a-zA-Z:" | awk '{$1=$1;print}')
+		tst_res TDEBUG "Current values of thermal interrupt counters: $interrupt_array_later"
+		for j in $(seq 1 "$num"); do
+			interrupt_later=$(echo "$interrupt_array_later" | cut -d " " -f  "$j")
+			interrupt_init=$(echo "$interrupt_array_init" | cut -d " " -f  "$j")
+			if [ $interrupt_later -le $interrupt_init ]; then
+				status=1
+			fi
+		done
+	done
+
+	if [ $status -eq 0 ]; then
+		tst_res TPASS "x86 package thermal interrupt triggered"
+	else
+		tst_res TFAIL "x86 package thermal interrupt did not trigger"
+	fi
+}
+
+. tst_run.sh
-- 
2.47.3

---------------------------------------------------------------------
Intel Technology Poland sp. z o.o.
ul. Slowackiego 173 | 80-298 Gdansk | Sad Rejonowy Gdansk Polnoc | VII Wydzial Gospodarczy Krajowego Rejestru Sadowego - KRS 101882 | NIP 957-07-52-316 | Kapital zakladowy 200.000 PLN.
Spolka oswiadcza, ze posiada status duzego przedsiebiorcy w rozumieniu ustawy z dnia 8 marca 2013 r. o przeciwdzialaniu nadmiernym opoznieniom w transakcjach handlowych.

Ta wiadomosc wraz z zalacznikami jest przeznaczona dla okreslonego adresata i moze zawierac informacje poufne. W razie przypadkowego otrzymania tej wiadomosci, prosimy o powiadomienie nadawcy oraz trwale jej usuniecie; jakiekolwiek przegladanie lub rozpowszechnianie jest zabronione.
This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). If you are not the intended recipient, please contact the sender and delete all copies; any review or distribution by others is strictly prohibited.

-- 
Mailing list info: https://lists.linux.it/listinfo/ltp

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [LTP] [PATCH v3] thermal: add new test group
  2025-11-19 11:45 Piotr Kubaj
@ 2025-11-20 16:30 ` Petr Vorel
  0 siblings, 0 replies; 9+ messages in thread
From: Petr Vorel @ 2025-11-20 16:30 UTC (permalink / raw)
  To: Piotr Kubaj
  Cc: helena.anna.dubel, tomasz.ossowski, rafael.j.wysocki, ltp,
	daniel.niestepski

Hi Piotr,

> This is a new test for checking thermal interrupt events.
> stress-ng is used because genload doesn't seem to generate enough load.

Thanks for info.  Maybe we should try to fix it later, because we really prefer
to use internal tools instead of external dependencies.

> This version adds min_runtime, switches echo to tst_res TINFO and TDEBUG and
> removes usage of bc. It also adds --quiet to stress-ng's arguments.

...
> diff --git a/runtest/thermal b/runtest/thermal
> new file mode 100644
> index 000000000..804ef7d79
> --- /dev/null
> +++ b/runtest/thermal
> @@ -0,0 +1,2 @@
> +#THERMAL
nit: '#THERMAL' does not bring much information value (the same name as the
file). Maybe:
# Thermal driver API
# https://docs.kernel.org/driver-api/thermal/

...
> +++ b/testcases/kernel/thermal/Makefile
> @@ -0,0 +1,11 @@
> +# SPDX-License-Identifier: GPL-2.0-or-later
> +# Copyright (c) 2025, Intel Corporation. All rights reserved.
> +# Author:Piotr Kubaj <piotr.kubaj@intel.com>
> +
> +top_srcdir             ?= ../../..
> +
> +include $(top_srcdir)/include/mk/env_pre.mk
> +
> +INSTALL_TARGETS                := thermal01.sh
nit: *.sh will help not update makefile in case more tests is added.

> +include $(top_srcdir)/include/mk/generic_leaf_target.mk
> diff --git a/testcases/kernel/thermal/thermal01.sh b/testcases/kernel/thermal/thermal01.sh
> new file mode 100755
> index 000000000..91e4c02c5
> --- /dev/null
> +++ b/testcases/kernel/thermal/thermal01.sh
> @@ -0,0 +1,102 @@
> +#!/usr/bin/env bash

Please, don't use env, don't expect bash.
#!/bin/sh

Please, test at least on the system where dash is /bin/sh

https://linux-test-project.readthedocs.io/en/latest/developers/writing_tests.html#shell-coding-style
(Doc does not mention busybox sh, but ideally it should be tested also on busybox sh.)

> +# SPDX-License-Identifier: GPL-2.0-or-later
> +# Copyright (C) 2025 Intel - http://www.intel.com/
> +#
> +# ---
> +# doc
> +# Tests the CPU package thermal sensor interface for Intel platforms.
> +#
> +# It works by checking the initial count of thermal interrupts. Then it
> +# decreases the threshold for sending a thermal interrupt to just above
> +# the current temperature and runs a workload on the CPU. Finally, it restores
> +# the original thermal threshold and checks whether the number of thermal
> +# interrupts increased.
> +# ---
> +#
> +# ---
> +# env
> +# {
> +#  "needs_root": true,
> +#  "supported_archs": ["x86", "x86_64"],
> +#  "needs_cmds": ["stress-ng"],
> +#  "min_runtime": 180
> +# }
> +# ---
> +
> +. tst_loader.sh
> +
> +thermal_zone_numbers=""
> +temp=""
> +temp_high=""
> +status=0

These variables do not need to be global, because you use them only in a single
function.  Could you please move them to tst_test()? And please use local?

tst_test()
{
	local thermal_zone_numbers temp temp_high
	local status=0

> +tst_test()
> +{
> +	line=$(grep "Thermal event interrupts" /proc/interrupts)
Actually any variable should use local.

> +	if [ $? -eq 0 ]; then
> +		interrupt_array_init=$(echo "$line" | tr -d "a-zA-Z:" | awk '{$1=$1;print}')
> +		tst_res TDEBUG "Initial values of thermal interrupt counters: $interrupt_array_init"
> +		num=$(nproc)

Could you please use: tst_getconf _NPROCESSORS_ONLN
Again, we try to avoid external dependencies (tst_getconf is LTP minimal getconf implementation).

> +		tst_res TDEBUG "Number of logical cores: $num"
> +	else
> +		tst_brk TCONF "Thermal event interrupts is not found."
nit: unneeded dot at the end.
> +	fi
> +
> +	# Below we check for the thermal_zone which uses x86_pkg_temp driver
> +	thermal_zone_numbers=$(grep -l x86_pkg_temp /sys/class/thermal/thermal_zone*/type | sed 's/[^0-9]//g' | tr -t '\n' ' ')

This will not work on busybox tr implementation:

tr: unrecognized option: t
BusyBox v1.36.1 (2023-11-07 18:53:09 UTC) multi-call binary.

Usage: tr [-cds] STRING1 [STRING2]

Yes, the time and pain trying to write portable shell is why I asked first to
rewrite into C which is more portable and reliable. (The other is to allow test
working without coreutils/busybox tools as external dependencies.) And using
more shell pipes increases the risk the code will fail.

First, "tr -t '\n' ' '" part is not needed at all, it will work without it:
thermal_zone_numbers=$(grep -l x86_pkg_temp /sys/class/thermal/thermal_zone*/type | sed 's/[^0-9]//g')

Do we need really to know the number? Can't we just store whole file and later
use dirname to get directory?

setup()
{
	...
	THERMAL_ZONE_NUMBERS="$(grep -l x86_pkg_temp /sys/class/thermal/thermal_zone*/type)"
}

tst_test()
{
	local i file tmp

	for i in $THERMAL_ZONE_NUMBERS; do
		file="$(dirname $i)/temp"
		temp=$(cat "$file")
	...
}

I checked on 2 x86_64 machines, with 8 and 2 thermal_zone files. Both had only
one x86_pkg_temp type, always the highest number. I wonder if we need loop at
all.

FYI C API has .save_restore and tst_run_shell.c supports it (see
testcases/kernel/mem/vma/vma05_vdso.c). Unfortunately we cannot use it, because
we are searching for one particular file from possible several.

NOTE: We usually try to do any preparation in test setup function (declared in
TST_SETUP=...  see testcases/lib/tests/shell_loader_setup_cleanup.sh) and do
cleanup in a a test cleanup function (TST_CLEANUP=...). Why? At least cleanup is
useful to have separated in case test quit during failure (e.g. tst_brk quits
testing earlier).

TST_SETUP="setup"
TST_CLEANUP="cleanup"

setup()
{
	local line

	line=$(grep "Thermal event interrupts" /proc/interrupts)

	if [ $? -ne 0 ]; then
		tst_brk TCONF "Thermal event interrupts is not found."
	fi

	tst_res TDEBUG "Number of logical cores: $(tst_getconf _NPROCESSORS_ONLN)"
	THERMAL_ZONE_NUMBERS="$(grep -l x86_pkg_temp /sys/class/thermal/thermal_zone*/type)"
}

cleanup()
{
	local i

	for i in $THERMAL_ZONE_NUMBERS; do
		file="$(dirname $i)/trip_point_1_temp"
		temp=$(cat "$file")
		echo "..." > $file
		=> but here we would need to reuse values previously stored in setup().
		Maybe not worth to complicate with setup+cleanup unless we test only
		single sysfs file.
	...
}

> +	tst_res TINFO "x86_pkg_temp thermal zones: $thermal_zone_numbers"
> +
> +	if [ -z $thermal_zone_numbers ]; then
> +		tst_brk TCONF "No x86_pkg_temp thermal zones found"
> +	fi
> +	for i in $thermal_zone_numbers; do
> +		tst_res TINFO "Currently testing x86_pkg_temp thermal_zone$i"
> +		TEMP=/sys/class/thermal/thermal_zone$i/temp
> +		temp=$(cat "$TEMP")
> +		tst_res TDEBUG "thermal_zone$i current temperature is $temp"
> +		case $temp in

> +		[0-9]*) ;;
> +		*)
> +			tst_brk TBROK "Unexpected zone temperature value $temp";;
> +		esac
> +		trip=$(cat /sys/class/thermal/thermal_zone$i/trip_point_1_temp)
> +		# Setting trip_point_1_temp for termal_zone$i to $temp + 10 (0.001°C)
> +		temp_high=$(( temp + 10 ))
> +		echo $temp_high > /sys/class/thermal/thermal_zone$i/trip_point_1_temp
> +		run_time=30
> +		sleep_time=10
> +		while [ $sleep_time -gt 0 ]; do
> +			stress-ng --matrix 0 --timeout $run_time --quiet

You happily ignore stress-ng exit code. IMHO it should be checked.

I sent a patch which adds ROD() and other functions
https://lore.kernel.org/ltp/20251120161957.331580-1-pvorel@suse.cz/

ROD stress-ng --matrix 0 --timeout $run_time --quiet

> +			temp_cur=$(cat "$TEMP")
> +			tst_res TDEBUG "temp_cur: $temp_cur"
> +			[ $temp_cur -gt $temp_high ] && break
> +			sleep $sleep_time
We have tst_sleep binary, please use it.

> +			run_time=$(( run_time - 3 ))
> +			sleep_time=$(( sleep_time - 1 ))
> +		done
> +		[ $temp_cur -gt $temp_high ] || tst_res TFAIL "Zone temperature is not rising as expected"
> +
> +		# Restore the original trip_point_1_temp value
> +		echo $trip > /sys/class/thermal/thermal_zone$i/trip_point_1_temp
nit: quoting echo is always safe

echo "$trip" > ...

In my case echo fails.

thermal01.sh:53: TINFO: Currently testing x86_pkg_temp thermal_zone8
/opt/ltp/testcases/bin/thermal01.sh: line 80: echo: write error: Invalid argument

> +
> +		# Check whether thermal interrupts count actually increased
> +		interrupt_array_later=$(grep "Thermal event interrupts" /proc/interrupts | \
> +			tr -d "a-zA-Z:" | awk '{$1=$1;print}')
Maybe, when we need to use awk, avoid tr? advantage: remove one pipe,
disadvantage: awk code less readable.

		interrupt_array_later=$(grep "Thermal event interrupts" /proc/interrupts | \
			awk '{ gsub(/[A-Za-z:]/, ""); $1 = $1 }1'

The rest LGTM.

Kind regards,
Petr

> +		tst_res TDEBUG "Current values of thermal interrupt counters: $interrupt_array_later"
> +		for j in $(seq 1 "$num"); do
> +			interrupt_later=$(echo "$interrupt_array_later" | cut -d " " -f  "$j")
> +			interrupt_init=$(echo "$interrupt_array_init" | cut -d " " -f  "$j")
> +			if [ $interrupt_later -le $interrupt_init ]; then
> +				status=1
> +			fi
> +		done
> +	done
> +
> +	if [ $status -eq 0 ]; then
> +		tst_res TPASS "x86 package thermal interrupt triggered"
> +	else
> +		tst_res TFAIL "x86 package thermal interrupt did not trigger"
> +	fi
> +}
> +
> +. tst_run.sh

-- 
Mailing list info: https://lists.linux.it/listinfo/ltp

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [LTP] [PATCH v3] thermal: add new test group
@ 2026-01-21 13:41 Piotr Kubaj
  2026-01-22 14:07 ` Petr Vorel
  0 siblings, 1 reply; 9+ messages in thread
From: Piotr Kubaj @ 2026-01-21 13:41 UTC (permalink / raw)
  To: ltp; +Cc: helena.anna.dubel, tomasz.ossowski, rafael.j.wysocki,
	daniel.niestepski

This is a new test for checking thermal interrupt events.
Addresses some review points from Petr.

Signed-off-by: Piotr Kubaj <piotr.kubaj@intel.com>
---
 runtest/thermal                               |   3 +
 scenario_groups/default                       |   1 +
 testcases/kernel/Makefile                     |   1 +
 testcases/kernel/thermal/Makefile             |  11 ++
 .../kernel/thermal/thermal_interrupt_events.c | 187 ++++++++++++++++++
 5 files changed, 203 insertions(+)
 create mode 100644 runtest/thermal
 create mode 100644 testcases/kernel/thermal/Makefile
 create mode 100644 testcases/kernel/thermal/thermal_interrupt_events.c

diff --git a/runtest/thermal b/runtest/thermal
new file mode 100644
index 000000000..57e3d29f8
--- /dev/null
+++ b/runtest/thermal
@@ -0,0 +1,3 @@
+# Thermal driver API
+# https://docs.kernel.org/driver-api/thermal/
+thermal_interrupt_events thermal_interrupt_events
diff --git a/scenario_groups/default b/scenario_groups/default
index 0e76b2bee..ffdd7ff25 100644
--- a/scenario_groups/default
+++ b/scenario_groups/default
@@ -26,3 +26,4 @@ crypto
 kernel_misc
 uevent
 watchqueue
+thermal
diff --git a/testcases/kernel/Makefile b/testcases/kernel/Makefile
index 98fd45a9d..ac816e4e8 100644
--- a/testcases/kernel/Makefile
+++ b/testcases/kernel/Makefile
@@ -36,6 +36,7 @@ SUBDIRS			+= connectors \
 			   sched \
 			   security \
 			   sound \
+			   thermal \
 			   tracing \
 			   uevents \
 			   watchqueue \
diff --git a/testcases/kernel/thermal/Makefile b/testcases/kernel/thermal/Makefile
new file mode 100644
index 000000000..04a4360d0
--- /dev/null
+++ b/testcases/kernel/thermal/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2025, Intel Corporation. All rights reserved.
+# Author:Piotr Kubaj <piotr.kubaj@intel.com>
+
+top_srcdir             ?= ../../..
+
+include $(top_srcdir)/include/mk/testcases.mk
+
+thermal_interrupt_events: LDLIBS    += -lpthread
+
+include $(top_srcdir)/include/mk/generic_leaf_target.mk
diff --git a/testcases/kernel/thermal/thermal_interrupt_events.c b/testcases/kernel/thermal/thermal_interrupt_events.c
new file mode 100644
index 000000000..037919f59
--- /dev/null
+++ b/testcases/kernel/thermal/thermal_interrupt_events.c
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/*
+ * Copyright (C) 2025-2026 Intel - http://www.intel.com/
+ */
+
+/*\
+ * Tests the CPU package thermal sensor interface for Intel platforms.
+
+ * Works by checking the initial count of thermal interrupts. Then it
+ * decreases the threshold for sending a thermal interrupt to just above
+ * the current temperature and runs a workload on the CPU. Finally, it restores
+ * the original thermal threshold and checks whether the number of thermal
+ * interrupts increased.
+ */
+
+#include "tst_safe_stdio.h"
+#include "tst_test.h"
+#include <ctype.h>
+#include <pthread.h>
+#define	PATH_LEN	69
+#define	STRING_LEN	23
+
+static void *cpu_workload(void *arg)
+{
+	time_t start_time = time(NULL);
+	int num = 2;
+
+	while (difftime(time(NULL), start_time) < *(double *)arg) {
+		for (int i = 2; i * i <= num; i++) {
+			if (num % i == 0)
+				break;
+		}
+		num++;
+	}
+	return NULL;
+}
+
+static void read_interrupts(uint64_t *interrupt_array, const uint16_t nproc)
+{
+	bool interrupts_found = 0;
+	char line[8192];
+
+	memset(interrupt_array, 0, nproc * sizeof(*interrupt_array));
+	FILE *fp = SAFE_FOPEN("/proc/interrupts", "r");
+
+	while (fgets(line, sizeof(line), fp)) {
+		if (strstr(line, "Thermal event interrupts")) {
+			interrupts_found = 1;
+			char *token = strtok(line, " ");
+
+			token = strtok(NULL, " ");
+			int i = 0;
+
+			while (!!strncmp(token, "Thermal", 7)) {
+				interrupt_array[i++] = atoll(token);
+				token = strtok(NULL, " ");
+				tst_res(TDEBUG, "Current value of interrupt_array[%d]: %ld", i - 1, interrupt_array[i - 1]);
+			}
+		}
+	}
+	SAFE_FCLOSE(fp);
+	if (!interrupts_found)
+		tst_brk(TCONF, "No Thermal event interrupts line in /proc/interrupts");
+}
+
+static void run(void)
+{
+	bool status = 1;
+	char line[8192];
+	int nproc = tst_ncpus();
+	uint64_t interrupt_init[nproc], interrupt_later[nproc];
+
+	tst_res(TDEBUG, "Number of logical cores: %d", nproc);
+	read_interrupts(interrupt_init, nproc);
+
+	DIR *dir = SAFE_OPENDIR("/sys/class/thermal/");
+	struct dirent *entry;
+	uint8_t tz_counter = 0;
+
+	while ((entry = SAFE_READDIR(dir))) {
+		if ((strncmp(entry->d_name, "thermal_zone", sizeof("thermal_zone"))) > 0)
+			tz_counter++;
+	}
+	SAFE_CLOSEDIR(dir);
+	tst_res(TDEBUG, "Found %d thermal zone(s)", tz_counter);
+
+	bool x86_pkg_temp_tz[tz_counter], x86_pkg_temp_tz_found = 0;
+
+	memset(x86_pkg_temp_tz, 0, sizeof(x86_pkg_temp_tz));
+
+	for (uint8_t i = 0; i < tz_counter; i++) {
+		char path[PATH_LEN];
+
+		snprintf(path, PATH_LEN, "/sys/class/thermal/thermal_zone%d/type", i);
+		tst_res(TDEBUG, "Checking whether %s is x86_pkg_temp", path);
+
+		SAFE_FILE_SCANF(path, "%s", line);
+		if (strstr(line, "x86_pkg_temp")) {
+			tst_res(TDEBUG, "Thermal zone %d uses x86_pkg_temp", i);
+			x86_pkg_temp_tz[i] = 1;
+			x86_pkg_temp_tz_found = 1;
+		}
+	}
+	if (!x86_pkg_temp_tz_found) {
+		tst_res(TINFO, "No thermal zone uses x86_pkg_temp");
+		status = 0;
+	}
+
+	for (uint8_t i = 0; i < tz_counter; i++) {
+		if (x86_pkg_temp_tz[i]) {
+			char path[PATH_LEN], temp_path[PATH_LEN], trip_path[PATH_LEN], temp_high[12], trip[12];
+			double run_time = 30;
+			uint8_t sleep_time = 10;
+			int temp;
+
+			snprintf(path, PATH_LEN, "/sys/class/thermal/thermal_zone%d/", i);
+			strncpy(temp_path, path, PATH_LEN);
+			strncat(temp_path, "temp", 4);
+			tst_res(TDEBUG, "Testing %s", temp_path);
+			SAFE_FILE_SCANF(temp_path, "%d", &temp);
+			if (temp < 0) {
+				tst_brk(TBROK, "Unexpected zone temperature value %d", temp);
+				status = 0;
+			}
+			tst_res(TDEBUG, "Current temperature for %s: %d", path, temp);
+
+			snprintf(temp_high, sizeof(temp_high), "%d", temp + 10);
+
+			strncpy(trip_path, path, PATH_LEN);
+			strncat(trip_path, "trip_point_1_temp", 17);
+
+			tst_res(TDEBUG, "Setting new trip_point_1_temp value: %s", temp_high);
+			SAFE_FILE_SCANF(trip_path, "%s", trip);
+			SAFE_FILE_PRINTF(trip_path, "%s", temp_high);
+
+			while (sleep_time > 0) {
+				tst_res(TDEBUG, "Running for %f seconds, then sleeping for %d seconds", run_time, sleep_time);
+				pthread_t threads[nproc];
+
+				for (uint16_t j = 0; j < nproc; j++)
+					pthread_create(&threads[j], NULL, cpu_workload, &run_time);
+				for (uint16_t j = 0; j < nproc; j++)
+					pthread_join(threads[j], NULL);
+
+				SAFE_FILE_SCANF(temp_path, "%d", &temp);
+				tst_res(TDEBUG, "Temperature for %s after a test: %d", path, temp);
+
+				if (temp > atol(temp_high))
+					break;
+				sleep(sleep_time--);
+				run_time -= 3;
+			}
+			if (temp <= atol(temp_high)) {
+				tst_res(TINFO, "Zone temperature is not rising as expected");
+				status = 0;
+			}
+
+			tst_res(TDEBUG, "Restoring original trip_point_1_temp value: %s", trip);
+			SAFE_FILE_PRINTF(trip_path, "%s", trip);
+		}
+	}
+	read_interrupts(interrupt_later, nproc);
+
+	for (uint16_t i = 0; i < nproc; i++) {
+		if (interrupt_later[i] < interrupt_init[i]) {
+			tst_res(TINFO, "For CPU %d interrupt counter is currently %ld, while it was %ld before the test", i, interrupt_later[i], interrupt_init[i]);
+			status = 0;
+		}
+	}
+
+	if (status)
+		tst_res(TPASS, "x86 package thermal interrupt triggered");
+	else
+		tst_res(TFAIL, "x86 package thermal interrupt did not trigger");
+}
+
+static struct tst_test test = {
+	.min_runtime = 180,
+	.needs_root = true,
+	.supported_archs = (const char *const []) {
+		"x86",
+		"x86_64",
+		NULL
+	},
+	.test_all = run
+};
-- 
2.47.3

---------------------------------------------------------------------
Intel Technology Poland sp. z o.o.
ul. Slowackiego 173 | 80-298 Gdansk | Sad Rejonowy Gdansk Polnoc | VII Wydzial Gospodarczy Krajowego Rejestru Sadowego - KRS 101882 | NIP 957-07-52-316 | Kapital zakladowy 200.000 PLN.
Spolka oswiadcza, ze posiada status duzego przedsiebiorcy w rozumieniu ustawy z dnia 8 marca 2013 r. o przeciwdzialaniu nadmiernym opoznieniom w transakcjach handlowych.

Ta wiadomosc wraz z zalacznikami jest przeznaczona dla okreslonego adresata i moze zawierac informacje poufne. W razie przypadkowego otrzymania tej wiadomosci, prosimy o powiadomienie nadawcy oraz trwale jej usuniecie; jakiekolwiek przegladanie lub rozpowszechnianie jest zabronione.
This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). If you are not the intended recipient, please contact the sender and delete all copies; any review or distribution by others is strictly prohibited.


-- 
Mailing list info: https://lists.linux.it/listinfo/ltp

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [LTP] [PATCH v3] thermal: add new test group
  2026-01-21 13:41 [LTP] [PATCH v3] thermal: add new test group Piotr Kubaj
@ 2026-01-22 14:07 ` Petr Vorel
  2026-01-23 13:12   ` Kubaj, Piotr
  0 siblings, 1 reply; 9+ messages in thread
From: Petr Vorel @ 2026-01-22 14:07 UTC (permalink / raw)
  To: Piotr Kubaj
  Cc: daniel.niestepski, tomasz.ossowski, helena.anna.dubel,
	rafael.j.wysocki, ltp

Hi Piotr,

not a complete review, just few thoughts.

> diff --git a/testcases/kernel/thermal/thermal_interrupt_events.c b/testcases/kernel/thermal/thermal_interrupt_events.c
...
> +/*\
> + * Tests the CPU package thermal sensor interface for Intel platforms.
> +
> + * Works by checking the initial count of thermal interrupts.
IMHO this part should be in setup function.

> Then it
> + * decreases the threshold for sending a thermal interrupt to just above
> + * the current temperature and runs a workload on the CPU.

First, why test needs to run for 30 sec and then sleep for 10 sec?
Is it possible to check the expected value from sysfs and quit testing before?
Also we have TST_RETRY_FUNC() and TST_RETRY_FN_EXP_BACKOFF() which could be used
for the loop (you create a function which does the check to quit before the
timeout.

> Finally, it restores
> + * the original thermal threshold and checks whether the number of thermal
> + * interrupts increased.
NOTE: the restore will not work if test exits before for some reason
(i.e. any of SAFE_*() macros fail). Therefore restore should be in a cleanup
function.

> + */
> +
> +#include "tst_safe_stdio.h"
> +#include "tst_test.h"
> +#include <ctype.h>
> +#include <pthread.h>
> +#define	PATH_LEN	69
> +#define	STRING_LEN	23
> +
> +static void *cpu_workload(void *arg)
> +{
> +	time_t start_time = time(NULL);
> +	int num = 2;
> +
> +	while (difftime(time(NULL), start_time) < *(double *)arg) {

difftime() is sufficient, but FYI we have include/tst_timer.h for
measuring time difference (which uses more precise struct timeval).

> +		for (int i = 2; i * i <= num; i++) {
> +			if (num % i == 0)
> +				break;
> +		}
> +		num++;
This is supposed to do some real workload on CPU? Is that really enough?
> +	}
> +	return NULL;
> +}
...
> +static void run(void)
> +{
> +	bool status = 1;
> +	char line[8192];
> +	int nproc = tst_ncpus();
> +	uint64_t interrupt_init[nproc], interrupt_later[nproc];
> +
> +	tst_res(TDEBUG, "Number of logical cores: %d", nproc);
> +	read_interrupts(interrupt_init, nproc);
> +
> +	DIR *dir = SAFE_OPENDIR("/sys/class/thermal/");
> +	struct dirent *entry;
> +	uint8_t tz_counter = 0;
I'm not sure if AI advices you to use it, but really we don't optimize that much
that we could not afford just to use plain int. Using exact-width types is
usually used only when needed, e.g. when mapping some struct from kernel.

> +	while ((entry = SAFE_READDIR(dir))) {
> +		if ((strncmp(entry->d_name, "thermal_zone", sizeof("thermal_zone"))) > 0)
> +			tz_counter++;
> +	}
> +	SAFE_CLOSEDIR(dir);
> +	tst_res(TDEBUG, "Found %d thermal zone(s)", tz_counter);

I still think this part can be done in setup and needed values stored in static
variables (outside function), i.e. static int tz_counter, nproc; Why? Because some
people can run test 1000x via "-i 1000", therefore we cache various preparation
results.
> +
> +	bool x86_pkg_temp_tz[tz_counter], x86_pkg_temp_tz_found = 0;
> +
> +	memset(x86_pkg_temp_tz, 0, sizeof(x86_pkg_temp_tz));
> +
> +	for (uint8_t i = 0; i < tz_counter; i++) {
> +		char path[PATH_LEN];
> +
> +		snprintf(path, PATH_LEN, "/sys/class/thermal/thermal_zone%d/type", i);
> +		tst_res(TDEBUG, "Checking whether %s is x86_pkg_temp", path);
> +
> +		SAFE_FILE_SCANF(path, "%s", line);
> +		if (strstr(line, "x86_pkg_temp")) {
> +			tst_res(TDEBUG, "Thermal zone %d uses x86_pkg_temp", i);
> +			x86_pkg_temp_tz[i] = 1;
> +			x86_pkg_temp_tz_found = 1;
> +		}
> +	}
> +	if (!x86_pkg_temp_tz_found) {
> +		tst_res(TINFO, "No thermal zone uses x86_pkg_temp");
> +		status = 0;
> +	}
> +
> +	for (uint8_t i = 0; i < tz_counter; i++) {
Again, please just use int.

> +		if (x86_pkg_temp_tz[i]) {
> +			char path[PATH_LEN], temp_path[PATH_LEN], trip_path[PATH_LEN], temp_high[12], trip[12];

You read and write integer values. Using char array is overcomplicated:
int trip, temp_high = temp + 10;

(But maybe have 10 defined as a constant with meaningful name. That is kind of
documentation (instead reader having to read the code to understand the meaning
of the number).

#define TEMP_INCREMENT 10

> +			double run_time = 30;
#define RUNTIME 30

> +			uint8_t sleep_time = 10;
#define SLEEP 10

> +			int temp;
> +
> +			snprintf(path, PATH_LEN, "/sys/class/thermal/thermal_zone%d/", i);
> +			strncpy(temp_path, path, PATH_LEN);
> +			strncat(temp_path, "temp", 4);
> +			tst_res(TDEBUG, "Testing %s", temp_path);
nit: I'd put this as TINFO to get at least some debug info without -D.

> +			SAFE_FILE_SCANF(temp_path, "%d", &temp);
All SAFE_*() macros quit testing, therefore the following check is not needed.

> +			if (temp < 0) {
> +				tst_brk(TBROK, "Unexpected zone temperature value %d", temp);
> +				status = 0;
> +			}
> +			tst_res(TDEBUG, "Current temperature for %s: %d", path, temp);
> +
This would not be needed:
> +			snprintf(temp_high, sizeof(temp_high), "%d", temp + 10);
> +
> +			strncpy(trip_path, path, PATH_LEN);
> +			strncat(trip_path, "trip_point_1_temp", 17);
> +
> +			tst_res(TDEBUG, "Setting new trip_point_1_temp value: %s", temp_high);
> +			SAFE_FILE_SCANF(trip_path, "%s", trip);
SAFE_FILE_SCANF(trip_path, "%d", &trip);

> +			SAFE_FILE_PRINTF(trip_path, "%s", temp_high);
SAFE_FILE_PRINTF(trip_path, "%d", temp_high);
> +
> +			while (sleep_time > 0) {

> +				tst_res(TDEBUG, "Running for %f seconds, then sleeping for %d seconds", run_time, sleep_time);
> +				pthread_t threads[nproc];
> +
> +				for (uint16_t j = 0; j < nproc; j++)
> +					pthread_create(&threads[j], NULL, cpu_workload, &run_time);
IMHO using pthread (and therefore -lpthread) is overkill.
Could you please use simple fork()?

testcases/cve/cve-2017-2618.c

	for (i = 0; i < nproc; i++) {
		if (!SAFE_FORK()) {
			cpu_workload();
			exit(0);
		}

		tst_reap_children();
	}

> +				for (uint16_t j = 0; j < nproc; j++)
> +					pthread_join(threads[j], NULL);
> +
> +				SAFE_FILE_SCANF(temp_path, "%d", &temp);
> +				tst_res(TDEBUG, "Temperature for %s after a test: %d", path, temp);
> +
> +				if (temp > atol(temp_high))
> +					break;
> +				sleep(sleep_time--);
> +				run_time -= 3;
> +			}
> +			if (temp <= atol(temp_high)) {
> +				tst_res(TINFO, "Zone temperature is not rising as expected");
> +				status = 0;
> +			}

> +
> +			tst_res(TDEBUG, "Restoring original trip_point_1_temp value: %s", trip);
> +			SAFE_FILE_PRINTF(trip_path, "%s", trip);
> +		}
> +	}
> +	read_interrupts(interrupt_later, nproc);
> +
> +	for (uint16_t i = 0; i < nproc; i++) {
> +		if (interrupt_later[i] < interrupt_init[i]) {
> +			tst_res(TINFO, "For CPU %d interrupt counter is currently %ld, while it was %ld before the test", i, interrupt_later[i], interrupt_init[i]);
> +			status = 0;
> +		}
> +	}
> +
> +	if (status)
> +		tst_res(TPASS, "x86 package thermal interrupt triggered");
> +	else
> +		tst_res(TFAIL, "x86 package thermal interrupt did not trigger");
> +}
> +
> +static struct tst_test test = {
> +	.min_runtime = 180,

> +	.needs_root = true,
nit: We always set it as 1.

Kind regards,
Petr

> +	.supported_archs = (const char *const []) {
> +		"x86",
> +		"x86_64",
> +		NULL
> +	},
> +	.test_all = run
> +};

-- 
Mailing list info: https://lists.linux.it/listinfo/ltp

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [LTP] [PATCH v3] thermal: add new test group
  2026-01-22 14:07 ` Petr Vorel
@ 2026-01-23 13:12   ` Kubaj, Piotr
  2026-01-23 18:28     ` Petr Vorel
  0 siblings, 1 reply; 9+ messages in thread
From: Kubaj, Piotr @ 2026-01-23 13:12 UTC (permalink / raw)
  To: pvorel@suse.cz
  Cc: Wysocki, Rafael J, Ossowski, Tomasz, Dubel, Helena Anna,
	Niestepski, Daniel, ltp@lists.linux.it

2026-01-22 (木) の 15:07 +0100 に Petr Vorel さんは書きました:
> Hi Piotr,
> 
> not a complete review, just few thoughts.
> 
> > diff --git a/testcases/kernel/thermal/thermal_interrupt_events.c
> > b/testcases/kernel/thermal/thermal_interrupt_events.c
> ...
> > +/*\
> > + * Tests the CPU package thermal sensor interface for Intel
> > platforms.
> > +
> > + * Works by checking the initial count of thermal interrupts.
> IMHO this part should be in setup function.
Done.


> 
> > Then it
> > + * decreases the threshold for sending a thermal interrupt to just
> > above
> > + * the current temperature and runs a workload on the CPU.
> 
> First, why test needs to run for 30 sec and then sleep for 10 sec?
> Is it possible to check the expected value from sysfs and quit
> testing before?
> Also we have TST_RETRY_FUNC() and TST_RETRY_FN_EXP_BACKOFF() which
> could be used
> for the loop (you create a function which does the check to quit
> before the
> timeout.
Both those macros are unsuitable for this test. TST_RETRY_FUNC() uses a
constant sleep time and TST_RETRY_FN_EXP_BACKOFF() increases its
timeout.

Here the point is to use a decreasing timeout. The test starts with 10s
cooldown to make sure that even pre-production CPU's, which might have
their thermal protections disabled, cool down properly. Once sleep time
reaches 0, the conclusion is that either there was not enough workload
or somehow interrupts are not triggered after all.

> 
> > Finally, it restores
> > + * the original thermal threshold and checks whether the number of
> > thermal
> > + * interrupts increased.
> NOTE: the restore will not work if test exits before for some reason
> (i.e. any of SAFE_*() macros fail). Therefore restore should be in a
> cleanup
> function.
Done.

> 
> > + */
> > +
> > +#include "tst_safe_stdio.h"
> > +#include "tst_test.h"
> > +#include <ctype.h>
> > +#include <pthread.h>
> > +#define	PATH_LEN	69
> > +#define	STRING_LEN	23
> > +
> > +static void *cpu_workload(void *arg)
> > +{
> > +	time_t start_time = time(NULL);
> > +	int num = 2;
> > +
> > +	while (difftime(time(NULL), start_time) < *(double *)arg)
> > {
> 
> difftime() is sufficient, but FYI we have include/tst_timer.h for
> measuring time difference (which uses more precise struct timeval).
> 
> > +		for (int i = 2; i * i <= num; i++) {
> > +			if (num % i == 0)
> > +				break;
> > +		}
> > +		num++;
> This is supposed to do some real workload on CPU? Is that really
> enough?
Yes, my work desktop currently runs at around 38C, when I start this
test it very quickly goes to around 80C, then when the fan starts
running faster, it decreases to about 65C.

> > +	}
> > +	return NULL;
> > +}
> ...
> > +static void run(void)
> > +{
> > +	bool status = 1;
> > +	char line[8192];
> > +	int nproc = tst_ncpus();
> > +	uint64_t interrupt_init[nproc], interrupt_later[nproc];
> > +
> > +	tst_res(TDEBUG, "Number of logical cores: %d", nproc);
> > +	read_interrupts(interrupt_init, nproc);
> > +
> > +	DIR *dir = SAFE_OPENDIR("/sys/class/thermal/");
> > +	struct dirent *entry;
> > +	uint8_t tz_counter = 0;
> I'm not sure if AI advices you to use it, but really we don't
> optimize that much
> that we could not afford just to use plain int. Using exact-width
> types is
> usually used only when needed, e.g. when mapping some struct from
> kernel.
Wrong address here :) I'm anti-FAMA, pro-selfhosting etc. and the
current AI fad goes very much against my principles.


> 
> > +	while ((entry = SAFE_READDIR(dir))) {
> > +		if ((strncmp(entry->d_name, "thermal_zone",
> > sizeof("thermal_zone"))) > 0)
> > +			tz_counter++;
> > +	}
> > +	SAFE_CLOSEDIR(dir);
> > +	tst_res(TDEBUG, "Found %d thermal zone(s)", tz_counter);
> 
> I still think this part can be done in setup and needed values stored
> in static
> variables (outside function), i.e. static int tz_counter, nproc; Why?
> Because some
> people can run test 1000x via "-i 1000", therefore we cache various
> preparation
> results.
Done.

> > +
> > +	bool x86_pkg_temp_tz[tz_counter], x86_pkg_temp_tz_found =
> > 0;
> > +
> > +	memset(x86_pkg_temp_tz, 0, sizeof(x86_pkg_temp_tz));
> > +
> > +	for (uint8_t i = 0; i < tz_counter; i++) {
> > +		char path[PATH_LEN];
> > +
> > +		snprintf(path, PATH_LEN,
> > "/sys/class/thermal/thermal_zone%d/type", i);
> > +		tst_res(TDEBUG, "Checking whether %s is
> > x86_pkg_temp", path);
> > +
> > +		SAFE_FILE_SCANF(path, "%s", line);
> > +		if (strstr(line, "x86_pkg_temp")) {
> > +			tst_res(TDEBUG, "Thermal zone %d uses
> > x86_pkg_temp", i);
> > +			x86_pkg_temp_tz[i] = 1;
> > +			x86_pkg_temp_tz_found = 1;
> > +		}
> > +	}
> > +	if (!x86_pkg_temp_tz_found) {
> > +		tst_res(TINFO, "No thermal zone uses
> > x86_pkg_temp");
> > +		status = 0;
> > +	}
> > +
> > +	for (uint8_t i = 0; i < tz_counter; i++) {
> Again, please just use int.
Done.

> 
> > +		if (x86_pkg_temp_tz[i]) {
> > +			char path[PATH_LEN], temp_path[PATH_LEN],
> > trip_path[PATH_LEN], temp_high[12], trip[12];
> 
> You read and write integer values. Using char array is
> overcomplicated:
> int trip, temp_high = temp + 10;
> 
> (But maybe have 10 defined as a constant with meaningful name. That
> is kind of
> documentation (instead reader having to read the code to understand
> the meaning
> of the number).
> 
> #define TEMP_INCREMENT 10
> 
> > +			double run_time = 30;
> #define RUNTIME 30
> 
> > +			uint8_t sleep_time = 10;
> #define SLEEP 10
Done.

> 
> > +			int temp;
> > +
> > +			snprintf(path, PATH_LEN,
> > "/sys/class/thermal/thermal_zone%d/", i);
> > +			strncpy(temp_path, path, PATH_LEN);
> > +			strncat(temp_path, "temp", 4);
> > +			tst_res(TDEBUG, "Testing %s", temp_path);
> nit: I'd put this as TINFO to get at least some debug info without -
> D.
> 
> > +			SAFE_FILE_SCANF(temp_path, "%d", &temp);
> All SAFE_*() macros quit testing, therefore the following check is
> not needed.
It's necessary because if the temperature is below 0, there's most
likely some kernel or sensor issue.

> 
> > +			if (temp < 0) {
> > +				tst_brk(TBROK, "Unexpected zone
> > temperature value %d", temp);
> > +				status = 0;
> > +			}
> > +			tst_res(TDEBUG, "Current temperature for
> > %s: %d", path, temp);
> > +
> This would not be needed:
Done.

> > +			snprintf(temp_high, sizeof(temp_high),
> > "%d", temp + 10);
> > +
> > +			strncpy(trip_path, path, PATH_LEN);
> > +			strncat(trip_path, "trip_point_1_temp",
> > 17);
> > +
> > +			tst_res(TDEBUG, "Setting new
> > trip_point_1_temp value: %s", temp_high);
> > +			SAFE_FILE_SCANF(trip_path, "%s", trip);
> SAFE_FILE_SCANF(trip_path, "%d", &trip);
> 
> > +			SAFE_FILE_PRINTF(trip_path, "%s",
> > temp_high);
> SAFE_FILE_PRINTF(trip_path, "%d", temp_high);
> > +
> > +			while (sleep_time > 0) {
> 
> > +				tst_res(TDEBUG, "Running for %f
> > seconds, then sleeping for %d seconds", run_time, sleep_time);
> > +				pthread_t threads[nproc];
> > +
> > +				for (uint16_t j = 0; j < nproc;
> > j++)
> > +					pthread_create(&threads[j]
> > , NULL, cpu_workload, &run_time);
> IMHO using pthread (and therefore -lpthread) is overkill.
> Could you please use simple fork()?
> 
> testcases/cve/cve-2017-2618.c
> 
> 	for (i = 0; i < nproc; i++) {
> 		if (!SAFE_FORK()) {
> 			cpu_workload();
> 			exit(0);
> 		}
> 
> 		tst_reap_children();
> 	}
> 
Done.

> > +				for (uint16_t j = 0; j < nproc;
> > j++)
> > +					pthread_join(threads[j],
> > NULL);
> > +
> > +				SAFE_FILE_SCANF(temp_path, "%d",
> > &temp);
> > +				tst_res(TDEBUG, "Temperature for
> > %s after a test: %d", path, temp);
> > +
> > +				if (temp > atol(temp_high))
> > +					break;
> > +				sleep(sleep_time--);
> > +				run_time -= 3;
> > +			}
> > +			if (temp <= atol(temp_high)) {
> > +				tst_res(TINFO, "Zone temperature
> > is not rising as expected");
> > +				status = 0;
> > +			}
> 
> > +
> > +			tst_res(TDEBUG, "Restoring original
> > trip_point_1_temp value: %s", trip);
> > +			SAFE_FILE_PRINTF(trip_path, "%s", trip);
> > +		}
> > +	}
> > +	read_interrupts(interrupt_later, nproc);
> > +
> > +	for (uint16_t i = 0; i < nproc; i++) {
> > +		if (interrupt_later[i] < interrupt_init[i]) {
> > +			tst_res(TINFO, "For CPU %d interrupt
> > counter is currently %ld, while it was %ld before the test", i,
> > interrupt_later[i], interrupt_init[i]);
> > +			status = 0;
> > +		}
> > +	}
> > +
> > +	if (status)
> > +		tst_res(TPASS, "x86 package thermal interrupt
> > triggered");
> > +	else
> > +		tst_res(TFAIL, "x86 package thermal interrupt did
> > not trigger");
> > +}
> > +
> > +static struct tst_test test = {
> > +	.min_runtime = 180,
> 
> > +	.needs_root = true,
> nit: We always set it as 1.
Done.

> 
> Kind regards,
> Petr
> 
> > +	.supported_archs = (const char *const []) {
> > +		"x86",
> > +		"x86_64",
> > +		NULL
> > +	},
> > +	.test_all = run
> > +};
---------------------------------------------------------------------
Intel Technology Poland sp. z o.o.
ul. Slowackiego 173 | 80-298 Gdansk | Sad Rejonowy Gdansk Polnoc | VII Wydzial Gospodarczy Krajowego Rejestru Sadowego - KRS 101882 | NIP 957-07-52-316 | Kapital zakladowy 200.000 PLN.
Spolka oswiadcza, ze posiada status duzego przedsiebiorcy w rozumieniu ustawy z dnia 8 marca 2013 r. o przeciwdzialaniu nadmiernym opoznieniom w transakcjach handlowych.

Ta wiadomosc wraz z zalacznikami jest przeznaczona dla okreslonego adresata i moze zawierac informacje poufne. W razie przypadkowego otrzymania tej wiadomosci, prosimy o powiadomienie nadawcy oraz trwale jej usuniecie; jakiekolwiek przegladanie lub rozpowszechnianie jest zabronione.
This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). If you are not the intended recipient, please contact the sender and delete all copies; any review or distribution by others is strictly prohibited.

-- 
Mailing list info: https://lists.linux.it/listinfo/ltp

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [LTP] [PATCH v3] thermal: add new test group
  2026-01-23 13:12   ` Kubaj, Piotr
@ 2026-01-23 18:28     ` Petr Vorel
  2026-01-29 21:40       ` Wysocki, Rafael J
  0 siblings, 1 reply; 9+ messages in thread
From: Petr Vorel @ 2026-01-23 18:28 UTC (permalink / raw)
  To: Kubaj, Piotr
  Cc: Wysocki, Rafael J, Ossowski, Tomasz, Dubel, Helena Anna,
	Niestepski, Daniel, ltp@lists.linux.it

Hi Piotr,

> > > Then it
> > > + * decreases the threshold for sending a thermal interrupt to just
> > > above
> > > + * the current temperature and runs a workload on the CPU.

> > First, why test needs to run for 30 sec and then sleep for 10 sec?
Maybe the most important of my questions / points.

> Here the point is to use a decreasing timeout. The test starts with 10s
> cooldown to make sure that even pre-production CPU's, which might have
> their thermal protections disabled, cool down properly. Once sleep time
> reaches 0, the conclusion is that either there was not enough workload
> or somehow interrupts are not triggered after all.

Why 30 sec and then sleep for 10 sec? Is it really needed to do it this way?
Aren't these times depending on the tested machine? Some of them will fail due
time not running enough, other will waste time (if they get interrupt e.g. in 10
sec). The usual approach would be to have the timeout safe enough for any type
of hardware but proactively check the temperature and stop testing once it's
done.

...
> > > +			int temp;
> > > +
> > > +			snprintf(path, PATH_LEN,
> > > "/sys/class/thermal/thermal_zone%d/", i);
> > > +			strncpy(temp_path, path, PATH_LEN);
> > > +			strncat(temp_path, "temp", 4);
> > > +			tst_res(TDEBUG, "Testing %s", temp_path);
> > nit: I'd put this as TINFO to get at least some debug info without -
> > D.

> > > +			SAFE_FILE_SCANF(temp_path, "%d", &temp);
> > All SAFE_*() macros quit testing, therefore the following check is
> > not needed.
> It's necessary because if the temperature is below 0, there's most
> likely some kernel or sensor issue.

I'm sorry, I was wrong here. I mixed up vfscanf() return value from LTP
function, but you check the scanned value.

Kind regards,
Petr

> > > +			if (temp < 0) {
> > > +				tst_brk(TBROK, "Unexpected zone
> > > temperature value %d", temp);
> > > +				status = 0;
> > > +			}
> > > +			tst_res(TDEBUG, "Current temperature for
> > > %s: %d", path, temp);

-- 
Mailing list info: https://lists.linux.it/listinfo/ltp

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [LTP] [PATCH v3] thermal: add new test group
  2026-01-23 18:28     ` Petr Vorel
@ 2026-01-29 21:40       ` Wysocki, Rafael J
  2026-01-29 23:24         ` Petr Vorel
  0 siblings, 1 reply; 9+ messages in thread
From: Wysocki, Rafael J @ 2026-01-29 21:40 UTC (permalink / raw)
  To: Petr Vorel, Kubaj, Piotr
  Cc: Ossowski, Tomasz, Dubel, Helena Anna, Niestepski, Daniel,
	ltp@lists.linux.it


On 1/23/2026 7:28 PM, Petr Vorel wrote:
> Hi Piotr,
>
>>>> Then it
>>>> + * decreases the threshold for sending a thermal interrupt to just
>>>> above
>>>> + * the current temperature and runs a workload on the CPU.
>>> First, why test needs to run for 30 sec and then sleep for 10 sec?
> Maybe the most important of my questions / points.
>
>> Here the point is to use a decreasing timeout. The test starts with 10s
>> cooldown to make sure that even pre-production CPU's, which might have
>> their thermal protections disabled, cool down properly. Once sleep time
>> reaches 0, the conclusion is that either there was not enough workload
>> or somehow interrupts are not triggered after all.
> Why 30 sec and then sleep for 10 sec? Is it really needed to do it this way?

Of course not.


> Aren't these times depending on the tested machine? Some of them will fail due
> time not running enough,

That's unexpected with the numbers that are used, so something is amiss 
if it fails (and so it should fail).


> other will waste time (if they get interrupt e.g. in 10 sec).

That very well may happen, but is it a big deal?


> The usual approach would be to have the timeout safe enough for any type
> of hardware but proactively check the temperature and stop testing once it's
> done.

We want to create conditions in which the temperature should rise and if 
it doesn't, then there is a problem.

That said, the temperature can of course be checked more proactively, at 
least in principle, like say run cpu_workload() for 1s, check the 
temperature, repeat that several times, then cool down etc.

BR, Rafael



-- 
Mailing list info: https://lists.linux.it/listinfo/ltp

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [LTP] [PATCH v3] thermal: add new test group
  2026-01-29 21:40       ` Wysocki, Rafael J
@ 2026-01-29 23:24         ` Petr Vorel
  2026-02-03 14:38           ` Wysocki, Rafael J
  0 siblings, 1 reply; 9+ messages in thread
From: Petr Vorel @ 2026-01-29 23:24 UTC (permalink / raw)
  To: Wysocki, Rafael J
  Cc: Dubel, Helena Anna, Ossowski, Tomasz, Niestepski, Daniel,
	ltp@lists.linux.it


> On 1/23/2026 7:28 PM, Petr Vorel wrote:
> > Hi Piotr,

> > > > > Then it
> > > > > + * decreases the threshold for sending a thermal interrupt to just
> > > > > above
> > > > > + * the current temperature and runs a workload on the CPU.
> > > > First, why test needs to run for 30 sec and then sleep for 10 sec?
> > Maybe the most important of my questions / points.

> > > Here the point is to use a decreasing timeout. The test starts with 10s
> > > cooldown to make sure that even pre-production CPU's, which might have
> > > their thermal protections disabled, cool down properly. Once sleep time
> > > reaches 0, the conclusion is that either there was not enough workload
> > > or somehow interrupts are not triggered after all.
> > Why 30 sec and then sleep for 10 sec? Is it really needed to do it this way?

> Of course not.


> > Aren't these times depending on the tested machine? Some of them will fail due
> > time not running enough,

> That's unexpected with the numbers that are used, so something is amiss if
> it fails (and so it should fail).

I tested on very old (~15 years) Thinkpad, quite powerful 3 years old Thinkpad
and some random machine kind of between these two. All detect the threshold with
less than 10% of time (heating CPU runtime 1s and sleep time 1s).

You go with 30s to smaller values. Wouldn't be faster to go the opposite
(start with small values and increase)? The diff below runs successfully on all
3 machines. What am I missing?

> > other will waste time (if they get interrupt e.g. in 10 sec).

> That very well may happen, but is it a big deal?

We try to cut down the test runtime, because LTP test collection is huge
and runtime for it is many hours [1]. For example, we have many CVE tests which
detect race condition. Instead of running each test for "safe long time enough"
which could be e.g. several minutes for many of them we have way to shorten the
time (see include/tst_fuzzy_sync.h).

[1] https://linux-test-project.readthedocs.io/en/latest/developers/ground_rules.html#why-is-sleep-in-tests-bad-then

> > The usual approach would be to have the timeout safe enough for any type
> > of hardware but proactively check the temperature and stop testing once it's
> > done.

> We want to create conditions in which the temperature should rise and if it
> doesn't, then there is a problem.

Sure.

> That said, the temperature can of course be checked more proactively, at
> least in principle, like say run cpu_workload() for 1s, check the
> temperature, repeat that several times, then cool down etc.

Yeah, that's kind of my proposal above.

Also, all of my 3 machines have only 1x x86_pkg_temp type, but I suppose there
are devices with more (I was not able to figure that out from
drivers/thermal/intel/x86_pkg_temp_thermal.c but otherwise the test would not
try to test them all). But why it is important to test them all? Isn't it enough
just to test a single one?

Kind regards,
Petr

> BR, Rafael


+++ testcases/kernel/thermal/thermal_interrupt_events.c
@@ -117,8 +117,8 @@ static void *cpu_workload(double run_time)
 static void test_zone(int i)
 {
 			char path[NAME_MAX], temp_path[NAME_MAX];
-			int sleep_time = SLEEP, temp_high, temp;
-			double run_time = RUNTIME;
+			int sleep_time = 1, temp_high, temp;
+			double run_time = 1;
 
 			snprintf(path, NAME_MAX, "/sys/class/thermal/thermal_zone%d/", i);
 			strncpy(temp_path, path, NAME_MAX);
@@ -138,7 +138,7 @@ static void test_zone(int i)
 			SAFE_FILE_SCANF(trip_path, "%d", &trip);
 			SAFE_FILE_PRINTF(trip_path, "%d", temp_high);
 
-			while (sleep_time > 0) {
+			while (sleep_time < SLEEP) {
 				tst_res(TDEBUG, "Running for %f seconds, then sleeping for %d seconds", run_time, sleep_time);
 
 				for (int j = 0; j < nproc; j++) {
@@ -155,8 +155,8 @@ static void test_zone(int i)
 
 				if (temp > temp_high)
 					break;
-				sleep(sleep_time--);
-				run_time -= 3;
+				sleep(sleep_time++);
+				run_time += 3;
 			}
 
 			if (temp <= temp_high)

-- 
Mailing list info: https://lists.linux.it/listinfo/ltp

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [LTP] [PATCH v3] thermal: add new test group
  2026-01-29 23:24         ` Petr Vorel
@ 2026-02-03 14:38           ` Wysocki, Rafael J
  0 siblings, 0 replies; 9+ messages in thread
From: Wysocki, Rafael J @ 2026-02-03 14:38 UTC (permalink / raw)
  To: Petr Vorel
  Cc: Dubel, Helena Anna, Ossowski, Tomasz, Niestepski, Daniel,
	ltp@lists.linux.it


On 1/30/2026 12:24 AM, Petr Vorel wrote:
>> On 1/23/2026 7:28 PM, Petr Vorel wrote:
>>> Hi Piotr,
>>>>>> Then it
>>>>>> + * decreases the threshold for sending a thermal interrupt to just
>>>>>> above
>>>>>> + * the current temperature and runs a workload on the CPU.
>>>>> First, why test needs to run for 30 sec and then sleep for 10 sec?
>>> Maybe the most important of my questions / points.
>>>> Here the point is to use a decreasing timeout. The test starts with 10s
>>>> cooldown to make sure that even pre-production CPU's, which might have
>>>> their thermal protections disabled, cool down properly. Once sleep time
>>>> reaches 0, the conclusion is that either there was not enough workload
>>>> or somehow interrupts are not triggered after all.
>>> Why 30 sec and then sleep for 10 sec? Is it really needed to do it this way?
>> Of course not.
>
>>> Aren't these times depending on the tested machine? Some of them will fail due
>>> time not running enough,
>> That's unexpected with the numbers that are used, so something is amiss if
>> it fails (and so it should fail).
> I tested on very old (~15 years) Thinkpad, quite powerful 3 years old Thinkpad
> and some random machine kind of between these two. All detect the threshold with
> less than 10% of time (heating CPU runtime 1s and sleep time 1s).
>
> You go with 30s to smaller values. Wouldn't be faster to go the opposite
> (start with small values and increase)? The diff below runs successfully on all
> 3 machines. What am I missing?

That can be done too, the question is when to decide that the thermal 
sensor does not work in the case of no response.

The 30 s of running the workload continuously is kind of a "worst case" 
choice, but a smaller value can be used to start with.  If a system that 
needs a longer time is found, the test can be updated I suppose.

>>> other will waste time (if they get interrupt e.g. in 10 sec).
>> That very well may happen, but is it a big deal?
> We try to cut down the test runtime, because LTP test collection is huge
> and runtime for it is many hours [1]. For example, we have many CVE tests which
> detect race condition. Instead of running each test for "safe long time enough"
> which could be e.g. several minutes for many of them we have way to shorten the
> time (see include/tst_fuzzy_sync.h).

Fair enough.


> [1] https://linux-test-project.readthedocs.io/en/latest/developers/ground_rules.html#why-is-sleep-in-tests-bad-then
>
>>> The usual approach would be to have the timeout safe enough for any type
>>> of hardware but proactively check the temperature and stop testing once it's
>>> done.
>> We want to create conditions in which the temperature should rise and if it
>> doesn't, then there is a problem.
> Sure.
>
>> That said, the temperature can of course be checked more proactively, at
>> least in principle, like say run cpu_workload() for 1s, check the
>> temperature, repeat that several times, then cool down etc.
> Yeah, that's kind of my proposal above.
>
> Also, all of my 3 machines have only 1x x86_pkg_temp type, but I suppose there
> are devices with more (I was not able to figure that out from
> drivers/thermal/intel/x86_pkg_temp_thermal.c but otherwise the test would not
> try to test them all). But why it is important to test them all? Isn't it enough
> just to test a single one?

The thermal sensors in question are per processor package.  With 
multiple packages in a system, if any of them does not work as expected, 
we want to know.

BR, Rafael


>
>
> +++ testcases/kernel/thermal/thermal_interrupt_events.c
> @@ -117,8 +117,8 @@ static void *cpu_workload(double run_time)
>   static void test_zone(int i)
>   {
>   			char path[NAME_MAX], temp_path[NAME_MAX];
> -			int sleep_time = SLEEP, temp_high, temp;
> -			double run_time = RUNTIME;
> +			int sleep_time = 1, temp_high, temp;
> +			double run_time = 1;
>   
>   			snprintf(path, NAME_MAX, "/sys/class/thermal/thermal_zone%d/", i);
>   			strncpy(temp_path, path, NAME_MAX);
> @@ -138,7 +138,7 @@ static void test_zone(int i)
>   			SAFE_FILE_SCANF(trip_path, "%d", &trip);
>   			SAFE_FILE_PRINTF(trip_path, "%d", temp_high);
>   
> -			while (sleep_time > 0) {
> +			while (sleep_time < SLEEP) {
>   				tst_res(TDEBUG, "Running for %f seconds, then sleeping for %d seconds", run_time, sleep_time);
>   
>   				for (int j = 0; j < nproc; j++) {
> @@ -155,8 +155,8 @@ static void test_zone(int i)
>   
>   				if (temp > temp_high)
>   					break;
> -				sleep(sleep_time--);
> -				run_time -= 3;
> +				sleep(sleep_time++);
> +				run_time += 3;
>   			}
>   
>   			if (temp <= temp_high)

-- 
Mailing list info: https://lists.linux.it/listinfo/ltp

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2026-02-03 14:39 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-01-21 13:41 [LTP] [PATCH v3] thermal: add new test group Piotr Kubaj
2026-01-22 14:07 ` Petr Vorel
2026-01-23 13:12   ` Kubaj, Piotr
2026-01-23 18:28     ` Petr Vorel
2026-01-29 21:40       ` Wysocki, Rafael J
2026-01-29 23:24         ` Petr Vorel
2026-02-03 14:38           ` Wysocki, Rafael J
  -- strict thread matches above, loose matches on Subject: below --
2025-11-19 11:45 Piotr Kubaj
2025-11-20 16:30 ` Petr Vorel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox