* [PATCH 1/4] tools/power turbostat: Add --no-msr option
2024-01-12 12:48 [PATCH 0/4] turbostat msr, perf controls and aperf/mperf via perf Patryk Wlazlyn
@ 2024-01-12 12:48 ` Patryk Wlazlyn
2024-01-13 1:00 ` Len Brown
2024-01-12 12:48 ` [PATCH 2/4] tools/power turbostat: Add --no-perf option Patryk Wlazlyn
` (2 subsequent siblings)
3 siblings, 1 reply; 11+ messages in thread
From: Patryk Wlazlyn @ 2024-01-12 12:48 UTC (permalink / raw)
To: len.brown; +Cc: linux-pm
Add --no-msr option to allow users to run turbostat without
accessing MSRs via the MSR driver.
Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
tools/power/x86/turbostat/turbostat.8 | 2 +
tools/power/x86/turbostat/turbostat.c | 213 +++++++++++++++++++-------
2 files changed, 160 insertions(+), 55 deletions(-)
diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index 8f08c3fd498d..5575c947134d 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -67,6 +67,8 @@ The column name "all" can be used to enable all disabled-by-default built-in cou
.PP
\fB--quiet\fP Do not decode and print the system configuration header information.
.PP
++\fB--no-msr\fP Disable all the uses of the MSR driver.
++.PP
\fB--interval seconds\fP overrides the default 5.0 second measurement interval.
.PP
\fB--num_iterations num\fP number of the measurement iterations.
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 7a334377f92b..f192d75d5977 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -36,6 +36,7 @@
#include <linux/perf_event.h>
#include <asm/unistd.h>
#include <stdbool.h>
+#include <assert.h>
#define UNUSED(x) (void)(x)
@@ -263,6 +264,7 @@ unsigned int has_hwp_epp; /* IA32_HWP_REQUEST[bits 31:24] */
unsigned int has_hwp_pkg; /* IA32_HWP_REQUEST_PKG */
unsigned int first_counter_read = 1;
int ignore_stdin;
+bool no_msr;
int get_msr(int cpu, off_t offset, unsigned long long *msr);
@@ -1280,13 +1282,45 @@ int get_msr_fd(int cpu)
sprintf(pathname, "/dev/cpu/%d/msr", cpu);
fd = open(pathname, O_RDONLY);
if (fd < 0)
- err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, or run as root", pathname);
+ err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, "
+ "or run with --no-msr, or run as root", pathname);
fd_percpu[cpu] = fd;
return fd;
}
+static void bic_disable_msr_access(void)
+{
+ const unsigned long bic_msrs =
+ BIC_Avg_MHz |
+ BIC_Busy |
+ BIC_Bzy_MHz |
+ BIC_IPC |
+ BIC_SMI |
+ BIC_CPU_c1 |
+ BIC_CPU_c3 |
+ BIC_CPU_c6 |
+ BIC_CPU_c7 |
+ BIC_Mod_c6 |
+ BIC_CoreTmp |
+ BIC_Totl_c0 |
+ BIC_Any_c0 |
+ BIC_GFX_c0 |
+ BIC_CPUGFX |
+ BIC_Pkgpc2 |
+ BIC_Pkgpc3 |
+ BIC_Pkgpc6 |
+ BIC_Pkgpc7 |
+ BIC_Pkgpc8 |
+ BIC_Pkgpc9 |
+ BIC_Pkgpc10 |
+ BIC_PkgTmp |
+ BIC_RAMWatt;
+
+ bic_enabled &= ~bic_msrs;
+}
+
static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags)
{
return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
@@ -1326,6 +1360,8 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr)
{
ssize_t retval;
+ assert(!no_msr);
+
retval = pread(get_msr_fd(cpu), msr, sizeof(*msr), offset);
if (retval != sizeof *msr)
@@ -1369,6 +1405,7 @@ void help(void)
" Override default 5-second measurement interval\n"
" -J, --Joules displays energy in Joules instead of Watts\n"
" -l, --list list column headers only\n"
+ " -M, --no-msr Disable all uses of the MSR driver\n"
" -n, --num_iterations num\n"
" number of the measurement iterations\n"
" -N, --header_iterations num\n"
@@ -2578,6 +2615,7 @@ unsigned long long snapshot_sysfs_counter(char *path)
int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp)
{
if (mp->msr_num != 0) {
+ assert(!no_msr);
if (get_msr(cpu, mp->msr_num, counterp))
return -1;
} else {
@@ -2627,6 +2665,9 @@ int get_epb(int cpu)
return epb;
msr_fallback:
+ if (no_msr)
+ return -1;
+
get_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, &msr);
return msr & 0xf;
@@ -2846,7 +2887,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
if (DO_BIC(BIC_CORE_THROT_CNT))
get_core_throt_cnt(cpu, &c->core_throt_cnt);
- if (platform->rapl_msrs & RAPL_AMD_F17H) {
+ if ((platform->rapl_msrs & RAPL_AMD_F17H) && !no_msr) {
if (get_msr(cpu, MSR_CORE_ENERGY_STAT, &msr))
return -14;
c->core_energy = msr & 0xFFFFFFFF;
@@ -2911,41 +2952,44 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
if (DO_BIC(BIC_SYS_LPI))
p->sys_lpi = cpuidle_cur_sys_lpi_us;
- if (platform->rapl_msrs & RAPL_PKG) {
- if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr))
- return -13;
- p->energy_pkg = msr;
- }
- if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) {
- if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr))
- return -14;
- p->energy_cores = msr;
- }
- if (platform->rapl_msrs & RAPL_DRAM) {
- if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr))
- return -15;
- p->energy_dram = msr;
- }
- if (platform->rapl_msrs & RAPL_GFX) {
- if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr))
- return -16;
- p->energy_gfx = msr;
- }
- if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) {
- if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr))
- return -16;
- p->rapl_pkg_perf_status = msr;
- }
- if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) {
- if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr))
- return -16;
- p->rapl_dram_perf_status = msr;
- }
- if (platform->rapl_msrs & RAPL_AMD_F17H) {
- if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr))
- return -13;
- p->energy_pkg = msr;
+ if (!no_msr) {
+ if (platform->rapl_msrs & RAPL_PKG) {
+ if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr))
+ return -13;
+ p->energy_pkg = msr;
+ }
+ if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) {
+ if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr))
+ return -14;
+ p->energy_cores = msr;
+ }
+ if (platform->rapl_msrs & RAPL_DRAM) {
+ if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr))
+ return -15;
+ p->energy_dram = msr;
+ }
+ if (platform->rapl_msrs & RAPL_GFX) {
+ if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr))
+ return -16;
+ p->energy_gfx = msr;
+ }
+ if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) {
+ if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr))
+ return -16;
+ p->rapl_pkg_perf_status = msr;
+ }
+ if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) {
+ if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr))
+ return -16;
+ p->rapl_dram_perf_status = msr;
+ }
+ if (platform->rapl_msrs & RAPL_AMD_F17H) {
+ if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr))
+ return -13;
+ p->energy_pkg = msr;
+ }
}
+
if (DO_BIC(BIC_PkgTmp)) {
if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr))
return -17;
@@ -3053,7 +3097,7 @@ void probe_cst_limit(void)
unsigned long long msr;
int *pkg_cstate_limits;
- if (!platform->has_nhm_msrs)
+ if (!platform->has_nhm_msrs || no_msr)
return;
switch (platform->cst_limit) {
@@ -3097,7 +3141,7 @@ static void dump_platform_info(void)
unsigned long long msr;
unsigned int ratio;
- if (!platform->has_nhm_msrs)
+ if (!platform->has_nhm_msrs || no_msr)
return;
get_msr(base_cpu, MSR_PLATFORM_INFO, &msr);
@@ -3115,7 +3159,7 @@ static void dump_power_ctl(void)
{
unsigned long long msr;
- if (!platform->has_nhm_msrs)
+ if (!platform->has_nhm_msrs || no_msr)
return;
get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr);
@@ -3321,7 +3365,7 @@ static void dump_cst_cfg(void)
{
unsigned long long msr;
- if (!platform->has_nhm_msrs)
+ if (!platform->has_nhm_msrs || no_msr)
return;
get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
@@ -3393,7 +3437,7 @@ void print_irtl(void)
{
unsigned long long msr;
- if (!platform->has_irtl_msrs)
+ if (!platform->has_irtl_msrs || no_msr)
return;
if (platform->supported_cstates & PC3) {
@@ -4173,6 +4217,8 @@ int get_msr_sum(int cpu, off_t offset, unsigned long long *msr)
int ret, idx;
unsigned long long msr_cur, msr_last;
+ assert(!no_msr);
+
if (!per_cpu_msr_sum)
return 1;
@@ -4201,6 +4247,8 @@ static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg
UNUSED(c);
UNUSED(p);
+ assert(!no_msr);
+
for (i = IDX_PKG_ENERGY; i < IDX_COUNT; i++) {
unsigned long long msr_cur, msr_last;
off_t offset;
@@ -4442,10 +4490,13 @@ void check_permissions(void)
do_exit += check_for_cap_sys_rawio();
/* test file permissions */
- sprintf(pathname, "/dev/cpu/%d/msr", base_cpu);
- if (euidaccess(pathname, R_OK)) {
- do_exit++;
- warn("/dev/cpu/0/msr open failed, try chown or chmod +r /dev/cpu/*/msr");
+ if (!no_msr) {
+ sprintf(pathname, "/dev/cpu/%d/msr", base_cpu);
+ if (euidaccess(pathname, R_OK)) {
+ do_exit++;
+ warn("/dev/cpu/0/msr open failed, try chown or chmod +r /dev/cpu/*/msr, "
+ "or run with --no-msr");
+ }
}
/* if all else fails, thell them to be root */
@@ -4462,7 +4513,7 @@ void probe_bclk(void)
unsigned long long msr;
unsigned int base_ratio;
- if (!platform->has_nhm_msrs)
+ if (!platform->has_nhm_msrs || no_msr)
return;
if (platform->bclk_freq == BCLK_100MHZ)
@@ -4502,7 +4553,7 @@ static void dump_turbo_ratio_info(void)
if (!has_turbo)
return;
- if (!platform->has_nhm_msrs)
+ if (!platform->has_nhm_msrs || no_msr)
return;
if (platform->trl_msrs & TRL_LIMIT2)
@@ -4783,6 +4834,9 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
UNUSED(c);
UNUSED(p);
+ if (no_msr)
+ return 0;
+
if (!has_hwp)
return 0;
@@ -4869,6 +4923,9 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
UNUSED(c);
UNUSED(p);
+ if (no_msr)
+ return 0;
+
cpu = t->cpu_id;
/* per-package */
@@ -5202,7 +5259,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
*/
void probe_rapl(void)
{
- if (!platform->rapl_msrs)
+ if (!platform->rapl_msrs || no_msr)
return;
if (genuine_intel)
@@ -5258,7 +5315,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk
}
/* Temperature Target MSR is Nehalem and newer only */
- if (!platform->has_nhm_msrs)
+ if (!platform->has_nhm_msrs || no_msr)
goto guess;
if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr))
@@ -5305,6 +5362,9 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p
UNUSED(c);
UNUSED(p);
+ if (no_msr)
+ return 0;
+
if (!(do_dts || do_ptm))
return 0;
@@ -5402,6 +5462,9 @@ void decode_feature_control_msr(void)
{
unsigned long long msr;
+ if (no_msr)
+ return;
+
if (!get_msr(base_cpu, MSR_IA32_FEAT_CTL, &msr))
fprintf(outf, "cpu%d: MSR_IA32_FEATURE_CONTROL: 0x%08llx (%sLocked %s)\n",
base_cpu, msr, msr & FEAT_CTL_LOCKED ? "" : "UN-", msr & (1 << 18) ? "SGX" : "");
@@ -5411,6 +5474,9 @@ void decode_misc_enable_msr(void)
{
unsigned long long msr;
+ if (no_msr)
+ return;
+
if (!genuine_intel)
return;
@@ -5428,6 +5494,9 @@ void decode_misc_feature_control(void)
{
unsigned long long msr;
+ if (no_msr)
+ return;
+
if (!platform->has_msr_misc_feature_control)
return;
@@ -5449,6 +5518,9 @@ void decode_misc_pwr_mgmt_msr(void)
{
unsigned long long msr;
+ if (no_msr)
+ return;
+
if (!platform->has_msr_misc_pwr_mgmt)
return;
@@ -5468,6 +5540,9 @@ void decode_c6_demotion_policy_msr(void)
{
unsigned long long msr;
+ if (no_msr)
+ return;
+
if (!platform->has_msr_c6_demotion_policy_config)
return;
@@ -5563,7 +5638,7 @@ void probe_cstates(void)
if (platform->has_msr_module_c6_res_ms)
BIC_PRESENT(BIC_Mod_c6);
- if (platform->has_ext_cst_msrs) {
+ if (platform->has_ext_cst_msrs && !no_msr) {
BIC_PRESENT(BIC_Totl_c0);
BIC_PRESENT(BIC_Any_c0);
BIC_PRESENT(BIC_GFX_c0);
@@ -5650,8 +5725,10 @@ void process_cpuid()
ecx_flags = ecx;
edx_flags = edx;
- if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch))
- warnx("get_msr(UCODE)");
+ if (!no_msr) {
+ if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch))
+ warnx("get_msr(UCODE)");
+ }
/*
* check max extended function levels of CPUID.
@@ -5814,7 +5891,7 @@ void probe_pm_features(void)
probe_thermal();
- if (platform->has_nhm_msrs)
+ if (platform->has_nhm_msrs && !no_msr)
BIC_PRESENT(BIC_SMI);
if (!quiet)
@@ -6291,6 +6368,9 @@ int add_counter(unsigned int msr_num, char *path, char *name,
{
struct msr_counter *msrp;
+ if (no_msr && msr_num)
+ errx(1, "Requested MSR counter 0x%x, but in --no-msr mode", msr_num);
+
msrp = calloc(1, sizeof(struct msr_counter));
if (msrp == NULL) {
perror("calloc");
@@ -6595,6 +6675,7 @@ void cmdline(int argc, char **argv)
{ "list", no_argument, 0, 'l' },
{ "out", required_argument, 0, 'o' },
{ "quiet", no_argument, 0, 'q' },
+ { "no-msr", no_argument, 0, 'M' },
{ "show", required_argument, 0, 's' },
{ "Summary", no_argument, 0, 'S' },
{ "TCC", required_argument, 0, 'T' },
@@ -6604,7 +6685,22 @@ void cmdline(int argc, char **argv)
progname = argv[0];
- while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qST:v", long_options, &option_index)) != -1) {
+ /*
+ * Parse some options early, because they may make other options invalid,
+ * like adding the MSR counter with --add and at the same time using --no-msr.
+ */
+ while ((opt = getopt_long_only(argc, argv, "M", long_options, &option_index)) != -1) {
+ switch (opt) {
+ case 'M':
+ no_msr = 1;
+ break;
+ default:
+ break;
+ }
+ }
+ optind = 0;
+
+ while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qMST:v", long_options, &option_index)) != -1) {
switch (opt) {
case 'a':
parse_add_command(optarg);
@@ -6662,6 +6758,9 @@ void cmdline(int argc, char **argv)
case 'q':
quiet = 1;
break;
+ case 'M':
+ /* Parsed earlier */
+ break;
case 'n':
num_iterations = strtod(optarg, NULL);
@@ -6722,6 +6821,9 @@ int main(int argc, char **argv)
outf = stderr;
cmdline(argc, argv);
+ if (no_msr)
+ bic_disable_msr_access();
+
if (!quiet) {
print_version();
print_bootcmd();
@@ -6731,7 +6833,8 @@ int main(int argc, char **argv)
turbostat_init();
- msr_sum_record();
+ if (!no_msr)
+ msr_sum_record();
/* dump counters and exit */
if (dump_only)
--
2.43.0
^ permalink raw reply related [flat|nested] 11+ messages in thread* Re: [PATCH 1/4] tools/power turbostat: Add --no-msr option
2024-01-12 12:48 ` [PATCH 1/4] tools/power turbostat: Add --no-msr option Patryk Wlazlyn
@ 2024-01-13 1:00 ` Len Brown
2024-01-15 12:58 ` Patryk Wlazlyn
0 siblings, 1 reply; 11+ messages in thread
From: Len Brown @ 2024-01-13 1:00 UTC (permalink / raw)
To: Patryk Wlazlyn; +Cc: len.brown, linux-pm
> - if (platform->has_ext_cst_msrs) {
< + if (platform->has_ext_cst_msrs && !no_msr) {
6.7 added probe_platform_features().
Perhaps after it runs, we can simply update the result to disable
those impacted by no_msr?
platform->has_cst_msrs = 0;
platform->has_nhm_msrs = 0;
platform->rapl_msrs = 0;
etc.
that would avoid having to scatter no_msr in a lot of places.
of course that begs the question of what to do when a feature is
available both via MSR and via perf -- which is about to happen...
it also adds a place for us to make an error when we add a feature --
but even if we had a table and a bit to say whether the feature is
available via msr, we'd still have the opportunity to mess that up...
On Fri, Jan 12, 2024 at 6:49 AM Patryk Wlazlyn
<patryk.wlazlyn@linux.intel.com> wrote:
>
> Add --no-msr option to allow users to run turbostat without
> accessing MSRs via the MSR driver.
>
> Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
> Reviewed-by: Len Brown <len.brown@intel.com>
> ---
> tools/power/x86/turbostat/turbostat.8 | 2 +
> tools/power/x86/turbostat/turbostat.c | 213 +++++++++++++++++++-------
> 2 files changed, 160 insertions(+), 55 deletions(-)
>
> diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
> index 8f08c3fd498d..5575c947134d 100644
> --- a/tools/power/x86/turbostat/turbostat.8
> +++ b/tools/power/x86/turbostat/turbostat.8
> @@ -67,6 +67,8 @@ The column name "all" can be used to enable all disabled-by-default built-in cou
> .PP
> \fB--quiet\fP Do not decode and print the system configuration header information.
> .PP
> ++\fB--no-msr\fP Disable all the uses of the MSR driver.
> ++.PP
> \fB--interval seconds\fP overrides the default 5.0 second measurement interval.
> .PP
> \fB--num_iterations num\fP number of the measurement iterations.
> diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
> index 7a334377f92b..f192d75d5977 100644
> --- a/tools/power/x86/turbostat/turbostat.c
> +++ b/tools/power/x86/turbostat/turbostat.c
> @@ -36,6 +36,7 @@
> #include <linux/perf_event.h>
> #include <asm/unistd.h>
> #include <stdbool.h>
> +#include <assert.h>
>
> #define UNUSED(x) (void)(x)
>
> @@ -263,6 +264,7 @@ unsigned int has_hwp_epp; /* IA32_HWP_REQUEST[bits 31:24] */
> unsigned int has_hwp_pkg; /* IA32_HWP_REQUEST_PKG */
> unsigned int first_counter_read = 1;
> int ignore_stdin;
> +bool no_msr;
>
> int get_msr(int cpu, off_t offset, unsigned long long *msr);
>
> @@ -1280,13 +1282,45 @@ int get_msr_fd(int cpu)
> sprintf(pathname, "/dev/cpu/%d/msr", cpu);
> fd = open(pathname, O_RDONLY);
> if (fd < 0)
> - err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, or run as root", pathname);
> + err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, "
> + "or run with --no-msr, or run as root", pathname);
>
> fd_percpu[cpu] = fd;
>
> return fd;
> }
>
> +static void bic_disable_msr_access(void)
> +{
> + const unsigned long bic_msrs =
> + BIC_Avg_MHz |
> + BIC_Busy |
> + BIC_Bzy_MHz |
> + BIC_IPC |
> + BIC_SMI |
> + BIC_CPU_c1 |
> + BIC_CPU_c3 |
> + BIC_CPU_c6 |
> + BIC_CPU_c7 |
> + BIC_Mod_c6 |
> + BIC_CoreTmp |
> + BIC_Totl_c0 |
> + BIC_Any_c0 |
> + BIC_GFX_c0 |
> + BIC_CPUGFX |
> + BIC_Pkgpc2 |
> + BIC_Pkgpc3 |
> + BIC_Pkgpc6 |
> + BIC_Pkgpc7 |
> + BIC_Pkgpc8 |
> + BIC_Pkgpc9 |
> + BIC_Pkgpc10 |
> + BIC_PkgTmp |
> + BIC_RAMWatt;
> +
> + bic_enabled &= ~bic_msrs;
> +}
> +
> static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags)
> {
> return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
> @@ -1326,6 +1360,8 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr)
> {
> ssize_t retval;
>
> + assert(!no_msr);
> +
> retval = pread(get_msr_fd(cpu), msr, sizeof(*msr), offset);
>
> if (retval != sizeof *msr)
> @@ -1369,6 +1405,7 @@ void help(void)
> " Override default 5-second measurement interval\n"
> " -J, --Joules displays energy in Joules instead of Watts\n"
> " -l, --list list column headers only\n"
> + " -M, --no-msr Disable all uses of the MSR driver\n"
> " -n, --num_iterations num\n"
> " number of the measurement iterations\n"
> " -N, --header_iterations num\n"
> @@ -2578,6 +2615,7 @@ unsigned long long snapshot_sysfs_counter(char *path)
> int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp)
> {
> if (mp->msr_num != 0) {
> + assert(!no_msr);
> if (get_msr(cpu, mp->msr_num, counterp))
> return -1;
> } else {
> @@ -2627,6 +2665,9 @@ int get_epb(int cpu)
> return epb;
>
> msr_fallback:
> + if (no_msr)
> + return -1;
> +
> get_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, &msr);
>
> return msr & 0xf;
> @@ -2846,7 +2887,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
> if (DO_BIC(BIC_CORE_THROT_CNT))
> get_core_throt_cnt(cpu, &c->core_throt_cnt);
>
> - if (platform->rapl_msrs & RAPL_AMD_F17H) {
> + if ((platform->rapl_msrs & RAPL_AMD_F17H) && !no_msr) {
> if (get_msr(cpu, MSR_CORE_ENERGY_STAT, &msr))
> return -14;
> c->core_energy = msr & 0xFFFFFFFF;
> @@ -2911,41 +2952,44 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
> if (DO_BIC(BIC_SYS_LPI))
> p->sys_lpi = cpuidle_cur_sys_lpi_us;
>
> - if (platform->rapl_msrs & RAPL_PKG) {
> - if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr))
> - return -13;
> - p->energy_pkg = msr;
> - }
> - if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) {
> - if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr))
> - return -14;
> - p->energy_cores = msr;
> - }
> - if (platform->rapl_msrs & RAPL_DRAM) {
> - if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr))
> - return -15;
> - p->energy_dram = msr;
> - }
> - if (platform->rapl_msrs & RAPL_GFX) {
> - if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr))
> - return -16;
> - p->energy_gfx = msr;
> - }
> - if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) {
> - if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr))
> - return -16;
> - p->rapl_pkg_perf_status = msr;
> - }
> - if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) {
> - if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr))
> - return -16;
> - p->rapl_dram_perf_status = msr;
> - }
> - if (platform->rapl_msrs & RAPL_AMD_F17H) {
> - if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr))
> - return -13;
> - p->energy_pkg = msr;
> + if (!no_msr) {
> + if (platform->rapl_msrs & RAPL_PKG) {
> + if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr))
> + return -13;
> + p->energy_pkg = msr;
> + }
> + if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) {
> + if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr))
> + return -14;
> + p->energy_cores = msr;
> + }
> + if (platform->rapl_msrs & RAPL_DRAM) {
> + if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr))
> + return -15;
> + p->energy_dram = msr;
> + }
> + if (platform->rapl_msrs & RAPL_GFX) {
> + if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr))
> + return -16;
> + p->energy_gfx = msr;
> + }
> + if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) {
> + if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr))
> + return -16;
> + p->rapl_pkg_perf_status = msr;
> + }
> + if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) {
> + if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr))
> + return -16;
> + p->rapl_dram_perf_status = msr;
> + }
> + if (platform->rapl_msrs & RAPL_AMD_F17H) {
> + if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr))
> + return -13;
> + p->energy_pkg = msr;
> + }
> }
> +
> if (DO_BIC(BIC_PkgTmp)) {
> if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr))
> return -17;
> @@ -3053,7 +3097,7 @@ void probe_cst_limit(void)
> unsigned long long msr;
> int *pkg_cstate_limits;
>
> - if (!platform->has_nhm_msrs)
> + if (!platform->has_nhm_msrs || no_msr)
> return;
>
> switch (platform->cst_limit) {
> @@ -3097,7 +3141,7 @@ static void dump_platform_info(void)
> unsigned long long msr;
> unsigned int ratio;
>
> - if (!platform->has_nhm_msrs)
> + if (!platform->has_nhm_msrs || no_msr)
> return;
>
> get_msr(base_cpu, MSR_PLATFORM_INFO, &msr);
> @@ -3115,7 +3159,7 @@ static void dump_power_ctl(void)
> {
> unsigned long long msr;
>
> - if (!platform->has_nhm_msrs)
> + if (!platform->has_nhm_msrs || no_msr)
> return;
>
> get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr);
> @@ -3321,7 +3365,7 @@ static void dump_cst_cfg(void)
> {
> unsigned long long msr;
>
> - if (!platform->has_nhm_msrs)
> + if (!platform->has_nhm_msrs || no_msr)
> return;
>
> get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
> @@ -3393,7 +3437,7 @@ void print_irtl(void)
> {
> unsigned long long msr;
>
> - if (!platform->has_irtl_msrs)
> + if (!platform->has_irtl_msrs || no_msr)
> return;
>
> if (platform->supported_cstates & PC3) {
> @@ -4173,6 +4217,8 @@ int get_msr_sum(int cpu, off_t offset, unsigned long long *msr)
> int ret, idx;
> unsigned long long msr_cur, msr_last;
>
> + assert(!no_msr);
> +
> if (!per_cpu_msr_sum)
> return 1;
>
> @@ -4201,6 +4247,8 @@ static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg
> UNUSED(c);
> UNUSED(p);
>
> + assert(!no_msr);
> +
> for (i = IDX_PKG_ENERGY; i < IDX_COUNT; i++) {
> unsigned long long msr_cur, msr_last;
> off_t offset;
> @@ -4442,10 +4490,13 @@ void check_permissions(void)
> do_exit += check_for_cap_sys_rawio();
>
> /* test file permissions */
> - sprintf(pathname, "/dev/cpu/%d/msr", base_cpu);
> - if (euidaccess(pathname, R_OK)) {
> - do_exit++;
> - warn("/dev/cpu/0/msr open failed, try chown or chmod +r /dev/cpu/*/msr");
> + if (!no_msr) {
> + sprintf(pathname, "/dev/cpu/%d/msr", base_cpu);
> + if (euidaccess(pathname, R_OK)) {
> + do_exit++;
> + warn("/dev/cpu/0/msr open failed, try chown or chmod +r /dev/cpu/*/msr, "
> + "or run with --no-msr");
> + }
> }
>
> /* if all else fails, thell them to be root */
> @@ -4462,7 +4513,7 @@ void probe_bclk(void)
> unsigned long long msr;
> unsigned int base_ratio;
>
> - if (!platform->has_nhm_msrs)
> + if (!platform->has_nhm_msrs || no_msr)
> return;
>
> if (platform->bclk_freq == BCLK_100MHZ)
> @@ -4502,7 +4553,7 @@ static void dump_turbo_ratio_info(void)
> if (!has_turbo)
> return;
>
> - if (!platform->has_nhm_msrs)
> + if (!platform->has_nhm_msrs || no_msr)
> return;
>
> if (platform->trl_msrs & TRL_LIMIT2)
> @@ -4783,6 +4834,9 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
> UNUSED(c);
> UNUSED(p);
>
> + if (no_msr)
> + return 0;
> +
> if (!has_hwp)
> return 0;
>
> @@ -4869,6 +4923,9 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
> UNUSED(c);
> UNUSED(p);
>
> + if (no_msr)
> + return 0;
> +
> cpu = t->cpu_id;
>
> /* per-package */
> @@ -5202,7 +5259,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
> */
> void probe_rapl(void)
> {
> - if (!platform->rapl_msrs)
> + if (!platform->rapl_msrs || no_msr)
> return;
>
> if (genuine_intel)
> @@ -5258,7 +5315,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk
> }
>
> /* Temperature Target MSR is Nehalem and newer only */
> - if (!platform->has_nhm_msrs)
> + if (!platform->has_nhm_msrs || no_msr)
> goto guess;
>
> if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr))
> @@ -5305,6 +5362,9 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p
> UNUSED(c);
> UNUSED(p);
>
> + if (no_msr)
> + return 0;
> +
> if (!(do_dts || do_ptm))
> return 0;
>
> @@ -5402,6 +5462,9 @@ void decode_feature_control_msr(void)
> {
> unsigned long long msr;
>
> + if (no_msr)
> + return;
> +
> if (!get_msr(base_cpu, MSR_IA32_FEAT_CTL, &msr))
> fprintf(outf, "cpu%d: MSR_IA32_FEATURE_CONTROL: 0x%08llx (%sLocked %s)\n",
> base_cpu, msr, msr & FEAT_CTL_LOCKED ? "" : "UN-", msr & (1 << 18) ? "SGX" : "");
> @@ -5411,6 +5474,9 @@ void decode_misc_enable_msr(void)
> {
> unsigned long long msr;
>
> + if (no_msr)
> + return;
> +
> if (!genuine_intel)
> return;
>
> @@ -5428,6 +5494,9 @@ void decode_misc_feature_control(void)
> {
> unsigned long long msr;
>
> + if (no_msr)
> + return;
> +
> if (!platform->has_msr_misc_feature_control)
> return;
>
> @@ -5449,6 +5518,9 @@ void decode_misc_pwr_mgmt_msr(void)
> {
> unsigned long long msr;
>
> + if (no_msr)
> + return;
> +
> if (!platform->has_msr_misc_pwr_mgmt)
> return;
>
> @@ -5468,6 +5540,9 @@ void decode_c6_demotion_policy_msr(void)
> {
> unsigned long long msr;
>
> + if (no_msr)
> + return;
> +
> if (!platform->has_msr_c6_demotion_policy_config)
> return;
>
> @@ -5563,7 +5638,7 @@ void probe_cstates(void)
> if (platform->has_msr_module_c6_res_ms)
> BIC_PRESENT(BIC_Mod_c6);
>
> - if (platform->has_ext_cst_msrs) {
> + if (platform->has_ext_cst_msrs && !no_msr) {
> BIC_PRESENT(BIC_Totl_c0);
> BIC_PRESENT(BIC_Any_c0);
> BIC_PRESENT(BIC_GFX_c0);
> @@ -5650,8 +5725,10 @@ void process_cpuid()
> ecx_flags = ecx;
> edx_flags = edx;
>
> - if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch))
> - warnx("get_msr(UCODE)");
> + if (!no_msr) {
> + if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch))
> + warnx("get_msr(UCODE)");
> + }
>
> /*
> * check max extended function levels of CPUID.
> @@ -5814,7 +5891,7 @@ void probe_pm_features(void)
>
> probe_thermal();
>
> - if (platform->has_nhm_msrs)
> + if (platform->has_nhm_msrs && !no_msr)
> BIC_PRESENT(BIC_SMI);
>
> if (!quiet)
> @@ -6291,6 +6368,9 @@ int add_counter(unsigned int msr_num, char *path, char *name,
> {
> struct msr_counter *msrp;
>
> + if (no_msr && msr_num)
> + errx(1, "Requested MSR counter 0x%x, but in --no-msr mode", msr_num);
> +
> msrp = calloc(1, sizeof(struct msr_counter));
> if (msrp == NULL) {
> perror("calloc");
> @@ -6595,6 +6675,7 @@ void cmdline(int argc, char **argv)
> { "list", no_argument, 0, 'l' },
> { "out", required_argument, 0, 'o' },
> { "quiet", no_argument, 0, 'q' },
> + { "no-msr", no_argument, 0, 'M' },
> { "show", required_argument, 0, 's' },
> { "Summary", no_argument, 0, 'S' },
> { "TCC", required_argument, 0, 'T' },
> @@ -6604,7 +6685,22 @@ void cmdline(int argc, char **argv)
>
> progname = argv[0];
>
> - while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qST:v", long_options, &option_index)) != -1) {
> + /*
> + * Parse some options early, because they may make other options invalid,
> + * like adding the MSR counter with --add and at the same time using --no-msr.
> + */
> + while ((opt = getopt_long_only(argc, argv, "M", long_options, &option_index)) != -1) {
> + switch (opt) {
> + case 'M':
> + no_msr = 1;
> + break;
> + default:
> + break;
> + }
> + }
> + optind = 0;
> +
> + while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qMST:v", long_options, &option_index)) != -1) {
> switch (opt) {
> case 'a':
> parse_add_command(optarg);
> @@ -6662,6 +6758,9 @@ void cmdline(int argc, char **argv)
> case 'q':
> quiet = 1;
> break;
> + case 'M':
> + /* Parsed earlier */
> + break;
> case 'n':
> num_iterations = strtod(optarg, NULL);
>
> @@ -6722,6 +6821,9 @@ int main(int argc, char **argv)
> outf = stderr;
> cmdline(argc, argv);
>
> + if (no_msr)
> + bic_disable_msr_access();
> +
> if (!quiet) {
> print_version();
> print_bootcmd();
> @@ -6731,7 +6833,8 @@ int main(int argc, char **argv)
>
> turbostat_init();
>
> - msr_sum_record();
> + if (!no_msr)
> + msr_sum_record();
>
> /* dump counters and exit */
> if (dump_only)
> --
> 2.43.0
>
>
--
Len Brown, Intel
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH 1/4] tools/power turbostat: Add --no-msr option
2024-01-13 1:00 ` Len Brown
@ 2024-01-15 12:58 ` Patryk Wlazlyn
0 siblings, 0 replies; 11+ messages in thread
From: Patryk Wlazlyn @ 2024-01-15 12:58 UTC (permalink / raw)
To: Len Brown; +Cc: len.brown, linux-pm
> 6.7 added probe_platform_features().
>
> Perhaps after it runs, we can simply update the result to disable
> those impacted by no_msr?
>
> platform->has_cst_msrs = 0;
> platform->has_nhm_msrs = 0;
> platform->rapl_msrs = 0;
> etc.
>
> that would avoid having to scatter no_msr in a lot of places.
>
> of course that begs the question of what to do when a feature is
> available both via MSR and via perf -- which is about to happen...
>
> it also adds a place for us to make an error when we add a feature --
> but even if we had a table and a bit to say whether the feature is
> available via msr, we'd still have the opportunity to mess that up...
Right. I thought about it, but I didn't like the idea of modifying the
platform information, because of the reasons you outlined above, but
also we might have some logic, along the way that cares if something is
present, but not necessarily use MSR driver (or any other method) to get it.
With these changes, sure there are some no_msr checks scattered around,
but they are usually close to where the data is acquired, making it easy
to add an alternative path.
Modifying the platform_features would require to "unconst" a lot of the
structures.
I think it's more clear to store a platform information in the
platform_features and leave no_msr just to store whether the user
requested the mode. This comes with a few extra checks, but leaves the
structures independent.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 2/4] tools/power turbostat: Add --no-perf option
2024-01-12 12:48 [PATCH 0/4] turbostat msr, perf controls and aperf/mperf via perf Patryk Wlazlyn
2024-01-12 12:48 ` [PATCH 1/4] tools/power turbostat: Add --no-msr option Patryk Wlazlyn
@ 2024-01-12 12:48 ` Patryk Wlazlyn
2024-01-13 1:03 ` Len Brown
2024-01-12 12:48 ` [PATCH 3/4] tools/power turbostat: Don't print invalid ucode revision Patryk Wlazlyn
2024-01-12 12:48 ` [PATCH 4/4] tools/power turbostat: Add reading aperf and mperf via perf API Patryk Wlazlyn
3 siblings, 1 reply; 11+ messages in thread
From: Patryk Wlazlyn @ 2024-01-12 12:48 UTC (permalink / raw)
To: len.brown; +Cc: linux-pm
Add the --no-perf option to allow users to run turbostat without
accessing perf.
Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
tools/power/x86/turbostat/turbostat.8 | 2 ++
tools/power/x86/turbostat/turbostat.c | 26 +++++++++++++++++++++++---
2 files changed, 25 insertions(+), 3 deletions(-)
diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index 5575c947134d..8d3d9cac27e0 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -69,6 +69,8 @@ The column name "all" can be used to enable all disabled-by-default built-in cou
.PP
+\fB--no-msr\fP Disable all the uses of the MSR driver.
+.PP
++\fB--no-perf\fP Disable all the uses of the perf API.
++.PP
\fB--interval seconds\fP overrides the default 5.0 second measurement interval.
.PP
\fB--num_iterations num\fP number of the measurement iterations.
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index f192d75d5977..ba10a10c5144 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -265,6 +265,7 @@ unsigned int has_hwp_pkg; /* IA32_HWP_REQUEST_PKG */
unsigned int first_counter_read = 1;
int ignore_stdin;
bool no_msr;
+bool no_perf;
int get_msr(int cpu, off_t offset, unsigned long long *msr);
@@ -1321,8 +1322,17 @@ static void bic_disable_msr_access(void)
bic_enabled &= ~bic_msrs;
}
+static void bic_disable_perf_access(void)
+{
+ const unsigned long bic_perf = BIC_IPC;
+
+ bic_enabled &= ~bic_perf;
+}
+
static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags)
{
+ assert(!no_perf);
+
return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
}
@@ -1339,8 +1349,9 @@ static int perf_instr_count_open(int cpu_num)
/* counter for cpu_num, including user + kernel and all processes */
fd = perf_event_open(&pea, -1, cpu_num, -1, 0);
if (fd == -1) {
- warnx("capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\"", progname);
- BIC_NOT_PRESENT(BIC_IPC);
+ warnx("capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\""
+ " or use --no-perf", progname);
+ bic_disable_perf_access();
}
return fd;
@@ -1406,6 +1417,7 @@ void help(void)
" -J, --Joules displays energy in Joules instead of Watts\n"
" -l, --list list column headers only\n"
" -M, --no-msr Disable all uses of the MSR driver\n"
+ " -P, --no-perf Disable all uses of the perf API\n"
" -n, --num_iterations num\n"
" number of the measurement iterations\n"
" -N, --header_iterations num\n"
@@ -6676,6 +6688,7 @@ void cmdline(int argc, char **argv)
{ "out", required_argument, 0, 'o' },
{ "quiet", no_argument, 0, 'q' },
{ "no-msr", no_argument, 0, 'M' },
+ { "no-perf", no_argument, 0, 'P' },
{ "show", required_argument, 0, 's' },
{ "Summary", no_argument, 0, 'S' },
{ "TCC", required_argument, 0, 'T' },
@@ -6689,11 +6702,14 @@ void cmdline(int argc, char **argv)
* Parse some options early, because they may make other options invalid,
* like adding the MSR counter with --add and at the same time using --no-msr.
*/
- while ((opt = getopt_long_only(argc, argv, "M", long_options, &option_index)) != -1) {
+ while ((opt = getopt_long_only(argc, argv, "MP", long_options, &option_index)) != -1) {
switch (opt) {
case 'M':
no_msr = 1;
break;
+ case 'P':
+ no_perf = 1;
+ break;
default:
break;
}
@@ -6759,6 +6775,7 @@ void cmdline(int argc, char **argv)
quiet = 1;
break;
case 'M':
+ case 'P':
/* Parsed earlier */
break;
case 'n':
@@ -6824,6 +6841,9 @@ int main(int argc, char **argv)
if (no_msr)
bic_disable_msr_access();
+ if (no_perf)
+ bic_disable_perf_access();
+
if (!quiet) {
print_version();
print_bootcmd();
--
2.43.0
^ permalink raw reply related [flat|nested] 11+ messages in thread* Re: [PATCH 2/4] tools/power turbostat: Add --no-perf option
2024-01-12 12:48 ` [PATCH 2/4] tools/power turbostat: Add --no-perf option Patryk Wlazlyn
@ 2024-01-13 1:03 ` Len Brown
0 siblings, 0 replies; 11+ messages in thread
From: Len Brown @ 2024-01-13 1:03 UTC (permalink / raw)
To: Patryk Wlazlyn; +Cc: len.brown, linux-pm
Looks good,
but it depends on 1/4, which you are about to change, so I'll wait for
the refresh.
thanks,
-Len
On Fri, Jan 12, 2024 at 6:49 AM Patryk Wlazlyn
<patryk.wlazlyn@linux.intel.com> wrote:
>
> Add the --no-perf option to allow users to run turbostat without
> accessing perf.
>
> Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
> Reviewed-by: Len Brown <len.brown@intel.com>
> ---
> tools/power/x86/turbostat/turbostat.8 | 2 ++
> tools/power/x86/turbostat/turbostat.c | 26 +++++++++++++++++++++++---
> 2 files changed, 25 insertions(+), 3 deletions(-)
>
> diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
> index 5575c947134d..8d3d9cac27e0 100644
> --- a/tools/power/x86/turbostat/turbostat.8
> +++ b/tools/power/x86/turbostat/turbostat.8
> @@ -69,6 +69,8 @@ The column name "all" can be used to enable all disabled-by-default built-in cou
> .PP
> +\fB--no-msr\fP Disable all the uses of the MSR driver.
> +.PP
> ++\fB--no-perf\fP Disable all the uses of the perf API.
> ++.PP
> \fB--interval seconds\fP overrides the default 5.0 second measurement interval.
> .PP
> \fB--num_iterations num\fP number of the measurement iterations.
> diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
> index f192d75d5977..ba10a10c5144 100644
> --- a/tools/power/x86/turbostat/turbostat.c
> +++ b/tools/power/x86/turbostat/turbostat.c
> @@ -265,6 +265,7 @@ unsigned int has_hwp_pkg; /* IA32_HWP_REQUEST_PKG */
> unsigned int first_counter_read = 1;
> int ignore_stdin;
> bool no_msr;
> +bool no_perf;
>
> int get_msr(int cpu, off_t offset, unsigned long long *msr);
>
> @@ -1321,8 +1322,17 @@ static void bic_disable_msr_access(void)
> bic_enabled &= ~bic_msrs;
> }
>
> +static void bic_disable_perf_access(void)
> +{
> + const unsigned long bic_perf = BIC_IPC;
> +
> + bic_enabled &= ~bic_perf;
> +}
> +
> static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags)
> {
> + assert(!no_perf);
> +
> return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
> }
>
> @@ -1339,8 +1349,9 @@ static int perf_instr_count_open(int cpu_num)
> /* counter for cpu_num, including user + kernel and all processes */
> fd = perf_event_open(&pea, -1, cpu_num, -1, 0);
> if (fd == -1) {
> - warnx("capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\"", progname);
> - BIC_NOT_PRESENT(BIC_IPC);
> + warnx("capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\""
> + " or use --no-perf", progname);
> + bic_disable_perf_access();
> }
>
> return fd;
> @@ -1406,6 +1417,7 @@ void help(void)
> " -J, --Joules displays energy in Joules instead of Watts\n"
> " -l, --list list column headers only\n"
> " -M, --no-msr Disable all uses of the MSR driver\n"
> + " -P, --no-perf Disable all uses of the perf API\n"
> " -n, --num_iterations num\n"
> " number of the measurement iterations\n"
> " -N, --header_iterations num\n"
> @@ -6676,6 +6688,7 @@ void cmdline(int argc, char **argv)
> { "out", required_argument, 0, 'o' },
> { "quiet", no_argument, 0, 'q' },
> { "no-msr", no_argument, 0, 'M' },
> + { "no-perf", no_argument, 0, 'P' },
> { "show", required_argument, 0, 's' },
> { "Summary", no_argument, 0, 'S' },
> { "TCC", required_argument, 0, 'T' },
> @@ -6689,11 +6702,14 @@ void cmdline(int argc, char **argv)
> * Parse some options early, because they may make other options invalid,
> * like adding the MSR counter with --add and at the same time using --no-msr.
> */
> - while ((opt = getopt_long_only(argc, argv, "M", long_options, &option_index)) != -1) {
> + while ((opt = getopt_long_only(argc, argv, "MP", long_options, &option_index)) != -1) {
> switch (opt) {
> case 'M':
> no_msr = 1;
> break;
> + case 'P':
> + no_perf = 1;
> + break;
> default:
> break;
> }
> @@ -6759,6 +6775,7 @@ void cmdline(int argc, char **argv)
> quiet = 1;
> break;
> case 'M':
> + case 'P':
> /* Parsed earlier */
> break;
> case 'n':
> @@ -6824,6 +6841,9 @@ int main(int argc, char **argv)
> if (no_msr)
> bic_disable_msr_access();
>
> + if (no_perf)
> + bic_disable_perf_access();
> +
> if (!quiet) {
> print_version();
> print_bootcmd();
> --
> 2.43.0
>
>
--
Len Brown, Intel
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 3/4] tools/power turbostat: Don't print invalid ucode revision
2024-01-12 12:48 [PATCH 0/4] turbostat msr, perf controls and aperf/mperf via perf Patryk Wlazlyn
2024-01-12 12:48 ` [PATCH 1/4] tools/power turbostat: Add --no-msr option Patryk Wlazlyn
2024-01-12 12:48 ` [PATCH 2/4] tools/power turbostat: Add --no-perf option Patryk Wlazlyn
@ 2024-01-12 12:48 ` Patryk Wlazlyn
2024-01-13 1:15 ` Len Brown
2024-01-12 12:48 ` [PATCH 4/4] tools/power turbostat: Add reading aperf and mperf via perf API Patryk Wlazlyn
3 siblings, 1 reply; 11+ messages in thread
From: Patryk Wlazlyn @ 2024-01-12 12:48 UTC (permalink / raw)
To: len.brown; +Cc: linux-pm
Earlier we printed "microcode 0x0" if we failed to obtain it via MSR.
Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
---
tools/power/x86/turbostat/turbostat.c | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index ba10a10c5144..bf733e7d73b5 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -5710,6 +5710,7 @@ void process_cpuid()
unsigned int eax, ebx, ecx, edx;
unsigned int fms, family, model, stepping, ecx_flags, edx_flags;
unsigned long long ucode_patch = 0;
+ bool ucode_patch_valid = false;
eax = ebx = ecx = edx = 0;
@@ -5740,6 +5741,8 @@ void process_cpuid()
if (!no_msr) {
if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch))
warnx("get_msr(UCODE)");
+ else
+ ucode_patch_valid = true;
}
/*
@@ -5751,9 +5754,12 @@ void process_cpuid()
__cpuid(0x80000000, max_extended_level, ebx, ecx, edx);
if (!quiet) {
- fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d) microcode 0x%x\n",
- family, model, stepping, family, model, stepping,
- (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF));
+ fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d)",
+ family, model, stepping, family, model, stepping);
+ if (ucode_patch_valid)
+ fprintf(outf, " microcode 0x%x", (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF));
+ fputc('\n', outf);
+
fprintf(outf, "CPUID(0x80000000): max_extended_levels: 0x%x\n", max_extended_level);
fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s %s %s\n",
ecx_flags & (1 << 0) ? "SSE3" : "-",
--
2.43.0
^ permalink raw reply related [flat|nested] 11+ messages in thread* Re: [PATCH 3/4] tools/power turbostat: Don't print invalid ucode revision
2024-01-12 12:48 ` [PATCH 3/4] tools/power turbostat: Don't print invalid ucode revision Patryk Wlazlyn
@ 2024-01-13 1:15 ` Len Brown
0 siblings, 0 replies; 11+ messages in thread
From: Len Brown @ 2024-01-13 1:15 UTC (permalink / raw)
To: Patryk Wlazlyn; +Cc: len.brown, linux-pm
Applied.
This patch can be found on the latest development turbostat branch, here:
git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git turbostat
thanks!
-Len
On Fri, Jan 12, 2024 at 6:49 AM Patryk Wlazlyn
<patryk.wlazlyn@linux.intel.com> wrote:
>
> Earlier we printed "microcode 0x0" if we failed to obtain it via MSR.
>
> Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
> Reviewed-by: Len Brown <len.brown@intel.com>
> ---
> tools/power/x86/turbostat/turbostat.c | 12 +++++++++---
> 1 file changed, 9 insertions(+), 3 deletions(-)
>
> diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
> index ba10a10c5144..bf733e7d73b5 100644
> --- a/tools/power/x86/turbostat/turbostat.c
> +++ b/tools/power/x86/turbostat/turbostat.c
> @@ -5710,6 +5710,7 @@ void process_cpuid()
> unsigned int eax, ebx, ecx, edx;
> unsigned int fms, family, model, stepping, ecx_flags, edx_flags;
> unsigned long long ucode_patch = 0;
> + bool ucode_patch_valid = false;
>
> eax = ebx = ecx = edx = 0;
>
> @@ -5740,6 +5741,8 @@ void process_cpuid()
> if (!no_msr) {
> if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch))
> warnx("get_msr(UCODE)");
> + else
> + ucode_patch_valid = true;
> }
>
> /*
> @@ -5751,9 +5754,12 @@ void process_cpuid()
> __cpuid(0x80000000, max_extended_level, ebx, ecx, edx);
>
> if (!quiet) {
> - fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d) microcode 0x%x\n",
> - family, model, stepping, family, model, stepping,
> - (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF));
> + fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d)",
> + family, model, stepping, family, model, stepping);
> + if (ucode_patch_valid)
> + fprintf(outf, " microcode 0x%x", (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF));
> + fputc('\n', outf);
> +
> fprintf(outf, "CPUID(0x80000000): max_extended_levels: 0x%x\n", max_extended_level);
> fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s %s %s\n",
> ecx_flags & (1 << 0) ? "SSE3" : "-",
> --
> 2.43.0
>
>
--
Len Brown, Intel
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 4/4] tools/power turbostat: Add reading aperf and mperf via perf API
2024-01-12 12:48 [PATCH 0/4] turbostat msr, perf controls and aperf/mperf via perf Patryk Wlazlyn
` (2 preceding siblings ...)
2024-01-12 12:48 ` [PATCH 3/4] tools/power turbostat: Don't print invalid ucode revision Patryk Wlazlyn
@ 2024-01-12 12:48 ` Patryk Wlazlyn
2024-01-13 1:42 ` Len Brown
3 siblings, 1 reply; 11+ messages in thread
From: Patryk Wlazlyn @ 2024-01-12 12:48 UTC (permalink / raw)
To: len.brown; +Cc: linux-pm
Reading the counters via perf API is *usually* faster than going through
the msr driver, mainly because we do less syscalls, which also helps
with narrowing the gap between the reads. Getting cache misses on the
perf path does cost more and this is where the "usually faster" comes
from.
We would fallback to the msr reads if the sysfs isn't there or when in
--no-perf mode.
Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
---
tools/power/x86/turbostat/turbostat.c | 345 +++++++++++++++++++++-----
1 file changed, 277 insertions(+), 68 deletions(-)
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index bf733e7d73b5..d85e38cbadcb 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -57,6 +57,7 @@
enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE };
enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC };
enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT };
+enum xperf_source { XPERF_SOURCE_PERF, XPERF_SOURCE_MSR };
struct msr_counter {
unsigned int msr_num;
@@ -209,6 +210,7 @@ char *proc_stat = "/proc/stat";
FILE *outf;
int *fd_percpu;
int *fd_instr_count_percpu;
+int *fd_xperf_percpu; /* File descriptors for perf group with APERF and MPERF counters. */
struct timeval interval_tv = { 5, 0 };
struct timespec interval_ts = { 5, 0 };
@@ -266,6 +268,7 @@ unsigned int first_counter_read = 1;
int ignore_stdin;
bool no_msr;
bool no_perf;
+enum xperf_source xperf_source;
int get_msr(int cpu, off_t offset, unsigned long long *msr);
@@ -1336,18 +1339,27 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu
return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
}
-static int perf_instr_count_open(int cpu_num)
+static long open_perf_counter(
+ int cpu,
+ unsigned type,
+ unsigned config,
+ int group_fd,
+ __u64 read_format)
{
- struct perf_event_attr pea;
- int fd;
+ struct perf_event_attr attr;
+ const pid_t pid = -1;
+ const unsigned long flags = 0;
+
+ memset(&attr, 0, sizeof(struct perf_event_attr));
- memset(&pea, 0, sizeof(struct perf_event_attr));
- pea.type = PERF_TYPE_HARDWARE;
- pea.size = sizeof(struct perf_event_attr);
- pea.config = PERF_COUNT_HW_INSTRUCTIONS;
+ attr.type = type;
+ attr.size = sizeof(struct perf_event_attr);
+ attr.config = config;
+ attr.disabled = 0;
+ attr.sample_type = PERF_SAMPLE_IDENTIFIER;
+ attr.read_format = read_format;
- /* counter for cpu_num, including user + kernel and all processes */
- fd = perf_event_open(&pea, -1, cpu_num, -1, 0);
+ const int fd = perf_event_open(&attr, pid, cpu, group_fd, flags);
if (fd == -1) {
warnx("capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\""
" or use --no-perf", progname);
@@ -1362,7 +1374,7 @@ int get_instr_count_fd(int cpu)
if (fd_instr_count_percpu[cpu])
return fd_instr_count_percpu[cpu];
- fd_instr_count_percpu[cpu] = perf_instr_count_open(cpu);
+ fd_instr_count_percpu[cpu] = open_perf_counter(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0);
return fd_instr_count_percpu[cpu];
}
@@ -2753,6 +2765,182 @@ int get_core_throt_cnt(int cpu, unsigned long long *cnt)
return 0;
}
+static unsigned read_perf_counter_info(
+ const char * const path,
+ const char * const parse_format)
+{
+ int fdmt;
+ char buf[16];
+ unsigned v;
+
+ fdmt = open(path, O_RDONLY, 0);
+ if (fdmt == -1)
+ errx(1, "Failed to read perf counter info %s\n", path);
+
+ if (read(fdmt, buf, sizeof(buf)) <= 0)
+ return 0;
+
+ buf[sizeof(buf)-1] = '\0';
+
+ if (sscanf(buf, parse_format, &v) != 1)
+ errx(1, "Failed to parse perf counter info %s\n", path);
+
+ close(fdmt);
+
+ return v;
+}
+
+static unsigned read_msr_type(void)
+{
+ const char * const path = "/sys/bus/event_source/devices/msr/type";
+ const char * const format = "%u";
+
+ return read_perf_counter_info(path, format);
+}
+
+static unsigned read_aperf_config(void)
+{
+ const char * const path = "/sys/bus/event_source/devices/msr/events/aperf";
+ const char * const format = "event=%x";
+
+ return read_perf_counter_info(path, format);
+}
+
+static unsigned read_mperf_config(void)
+{
+ const char * const path = "/sys/bus/event_source/devices/msr/events/mperf";
+ const char * const format = "event=%x";
+
+ return read_perf_counter_info(path, format);
+}
+
+static int open_xperf_fd(int cpu)
+{
+ const unsigned msr_type = read_msr_type();
+ const unsigned aperf_config = read_aperf_config();
+ const unsigned mperf_config = read_mperf_config();
+ int fd_aperf = -1, fd_mperf = -1;
+
+ fd_aperf = open_perf_counter(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP);
+ if (fd_aperf == -1) {
+ perror("open_perf_counter aperf");
+ return 0;
+ }
+
+ fd_mperf = open_perf_counter(cpu, msr_type, mperf_config, fd_aperf, PERF_FORMAT_GROUP);
+ if (fd_mperf == -1) {
+ perror("open_perf_counter mperf");
+ close(fd_aperf);
+ return 0;
+ }
+
+ return fd_aperf;
+}
+
+static int get_xperf_fd(int cpu)
+{
+ assert(fd_xperf_percpu);
+
+ if (fd_xperf_percpu[cpu])
+ return fd_xperf_percpu[cpu];
+
+ fd_xperf_percpu[cpu] = open_xperf_fd(cpu);
+
+ return fd_xperf_percpu[cpu];
+}
+
+/* Read APERF, MPERF and TSC using the perf API. */
+static int read_aperf_mperf_tsc_perf(struct thread_data *t, int cpu)
+{
+ union {
+ struct {
+ unsigned long nr_entries;
+ unsigned long aperf;
+ unsigned long mperf;
+ };
+
+ unsigned long as_array[3];
+ } cnt;
+
+ const int fd_xperf = get_xperf_fd(cpu);
+
+ /*
+ * Read the TSC with rdtsc, because we want the absolute value and not
+ * the offset from the start of the process.
+ */
+ t->tsc = rdtsc();
+
+ const int n = read(fd_xperf, &cnt.as_array[0], sizeof(cnt.as_array));
+ if (n != sizeof(cnt.as_array))
+ return 1;
+
+ t->aperf = cnt.aperf * aperf_mperf_multiplier;
+ t->mperf = cnt.mperf * aperf_mperf_multiplier;
+
+ return 0;
+}
+
+/* Read APERF, MPERF and TSC using the MSR driver and rdtsc instruction. */
+static int read_aperf_mperf_tsc_msr(struct thread_data *t, int cpu)
+{
+ unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time;
+ int aperf_mperf_retry_count = 0;
+
+ /*
+ * The TSC, APERF and MPERF must be read together for
+ * APERF/MPERF and MPERF/TSC to give accurate results.
+ *
+ * Unfortunately, APERF and MPERF are read by
+ * individual system call, so delays may occur
+ * between them. If the time to read them
+ * varies by a large amount, we re-read them.
+ */
+
+ /*
+ * This initial dummy APERF read has been seen to
+ * reduce jitter in the subsequent reads.
+ */
+
+ if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
+ return -3;
+
+retry:
+ t->tsc = rdtsc(); /* re-read close to APERF */
+
+ tsc_before = t->tsc;
+
+ if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
+ return -3;
+
+ tsc_between = rdtsc();
+
+ if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf))
+ return -4;
+
+ tsc_after = rdtsc();
+
+ aperf_time = tsc_between - tsc_before;
+ mperf_time = tsc_after - tsc_between;
+
+ /*
+ * If the system call latency to read APERF and MPERF
+ * differ by more than 2x, then try again.
+ */
+ if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) {
+ aperf_mperf_retry_count++;
+ if (aperf_mperf_retry_count < 5)
+ goto retry;
+ else
+ warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time);
+ }
+ aperf_mperf_retry_count = 0;
+
+ t->aperf = t->aperf * aperf_mperf_multiplier;
+ t->mperf = t->mperf * aperf_mperf_multiplier;
+
+ return 0;
+}
+
/*
* get_counters(...)
* migrate to cpu
@@ -2762,7 +2950,6 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
{
int cpu = t->cpu_id;
unsigned long long msr;
- int aperf_mperf_retry_count = 0;
struct msr_counter *mp;
int i;
@@ -2775,63 +2962,26 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
if (first_counter_read)
get_apic_id(t);
-retry:
+
t->tsc = rdtsc(); /* we are running on local CPU of interest */
if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC)
|| soft_c1_residency_display(BIC_Avg_MHz)) {
- unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time;
+ int status = -1;
- /*
- * The TSC, APERF and MPERF must be read together for
- * APERF/MPERF and MPERF/TSC to give accurate results.
- *
- * Unfortunately, APERF and MPERF are read by
- * individual system call, so delays may occur
- * between them. If the time to read them
- * varies by a large amount, we re-read them.
- */
+ assert(!no_perf || !no_msr);
- /*
- * This initial dummy APERF read has been seen to
- * reduce jitter in the subsequent reads.
- */
-
- if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
- return -3;
-
- t->tsc = rdtsc(); /* re-read close to APERF */
-
- tsc_before = t->tsc;
-
- if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
- return -3;
-
- tsc_between = rdtsc();
-
- if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf))
- return -4;
-
- tsc_after = rdtsc();
-
- aperf_time = tsc_between - tsc_before;
- mperf_time = tsc_after - tsc_between;
-
- /*
- * If the system call latency to read APERF and MPERF
- * differ by more than 2x, then try again.
- */
- if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) {
- aperf_mperf_retry_count++;
- if (aperf_mperf_retry_count < 5)
- goto retry;
- else
- warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time);
+ switch (xperf_source) {
+ case XPERF_SOURCE_PERF:
+ status = read_aperf_mperf_tsc_perf(t, cpu);
+ break;
+ case XPERF_SOURCE_MSR:
+ status = read_aperf_mperf_tsc_msr(t, cpu);
+ break;
}
- aperf_mperf_retry_count = 0;
- t->aperf = t->aperf * aperf_mperf_multiplier;
- t->mperf = t->mperf * aperf_mperf_multiplier;
+ if (status != 0)
+ return status;
}
if (DO_BIC(BIC_IPC))
@@ -5597,17 +5747,50 @@ void print_dev_latency(void)
*/
void linux_perf_init(void)
{
- if (!BIC_IS_ENABLED(BIC_IPC))
- return;
-
if (access("/proc/sys/kernel/perf_event_paranoid", F_OK))
return;
- fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
- if (fd_instr_count_percpu == NULL)
- err(-1, "calloc fd_instr_count_percpu");
+ if (BIC_IS_ENABLED(BIC_IPC)) {
+ fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
+ if (fd_instr_count_percpu == NULL)
+ err(-1, "calloc fd_instr_count_percpu");
- BIC_PRESENT(BIC_IPC);
+ BIC_PRESENT(BIC_IPC);
+ }
+
+ const bool aperf_required = DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) ||
+ DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC);
+ if (aperf_required && xperf_source == XPERF_SOURCE_PERF) {
+ fd_xperf_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
+ if (fd_xperf_percpu == NULL)
+ err(-1, "calloc fd_xperf_percpu");
+ }
+}
+
+static int has_xperf_access_via_perf(void)
+{
+ if (access("/sys/bus/event_source/devices/msr/type", F_OK))
+ return 0;
+
+ if (access("/sys/bus/event_source/devices/msr/events/aperf", F_OK))
+ return 0;
+
+ if (access("/sys/bus/event_source/devices/msr/events/mperf", F_OK))
+ return 0;
+
+ return 1;
+}
+
+/* Check if we can access APERF and MPERF */
+static int has_xperf_access(void)
+{
+ if (!no_msr)
+ return 1;
+
+ if (!no_perf && has_xperf_access_via_perf())
+ return 1;
+
+ return 0;
}
void probe_cstates(void)
@@ -5795,7 +5978,7 @@ void process_cpuid()
__cpuid(0x6, eax, ebx, ecx, edx);
has_aperf = ecx & (1 << 0);
- if (has_aperf) {
+ if (has_aperf && has_xperf_access()) {
BIC_PRESENT(BIC_Avg_MHz);
BIC_PRESENT(BIC_Busy);
BIC_PRESENT(BIC_Bzy_MHz);
@@ -6264,6 +6447,20 @@ void set_base_cpu(void)
err(-ENODEV, "No valid cpus found");
}
+static void set_xperf_source(void)
+{
+ xperf_source = XPERF_SOURCE_PERF;
+
+ if (no_perf || !has_xperf_access_via_perf())
+ xperf_source = XPERF_SOURCE_MSR;
+
+ if (quiet)
+ return;
+
+ fprintf(outf, "aperf/mperf source: %s\n",
+ xperf_source == XPERF_SOURCE_MSR ? "msr" : "perf");
+}
+
void turbostat_init()
{
setup_all_buffers(true);
@@ -6272,6 +6469,7 @@ void turbostat_init()
check_permissions();
process_cpuid();
probe_pm_features();
+ set_xperf_source();
linux_perf_init();
for_all_cpus(get_cpu_type, ODD_COUNTERS);
@@ -6859,6 +7057,17 @@ int main(int argc, char **argv)
turbostat_init();
+ /*
+ * We can't get TSC tweak in no-msr mode,
+ * so have to disable more BICs, since we can't report them accurately.
+ */
+ if (platform->enable_tsc_tweak && no_msr) {
+ bic_enabled &= ~BIC_Avg_MHz;
+ bic_enabled &= ~BIC_Busy;
+ bic_enabled &= ~BIC_Bzy_MHz;
+ bic_enabled &= ~BIC_IPC;
+ }
+
if (!no_msr)
msr_sum_record();
--
2.43.0
^ permalink raw reply related [flat|nested] 11+ messages in thread* Re: [PATCH 4/4] tools/power turbostat: Add reading aperf and mperf via perf API
2024-01-12 12:48 ` [PATCH 4/4] tools/power turbostat: Add reading aperf and mperf via perf API Patryk Wlazlyn
@ 2024-01-13 1:42 ` Len Brown
2024-01-15 10:28 ` Patryk Wlazlyn
0 siblings, 1 reply; 11+ messages in thread
From: Len Brown @ 2024-01-13 1:42 UTC (permalink / raw)
To: Patryk Wlazlyn; +Cc: len.brown, linux-pm
When this patch is applied (on top of the --no-msr patch), then with
--no-msr, we should still see Busy%, Avg_Mhz, Bzy_MHz, and IPC -- but
we do not.
Also, the reason we want to do this isn't because perf takes fewer
cycles than the MSR driver -- though that is an added bonus in the
scenario when that is true.
The reason we want to do this is because we can read APERF and MPERF
within the same system call, reducing "jitter" between the reads and
thus allowing more accurate frequency calculations under key
conditions -- such as when the machine is very busy and turbostat is
contending for the CPU, as well as when the sample interval is very
short.
thanks,
-Len
On Fri, Jan 12, 2024 at 6:49 AM Patryk Wlazlyn
<patryk.wlazlyn@linux.intel.com> wrote:
>
> Reading the counters via perf API is *usually* faster than going through
> the msr driver, mainly because we do less syscalls, which also helps
> with narrowing the gap between the reads. Getting cache misses on the
> perf path does cost more and this is where the "usually faster" comes
> from.
>
> We would fallback to the msr reads if the sysfs isn't there or when in
> --no-perf mode.
>
> Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
> ---
> tools/power/x86/turbostat/turbostat.c | 345 +++++++++++++++++++++-----
> 1 file changed, 277 insertions(+), 68 deletions(-)
>
> diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
> index bf733e7d73b5..d85e38cbadcb 100644
> --- a/tools/power/x86/turbostat/turbostat.c
> +++ b/tools/power/x86/turbostat/turbostat.c
> @@ -57,6 +57,7 @@
> enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE };
> enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC };
> enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT };
> +enum xperf_source { XPERF_SOURCE_PERF, XPERF_SOURCE_MSR };
>
> struct msr_counter {
> unsigned int msr_num;
> @@ -209,6 +210,7 @@ char *proc_stat = "/proc/stat";
> FILE *outf;
> int *fd_percpu;
> int *fd_instr_count_percpu;
> +int *fd_xperf_percpu; /* File descriptors for perf group with APERF and MPERF counters. */
> struct timeval interval_tv = { 5, 0 };
> struct timespec interval_ts = { 5, 0 };
>
> @@ -266,6 +268,7 @@ unsigned int first_counter_read = 1;
> int ignore_stdin;
> bool no_msr;
> bool no_perf;
> +enum xperf_source xperf_source;
>
> int get_msr(int cpu, off_t offset, unsigned long long *msr);
>
> @@ -1336,18 +1339,27 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu
> return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
> }
>
> -static int perf_instr_count_open(int cpu_num)
> +static long open_perf_counter(
> + int cpu,
> + unsigned type,
> + unsigned config,
> + int group_fd,
> + __u64 read_format)
> {
> - struct perf_event_attr pea;
> - int fd;
> + struct perf_event_attr attr;
> + const pid_t pid = -1;
> + const unsigned long flags = 0;
> +
> + memset(&attr, 0, sizeof(struct perf_event_attr));
>
> - memset(&pea, 0, sizeof(struct perf_event_attr));
> - pea.type = PERF_TYPE_HARDWARE;
> - pea.size = sizeof(struct perf_event_attr);
> - pea.config = PERF_COUNT_HW_INSTRUCTIONS;
> + attr.type = type;
> + attr.size = sizeof(struct perf_event_attr);
> + attr.config = config;
> + attr.disabled = 0;
> + attr.sample_type = PERF_SAMPLE_IDENTIFIER;
> + attr.read_format = read_format;
>
> - /* counter for cpu_num, including user + kernel and all processes */
> - fd = perf_event_open(&pea, -1, cpu_num, -1, 0);
> + const int fd = perf_event_open(&attr, pid, cpu, group_fd, flags);
> if (fd == -1) {
> warnx("capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\""
> " or use --no-perf", progname);
> @@ -1362,7 +1374,7 @@ int get_instr_count_fd(int cpu)
> if (fd_instr_count_percpu[cpu])
> return fd_instr_count_percpu[cpu];
>
> - fd_instr_count_percpu[cpu] = perf_instr_count_open(cpu);
> + fd_instr_count_percpu[cpu] = open_perf_counter(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0);
>
> return fd_instr_count_percpu[cpu];
> }
> @@ -2753,6 +2765,182 @@ int get_core_throt_cnt(int cpu, unsigned long long *cnt)
> return 0;
> }
>
> +static unsigned read_perf_counter_info(
> + const char * const path,
> + const char * const parse_format)
> +{
> + int fdmt;
> + char buf[16];
> + unsigned v;
> +
> + fdmt = open(path, O_RDONLY, 0);
> + if (fdmt == -1)
> + errx(1, "Failed to read perf counter info %s\n", path);
> +
> + if (read(fdmt, buf, sizeof(buf)) <= 0)
> + return 0;
> +
> + buf[sizeof(buf)-1] = '\0';
> +
> + if (sscanf(buf, parse_format, &v) != 1)
> + errx(1, "Failed to parse perf counter info %s\n", path);
> +
> + close(fdmt);
> +
> + return v;
> +}
> +
> +static unsigned read_msr_type(void)
> +{
> + const char * const path = "/sys/bus/event_source/devices/msr/type";
> + const char * const format = "%u";
> +
> + return read_perf_counter_info(path, format);
> +}
> +
> +static unsigned read_aperf_config(void)
> +{
> + const char * const path = "/sys/bus/event_source/devices/msr/events/aperf";
> + const char * const format = "event=%x";
> +
> + return read_perf_counter_info(path, format);
> +}
> +
> +static unsigned read_mperf_config(void)
> +{
> + const char * const path = "/sys/bus/event_source/devices/msr/events/mperf";
> + const char * const format = "event=%x";
> +
> + return read_perf_counter_info(path, format);
> +}
> +
> +static int open_xperf_fd(int cpu)
> +{
> + const unsigned msr_type = read_msr_type();
> + const unsigned aperf_config = read_aperf_config();
> + const unsigned mperf_config = read_mperf_config();
> + int fd_aperf = -1, fd_mperf = -1;
> +
> + fd_aperf = open_perf_counter(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP);
> + if (fd_aperf == -1) {
> + perror("open_perf_counter aperf");
> + return 0;
> + }
> +
> + fd_mperf = open_perf_counter(cpu, msr_type, mperf_config, fd_aperf, PERF_FORMAT_GROUP);
> + if (fd_mperf == -1) {
> + perror("open_perf_counter mperf");
> + close(fd_aperf);
> + return 0;
> + }
> +
> + return fd_aperf;
> +}
> +
> +static int get_xperf_fd(int cpu)
> +{
> + assert(fd_xperf_percpu);
> +
> + if (fd_xperf_percpu[cpu])
> + return fd_xperf_percpu[cpu];
> +
> + fd_xperf_percpu[cpu] = open_xperf_fd(cpu);
> +
> + return fd_xperf_percpu[cpu];
> +}
> +
> +/* Read APERF, MPERF and TSC using the perf API. */
> +static int read_aperf_mperf_tsc_perf(struct thread_data *t, int cpu)
> +{
> + union {
> + struct {
> + unsigned long nr_entries;
> + unsigned long aperf;
> + unsigned long mperf;
> + };
> +
> + unsigned long as_array[3];
> + } cnt;
> +
> + const int fd_xperf = get_xperf_fd(cpu);
> +
> + /*
> + * Read the TSC with rdtsc, because we want the absolute value and not
> + * the offset from the start of the process.
> + */
> + t->tsc = rdtsc();
> +
> + const int n = read(fd_xperf, &cnt.as_array[0], sizeof(cnt.as_array));
> + if (n != sizeof(cnt.as_array))
> + return 1;
> +
> + t->aperf = cnt.aperf * aperf_mperf_multiplier;
> + t->mperf = cnt.mperf * aperf_mperf_multiplier;
> +
> + return 0;
> +}
> +
> +/* Read APERF, MPERF and TSC using the MSR driver and rdtsc instruction. */
> +static int read_aperf_mperf_tsc_msr(struct thread_data *t, int cpu)
> +{
> + unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time;
> + int aperf_mperf_retry_count = 0;
> +
> + /*
> + * The TSC, APERF and MPERF must be read together for
> + * APERF/MPERF and MPERF/TSC to give accurate results.
> + *
> + * Unfortunately, APERF and MPERF are read by
> + * individual system call, so delays may occur
> + * between them. If the time to read them
> + * varies by a large amount, we re-read them.
> + */
> +
> + /*
> + * This initial dummy APERF read has been seen to
> + * reduce jitter in the subsequent reads.
> + */
> +
> + if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
> + return -3;
> +
> +retry:
> + t->tsc = rdtsc(); /* re-read close to APERF */
> +
> + tsc_before = t->tsc;
> +
> + if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
> + return -3;
> +
> + tsc_between = rdtsc();
> +
> + if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf))
> + return -4;
> +
> + tsc_after = rdtsc();
> +
> + aperf_time = tsc_between - tsc_before;
> + mperf_time = tsc_after - tsc_between;
> +
> + /*
> + * If the system call latency to read APERF and MPERF
> + * differ by more than 2x, then try again.
> + */
> + if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) {
> + aperf_mperf_retry_count++;
> + if (aperf_mperf_retry_count < 5)
> + goto retry;
> + else
> + warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time);
> + }
> + aperf_mperf_retry_count = 0;
> +
> + t->aperf = t->aperf * aperf_mperf_multiplier;
> + t->mperf = t->mperf * aperf_mperf_multiplier;
> +
> + return 0;
> +}
> +
> /*
> * get_counters(...)
> * migrate to cpu
> @@ -2762,7 +2950,6 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
> {
> int cpu = t->cpu_id;
> unsigned long long msr;
> - int aperf_mperf_retry_count = 0;
> struct msr_counter *mp;
> int i;
>
> @@ -2775,63 +2962,26 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
>
> if (first_counter_read)
> get_apic_id(t);
> -retry:
> +
> t->tsc = rdtsc(); /* we are running on local CPU of interest */
>
> if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC)
> || soft_c1_residency_display(BIC_Avg_MHz)) {
> - unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time;
> + int status = -1;
>
> - /*
> - * The TSC, APERF and MPERF must be read together for
> - * APERF/MPERF and MPERF/TSC to give accurate results.
> - *
> - * Unfortunately, APERF and MPERF are read by
> - * individual system call, so delays may occur
> - * between them. If the time to read them
> - * varies by a large amount, we re-read them.
> - */
> + assert(!no_perf || !no_msr);
>
> - /*
> - * This initial dummy APERF read has been seen to
> - * reduce jitter in the subsequent reads.
> - */
> -
> - if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
> - return -3;
> -
> - t->tsc = rdtsc(); /* re-read close to APERF */
> -
> - tsc_before = t->tsc;
> -
> - if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
> - return -3;
> -
> - tsc_between = rdtsc();
> -
> - if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf))
> - return -4;
> -
> - tsc_after = rdtsc();
> -
> - aperf_time = tsc_between - tsc_before;
> - mperf_time = tsc_after - tsc_between;
> -
> - /*
> - * If the system call latency to read APERF and MPERF
> - * differ by more than 2x, then try again.
> - */
> - if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) {
> - aperf_mperf_retry_count++;
> - if (aperf_mperf_retry_count < 5)
> - goto retry;
> - else
> - warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time);
> + switch (xperf_source) {
> + case XPERF_SOURCE_PERF:
> + status = read_aperf_mperf_tsc_perf(t, cpu);
> + break;
> + case XPERF_SOURCE_MSR:
> + status = read_aperf_mperf_tsc_msr(t, cpu);
> + break;
> }
> - aperf_mperf_retry_count = 0;
>
> - t->aperf = t->aperf * aperf_mperf_multiplier;
> - t->mperf = t->mperf * aperf_mperf_multiplier;
> + if (status != 0)
> + return status;
> }
>
> if (DO_BIC(BIC_IPC))
> @@ -5597,17 +5747,50 @@ void print_dev_latency(void)
> */
> void linux_perf_init(void)
> {
> - if (!BIC_IS_ENABLED(BIC_IPC))
> - return;
> -
> if (access("/proc/sys/kernel/perf_event_paranoid", F_OK))
> return;
>
> - fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
> - if (fd_instr_count_percpu == NULL)
> - err(-1, "calloc fd_instr_count_percpu");
> + if (BIC_IS_ENABLED(BIC_IPC)) {
> + fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
> + if (fd_instr_count_percpu == NULL)
> + err(-1, "calloc fd_instr_count_percpu");
>
> - BIC_PRESENT(BIC_IPC);
> + BIC_PRESENT(BIC_IPC);
> + }
> +
> + const bool aperf_required = DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) ||
> + DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC);
> + if (aperf_required && xperf_source == XPERF_SOURCE_PERF) {
> + fd_xperf_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
> + if (fd_xperf_percpu == NULL)
> + err(-1, "calloc fd_xperf_percpu");
> + }
> +}
> +
> +static int has_xperf_access_via_perf(void)
> +{
> + if (access("/sys/bus/event_source/devices/msr/type", F_OK))
> + return 0;
> +
> + if (access("/sys/bus/event_source/devices/msr/events/aperf", F_OK))
> + return 0;
> +
> + if (access("/sys/bus/event_source/devices/msr/events/mperf", F_OK))
> + return 0;
> +
> + return 1;
> +}
> +
> +/* Check if we can access APERF and MPERF */
> +static int has_xperf_access(void)
> +{
> + if (!no_msr)
> + return 1;
> +
> + if (!no_perf && has_xperf_access_via_perf())
> + return 1;
> +
> + return 0;
> }
>
> void probe_cstates(void)
> @@ -5795,7 +5978,7 @@ void process_cpuid()
>
> __cpuid(0x6, eax, ebx, ecx, edx);
> has_aperf = ecx & (1 << 0);
> - if (has_aperf) {
> + if (has_aperf && has_xperf_access()) {
> BIC_PRESENT(BIC_Avg_MHz);
> BIC_PRESENT(BIC_Busy);
> BIC_PRESENT(BIC_Bzy_MHz);
> @@ -6264,6 +6447,20 @@ void set_base_cpu(void)
> err(-ENODEV, "No valid cpus found");
> }
>
> +static void set_xperf_source(void)
> +{
> + xperf_source = XPERF_SOURCE_PERF;
> +
> + if (no_perf || !has_xperf_access_via_perf())
> + xperf_source = XPERF_SOURCE_MSR;
> +
> + if (quiet)
> + return;
> +
> + fprintf(outf, "aperf/mperf source: %s\n",
> + xperf_source == XPERF_SOURCE_MSR ? "msr" : "perf");
> +}
> +
> void turbostat_init()
> {
> setup_all_buffers(true);
> @@ -6272,6 +6469,7 @@ void turbostat_init()
> check_permissions();
> process_cpuid();
> probe_pm_features();
> + set_xperf_source();
> linux_perf_init();
>
> for_all_cpus(get_cpu_type, ODD_COUNTERS);
> @@ -6859,6 +7057,17 @@ int main(int argc, char **argv)
>
> turbostat_init();
>
> + /*
> + * We can't get TSC tweak in no-msr mode,
> + * so have to disable more BICs, since we can't report them accurately.
> + */
> + if (platform->enable_tsc_tweak && no_msr) {
> + bic_enabled &= ~BIC_Avg_MHz;
> + bic_enabled &= ~BIC_Busy;
> + bic_enabled &= ~BIC_Bzy_MHz;
> + bic_enabled &= ~BIC_IPC;
> + }
> +
> if (!no_msr)
> msr_sum_record();
>
> --
> 2.43.0
>
>
--
Len Brown, Intel
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH 4/4] tools/power turbostat: Add reading aperf and mperf via perf API
2024-01-13 1:42 ` Len Brown
@ 2024-01-15 10:28 ` Patryk Wlazlyn
0 siblings, 0 replies; 11+ messages in thread
From: Patryk Wlazlyn @ 2024-01-15 10:28 UTC (permalink / raw)
To: Len Brown; +Cc: len.brown, linux-pm
> When this patch is applied (on top of the --no-msr patch), then with
> --no-msr, we should still see Busy%, Avg_Mhz, Bzy_MHz, and IPC -- but
> we do not.
That's true only on some platforms, unfortunately. It's because, to get
the tsc_tweak we need to read an MSR. That being said, there are
platforms that do not require the tsc_tweak and I got it working, but
after rebasing I must have left too many bits in the
bic_disable_msr_access() and now we are being "too conservative" about it.
Will fix.
^ permalink raw reply [flat|nested] 11+ messages in thread