From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail.linutronix.de (146.0.238.70:993) by crypto-ml.lab.linutronix.de with IMAP4-SSL for ; 08 Feb 2019 10:53:24 -0000 Received: from merlin.infradead.org ([2001:8b0:10b:1231::1]) by Galois.linutronix.de with esmtps (TLS1.2:RSA_AES_256_CBC_SHA256:256) (Exim 4.80) (envelope-from ) id 1gs3md-0007Ue-5y for speck@linutronix.de; Fri, 08 Feb 2019 11:53:23 +0100 Received: from j217100.upc-j.chello.nl ([24.132.217.100] helo=hirez.programming.kicks-ass.net) by merlin.infradead.org with esmtpsa (Exim 4.90_1 #2 (Red Hat Linux)) id 1gs3mb-0002jJ-QJ for speck@linutronix.de; Fri, 08 Feb 2019 10:53:22 +0000 Date: Fri, 8 Feb 2019 11:53:18 +0100 From: Peter Zijlstra Subject: [MODERATED] [RFC][PATCH] performance walnuts Message-ID: <20190208105318.GE32534@hirez.programming.kicks-ass.net> References: <3dd5d6e2bc9ac53f826c251c68ce84fcc79a6872.1549582769.git.ak@linux.intel.com> <20190208090147.GK32477@hirez.programming.kicks-ass.net> <20190208093950.GD32534@hirez.programming.kicks-ass.net> MIME-Version: 1.0 In-Reply-To: <20190208093950.GD32534@hirez.programming.kicks-ass.net> Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 7bit To: speck@linutronix.de List-ID: On Fri, Feb 08, 2019 at 10:39:50AM +0100, Peter Zijlstra wrote: > Ah, I think I found a way to avoid having to rely on this. Let me try. Something like so. Can someone with access to a relevant machine test this? If it works, I'll write a Changelog and this'll be it. --- arch/x86/events/intel/core.c | 127 ++++++++++++++++++++++++++++++------- arch/x86/events/perf_event.h | 6 ++ arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 6 ++ 4 files changed, 116 insertions(+), 24 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index e0232bdb7aff..8352c2647a1b 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -1999,6 +1999,39 @@ static void intel_pmu_nhm_enable_all(int added) intel_pmu_enable_all(added); } +static void intel_set_tfa(struct cpu_hw_events *cpuc, bool on) +{ + u64 val = MSR_TFA_RTM_FORCE_ABORT * on; + + if (cpuc->tfa_shadow != val) { + cpuc->tfa_shadow = val; + wrmsrl(MSR_TSX_FORCE_ABORT, val); + } +} + +static void intel_skl_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr) +{ + /* + * We're going to use PMC3, make sure TFA is set before we touch it. + */ + if (cntr == 3) + intel_set_tfa(cpuc, true); +} + +static void intel_skl_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* + * If we find PMC3 is no longer used when we enable the PMU, we can + * clear TFA. + */ + if (!test_bit(3, cpuc->active_mask)) + intel_set_tfa(cpuc, false); + + intel_pmu_enable_all(added); +} + static void enable_counter_freeze(void) { update_debugctlmsr(get_debugctlmsr() | @@ -2768,6 +2801,35 @@ intel_stop_scheduling(struct cpu_hw_events *cpuc) raw_spin_unlock(&excl_cntrs->lock); } +static struct event_constraint * +dyn_constraint(struct cpu_hw_events *cpuc, struct event_constraint *c, int idx) +{ + WARN_ON_ONCE(!cpuc->constraint_list); + + if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) { + struct event_constraint *cx; + + /* + * grab pre-allocated constraint entry + */ + cx = &cpuc->constraint_list[idx]; + + /* + * initialize dynamic constraint + * with static constraint + */ + *cx = *c; + + /* + * mark constraint as dynamic + */ + cx->flags |= PERF_X86_EVENT_DYNAMIC; + c = cx; + } + + return c; +} + static struct event_constraint * intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, int idx, struct event_constraint *c) @@ -2798,27 +2860,7 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, * only needed when constraint has not yet * been cloned (marked dynamic) */ - if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) { - struct event_constraint *cx; - - /* - * grab pre-allocated constraint entry - */ - cx = &cpuc->constraint_list[idx]; - - /* - * initialize dynamic constraint - * with static constraint - */ - *cx = *c; - - /* - * mark constraint as dynamic, so we - * can free it later on - */ - cx->flags |= PERF_X86_EVENT_DYNAMIC; - c = cx; - } + c = dyn_constraint(cpuc, c, idx); /* * From here on, the constraint is dynamic. @@ -3345,6 +3387,26 @@ glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx, return c; } +static bool allow_tsx_force_abort = true; + +static struct event_constraint * +skl_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct event_constraint *c = hsw_get_event_constraints(cpuc, idx, event); + + /* + * Without TFA we must not use PMC3. + */ + if (!allow_tsx_force_abort && test_bit(3, c->idxmsk)) { + c = dyn_constraint(cpuc, c, idx); + c->idxmsk64 &= ~(1ULL << 3); + c->weight = hweight64(c->idxmsk64); + } + + return c; +} + /* * Broadwell: * @@ -3440,13 +3502,15 @@ static int intel_pmu_cpu_prepare(int cpu) goto err; } - if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { + if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_WALNUT)) { size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint); cpuc->constraint_list = kzalloc(sz, GFP_KERNEL); if (!cpuc->constraint_list) goto err_shared_regs; + } + if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { cpuc->excl_cntrs = allocate_excl_cntrs(cpu); if (!cpuc->excl_cntrs) goto err_constraint_list; @@ -3552,9 +3616,10 @@ static void free_excl_cntrs(int cpu) if (c->core_id == -1 || --c->refcnt == 0) kfree(c); cpuc->excl_cntrs = NULL; - kfree(cpuc->constraint_list); - cpuc->constraint_list = NULL; } + + kfree(cpuc->constraint_list); + cpuc->constraint_list = NULL; } static void intel_pmu_cpu_dying(int cpu) @@ -4061,9 +4126,12 @@ static struct attribute *intel_pmu_caps_attrs[] = { NULL }; +DEVICE_BOOL_ATTR(allow_tsx_force_abort, 0644, allow_tsx_force_abort); + static struct attribute *intel_pmu_attrs[] = { &dev_attr_freeze_on_smi.attr, NULL, + NULL, }; static __init struct attribute ** @@ -4546,6 +4614,7 @@ __init int intel_pmu_init(void) x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; + x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? @@ -4557,6 +4626,16 @@ __init int intel_pmu_init(void) tsx_attr = hsw_tsx_events_attrs; intel_pmu_pebs_data_source_skl( boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X); + + /* If our CPU haz a walnut */ + if (boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) { + x86_pmu.flags |= PMU_FL_WALNUT; + x86_pmu.get_event_constraints = skl_get_event_constraints; + x86_pmu.enable_all = intel_skl_pmu_enable_all; + x86_pmu.commit_scheduling = intel_skl_commit_scheduling; + intel_pmu_attrs[1] = &dev_attr_allow_tsx_force_abort.attr.attr; + } + pr_cont("Skylake events, "); name = "skylake"; break; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 78d7b7031bfc..44b3426c618e 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -242,6 +242,11 @@ struct cpu_hw_events { struct intel_excl_cntrs *excl_cntrs; int excl_thread_id; /* 0 or 1 */ + /* + * SKL TSX_FORCE_ABORT shadow + */ + int tfa_shadow; + /* * AMD specific bits */ @@ -676,6 +681,7 @@ do { \ #define PMU_FL_EXCL_CNTRS 0x4 /* has exclusive counter requirements */ #define PMU_FL_EXCL_ENABLED 0x8 /* exclusive counter active */ #define PMU_FL_PEBS_ALL 0x10 /* all events are valid PEBS events */ +#define PMU_FL_WALNUT 0x20 /* deal with the walnut errata */ #define EVENT_VAR(_id) event_attr_##_id #define EVENT_PTR(_id) &event_attr_##_id.attr.attr diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 6d6122524711..981ff9479648 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -344,6 +344,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 8e40c2446fd1..ca5bc0eacb95 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -666,6 +666,12 @@ #define MSR_IA32_TSC_DEADLINE 0x000006E0 + +#define MSR_TSX_FORCE_ABORT 0x0000010F + +#define MSR_TFA_RTM_FORCE_ABORT_BIT 0 +#define MSR_TFA_RTM_FORCE_ABORT BIT_ULL(MSR_TFA_RTM_FORCE_ABORT_BIT) + /* P4/Xeon+ specific */ #define MSR_IA32_MCG_EAX 0x00000180 #define MSR_IA32_MCG_EBX 0x00000181