LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH v2] perf vendor events power10: Add metric events json file for power10 platform
From: Michael Ellerman @ 2021-10-25  3:23 UTC (permalink / raw)
  To: Paul A. Clarke, Kajol Jain
  Cc: maddy, rnsastry, linuxppc-dev, linux-kernel, acme,
	linux-perf-users, atrajeev, jolsa
In-Reply-To: <20211022144910.GC104437@li-24c3614c-2adc-11b2-a85c-85f334518bdb.ibm.com>

"Paul A. Clarke" <pc@us.ibm.com> writes:
> Thanks for the changes!
> More nits below (many left over from prior review)...
>
> On Fri, Oct 22, 2021 at 11:55:05AM +0530, Kajol Jain wrote:
>> Add pmu metric json file for power10 platform.
>> 
>> Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
>> ---
>> Changelog v1 -> v2:
>> - Did some nit changes in BriefDescription field
>>   as suggested by Paul A. Clarke
>> 
>> - Link to the v1 patch: https://lkml.org/lkml/2021/10/6/131
>> 
>>  .../arch/powerpc/power10/metrics.json         | 676 ++++++++++++++++++
>>  1 file changed, 676 insertions(+)
>>  create mode 100644 tools/perf/pmu-events/arch/powerpc/power10/metrics.json
>> 
>> diff --git a/tools/perf/pmu-events/arch/powerpc/power10/metrics.json b/tools/perf/pmu-events/arch/powerpc/power10/metrics.json
>> new file mode 100644
>> index 000000000000..8adab5cd9934
>> --- /dev/null
>> +++ b/tools/perf/pmu-events/arch/powerpc/power10/metrics.json
>> @@ -0,0 +1,676 @@
>> +[
>> +    {
>> +        "BriefDescription": "Percentage of cycles that are run cycles",
>> +        "MetricExpr": "PM_RUN_CYC / PM_CYC * 100",
>> +        "MetricGroup": "General",
>> +        "MetricName": "RUN_CYCLES_RATE",
>> +        "ScaleUnit": "1%"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per completed instruction",
>> +        "MetricExpr": "PM_CYC / PM_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "CYCLES_PER_INSTRUCTION"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled for any reason",
>> +        "MetricExpr": "PM_DISP_STALL_CYC / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISPATCHED_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled because there was a flush",
>> +        "MetricExpr": "PM_DISP_STALL_FLUSH / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISPATCHED_FLUSH_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled because the MMU was handling a translation miss",
>> +        "MetricExpr": "PM_DISP_STALL_TRANSLATION / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISPATCHED_TRANSLATION_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled waiting to resolve an instruction ERAT miss",
>> +        "MetricExpr": "PM_DISP_STALL_IERAT_ONLY_MISS / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISPATCHED_IERAT_ONLY_MISS_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled waiting to resolve an instruction TLB miss",
>> +        "MetricExpr": "PM_DISP_STALL_ITLB_MISS / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISPATCHED_ITLB_MISS_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled due to an icache miss",
>> +        "MetricExpr": "PM_DISP_STALL_IC_MISS / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISPATCHED_IC_MISS_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled while the instruction was fetched from the local L2",
>> +        "MetricExpr": "PM_DISP_STALL_IC_L2 / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISPATCHED_IC_L2_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled while the instruction was fetched from the local L3",
>> +        "MetricExpr": "PM_DISP_STALL_IC_L3 / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISPATCHED_IC_L3_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled while the instruction was fetched from any source beyond the local L3",
>> +        "MetricExpr": "PM_DISP_STALL_IC_L3MISS / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISPATCHED_IC_L3MISS_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled due to an icache miss after a branch mispredict",
>> +        "MetricExpr": "PM_DISP_STALL_BR_MPRED_ICMISS / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISPATCHED_BR_MPRED_ICMISS_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled while instruction was fetched from the local L2 after suffering a branch mispredict",
>> +        "MetricExpr": "PM_DISP_STALL_BR_MPRED_IC_L2 / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISPATCHED_BR_MPRED_IC_L2_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled while instruction was fetched from the local L3 after suffering a branch mispredict",
>> +        "MetricExpr": "PM_DISP_STALL_BR_MPRED_IC_L3 / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISPATCHED_BR_MPRED_IC_L3_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled while instruction was fetched from any source beyond the local L3 after suffering a branch mispredict",
>> +        "MetricExpr": "PM_DISP_STALL_BR_MPRED_IC_L3MISS / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISPATCHED_BR_MPRED_IC_L3MISS_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled due to a branch mispredict",
>> +        "MetricExpr": "PM_DISP_STALL_BR_MPRED / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISPATCHED_BR_MPRED_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction was held at dispatch for any reason",
>> +        "MetricExpr": "PM_DISP_STALL_HELD_CYC / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISPATCHED_HELD_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction was held at dispatch because of a synchronizing instruction that requires the ICT to be empty before dispatch",
>> +        "MetricExpr": "PM_DISP_STALL_HELD_SYNC_CYC / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISP_HELD_STALL_SYNC_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction was held at dispatch while waiting on the scoreboard",
>> +        "MetricExpr": "PM_DISP_STALL_HELD_SCOREBOARD_CYC / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISP_HELD_STALL_SCOREBOARD_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction was held at dispatch due to issue queue full",
>> +        "MetricExpr": "PM_DISP_STALL_HELD_ISSQ_FULL_CYC / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISP_HELD_STALL_ISSQ_FULL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction was held at dispatch because the mapper/SRB was full",
>> +        "MetricExpr": "PM_DISP_STALL_HELD_RENAME_CYC / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISPATCHED_HELD_RENAME_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction was held at dispatch because the STF mapper/SRB was full",
>> +        "MetricExpr": "PM_DISP_STALL_HELD_STF_MAPPER_CYC / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISPATCHED_HELD_STF_MAPPER_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction was held at dispatch because the XVFC mapper/SRB was full",
>> +        "MetricExpr": "PM_DISP_STALL_HELD_XVFC_MAPPER_CYC / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISPATCHED_HELD_XVFC_MAPPER_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction was held at dispatch for any other reason",
>> +        "MetricExpr": "PM_DISP_STALL_HELD_OTHER_CYC / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISPATCHED_HELD_OTHER_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction has been dispatched but not issued for any reason",
>> +        "MetricExpr": "PM_ISSUE_STALL / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "ISSUE_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is waiting to be finished in one of the execution units",
>> +        "MetricExpr": "PM_EXEC_STALL / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "EXECUTION_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction spent executing an NTC instruction that gets flushed some time after dispatch",
>> +        "MetricExpr": "PM_EXEC_STALL_NTC_FLUSH / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "NTC_FLUSH_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTF instruction finishes at dispatch",
>> +        "MetricExpr": "PM_EXEC_STALL_FIN_AT_DISP / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "FIN_AT_DISP_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is executing in the branch unit",
>> +        "MetricExpr": "PM_EXEC_STALL_BRU / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "BRU_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is a simple fixed point instruction that is executing in the LSU",
>> +        "MetricExpr": "PM_EXEC_STALL_SIMPLE_FX / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "SIMPLE_FX_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is executing in the VSU",
>> +        "MetricExpr": "PM_EXEC_STALL_VSU / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "VSU_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is waiting to be finished in one of the execution units",
>> +        "MetricExpr": "PM_EXEC_STALL_TRANSLATION / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "TRANSLATION_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is a load or store that suffered a translation miss",
>> +        "MetricExpr": "PM_EXEC_STALL_DERAT_ONLY_MISS / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DERAT_ONLY_MISS_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is recovering from a TLB miss",
>> +        "MetricExpr": "PM_EXEC_STALL_DERAT_DTLB_MISS / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DERAT_DTLB_MISS_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is executing in the LSU",
>> +        "MetricExpr": "PM_EXEC_STALL_LSU / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "LSU_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is a load that is executing in the LSU",
>> +        "MetricExpr": "PM_EXEC_STALL_LOAD / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "LOAD_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is waiting for a load miss to resolve from either the local L2 or local L3",
>> +        "MetricExpr": "PM_EXEC_STALL_DMISS_L2L3 / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DMISS_L2L3_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is waiting for a load miss to resolve from either the local L2 or local L3, with an RC dispatch conflict",
>> +        "MetricExpr": "PM_EXEC_STALL_DMISS_L2L3_CONFLICT / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DMISS_L2L3_CONFLICT_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is waiting for a load miss to resolve from either the local L2 or local L3, without an RC dispatch conflict",
>> +        "MetricExpr": "PM_EXEC_STALL_DMISS_L2L3_NOCONFLICT / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DMISS_L2L3_NOCONFLICT_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is waiting for a load miss to resolve from a source beyond the local L2 and local L3",
>> +        "MetricExpr": "PM_EXEC_STALL_DMISS_L3MISS / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DMISS_L3MISS_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is waiting for a load miss to resolve from a neighbor chiplet's L2 or L3 in the same chip",
>> +        "MetricExpr": "PM_EXEC_STALL_DMISS_L21_L31 / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DMISS_L21_L31_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is waiting for a load miss to resolve from L4, local memory or OpenCapp chip",
>
> What is "OpenCapp"?  Is is different from OpenCAPI?
>
>> +        "MetricExpr": "PM_EXEC_STALL_DMISS_LMEM / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DMISS_LMEM_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is waiting for a load miss to resolve from a remote chip (cache, L4, memory or OpenCapp) in the same group",
>> +        "MetricExpr": "PM_EXEC_STALL_DMISS_OFF_CHIP / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DMISS_OFF_CHIP_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is waiting for a load miss to resolve from a distant chip (cache, L4, memory or OpenCapp chip)",
>> +        "MetricExpr": "PM_EXEC_STALL_DMISS_OFF_NODE / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DMISS_OFF_NODE_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is executing a TLBIEL instruction",
>> +        "MetricExpr": "PM_EXEC_STALL_TLBIEL / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "TLBIEL_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is finishing a load after its data has been reloaded from a data source beyond the local L1, OR when the LSU is processing an L1-hit, OR when the NTF instruction merged with another load in the LMQ",
>> +        "MetricExpr": "PM_EXEC_STALL_LOAD_FINISH / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "LOAD_FINISH_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is a store that is executing in the LSU",
>> +        "MetricExpr": "PM_EXEC_STALL_STORE / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "STORE_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is in the store unit outside of handling store misses or other special store operations",
>
> Is "store unit" not the same as "LSU" ?  Use "LSU" uniformly if appropriate:
> s/store unit/LSU/
>
>> +        "MetricExpr": "PM_EXEC_STALL_STORE_PIPE / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "STORE_PIPE_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is a store whose cache line was not resident in the L1 and had to wait for allocation of the missing line into the L1",
>> +        "MetricExpr": "PM_EXEC_STALL_STORE_MISS / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "STORE_MISS_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is a TLBIE instruction waiting for a response from the L2",
>> +        "MetricExpr": "PM_EXEC_STALL_TLBIE / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "TLBIE_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is executing a PTESYNC instruction",
>> +        "MetricExpr": "PM_EXEC_STALL_PTESYNC / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "PTESYNC_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction cannot complete because the thread was blocked",
>> +        "MetricExpr": "PM_CMPL_STALL / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "COMPLETION_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction cannot complete because it was interrupted by ANY exception",
>> +        "MetricExpr": "PM_CMPL_STALL_EXCEPTION / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "EXCEPTION_COMPLETION_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is stuck at finish waiting for the non-speculative finish of either a STCX instruction waiting for its result or a load waiting for non-critical sectors of data and ECC",
>> +        "MetricExpr": "PM_CMPL_STALL_MEM_ECC / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "MEM_ECC_COMPLETION_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction cannot complete the instruction is a STCX instruction waiting for resolution from the nest",
>
> Need to reword this, I think.  I propose "Average cycles per instruction
> when the NTC instruction is a STCX instruction waiting for resolution
> from the nest", which follows the form used by HWSYNC_COMPLETION_STALL_CPI,
> below.
>
>> +        "MetricExpr": "PM_CMPL_STALL_STCX / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "STCX_COMPLETION_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is a LWSYNC instruction waiting to complete",
>> +        "MetricExpr": "PM_CMPL_STALL_LWSYNC / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "LWSYNC_COMPLETION_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is a HWSYNC instruction stuck at finish waiting for a response from the L2",
>> +        "MetricExpr": "PM_CMPL_STALL_HWSYNC / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "HWSYNC_COMPLETION_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction required special handling before completion",
>> +        "MetricExpr": "PM_CMPL_STALL_SPECIAL / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "SPECIAL_COMPLETION_STALL_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled because fetch was being held, so there was nothing in the pipeline for this thread",
>> +        "MetricExpr": "PM_DISP_STALL_FETCH / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISPATCHED_FETCH_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Average cycles per instruction when the NTC instruction was held at dispatch because of power management",
>> +        "MetricExpr": "PM_DISP_STALL_HELD_HALT_CYC / PM_RUN_INST_CMPL",
>> +        "MetricGroup": "CPI",
>> +        "MetricName": "DISPATCHED_HELD_HALT_CPI"
>> +    },
>> +    {
>> +        "BriefDescription": "Percentage of flushes per completed run instruction",
>
> s/per completed run instruction/per instruction/

I'm not sure we want to drop "completed" from this and all the following
descriptions.

There is a meaningful distinction between completed and dispatched
instructions, I think it's useful to be explicit about which the event
is counting.

I agree dropping "run" is a good idea, most people won't understand that
"run" means "non-idle", and I think don't expect idle instructions to be
counted anyway.

...
>
>> +        "MetricExpr": "PM_RUN_INST_CMPL / PM_RUN_CYC",
>> +        "MetricGroup": "General",
>> +        "MetricName": "RUN_IPC"
>> +    },
>> +    {
>> +        "BriefDescription": "Average number of instructions completed per instruction group",
>
> s/completed//

And here the meaning is different if you drop "completed".

cheers

^ permalink raw reply

* Re: Stable backport request
From: Greg KH @ 2021-10-24 11:49 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev, stable
In-Reply-To: <87zgqzbcx6.fsf@mpe.ellerman.id.au>

On Sun, Oct 24, 2021 at 09:21:09AM +1100, Michael Ellerman wrote:
> Hi Greg,
> 
> Please backport the following commit to v5.4 and v5.10:
> 
>   73287caa9210ded6066833195f4335f7f688a46b
>   ("powerpc64/idle: Fix SP offsets when saving GPRs")
> 
> 
> And please backport the following commits to v5.4, v5.10 and v5.14:
> 
>   9b4416c5095c20e110c82ae602c254099b83b72f
>   ("KVM: PPC: Book3S HV: Fix stack handling in idle_kvm_start_guest()")
> 
>   cdeb5d7d890e14f3b70e8087e745c4a6a7d9f337
>   ("KVM: PPC: Book3S HV: Make idle_kvm_start_guest() return 0 if it went to guest")
> 
>   496c5fe25c377ddb7815c4ce8ecfb676f051e9b6
>   ("powerpc/idle: Don't corrupt back chain when going idle")

All now queued up, thanks!

greg k-h

^ permalink raw reply

* Stable backport request
From: Michael Ellerman @ 2021-10-23 22:21 UTC (permalink / raw)
  To: gregkh; +Cc: linuxppc-dev, stable

Hi Greg,

Please backport the following commit to v5.4 and v5.10:

  73287caa9210ded6066833195f4335f7f688a46b
  ("powerpc64/idle: Fix SP offsets when saving GPRs")


And please backport the following commits to v5.4, v5.10 and v5.14:

  9b4416c5095c20e110c82ae602c254099b83b72f
  ("KVM: PPC: Book3S HV: Fix stack handling in idle_kvm_start_guest()")

  cdeb5d7d890e14f3b70e8087e745c4a6a7d9f337
  ("KVM: PPC: Book3S HV: Make idle_kvm_start_guest() return 0 if it went to guest")

  496c5fe25c377ddb7815c4ce8ecfb676f051e9b6
  ("powerpc/idle: Don't corrupt back chain when going idle")


cheers

^ permalink raw reply

* Re: [PATCH] locking: remove spin_lock_flags() etc
From: Arnd Bergmann @ 2021-10-23 16:04 UTC (permalink / raw)
  To: Waiman Long
  Cc: linux-ia64, Peter Zijlstra, James E.J. Bottomley, Paul Mackerras,
	Alexander Gordeev, Will Deacon, Jonas Bonn, linux-s390,
	Arnd Bergmann, Helge Deller, Christian Borntraeger, Ingo Molnar,
	Heiko Carstens, Vasily Gorbik, Boqun Feng, Stefan Kristiansson,
	Openrisc, Stafford Horne, Parisc List, Linux Kernel Mailing List,
	linuxppc-dev
In-Reply-To: <cc8e3c58-457d-fdf3-6a62-98bde0cefdea@redhat.com>

On Sat, Oct 23, 2021 at 3:37 AM Waiman Long <longman@redhat.com> wrote:
>> On 10/22/21 7:59 AM, Arnd Bergmann wrote:
> > From: Arnd Bergmann <arnd@arndb.de>
> >
> > As this is all dead code, just remove it and the helper functions built
> > around it. For arch/ia64, the inline asm could be cleaned up, but
> > it seems safer to leave it untouched.
> >
> > Signed-off-by: Arnd Bergmann <arnd@arndb.de>
>
> Does that mean we can also remove the GENERIC_LOCKBREAK config option
> from the Kconfig files as well?

 I couldn't figure this out.

What I see is that the only architectures setting GENERIC_LOCKBREAK are
nds32, parisc, powerpc, s390, sh and sparc64, while the only architectures
implementing arch_spin_is_contended() are arm32, csky and ia64.

The part I don't understand is whether the option actually does anything
useful any more after commit d89c70356acf ("locking/core: Remove break_lock
field when CONFIG_GENERIC_LOCKBREAK=y").

      Arnd

^ permalink raw reply

* Re: [PATCH] powerpc: Enhance pmem DMA bypass handling
From: Alexey Kardashevskiy @ 2021-10-23 12:18 UTC (permalink / raw)
  To: Brian King, linuxppc-dev
In-Reply-To: <e4120ddc-8cac-c722-2379-ecc44bd9ef89@linux.vnet.ibm.com>



On 23/10/2021 07:18, Brian King wrote:
> On 10/22/21 7:24 AM, Alexey Kardashevskiy wrote:
>>
>>
>> On 22/10/2021 04:44, Brian King wrote:
>>> If ibm,pmemory is installed in the system, it can appear anywhere
>>> in the address space. This patch enhances how we handle DMA for devices when
>>> ibm,pmemory is present. In the case where we have enough DMA space to
>>> direct map all of RAM, but not ibm,pmemory, we use direct DMA for
>>> I/O to RAM and use the default window to dynamically map ibm,pmemory.
>>> In the case where we only have a single DMA window, this won't work, > so if the window is not big enough to map the entire address range,
>>> we cannot direct map.
>>
>> but we want the pmem range to be mapped into the huge DMA window too if we can, why skip it?
> 
> This patch should simply do what the comment in this commit mentioned below suggests, which says that
> ibm,pmemory can appear anywhere in the address space. If the DMA window is large enough
> to map all of MAX_PHYSMEM_BITS, we will indeed simply do direct DMA for everything,
> including the pmem. If we do not have a big enough window to do that, we will do
> direct DMA for DRAM and dynamic mapping for pmem.


Right, and this is what we do already, do not we? I missing something here.

> 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/arch/powerpc/platforms/pseries/iommu.c?id=bf6e2d562bbc4d115cf322b0bca57fe5bbd26f48
> 
> 
> Thanks,
> 
> Brian
> 
> 
>>
>>
>>>
>>> Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
>>> ---
>>>    arch/powerpc/platforms/pseries/iommu.c | 19 ++++++++++---------
>>>    1 file changed, 10 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>>> index 269f61d519c2..d9ae985d10a4 100644
>>> --- a/arch/powerpc/platforms/pseries/iommu.c
>>> +++ b/arch/powerpc/platforms/pseries/iommu.c
>>> @@ -1092,15 +1092,6 @@ static phys_addr_t ddw_memory_hotplug_max(void)
>>>        phys_addr_t max_addr = memory_hotplug_max();
>>>        struct device_node *memory;
>>>    -    /*
>>> -     * The "ibm,pmemory" can appear anywhere in the address space.
>>> -     * Assuming it is still backed by page structs, set the upper limit
>>> -     * for the huge DMA window as MAX_PHYSMEM_BITS.
>>> -     */
>>> -    if (of_find_node_by_type(NULL, "ibm,pmemory"))
>>> -        return (sizeof(phys_addr_t) * 8 <= MAX_PHYSMEM_BITS) ?
>>> -            (phys_addr_t) -1 : (1ULL << MAX_PHYSMEM_BITS);
>>> -
>>>        for_each_node_by_type(memory, "memory") {
>>>            unsigned long start, size;
>>>            int n_mem_addr_cells, n_mem_size_cells, len;
>>> @@ -1341,6 +1332,16 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>>>         */
>>>        len = max_ram_len;
>>>        if (pmem_present) {
>>> +        if (default_win_removed) {
>>> +            /*
>>> +             * If we only have one DMA window and have pmem present,
>>> +             * then we need to be able to map the entire address
>>> +             * range in order to be able to do direct DMA to RAM.
>>> +             */
>>> +            len = order_base_2((sizeof(phys_addr_t) * 8 <= MAX_PHYSMEM_BITS) ?
>>> +                    (phys_addr_t) -1 : (1ULL << MAX_PHYSMEM_BITS));
>>> +        }
>>> +
>>>            if (query.largest_available_block >=
>>>                (1ULL << (MAX_PHYSMEM_BITS - page_shift)))
>>>                len = MAX_PHYSMEM_BITS;
>>>
>>
> 
> 

-- 
Alexey

^ permalink raw reply

* [PATCH v2 10/10] powerpc/fsl_booke: Enable STRICT_KERNEL_RWX
From: Christophe Leroy @ 2021-10-23 11:47 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <33e7fe0f6134c58e044eb63d3925cd34aa120104.1634983809.git.christophe.leroy@csgroup.eu>

Enable STRICT_KERNEL_RWX on fsl_booke.

For that, we need additional TLBCAMs dedicated to linear mapping,
based on the alignment of _sinittext.

By default, up to 768 Mbytes of memory are mapped.
It uses 3 TLBCAMs of size 256 Mbytes.

With a data alignment of 16, we need up to 9 TLBCAMs:
  16/16/16/16/64/64/64/256/256

With a data alignment of 4, we need up to 12 TLBCAMs:
  4/4/4/4/16/16/16/64/64/64/256/256

With a data alignment of 1, we need up to 15 TLBCAMs:
  1/1/1/1/4/4/4/16/16/16/64/64/64/256/256

By default, set a 16 Mbytes alignment as a compromise between memory
usage and number of TLBCAMs. This can be adjusted manually when needed.

For the time being, it doens't work when the base is randomised.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
v2: No change
---
 arch/powerpc/Kconfig | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 6b9f523882c5..939a47642a9c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -139,6 +139,7 @@ config PPC
 	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE && PPC_BOOK3S_64
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_STRICT_KERNEL_RWX	if (PPC_BOOK3S || PPC_8xx || 40x) && !HIBERNATION
+	select ARCH_HAS_STRICT_KERNEL_RWX	if FSL_BOOKE && !HIBERNATION && !RANDOMIZE_BASE
 	select ARCH_HAS_STRICT_MODULE_RWX	if ARCH_HAS_STRICT_KERNEL_RWX && !PPC_BOOK3S_32
 	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAS_UACCESS_FLUSHCACHE
@@ -778,7 +779,8 @@ config DATA_SHIFT_BOOL
 	bool "Set custom data alignment"
 	depends on ADVANCED_OPTIONS
 	depends on STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE
-	depends on PPC_BOOK3S_32 || (PPC_8xx && !PIN_TLB_DATA && !STRICT_KERNEL_RWX)
+	depends on PPC_BOOK3S_32 || (PPC_8xx && !PIN_TLB_DATA && !STRICT_KERNEL_RWX) || \
+		   FSL_BOOKE
 	help
 	  This option allows you to set the kernel data alignment. When
 	  RAM is mapped by blocks, the alignment needs to fit the size and
@@ -791,11 +793,13 @@ config DATA_SHIFT
 	default 24 if STRICT_KERNEL_RWX && PPC64
 	range 17 28 if (STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE) && PPC_BOOK3S_32
 	range 19 23 if (STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE) && PPC_8xx
+	range 20 24 if (STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE) && PPC_FSL_BOOKE
 	default 22 if STRICT_KERNEL_RWX && PPC_BOOK3S_32
 	default 18 if (DEBUG_PAGEALLOC || KFENCE) && PPC_BOOK3S_32
 	default 23 if STRICT_KERNEL_RWX && PPC_8xx
 	default 23 if (DEBUG_PAGEALLOC || KFENCE) && PPC_8xx && PIN_TLB_DATA
 	default 19 if (DEBUG_PAGEALLOC || KFENCE) && PPC_8xx
+	default 24 if STRICT_KERNEL_RWX && FSL_BOOKE
 	default PPC_PAGE_SHIFT
 	help
 	  On Book3S 32 (603+), DBATs are used to map kernel text and rodata RO.
@@ -1123,7 +1127,10 @@ config LOWMEM_CAM_NUM_BOOL
 config LOWMEM_CAM_NUM
 	depends on FSL_BOOKE
 	int "Number of CAMs to use to map low memory" if LOWMEM_CAM_NUM_BOOL
-	default 3
+	default 3 if !STRICT_KERNEL_RWX
+	default 9 if DATA_SHIFT >= 24
+	default 12 if DATA_SHIFT >= 22
+	default 15
 
 config DYNAMIC_MEMSTART
 	bool "Enable page aligned dynamic load address for kernel"
-- 
2.31.1


^ permalink raw reply related

* [PATCH v2 09/10] powerpc/fsl_booke: Update of TLBCAMs after init
From: Christophe Leroy @ 2021-10-23 11:47 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <33e7fe0f6134c58e044eb63d3925cd34aa120104.1634983809.git.christophe.leroy@csgroup.eu>

After init, set readonly memory as ROX and set readwrite
memory as RWX, if STRICT_KERNEL_RWX is enabled.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
v2: No change
---
 arch/powerpc/mm/mmu_decl.h          |  2 +-
 arch/powerpc/mm/nohash/fsl_book3e.c | 32 +++++++++++++++++++++++++----
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index e13a3c0caa02..0dd4c18f8363 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -168,7 +168,7 @@ static inline phys_addr_t v_block_mapped(unsigned long va) { return 0; }
 static inline unsigned long p_block_mapped(phys_addr_t pa) { return 0; }
 #endif
 
-#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx)
+#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC_FSL_BOOK3E)
 void mmu_mark_initmem_nx(void);
 void mmu_mark_rodata_ro(void);
 #else
diff --git a/arch/powerpc/mm/nohash/fsl_book3e.c b/arch/powerpc/mm/nohash/fsl_book3e.c
index 88132cab3442..b231a54f540c 100644
--- a/arch/powerpc/mm/nohash/fsl_book3e.c
+++ b/arch/powerpc/mm/nohash/fsl_book3e.c
@@ -182,7 +182,7 @@ static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
 	/* Calculate CAM values */
 	for (i = 0; boundary && i < max_cam_idx; i++) {
 		unsigned long cam_sz;
-		pgprot_t prot = PAGE_KERNEL_X;
+		pgprot_t prot = init ? PAGE_KERNEL_X : PAGE_KERNEL_ROX;
 
 		cam_sz = calc_cam_sz(boundary, virt, phys);
 		if (!dryrun)
@@ -195,7 +195,7 @@ static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
 	}
 	for (ram -= amount_mapped; ram && i < max_cam_idx; i++) {
 		unsigned long cam_sz;
-		pgprot_t prot = PAGE_KERNEL_X;
+		pgprot_t prot = init ? PAGE_KERNEL_X : PAGE_KERNEL;
 
 		cam_sz = calc_cam_sz(ram, virt, phys);
 		if (!dryrun)
@@ -210,8 +210,13 @@ static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
 	if (dryrun)
 		return amount_mapped;
 
-	loadcam_multi(0, i, max_cam_idx);
-	tlbcam_index = i;
+	if (init) {
+		loadcam_multi(0, i, max_cam_idx);
+		tlbcam_index = i;
+	} else {
+		loadcam_multi(0, i, 0);
+		WARN_ON(i > tlbcam_index);
+	}
 
 #ifdef CONFIG_PPC64
 	get_paca()->tcd.esel_next = i;
@@ -280,6 +285,25 @@ void __init adjust_total_lowmem(void)
 	memblock_set_current_limit(memstart_addr + __max_low_memory);
 }
 
+#ifdef CONFIG_STRICT_KERNEL_RWX
+void mmu_mark_rodata_ro(void)
+{
+	/* Everything is done in mmu_mark_initmem_nx() */
+}
+#endif
+
+void mmu_mark_initmem_nx(void)
+{
+	unsigned long remapped;
+
+	if (!strict_kernel_rwx_enabled())
+		return;
+
+	remapped = map_mem_in_cams(__max_low_memory, CONFIG_LOWMEM_CAM_NUM, false, false);
+
+	WARN_ON(__max_low_memory != remapped);
+}
+
 void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 				phys_addr_t first_memblock_size)
 {
-- 
2.31.1


^ permalink raw reply related

* [PATCH v2 08/10] powerpc/fsl_booke: Allocate separate TLBCAMs for readonly memory
From: Christophe Leroy @ 2021-10-23 11:47 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <33e7fe0f6134c58e044eb63d3925cd34aa120104.1634983809.git.christophe.leroy@csgroup.eu>

Reorganise TLBCAM allocation so that when STRICT_KERNEL_RWX is
enabled, TLBCAMs are allocated such that readonly memory uses
different TLBCAMs.

This results in an allocation looking like:

Memory CAM mapping: 4/4/4/1/1/1/1/16/16/16/64/64/64/256/256 Mb, residual: 256Mb

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
v2: No change
---
 arch/powerpc/mm/nohash/fsl_book3e.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/nohash/fsl_book3e.c b/arch/powerpc/mm/nohash/fsl_book3e.c
index 8ae1ba7985df..88132cab3442 100644
--- a/arch/powerpc/mm/nohash/fsl_book3e.c
+++ b/arch/powerpc/mm/nohash/fsl_book3e.c
@@ -172,15 +172,34 @@ static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
 {
 	int i;
 	unsigned long amount_mapped = 0;
+	unsigned long boundary;
+
+	if (strict_kernel_rwx_enabled())
+		boundary = (unsigned long)(_sinittext - _stext);
+	else
+		boundary = ram;
 
 	/* Calculate CAM values */
-	for (i = 0; ram && i < max_cam_idx; i++) {
+	for (i = 0; boundary && i < max_cam_idx; i++) {
+		unsigned long cam_sz;
+		pgprot_t prot = PAGE_KERNEL_X;
+
+		cam_sz = calc_cam_sz(boundary, virt, phys);
+		if (!dryrun)
+			settlbcam(i, virt, phys, cam_sz, pgprot_val(prot), 0);
+
+		boundary -= cam_sz;
+		amount_mapped += cam_sz;
+		virt += cam_sz;
+		phys += cam_sz;
+	}
+	for (ram -= amount_mapped; ram && i < max_cam_idx; i++) {
 		unsigned long cam_sz;
+		pgprot_t prot = PAGE_KERNEL_X;
 
 		cam_sz = calc_cam_sz(ram, virt, phys);
 		if (!dryrun)
-			settlbcam(i, virt, phys, cam_sz,
-				  pgprot_val(PAGE_KERNEL_X), 0);
+			settlbcam(i, virt, phys, cam_sz, pgprot_val(prot), 0);
 
 		ram -= cam_sz;
 		amount_mapped += cam_sz;
-- 
2.31.1


^ permalink raw reply related

* [PATCH v2 07/10] powerpc/fsl_booke: Tell map_mem_in_cams() if init is done
From: Christophe Leroy @ 2021-10-23 11:47 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <33e7fe0f6134c58e044eb63d3925cd34aa120104.1634983809.git.christophe.leroy@csgroup.eu>

In order to be able to call map_mem_in_cams() once more
after init for STRICT_KERNEL_RWX, add an argument.

For now, map_mem_in_cams() is always called only during init.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
v2: No change
---
 arch/powerpc/mm/mmu_decl.h           |  2 +-
 arch/powerpc/mm/nohash/fsl_book3e.c  | 12 ++++++------
 arch/powerpc/mm/nohash/kaslr_booke.c |  2 +-
 arch/powerpc/mm/nohash/tlb.c         |  4 ++--
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index dd1cabc2ea0f..e13a3c0caa02 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -126,7 +126,7 @@ unsigned long mmu_mapin_ram(unsigned long base, unsigned long top);
 
 #ifdef CONFIG_PPC_FSL_BOOK3E
 extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx,
-				     bool dryrun);
+				     bool dryrun, bool init);
 extern unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
 				 phys_addr_t phys);
 #ifdef CONFIG_PPC32
diff --git a/arch/powerpc/mm/nohash/fsl_book3e.c b/arch/powerpc/mm/nohash/fsl_book3e.c
index 2668bb06e4fa..8ae1ba7985df 100644
--- a/arch/powerpc/mm/nohash/fsl_book3e.c
+++ b/arch/powerpc/mm/nohash/fsl_book3e.c
@@ -168,7 +168,7 @@ unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
 
 static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
 					unsigned long ram, int max_cam_idx,
-					bool dryrun)
+					bool dryrun, bool init)
 {
 	int i;
 	unsigned long amount_mapped = 0;
@@ -203,12 +203,12 @@ static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
 	return amount_mapped;
 }
 
-unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun)
+unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun, bool init)
 {
 	unsigned long virt = PAGE_OFFSET;
 	phys_addr_t phys = memstart_addr;
 
-	return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx, dryrun);
+	return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx, dryrun, init);
 }
 
 #ifdef CONFIG_PPC32
@@ -249,7 +249,7 @@ void __init adjust_total_lowmem(void)
 	ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem);
 
 	i = switch_to_as1();
-	__max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, false);
+	__max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, false, true);
 	restore_to_as0(i, 0, 0, 1);
 
 	pr_info("Memory CAM mapping: ");
@@ -320,11 +320,11 @@ notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start)
 		/* map a 64M area for the second relocation */
 		if (memstart_addr > start)
 			map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM,
-					false);
+					false, true);
 		else
 			map_mem_in_cams_addr(start, PAGE_OFFSET + offset,
 					0x4000000, CONFIG_LOWMEM_CAM_NUM,
-					false);
+					false, true);
 		restore_to_as0(n, offset, __va(dt_ptr), 1);
 		/* We should never reach here */
 		panic("Relocation error");
diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c
index 4c74e8a5482b..8fc49b1b4a91 100644
--- a/arch/powerpc/mm/nohash/kaslr_booke.c
+++ b/arch/powerpc/mm/nohash/kaslr_booke.c
@@ -314,7 +314,7 @@ static unsigned long __init kaslr_choose_location(void *dt_ptr, phys_addr_t size
 		pr_warn("KASLR: No safe seed for randomizing the kernel base.\n");
 
 	ram = min_t(phys_addr_t, __max_low_memory, size);
-	ram = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, true);
+	ram = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, true, false);
 	linear_sz = min_t(unsigned long, ram, SZ_512M);
 
 	/* If the linear size is smaller than 64M, do not randmize */
diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
index 5872f69141d5..fc195b9f524b 100644
--- a/arch/powerpc/mm/nohash/tlb.c
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -643,7 +643,7 @@ static void early_init_this_mmu(void)
 
 		if (map)
 			linear_map_top = map_mem_in_cams(linear_map_top,
-							 num_cams, false);
+							 num_cams, true, true);
 	}
 #endif
 
@@ -764,7 +764,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 		num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
 
 		linear_sz = map_mem_in_cams(first_memblock_size, num_cams,
-					    true);
+					    false, true);
 
 		ppc64_rma_size = min_t(u64, linear_sz, 0x40000000);
 	} else
-- 
2.31.1


^ permalink raw reply related

* [PATCH v2 06/10] powerpc/fsl_booke: Enable reloading of TLBCAM without switching to AS1
From: Christophe Leroy @ 2021-10-23 11:47 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <33e7fe0f6134c58e044eb63d3925cd34aa120104.1634983809.git.christophe.leroy@csgroup.eu>

Avoid switching to AS1 when reloading TLBCAM after init for
STRICT_KERNEL_RWX.

When we setup AS1 we expect the entire accessible memory to be mapped
through one entry, this is not the case anymore at the end of init.

We are not changing the size of TLBCAMs, only flags, so no need to
switch to AS1.

So change loadcam_multi() to not switch to AS1 when the given
temporary tlb entry in 0.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
v2: No change
---
 arch/powerpc/mm/nohash/tlb_low.S | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/nohash/tlb_low.S b/arch/powerpc/mm/nohash/tlb_low.S
index 5add4a51e51f..dd39074de9af 100644
--- a/arch/powerpc/mm/nohash/tlb_low.S
+++ b/arch/powerpc/mm/nohash/tlb_low.S
@@ -369,7 +369,7 @@ _GLOBAL(_tlbivax_bcast)
  * extern void loadcam_entry(unsigned int index)
  *
  * Load TLBCAM[index] entry in to the L2 CAM MMU
- * Must preserve r7, r8, r9, r10 and r11
+ * Must preserve r7, r8, r9, r10, r11, r12
  */
 _GLOBAL(loadcam_entry)
 	mflr	r5
@@ -401,7 +401,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS)
  *
  * r3 = first entry to write
  * r4 = number of entries to write
- * r5 = temporary tlb entry
+ * r5 = temporary tlb entry (0 means no switch to AS1)
  */
 _GLOBAL(loadcam_multi)
 	mflr	r8
@@ -409,6 +409,8 @@ _GLOBAL(loadcam_multi)
 	mfmsr	r11
 	andi.	r11,r11,MSR_IS
 	bne	10f
+	mr.	r12, r5
+	beq	10f
 
 	/*
 	 * Set up temporary TLB entry that is the same as what we're
@@ -446,6 +448,8 @@ _GLOBAL(loadcam_multi)
 	/* Don't return to AS=0 if we were in AS=1 at function start */
 	andi.	r11,r11,MSR_IS
 	bne	3f
+	cmpwi	r12, 0
+	beq	3f
 
 	/* Return to AS=0 and clear the temporary entry */
 	mfmsr	r6
-- 
2.31.1


^ permalink raw reply related

* [PATCH v2 05/10] powerpc/fsl_booke: Take exec flag into account when setting TLBCAMs
From: Christophe Leroy @ 2021-10-23 11:47 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <33e7fe0f6134c58e044eb63d3925cd34aa120104.1634983809.git.christophe.leroy@csgroup.eu>

Don't force MAS3_SX and MAS3_UX at all time. Take into account the
exec flag.

While at it, fix a couple of closeby style problems (indent with space
and unnecessary parenthesis), it keeps more readability.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
v2: Use the new _PAGE_EXEC to check executability of flags instead of _PAGE_BAP_SX (Error reported by robot with tqm8541_defconfig)
---
 arch/powerpc/mm/nohash/fsl_book3e.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/mm/nohash/fsl_book3e.c b/arch/powerpc/mm/nohash/fsl_book3e.c
index 03dacbe940e5..2668bb06e4fa 100644
--- a/arch/powerpc/mm/nohash/fsl_book3e.c
+++ b/arch/powerpc/mm/nohash/fsl_book3e.c
@@ -122,15 +122,18 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys,
 	TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0;
 	TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0;
 
-	TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SX | MAS3_SR;
-	TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0);
+	TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SR;
+	TLBCAM[index].MAS3 |= (flags & _PAGE_RW) ? MAS3_SW : 0;
 	if (mmu_has_feature(MMU_FTR_BIG_PHYS))
 		TLBCAM[index].MAS7 = (u64)phys >> 32;
 
 	/* Below is unlikely -- only for large user pages or similar */
 	if (pte_user(__pte(flags))) {
-	   TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR;
-	   TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0);
+		TLBCAM[index].MAS3 |= MAS3_UR;
+		TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_UX : 0;
+		TLBCAM[index].MAS3 |= (flags & _PAGE_RW) ? MAS3_UW : 0;
+	} else {
+		TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_SX : 0;
 	}
 
 	tlbcam_addrs[index].start = virt;
-- 
2.31.1


^ permalink raw reply related

* [PATCH v2 04/10] powerpc/fsl_booke: Rename fsl_booke.c to fsl_book3e.c
From: Christophe Leroy @ 2021-10-23 11:47 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <33e7fe0f6134c58e044eb63d3925cd34aa120104.1634983809.git.christophe.leroy@csgroup.eu>

We have a myriad of CONFIG symbols around different variants
of BOOKEs, which would be worth tidying up one day.

But at least, make file names and CONFIG option match:

We have CONFIG_FSL_BOOKE and CONFIG_PPC_FSL_BOOK3E.

fsl_booke.c is selected by and only by CONFIG_PPC_FSL_BOOK3E.
So rename it fsl_book3e to reduce confusion.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
v2: No change
---
 arch/powerpc/mm/nohash/Makefile                      | 4 ++--
 arch/powerpc/mm/nohash/{fsl_booke.c => fsl_book3e.c} | 0
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename arch/powerpc/mm/nohash/{fsl_booke.c => fsl_book3e.c} (100%)

diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile
index 0424f6ce5bd8..b1f630d423d8 100644
--- a/arch/powerpc/mm/nohash/Makefile
+++ b/arch/powerpc/mm/nohash/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_PPC_BOOK3E_64)  	+= tlb_low_64e.o book3e_pgtable.o
 obj-$(CONFIG_40x)		+= 40x.o
 obj-$(CONFIG_44x)		+= 44x.o
 obj-$(CONFIG_PPC_8xx)		+= 8xx.o
-obj-$(CONFIG_PPC_FSL_BOOK3E)	+= fsl_booke.o
+obj-$(CONFIG_PPC_FSL_BOOK3E)	+= fsl_book3e.o
 obj-$(CONFIG_RANDOMIZE_BASE)	+= kaslr_booke.o
 ifdef CONFIG_HUGETLB_PAGE
 obj-$(CONFIG_PPC_FSL_BOOK3E)	+= book3e_hugetlbpage.o
@@ -16,4 +16,4 @@ endif
 # Disable kcov instrumentation on sensitive code
 # This is necessary for booting with kcov enabled on book3e machines
 KCOV_INSTRUMENT_tlb.o := n
-KCOV_INSTRUMENT_fsl_booke.o := n
+KCOV_INSTRUMENT_fsl_book3e.o := n
diff --git a/arch/powerpc/mm/nohash/fsl_booke.c b/arch/powerpc/mm/nohash/fsl_book3e.c
similarity index 100%
rename from arch/powerpc/mm/nohash/fsl_booke.c
rename to arch/powerpc/mm/nohash/fsl_book3e.c
-- 
2.31.1


^ permalink raw reply related

* [PATCH v2 03/10] powerpc/booke: Disable STRICT_KERNEL_RWX, DEBUG_PAGEALLOC and KFENCE
From: Christophe Leroy @ 2021-10-23 11:47 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <33e7fe0f6134c58e044eb63d3925cd34aa120104.1634983809.git.christophe.leroy@csgroup.eu>

fsl_booke and 44x are not able to map kernel linear memory with
pages, so they can't support DEBUG_PAGEALLOC and KFENCE, and
STRICT_KERNEL_RWX is also a problem for now.

Enable those only on book3s (both 32 and 64 except KFENCE), 8xx and 40x.

Fixes: 88df6e90fa97 ("[POWERPC] DEBUG_PAGEALLOC for 32-bit")
Fixes: 95902e6c8864 ("powerpc/mm: Implement STRICT_KERNEL_RWX on PPC32")
Fixes: 90cbac0e995d ("powerpc: Enable KFENCE for PPC32")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
v2: No change
---
 arch/powerpc/Kconfig | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index ba5b66189358..6b9f523882c5 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -138,7 +138,7 @@ config PPC
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE && PPC_BOOK3S_64
 	select ARCH_HAS_SET_MEMORY
-	select ARCH_HAS_STRICT_KERNEL_RWX	if ((PPC_BOOK3S_64 || PPC32) && !HIBERNATION)
+	select ARCH_HAS_STRICT_KERNEL_RWX	if (PPC_BOOK3S || PPC_8xx || 40x) && !HIBERNATION
 	select ARCH_HAS_STRICT_MODULE_RWX	if ARCH_HAS_STRICT_KERNEL_RWX && !PPC_BOOK3S_32
 	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAS_UACCESS_FLUSHCACHE
@@ -150,7 +150,7 @@ config PPC
 	select ARCH_OPTIONAL_KERNEL_RWX		if ARCH_HAS_STRICT_KERNEL_RWX
 	select ARCH_STACKWALK
 	select ARCH_SUPPORTS_ATOMIC_RMW
-	select ARCH_SUPPORTS_DEBUG_PAGEALLOC	if PPC32 || PPC_BOOK3S_64
+	select ARCH_SUPPORTS_DEBUG_PAGEALLOC	if PPC_BOOK3S || PPC_8xx || 40x
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF		if PPC64
 	select ARCH_USE_MEMTEST
@@ -190,7 +190,7 @@ config PPC
 	select HAVE_ARCH_JUMP_LABEL_RELATIVE
 	select HAVE_ARCH_KASAN			if PPC32 && PPC_PAGE_SHIFT <= 14
 	select HAVE_ARCH_KASAN_VMALLOC		if PPC32 && PPC_PAGE_SHIFT <= 14
-	select HAVE_ARCH_KFENCE			if PPC32
+	select HAVE_ARCH_KFENCE			if PPC_BOOK3S_32 || PPC_8xx || 40x
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_MMAP_RND_BITS
 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS	if COMPAT
-- 
2.31.1


^ permalink raw reply related

* [PATCH v2 02/10] powerpc/book3e: Fix set_memory_x() and set_memory_nx()
From: Christophe Leroy @ 2021-10-23 11:47 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <33e7fe0f6134c58e044eb63d3925cd34aa120104.1634983809.git.christophe.leroy@csgroup.eu>

set_memory_x() calls pte_mkexec() which sets _PAGE_EXEC.
set_memory_nx() calls pte_exprotec() which clears _PAGE_EXEC.

Book3e has 2 bits, UX and SX, which defines the exec rights
resp. for user (PR=1) and for kernel (PR=0).

_PAGE_EXEC is defined as UX only.

An executable kernel page is set with either _PAGE_KERNEL_RWX
or _PAGE_KERNEL_ROX, which both have SX set and UX cleared.

So set_memory_nx() call for an executable kernel page does
nothing because UX is already cleared.

And set_memory_x() on a non-executable kernel page makes it
executable for the user and keeps it non-executable for kernel.

Also, pte_exec() always returns 'false' on kernel pages, because
it checks _PAGE_EXEC which doesn't include SX, so for instance
the W+X check doesn't work.

To fix this:
- change tlb_low_64e.S to use _PAGE_BAP_UX instead of _PAGE_USER
- sets both UX and SX in _PAGE_EXEC so that pte_user() returns
true whenever one of the two bits is set and pte_exprotect()
clears both bits.
- Define a book3e specific version of pte_mkexec() which sets
either SX or UX based on UR.

Fixes: 1f9ad21c3b38 ("powerpc/mm: Implement set_memory() routines")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
v2: New
---
 arch/powerpc/include/asm/nohash/32/pgtable.h |  2 ++
 arch/powerpc/include/asm/nohash/pte-book3e.h | 18 ++++++++++++++----
 arch/powerpc/mm/nohash/tlb_low_64e.S         |  8 ++++----
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index ac0a5ff48c3a..d6ba821a56ce 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -193,10 +193,12 @@ static inline pte_t pte_wrprotect(pte_t pte)
 }
 #endif
 
+#ifndef pte_mkexec
 static inline pte_t pte_mkexec(pte_t pte)
 {
 	return __pte(pte_val(pte) | _PAGE_EXEC);
 }
+#endif
 
 #define pmd_none(pmd)		(!pmd_val(pmd))
 #define	pmd_bad(pmd)		(pmd_val(pmd) & _PMD_BAD)
diff --git a/arch/powerpc/include/asm/nohash/pte-book3e.h b/arch/powerpc/include/asm/nohash/pte-book3e.h
index 813918f40765..f798640422c2 100644
--- a/arch/powerpc/include/asm/nohash/pte-book3e.h
+++ b/arch/powerpc/include/asm/nohash/pte-book3e.h
@@ -48,7 +48,7 @@
 #define _PAGE_WRITETHRU	0x800000 /* W: cache write-through */
 
 /* "Higher level" linux bit combinations */
-#define _PAGE_EXEC		_PAGE_BAP_UX /* .. and was cache cleaned */
+#define _PAGE_EXEC		(_PAGE_BAP_SX | _PAGE_BAP_UX) /* .. and was cache cleaned */
 #define _PAGE_RW		(_PAGE_BAP_SW | _PAGE_BAP_UW) /* User write permission */
 #define _PAGE_KERNEL_RW		(_PAGE_BAP_SW | _PAGE_BAP_SR | _PAGE_DIRTY)
 #define _PAGE_KERNEL_RO		(_PAGE_BAP_SR)
@@ -93,11 +93,11 @@
 /* Permission masks used to generate the __P and __S table */
 #define PAGE_NONE	__pgprot(_PAGE_BASE)
 #define PAGE_SHARED	__pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW)
-#define PAGE_SHARED_X	__pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | _PAGE_EXEC)
+#define PAGE_SHARED_X	__pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | _PAGE_BAP_UX)
 #define PAGE_COPY	__pgprot(_PAGE_BASE | _PAGE_USER)
-#define PAGE_COPY_X	__pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC)
+#define PAGE_COPY_X	__pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_BAP_UX)
 #define PAGE_READONLY	__pgprot(_PAGE_BASE | _PAGE_USER)
-#define PAGE_READONLY_X	__pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC)
+#define PAGE_READONLY_X	__pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_BAP_UX)
 
 #ifndef __ASSEMBLY__
 static inline pte_t pte_mkprivileged(pte_t pte)
@@ -113,6 +113,16 @@ static inline pte_t pte_mkuser(pte_t pte)
 }
 
 #define pte_mkuser pte_mkuser
+
+static inline pte_t pte_mkexec(pte_t pte)
+{
+	if (pte_val(pte) & _PAGE_BAP_UR)
+		return __pte((pte_val(pte) & ~_PAGE_BAP_SX) | _PAGE_BAP_UX);
+	else
+		return __pte((pte_val(pte) & ~_PAGE_BAP_UX) | _PAGE_BAP_SX);
+}
+#define pte_mkexec pte_mkexec
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/mm/nohash/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S
index bf24451f3e71..9235e720e357 100644
--- a/arch/powerpc/mm/nohash/tlb_low_64e.S
+++ b/arch/powerpc/mm/nohash/tlb_low_64e.S
@@ -222,7 +222,7 @@ tlb_miss_kernel_bolted:
 
 tlb_miss_fault_bolted:
 	/* We need to check if it was an instruction miss */
-	andi.	r10,r11,_PAGE_EXEC|_PAGE_BAP_SX
+	andi.	r10,r11,_PAGE_BAP_UX|_PAGE_BAP_SX
 	bne	itlb_miss_fault_bolted
 dtlb_miss_fault_bolted:
 	tlb_epilog_bolted
@@ -239,7 +239,7 @@ itlb_miss_fault_bolted:
 	srdi	r15,r16,60		/* get region */
 	bne-	itlb_miss_fault_bolted
 
-	li	r11,_PAGE_PRESENT|_PAGE_EXEC	/* Base perm */
+	li	r11,_PAGE_PRESENT|_PAGE_BAP_UX	/* Base perm */
 
 	/* We do the user/kernel test for the PID here along with the RW test
 	 */
@@ -614,7 +614,7 @@ itlb_miss_fault_e6500:
 
 	/* We do the user/kernel test for the PID here along with the RW test
 	 */
-	li	r11,_PAGE_PRESENT|_PAGE_EXEC	/* Base perm */
+	li	r11,_PAGE_PRESENT|_PAGE_BAP_UX	/* Base perm */
 	oris	r11,r11,_PAGE_ACCESSED@h
 
 	cmpldi	cr0,r15,0			/* Check for user region */
@@ -734,7 +734,7 @@ normal_tlb_miss_done:
 
 normal_tlb_miss_access_fault:
 	/* We need to check if it was an instruction miss */
-	andi.	r10,r11,_PAGE_EXEC
+	andi.	r10,r11,_PAGE_BAP_UX
 	bne	1f
 	ld	r14,EX_TLB_DEAR(r12)
 	ld	r15,EX_TLB_ESR(r12)
-- 
2.31.1


^ permalink raw reply related

* [PATCH v2 01/10] powerpc/nohash: Fix __ptep_set_access_flags() and ptep_set_wrprotect()
From: Christophe Leroy @ 2021-10-23 11:47 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

Commit 26973fa5ac0e ("powerpc/mm: use pte helpers in generic code")
changed those two functions to use pte helpers to determine which
bits to clear and which bits to set.

This change was based on the assumption that bits to be set/cleared
are always the same and can be determined by applying the pte
manipulation helpers on __pte(0).

But on platforms like book3e, the bits depend on whether the page
is a user page or not.

For the time being it more or less works because of _PAGE_EXEC being
used for user pages only and exec right being set at all time on
kernel page. But following patch will clean that and output of
pte_mkexec() will depend on the page being a user or kernel page.

Instead of trying to make an even more complicated helper where bits
would become dependent on the final pte value, come back to a more
static situation like before commit 26973fa5ac0e ("powerpc/mm: use
pte helpers in generic code"), by introducing an 8xx specific
version of __ptep_set_access_flags() and ptep_set_wrprotect().

Fixes: 26973fa5ac0e ("powerpc/mm: use pte helpers in generic code")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
v2: New
---
 arch/powerpc/include/asm/nohash/32/pgtable.h | 17 +++++++--------
 arch/powerpc/include/asm/nohash/32/pte-8xx.h | 22 ++++++++++++++++++++
 2 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index f06ae00f2a65..ac0a5ff48c3a 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -306,30 +306,29 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 }
 
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
+#ifndef ptep_set_wrprotect
 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
 				      pte_t *ptep)
 {
-	unsigned long clr = ~pte_val(pte_wrprotect(__pte(~0)));
-	unsigned long set = pte_val(pte_wrprotect(__pte(0)));
-
-	pte_update(mm, addr, ptep, clr, set, 0);
+	pte_update(mm, addr, ptep, _PAGE_RW, 0, 0);
 }
+#endif
 
+#ifndef __ptep_set_access_flags
 static inline void __ptep_set_access_flags(struct vm_area_struct *vma,
 					   pte_t *ptep, pte_t entry,
 					   unsigned long address,
 					   int psize)
 {
-	pte_t pte_set = pte_mkyoung(pte_mkdirty(pte_mkwrite(pte_mkexec(__pte(0)))));
-	pte_t pte_clr = pte_mkyoung(pte_mkdirty(pte_mkwrite(pte_mkexec(__pte(~0)))));
-	unsigned long set = pte_val(entry) & pte_val(pte_set);
-	unsigned long clr = ~pte_val(entry) & ~pte_val(pte_clr);
+	unsigned long set = pte_val(entry) &
+			    (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
 	int huge = psize > mmu_virtual_psize ? 1 : 0;
 
-	pte_update(vma->vm_mm, address, ptep, clr, set, huge);
+	pte_update(vma->vm_mm, address, ptep, 0, set, huge);
 
 	flush_tlb_page(vma, address);
 }
+#endif
 
 static inline int pte_young(pte_t pte)
 {
diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
index fcc48d590d88..1a89ebdc3acc 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
@@ -136,6 +136,28 @@ static inline pte_t pte_mkhuge(pte_t pte)
 
 #define pte_mkhuge pte_mkhuge
 
+static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, pte_t *p,
+				     unsigned long clr, unsigned long set, int huge);
+
+static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+	pte_update(mm, addr, ptep, 0, _PAGE_RO, 0);
+}
+#define ptep_set_wrprotect ptep_set_wrprotect
+
+static inline void __ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
+					   pte_t entry, unsigned long address, int psize)
+{
+	unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_EXEC);
+	unsigned long clr = ~pte_val(entry) & _PAGE_RO;
+	int huge = psize > mmu_virtual_psize ? 1 : 0;
+
+	pte_update(vma->vm_mm, address, ptep, clr, set, huge);
+
+	flush_tlb_page(vma, address);
+}
+#define __ptep_set_access_flags __ptep_set_access_flags
+
 static inline unsigned long pgd_leaf_size(pgd_t pgd)
 {
 	if (pgd_val(pgd) & _PMD_PAGE_8M)
-- 
2.31.1


^ permalink raw reply related

* Re: [PATCH] locking: remove spin_lock_flags() etc
From: Waiman Long @ 2021-10-23  1:37 UTC (permalink / raw)
  To: Arnd Bergmann, Peter Zijlstra, Ingo Molnar, Will Deacon
  Cc: Jonas Bonn, linux-s390, Alexander Gordeev, linux-ia64,
	Vasily Gorbik, Arnd Bergmann, Boqun Feng, linux-kernel,
	Stefan Kristiansson, James E.J. Bottomley, Christian Borntraeger,
	openrisc, Paul Mackerras, linux-parisc, Stafford Horne,
	linuxppc-dev, Helge Deller, Heiko Carstens
In-Reply-To: <20211022120058.1031690-1-arnd@kernel.org>

On 10/22/21 7:59 AM, Arnd Bergmann wrote:
> From: Arnd Bergmann <arnd@arndb.de>
>
> parisc, ia64 and powerpc32 are the only remaining architectures that
> provide custom arch_{spin,read,write}_lock_flags() functions, which are
> meant to re-enable interrupts while waiting for a spinlock.
>
> However, none of these can actually run into this codepath, because
> it is only called on architectures without CONFIG_GENERIC_LOCKBREAK,
> or when CONFIG_DEBUG_LOCK_ALLOC is set without CONFIG_LOCKDEP, and none
> of those combinations are possible on the three architectures.
>
> Going back in the git history, it appears that arch/mn10300 may have
> been able to run into this code path, but there is a good chance that
> it never worked. On the architectures that still exist, it was
> already impossible to hit back in 2008 after the introduction of
> CONFIG_GENERIC_LOCKBREAK, and possibly earlier.
>
> As this is all dead code, just remove it and the helper functions built
> around it. For arch/ia64, the inline asm could be cleaned up, but
> it seems safer to leave it untouched.
>
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>

Does that mean we can also remove the GENERIC_LOCKBREAK config option 
from the Kconfig files as well?

Cheers,
Longman



^ permalink raw reply

* Re: [PATCH] powerpc: Enhance pmem DMA bypass handling
From: Brian King @ 2021-10-22 20:18 UTC (permalink / raw)
  To: Alexey Kardashevskiy, linuxppc-dev
In-Reply-To: <84b82d2b-1263-39bb-d966-b432af530ca8@ozlabs.ru>

On 10/22/21 7:24 AM, Alexey Kardashevskiy wrote:
> 
> 
> On 22/10/2021 04:44, Brian King wrote:
>> If ibm,pmemory is installed in the system, it can appear anywhere
>> in the address space. This patch enhances how we handle DMA for devices when
>> ibm,pmemory is present. In the case where we have enough DMA space to
>> direct map all of RAM, but not ibm,pmemory, we use direct DMA for
>> I/O to RAM and use the default window to dynamically map ibm,pmemory.
>> In the case where we only have a single DMA window, this won't work, > so if the window is not big enough to map the entire address range,
>> we cannot direct map.
> 
> but we want the pmem range to be mapped into the huge DMA window too if we can, why skip it?

This patch should simply do what the comment in this commit mentioned below suggests, which says that
ibm,pmemory can appear anywhere in the address space. If the DMA window is large enough
to map all of MAX_PHYSMEM_BITS, we will indeed simply do direct DMA for everything,
including the pmem. If we do not have a big enough window to do that, we will do
direct DMA for DRAM and dynamic mapping for pmem.


https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/arch/powerpc/platforms/pseries/iommu.c?id=bf6e2d562bbc4d115cf322b0bca57fe5bbd26f48


Thanks,

Brian


> 
> 
>>
>> Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
>> ---
>>   arch/powerpc/platforms/pseries/iommu.c | 19 ++++++++++---------
>>   1 file changed, 10 insertions(+), 9 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>> index 269f61d519c2..d9ae985d10a4 100644
>> --- a/arch/powerpc/platforms/pseries/iommu.c
>> +++ b/arch/powerpc/platforms/pseries/iommu.c
>> @@ -1092,15 +1092,6 @@ static phys_addr_t ddw_memory_hotplug_max(void)
>>       phys_addr_t max_addr = memory_hotplug_max();
>>       struct device_node *memory;
>>   -    /*
>> -     * The "ibm,pmemory" can appear anywhere in the address space.
>> -     * Assuming it is still backed by page structs, set the upper limit
>> -     * for the huge DMA window as MAX_PHYSMEM_BITS.
>> -     */
>> -    if (of_find_node_by_type(NULL, "ibm,pmemory"))
>> -        return (sizeof(phys_addr_t) * 8 <= MAX_PHYSMEM_BITS) ?
>> -            (phys_addr_t) -1 : (1ULL << MAX_PHYSMEM_BITS);
>> -
>>       for_each_node_by_type(memory, "memory") {
>>           unsigned long start, size;
>>           int n_mem_addr_cells, n_mem_size_cells, len;
>> @@ -1341,6 +1332,16 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>>        */
>>       len = max_ram_len;
>>       if (pmem_present) {
>> +        if (default_win_removed) {
>> +            /*
>> +             * If we only have one DMA window and have pmem present,
>> +             * then we need to be able to map the entire address
>> +             * range in order to be able to do direct DMA to RAM.
>> +             */
>> +            len = order_base_2((sizeof(phys_addr_t) * 8 <= MAX_PHYSMEM_BITS) ?
>> +                    (phys_addr_t) -1 : (1ULL << MAX_PHYSMEM_BITS));
>> +        }
>> +
>>           if (query.largest_available_block >=
>>               (1ULL << (MAX_PHYSMEM_BITS - page_shift)))
>>               len = MAX_PHYSMEM_BITS;
>>
> 


-- 
Brian King
Power Linux I/O
IBM Linux Technology Center


^ permalink raw reply

* Re: [PATCH] soc: fsl: guts: Fix a resource leak in the error handling path of 'fsl_guts_probe()'
From: Tyrel Datwyler @ 2021-10-22 20:16 UTC (permalink / raw)
  To: Li Yang, Christophe JAILLET
  Cc: linuxppc-dev, kernel-janitors, lkml,
	moderated list:ARM/FREESCALE IMX / MXC ARM ARCHITECTURE
In-Reply-To: <CADRPPNTHKuV9eernJS6ZV_+i-xtPXHQnS64GSx=ubwWE+nbLYw@mail.gmail.com>

On 10/21/21 5:26 PM, Li Yang wrote:
> On Wed, Aug 18, 2021 at 4:23 PM Christophe JAILLET
> <christophe.jaillet@wanadoo.fr> wrote:
>>
>> If an error occurs after 'of_find_node_by_path()', the reference taken for
>> 'root' will never be released and some memory will leak.
> 
> Thanks for finding this.  This truly is a problem.
> 
>>
>> Instead of adding an error handling path and modifying all the
>> 'return -SOMETHING' into 'goto errorpath', use 'devm_add_action_or_reset()'
>> to release the reference when needed.
>>
>> Simplify the remove function accordingly.
>>
>> As an extra benefit, the 'root' global variable can now be removed as well.
>>
>> Fixes: 3c0d64e867ed ("soc: fsl: guts: reuse machine name from device tree")
>> Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
>> ---
>> Compile tested only
>> ---
>>  drivers/soc/fsl/guts.c | 16 ++++++++++++++--
>>  1 file changed, 14 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/soc/fsl/guts.c b/drivers/soc/fsl/guts.c
>> index d5e9a5f2c087..4d9476c7b87c 100644
>> --- a/drivers/soc/fsl/guts.c
>> +++ b/drivers/soc/fsl/guts.c
>> @@ -28,7 +28,6 @@ struct fsl_soc_die_attr {
>>  static struct guts *guts;
>>  static struct soc_device_attribute soc_dev_attr;
>>  static struct soc_device *soc_dev;
>> -static struct device_node *root;
>>
>>
>>  /* SoC die attribute definition for QorIQ platform */
>> @@ -136,14 +135,23 @@ static u32 fsl_guts_get_svr(void)
>>         return svr;
>>  }
>>
>> +static void fsl_guts_put_root(void *data)
>> +{
>> +       struct device_node *root = data;
>> +
>> +       of_node_put(root);
>> +}
>> +
>>  static int fsl_guts_probe(struct platform_device *pdev)
>>  {
>>         struct device_node *np = pdev->dev.of_node;
>>         struct device *dev = &pdev->dev;
>> +       struct device_node *root;
>>         struct resource *res;
>>         const struct fsl_soc_die_attr *soc_die;
>>         const char *machine;
>>         u32 svr;
>> +       int ret;
>>
>>         /* Initialize guts */
>>         guts = devm_kzalloc(dev, sizeof(*guts), GFP_KERNEL);
>> @@ -159,6 +167,10 @@ static int fsl_guts_probe(struct platform_device *pdev)
>>
>>         /* Register soc device */
>>         root = of_find_node_by_path("/");
>> +       ret = devm_add_action_or_reset(dev, fsl_guts_put_root, root);
>> +       if (ret)
>> +               return ret;
> 
> We probably only need to hold the reference when we do get "machine"
> from the device tree, otherwise we can put it directly.

To be pedantic since you are using the a properties string value of the root
node directly you need to hold the reference of that node for the lifetime of
the device. Realistically, its not like the root node and its model/compatible
properties are going to need to get modified at runtime so you could also argue
that holding the reference is unnecessary.

> 
> Or maybe we just maintain a local copy of string machine which means
> we can release the reference right away?

Looks like that is the original behavior that commit 3c0d64e867ed changed.
Although it looks like that behavior neglected to handle a memory allocation
failure during the string copy. How much memory is really being saved by not
keeping a local copy? Maybe revert 3c0d64e867ed and add the memory allocation check?

-Tyrel

> 
>> +
>>         if (of_property_read_string(root, "model", &machine))
>>                 of_property_read_string_index(root, "compatible", 0, &machine);
>>         if (machine)
>> @@ -197,7 +209,7 @@ static int fsl_guts_probe(struct platform_device *pdev)
>>  static int fsl_guts_remove(struct platform_device *dev)
>>  {
>>         soc_device_unregister(soc_dev);
>> -       of_node_put(root);
>> +
>>         return 0;
>>  }
>>
>> --
>> 2.30.2
>>


^ permalink raw reply

* Re: Fwd: Fwd: X stopped working with 5.14 on iBook
From: Christophe Leroy @ 2021-10-22 18:40 UTC (permalink / raw)
  To: Stan Johnson, Finn Thain
  Cc: Christopher M. Riedl, linuxppc-dev@lists.ozlabs.org
In-Reply-To: <27ad38f3-c1a8-ac5c-8467-f311b5882a00@yahoo.com>

Copied to Christopher M. Riedl <cmr@linux.ibm.com> and linuxppc list.

Le 22/10/2021 à 19:40, Stan Johnson a écrit :
> Hello Christophe and Finn,
> 
> My message to Christopher Riedl bounced:
> 
> <cmr@codefail.de>:
> 554: 5.7.1 <cmr@codefail.de>: Relay access denied
> 
> I'm not sure how to proceed; thanks for any help.

You should always copy the list, as other people might be interested by 
your problem and/or may help you.

Christophe

> 
> -Stan
> 
> -------- Forwarded Message --------
> Subject: Fwd: X stopped working with 5.14 on iBook
> Date: Fri, 22 Oct 2021 11:35:21 -0600
> From: Stan Johnson <userm57@yahoo.com>
> To: Christopher M. Riedl <cmr@codefail.de>
> CC: Finn Thain <fthain@fastmail.com.au>
> 
> Hello Christopher Riedl,
> 
> Please see the message below, in which a git bisect identifies a commit
> which may have stopped X from working on some PowerPC G4 systems
> (specifically the G4 PowerBook and Cube, possibly others).
> 
> I'm not sure how to proceed with further tests. If the identified commit
> could not have caused the problem, then further testing may be needed.
> Please let me know if you need any additional information.
> 
> Hopefully your e-mail filter will allow messages from yahoo.com addresses.
> 
> thanks for your help
> 
> -Stan Johnson
>   userm57@yahoo.com
> 
> -------- Forwarded Message --------
> Subject: Re: X stopped working with 5.14 on iBook
> Date: Fri, 22 Oct 2021 11:25:14 -0600
> From: Stan Johnson <userm57@yahoo.com>
> To: debian-powerpc@lists.debian.org
> CC: Riccardo Mottola <riccardo.mottola@libero.it>
> 
> On 10/14/21 9:21 PM, Stan Johnson wrote:
>> ...
>> Debian's 5.10.0-8 config file works (as expected) with Debian's 5.10.0-8
>> kernel source.
>> ...
>> X works with 5.14 using a tuned config file derived from 5.13 testing.
>> ...
> 
> Update:
> 
> The issue originally reported by Riccardo Mottola was that X wasn't
> working on a PowerBook G4 using Debian's default
> vmlinux-5.14.0-2-powerpc kernel. I was able to confirm that the X
> failure also occurs on a G4 Cube. My G4 Cube has Debian SID,
> sysvinit-core, Xfce and wdm installed. To test whether X works, I
> disabled wdm, then I log in at the text console and run "startx". When X
> fails, the screen goes blank and the backlight stays on; when X works,
> the normal desktop comes up.
> 
> X works in mainline v5.12 built using a config file based on Debian's
> config-5.10.0-8-powerpc.
> 
> X fails in mainline v5.13 built using a config file based on Debian's
> config-5.10.0-8-powerpc.
> 
> With much help and advice from Finn Thain, I was able to run a bisect
> using a config file based on Debian's config-5.10.0-8-powerpc, with
> v5.12 "good" and v5.13 "bad".
> 
> $ git reset --hard
> HEAD is now at 62fb9874f5da Linux 5.13
> $ git bisect start v5.13
> Updating files: 100% (12992/12992), done.
> Previous HEAD position was 62fb9874f5da Linux 5.13
> HEAD is now at 9f4ad9e425a1 Linux 5.12
> $ git bisect bad v5.13
> $ git bisect good v5.12
> Bisecting: 8739 revisions left to test after this (roughly 13 steps)
>> 85f3f17b5db2dd9f8a094a0ddc665555135afd22] Merge branch 'md-fixes' of
> https://git.kernel.org/pub/scm/linux/kernel/git/song/md into block-5.13
> 
> After the bisect, git reports this:
> 
> ----------
> 
> d3ccc9781560af051554017c702631560bdc0811 is the first bad commit
> commit d3ccc9781560af051554017c702631560bdc0811
> Author: Christopher M. Riedl <cmr@codefail.de>
> Date:   Fri Feb 26 19:12:59 2021 -0600
> 
>      powerpc/signal: Use __get_user() to copy sigset_t
> 
>      Usually sigset_t is exactly 8B which is a "trivial" size and does not
>      warrant using __copy_from_user(). Use __get_user() directly in
>      anticipation of future work to remove the trivial size optimizations
>      from __copy_from_user().
> 
>      The ppc32 implementation of get_sigset_t() previously called
>      copy_from_user() which, unlike __copy_from_user(), calls access_ok().
>      Replacing this w/ __get_user() (no access_ok()) is fine here since both
>      callsites in signal_32.c are preceded by an earlier access_ok().
> 
>      Signed-off-by: Christopher M. Riedl <cmr@codefail.de>
>      Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
>      Link: https://lore.kernel.org/r/20210227011259.11992-11-cmr@codefail.de
> 
>   arch/powerpc/kernel/signal.h    | 7 +++++++
>   arch/powerpc/kernel/signal_32.c | 2 +-
>   arch/powerpc/kernel/signal_64.c | 4 ++--
>   3 files changed, 10 insertions(+), 3 deletions(-)
> 

^ permalink raw reply

* Re: [PATCH v2] perf vendor events power10: Add metric events json file for power10 platform
From: Paul A. Clarke @ 2021-10-22 14:49 UTC (permalink / raw)
  To: Kajol Jain
  Cc: maddy, rnsastry, linuxppc-dev, linux-kernel, acme,
	linux-perf-users, atrajeev, jolsa
In-Reply-To: <20211022062505.78767-1-kjain@linux.ibm.com>

Thanks for the changes!
More nits below (many left over from prior review)...

On Fri, Oct 22, 2021 at 11:55:05AM +0530, Kajol Jain wrote:
> Add pmu metric json file for power10 platform.
> 
> Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
> ---
> Changelog v1 -> v2:
> - Did some nit changes in BriefDescription field
>   as suggested by Paul A. Clarke
> 
> - Link to the v1 patch: https://lkml.org/lkml/2021/10/6/131
> 
>  .../arch/powerpc/power10/metrics.json         | 676 ++++++++++++++++++
>  1 file changed, 676 insertions(+)
>  create mode 100644 tools/perf/pmu-events/arch/powerpc/power10/metrics.json
> 
> diff --git a/tools/perf/pmu-events/arch/powerpc/power10/metrics.json b/tools/perf/pmu-events/arch/powerpc/power10/metrics.json
> new file mode 100644
> index 000000000000..8adab5cd9934
> --- /dev/null
> +++ b/tools/perf/pmu-events/arch/powerpc/power10/metrics.json
> @@ -0,0 +1,676 @@
> +[
> +    {
> +        "BriefDescription": "Percentage of cycles that are run cycles",
> +        "MetricExpr": "PM_RUN_CYC / PM_CYC * 100",
> +        "MetricGroup": "General",
> +        "MetricName": "RUN_CYCLES_RATE",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per completed instruction",
> +        "MetricExpr": "PM_CYC / PM_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "CYCLES_PER_INSTRUCTION"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled for any reason",
> +        "MetricExpr": "PM_DISP_STALL_CYC / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISPATCHED_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled because there was a flush",
> +        "MetricExpr": "PM_DISP_STALL_FLUSH / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISPATCHED_FLUSH_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled because the MMU was handling a translation miss",
> +        "MetricExpr": "PM_DISP_STALL_TRANSLATION / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISPATCHED_TRANSLATION_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled waiting to resolve an instruction ERAT miss",
> +        "MetricExpr": "PM_DISP_STALL_IERAT_ONLY_MISS / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISPATCHED_IERAT_ONLY_MISS_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled waiting to resolve an instruction TLB miss",
> +        "MetricExpr": "PM_DISP_STALL_ITLB_MISS / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISPATCHED_ITLB_MISS_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled due to an icache miss",
> +        "MetricExpr": "PM_DISP_STALL_IC_MISS / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISPATCHED_IC_MISS_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled while the instruction was fetched from the local L2",
> +        "MetricExpr": "PM_DISP_STALL_IC_L2 / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISPATCHED_IC_L2_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled while the instruction was fetched from the local L3",
> +        "MetricExpr": "PM_DISP_STALL_IC_L3 / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISPATCHED_IC_L3_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled while the instruction was fetched from any source beyond the local L3",
> +        "MetricExpr": "PM_DISP_STALL_IC_L3MISS / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISPATCHED_IC_L3MISS_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled due to an icache miss after a branch mispredict",
> +        "MetricExpr": "PM_DISP_STALL_BR_MPRED_ICMISS / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISPATCHED_BR_MPRED_ICMISS_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled while instruction was fetched from the local L2 after suffering a branch mispredict",
> +        "MetricExpr": "PM_DISP_STALL_BR_MPRED_IC_L2 / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISPATCHED_BR_MPRED_IC_L2_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled while instruction was fetched from the local L3 after suffering a branch mispredict",
> +        "MetricExpr": "PM_DISP_STALL_BR_MPRED_IC_L3 / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISPATCHED_BR_MPRED_IC_L3_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled while instruction was fetched from any source beyond the local L3 after suffering a branch mispredict",
> +        "MetricExpr": "PM_DISP_STALL_BR_MPRED_IC_L3MISS / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISPATCHED_BR_MPRED_IC_L3MISS_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled due to a branch mispredict",
> +        "MetricExpr": "PM_DISP_STALL_BR_MPRED / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISPATCHED_BR_MPRED_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction was held at dispatch for any reason",
> +        "MetricExpr": "PM_DISP_STALL_HELD_CYC / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISPATCHED_HELD_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction was held at dispatch because of a synchronizing instruction that requires the ICT to be empty before dispatch",
> +        "MetricExpr": "PM_DISP_STALL_HELD_SYNC_CYC / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISP_HELD_STALL_SYNC_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction was held at dispatch while waiting on the scoreboard",
> +        "MetricExpr": "PM_DISP_STALL_HELD_SCOREBOARD_CYC / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISP_HELD_STALL_SCOREBOARD_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction was held at dispatch due to issue queue full",
> +        "MetricExpr": "PM_DISP_STALL_HELD_ISSQ_FULL_CYC / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISP_HELD_STALL_ISSQ_FULL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction was held at dispatch because the mapper/SRB was full",
> +        "MetricExpr": "PM_DISP_STALL_HELD_RENAME_CYC / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISPATCHED_HELD_RENAME_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction was held at dispatch because the STF mapper/SRB was full",
> +        "MetricExpr": "PM_DISP_STALL_HELD_STF_MAPPER_CYC / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISPATCHED_HELD_STF_MAPPER_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction was held at dispatch because the XVFC mapper/SRB was full",
> +        "MetricExpr": "PM_DISP_STALL_HELD_XVFC_MAPPER_CYC / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISPATCHED_HELD_XVFC_MAPPER_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction was held at dispatch for any other reason",
> +        "MetricExpr": "PM_DISP_STALL_HELD_OTHER_CYC / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISPATCHED_HELD_OTHER_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction has been dispatched but not issued for any reason",
> +        "MetricExpr": "PM_ISSUE_STALL / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "ISSUE_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is waiting to be finished in one of the execution units",
> +        "MetricExpr": "PM_EXEC_STALL / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "EXECUTION_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction spent executing an NTC instruction that gets flushed some time after dispatch",
> +        "MetricExpr": "PM_EXEC_STALL_NTC_FLUSH / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "NTC_FLUSH_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTF instruction finishes at dispatch",
> +        "MetricExpr": "PM_EXEC_STALL_FIN_AT_DISP / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "FIN_AT_DISP_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is executing in the branch unit",
> +        "MetricExpr": "PM_EXEC_STALL_BRU / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "BRU_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is a simple fixed point instruction that is executing in the LSU",
> +        "MetricExpr": "PM_EXEC_STALL_SIMPLE_FX / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "SIMPLE_FX_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is executing in the VSU",
> +        "MetricExpr": "PM_EXEC_STALL_VSU / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "VSU_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is waiting to be finished in one of the execution units",
> +        "MetricExpr": "PM_EXEC_STALL_TRANSLATION / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "TRANSLATION_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is a load or store that suffered a translation miss",
> +        "MetricExpr": "PM_EXEC_STALL_DERAT_ONLY_MISS / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DERAT_ONLY_MISS_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is recovering from a TLB miss",
> +        "MetricExpr": "PM_EXEC_STALL_DERAT_DTLB_MISS / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DERAT_DTLB_MISS_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is executing in the LSU",
> +        "MetricExpr": "PM_EXEC_STALL_LSU / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "LSU_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is a load that is executing in the LSU",
> +        "MetricExpr": "PM_EXEC_STALL_LOAD / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "LOAD_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is waiting for a load miss to resolve from either the local L2 or local L3",
> +        "MetricExpr": "PM_EXEC_STALL_DMISS_L2L3 / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DMISS_L2L3_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is waiting for a load miss to resolve from either the local L2 or local L3, with an RC dispatch conflict",
> +        "MetricExpr": "PM_EXEC_STALL_DMISS_L2L3_CONFLICT / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DMISS_L2L3_CONFLICT_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is waiting for a load miss to resolve from either the local L2 or local L3, without an RC dispatch conflict",
> +        "MetricExpr": "PM_EXEC_STALL_DMISS_L2L3_NOCONFLICT / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DMISS_L2L3_NOCONFLICT_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is waiting for a load miss to resolve from a source beyond the local L2 and local L3",
> +        "MetricExpr": "PM_EXEC_STALL_DMISS_L3MISS / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DMISS_L3MISS_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is waiting for a load miss to resolve from a neighbor chiplet's L2 or L3 in the same chip",
> +        "MetricExpr": "PM_EXEC_STALL_DMISS_L21_L31 / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DMISS_L21_L31_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is waiting for a load miss to resolve from L4, local memory or OpenCapp chip",

What is "OpenCapp"?  Is is different from OpenCAPI?

> +        "MetricExpr": "PM_EXEC_STALL_DMISS_LMEM / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DMISS_LMEM_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is waiting for a load miss to resolve from a remote chip (cache, L4, memory or OpenCapp) in the same group",
> +        "MetricExpr": "PM_EXEC_STALL_DMISS_OFF_CHIP / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DMISS_OFF_CHIP_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is waiting for a load miss to resolve from a distant chip (cache, L4, memory or OpenCapp chip)",
> +        "MetricExpr": "PM_EXEC_STALL_DMISS_OFF_NODE / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DMISS_OFF_NODE_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is executing a TLBIEL instruction",
> +        "MetricExpr": "PM_EXEC_STALL_TLBIEL / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "TLBIEL_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is finishing a load after its data has been reloaded from a data source beyond the local L1, OR when the LSU is processing an L1-hit, OR when the NTF instruction merged with another load in the LMQ",
> +        "MetricExpr": "PM_EXEC_STALL_LOAD_FINISH / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "LOAD_FINISH_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is a store that is executing in the LSU",
> +        "MetricExpr": "PM_EXEC_STALL_STORE / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "STORE_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is in the store unit outside of handling store misses or other special store operations",

Is "store unit" not the same as "LSU" ?  Use "LSU" uniformly if appropriate:
s/store unit/LSU/

> +        "MetricExpr": "PM_EXEC_STALL_STORE_PIPE / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "STORE_PIPE_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is a store whose cache line was not resident in the L1 and had to wait for allocation of the missing line into the L1",
> +        "MetricExpr": "PM_EXEC_STALL_STORE_MISS / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "STORE_MISS_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is a TLBIE instruction waiting for a response from the L2",
> +        "MetricExpr": "PM_EXEC_STALL_TLBIE / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "TLBIE_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is executing a PTESYNC instruction",
> +        "MetricExpr": "PM_EXEC_STALL_PTESYNC / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "PTESYNC_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction cannot complete because the thread was blocked",
> +        "MetricExpr": "PM_CMPL_STALL / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "COMPLETION_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction cannot complete because it was interrupted by ANY exception",
> +        "MetricExpr": "PM_CMPL_STALL_EXCEPTION / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "EXCEPTION_COMPLETION_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is stuck at finish waiting for the non-speculative finish of either a STCX instruction waiting for its result or a load waiting for non-critical sectors of data and ECC",
> +        "MetricExpr": "PM_CMPL_STALL_MEM_ECC / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "MEM_ECC_COMPLETION_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction cannot complete the instruction is a STCX instruction waiting for resolution from the nest",

Need to reword this, I think.  I propose "Average cycles per instruction
when the NTC instruction is a STCX instruction waiting for resolution
from the nest", which follows the form used by HWSYNC_COMPLETION_STALL_CPI,
below.

> +        "MetricExpr": "PM_CMPL_STALL_STCX / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "STCX_COMPLETION_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is a LWSYNC instruction waiting to complete",
> +        "MetricExpr": "PM_CMPL_STALL_LWSYNC / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "LWSYNC_COMPLETION_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction is a HWSYNC instruction stuck at finish waiting for a response from the L2",
> +        "MetricExpr": "PM_CMPL_STALL_HWSYNC / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "HWSYNC_COMPLETION_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction required special handling before completion",
> +        "MetricExpr": "PM_CMPL_STALL_SPECIAL / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "SPECIAL_COMPLETION_STALL_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when dispatch was stalled because fetch was being held, so there was nothing in the pipeline for this thread",
> +        "MetricExpr": "PM_DISP_STALL_FETCH / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISPATCHED_FETCH_CPI"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTC instruction was held at dispatch because of power management",
> +        "MetricExpr": "PM_DISP_STALL_HELD_HALT_CYC / PM_RUN_INST_CMPL",
> +        "MetricGroup": "CPI",
> +        "MetricName": "DISPATCHED_HELD_HALT_CPI"
> +    },
> +    {
> +        "BriefDescription": "Percentage of flushes per completed run instruction",

s/per completed run instruction/per instruction/

> +        "MetricExpr": "PM_FLUSH / PM_RUN_INST_CMPL * 100",
> +        "MetricGroup": "Others",
> +        "MetricName": "FLUSH_RATE",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Percentage of flushes due to a branch mispredict per instruction",
> +        "MetricExpr": "PM_FLUSH_MPRED / PM_RUN_INST_CMPL * 100",
> +        "MetricGroup": "Others",
> +        "MetricName": "BR_MPRED_FLUSH_RATE",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Percentage of branch mispredictions per completed run instruction",

s/per completed run instruction/per instruction/

> +        "MetricExpr": "PM_BR_MPRED_CMPL / PM_RUN_INST_CMPL",
> +        "MetricGroup": "Others",
> +        "MetricName": "BRANCH_MISPREDICTION_RATE"
> +    },
> +    {
> +        "BriefDescription": "Percentage of finished loads that missed in the L1",
> +        "MetricExpr": "PM_LD_MISS_L1 / PM_LD_REF_L1 * 100",
> +        "MetricGroup": "Others",
> +        "MetricName": "L1_LD_MISS_RATIO",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Percentage of completed run instructions that were loads that missed the L1",

s/completed run instructions/instructions/

> +        "MetricExpr": "PM_LD_MISS_L1 / PM_RUN_INST_CMPL * 100",
> +        "MetricGroup": "Others",
> +        "MetricName": "L1_LD_MISS_RATE",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Percentage of instructions when the DPTEG required for the load/store instruction in execution was missing from the TLB",
> +        "MetricExpr": "PM_DTLB_MISS / PM_RUN_INST_CMPL * 100",
> +        "MetricGroup": "Others",
> +        "MetricName": "DTLB_MISS_RATE",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Average number of instructions dispatched per instruction completed",
> +        "MetricExpr": "PM_INST_DISP / PM_RUN_INST_CMPL",
> +        "MetricGroup": "General",
> +        "MetricName": "DISPATCH_PER_INST_CMPL"
> +    },
> +    {
> +        "BriefDescription": "Percentage of completed run instructions that were a demand load that did not hit in the L1 or L2",

s/completed run instructions/instructions/

> +        "MetricExpr": "PM_DATA_FROM_L2MISS / PM_RUN_INST_CMPL * 100",
> +        "MetricGroup": "General",
> +        "MetricName": "L2_LD_MISS_RATE",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Percentage of completed run instructions that were demand fetches that missed the L1 instruction cache",

s/completed run instructions/instructions/
s/instruction cache/icache/ to be consistent with the rest of the file

> +        "MetricExpr": "PM_L1_ICACHE_MISS / PM_RUN_INST_CMPL * 100",
> +        "MetricGroup": "Instruction_Misses",
> +        "MetricName": "L1_INST_MISS_RATE",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Percentage of completed run instructions that were demand fetches that reloaded from beyond the L3 instruction cache",

s/completed run instructions/instructions/
s/instruction cache/icache/ to be consistent with the rest of the file

> +        "MetricExpr": "PM_INST_FROM_L3MISS / PM_RUN_INST_CMPL * 100",
> +        "MetricGroup": "General",
> +        "MetricName": "L3_INST_MISS_RATE",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Average number of completed instructions per cycle",
> +        "MetricExpr": "PM_INST_CMPL / PM_CYC",
> +        "MetricGroup": "General",
> +        "MetricName": "IPC"
> +    },
> +    {
> +        "BriefDescription": "Average number of cycles per completed instruction group",
> +        "MetricExpr": "PM_CYC / PM_1PLUS_PPC_CMPL",
> +        "MetricGroup": "General",
> +        "MetricName": "CYCLES_PER_COMPLETED_INSTRUCTIONS_SET"
> +    },
> +    {
> +        "BriefDescription": "Percentage of cycles when at least 1 instruction dispatched",
> +        "MetricExpr": "PM_1PLUS_PPC_DISP / PM_RUN_CYC * 100",
> +        "MetricGroup": "General",
> +        "MetricName": "CYCLES_ATLEAST_ONE_INST_DISPATCHED",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Average number of finished loads per completed run instruction",

s/completed run instruction/instruction/

> +        "MetricExpr": "PM_LD_REF_L1 / PM_RUN_INST_CMPL",
> +        "MetricGroup": "General",
> +        "MetricName": "LOADS_PER_INST"
> +    },
> +    {
> +        "BriefDescription": "Average number of finished stores per completed run instruction",

s/completed run instruction/instruction/

> +        "MetricExpr": "PM_ST_FIN / PM_RUN_INST_CMPL",
> +        "MetricGroup": "General",
> +        "MetricName": "STORES_PER_INST"
> +    },
> +    {
> +        "BriefDescription": "Percentage of demand loads that reloaded from beyond the L2 per completed run instruction",

s/completed run instruction/instruction/

> +        "MetricExpr": "PM_DATA_FROM_L2MISS / PM_RUN_INST_CMPL * 100",
> +        "MetricGroup": "dL1_Reloads",
> +        "MetricName": "DL1_RELOAD_FROM_L2_MISS_RATE",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Percentage of demand loads that reloaded from beyond the L3 per completed run instruction",

s/completed run instruction/instruction/

> +        "MetricExpr": "PM_DATA_FROM_L3MISS / PM_RUN_INST_CMPL * 100",
> +        "MetricGroup": "dL1_Reloads",
> +        "MetricName": "DL1_RELOAD_FROM_L3_MISS_RATE",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Percentage of DERAT misses with 4k page size per completed run instruction",

s/completed run instruction/instruction/

> +        "MetricExpr": "PM_DERAT_MISS_4K / PM_RUN_INST_CMPL * 100",
> +        "MetricGroup": "Translation",
> +        "MetricName": "DERAT_4K_MISS_RATE",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Percentage of DERAT misses with 64k page size per completed run instruction",

s/completed run instruction/instruction/

> +        "MetricExpr": "PM_DERAT_MISS_64K / PM_RUN_INST_CMPL * 100",
> +        "MetricGroup": "Translation",
> +        "MetricName": "DERAT_64K_MISS_RATE",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Average number of run cycles per completed run instruction",

s/completed run instruction/instruction/

> +        "MetricExpr": "PM_RUN_CYC / PM_RUN_INST_CMPL",
> +        "MetricGroup": "General",
> +        "MetricName": "RUN_CPI"
> +    },
> +    {
> +        "BriefDescription": "Percentage of DERAT misses per completed run instruction",

s/completed run instruction/instruction/

> +        "MetricExpr": "PM_DERAT_MISS / PM_RUN_INST_CMPL * 100",
> +        "MetricGroup": "Translation",
> +        "MetricName": "DERAT_MISS_RATE",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Average number of completed run instructions per run cycle",

s/completed run instructions/instructions/

> +        "MetricExpr": "PM_RUN_INST_CMPL / PM_RUN_CYC",
> +        "MetricGroup": "General",
> +        "MetricName": "RUN_IPC"
> +    },
> +    {
> +        "BriefDescription": "Average number of instructions completed per instruction group",

s/completed//

> +        "MetricExpr": "PM_RUN_INST_CMPL / PM_1PLUS_PPC_CMPL",
> +        "MetricGroup": "General",
> +        "MetricName": "AVERAGE_COMPLETED_INSTRUCTION_SET_SIZE"
> +    },
> +    {
> +        "BriefDescription": "Average number of finished instructions per completed run instructions",

s/completed run instructions/instruction/

> +        "MetricExpr": "PM_INST_FIN / PM_RUN_INST_CMPL",
> +        "MetricGroup": "General",
> +        "MetricName": "INST_FIN_PER_CMPL"
> +    },
> +    {
> +        "BriefDescription": "Average cycles per instruction when the NTF instruction is completing and the finish was overlooked",
> +        "MetricExpr": "PM_EXEC_STALL_UNKNOWN / PM_RUN_INST_CMPL",
> +        "MetricGroup": "General",
> +        "MetricName": "EXEC_STALL_UNKOWN_CPI"
> +    },
> +    {
> +        "BriefDescription": "Percentage of finished branches that were taken",
> +        "MetricExpr": "PM_BR_TAKEN_CMPL / PM_BR_FIN * 100",
> +        "MetricGroup": "General",
> +        "MetricName": "TAKEN_BRANCHES",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Percentage of completed run instructions that were a demand load that did not hit in the L1, L2, or the L3",

s/completed run instructions/instructions/

> +        "MetricExpr": "PM_DATA_FROM_L3MISS / PM_RUN_INST_CMPL * 100",
> +        "MetricGroup": "General",
> +        "MetricName": "L3_LD_MISS_RATE",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Average number of finished branches per completed run instruction",

s/completed run instruction/instruction/

> +        "MetricExpr": "PM_BR_FIN / PM_RUN_INST_CMPL",
> +        "MetricGroup": "General",
> +        "MetricName": "BRANCHES_PER_INST"
> +    },
> +    {
> +        "BriefDescription": "Average number of instructions finished in the LSU per completed run instruction",

s/completed run instruction/instruction/

> +        "MetricExpr": "PM_LSU_FIN / PM_RUN_INST_CMPL",
> +        "MetricGroup": "General",
> +        "MetricName": "LSU_PER_INST"
> +    },
> +    {
> +        "BriefDescription": "Average number of instructions finished in the VSU per completed run instruction",

s/completed run instruction/instruction/

> +        "MetricExpr": "PM_VSU_FIN / PM_RUN_INST_CMPL",
> +        "MetricGroup": "General",
> +        "MetricName": "VSU_PER_INST"
> +    },
> +    {
> +        "BriefDescription": "Average number of TLBIE instructions finished in the LSU per completed run instruction",

s/completed run instruction/instruction/

> +        "MetricExpr": "PM_TLBIE_FIN / PM_RUN_INST_CMPL",
> +        "MetricGroup": "General",
> +        "MetricName": "TLBIE_PER_INST"
> +    },
> +    {
> +        "BriefDescription": "Average number of STCX instructions finshed per completed run instruction",

s/completed run instruction/instruction/

> +        "MetricExpr": "PM_STCX_FIN / PM_RUN_INST_CMPL",
> +        "MetricGroup": "General",
> +        "MetricName": "STXC_PER_INST"
> +    },
> +    {
> +        "BriefDescription": "Average number of LARX instructions finshed per completed run instruction",

s/completed run instruction/instruction/

> +        "MetricExpr": "PM_LARX_FIN / PM_RUN_INST_CMPL",
> +        "MetricGroup": "General",
> +        "MetricName": "LARX_PER_INST"
> +    },
> +    {
> +        "BriefDescription": "Average number of PTESYNC instructions finshed per completed run instruction",

s/completed run instruction/instruction/

> +        "MetricExpr": "PM_PTESYNC_FIN / PM_RUN_INST_CMPL",
> +        "MetricGroup": "General",
> +        "MetricName": "PTESYNC_PER_INST"
> +    },
> +    {
> +        "BriefDescription": "Average number of simple fixed-point instructions finshed in the store unit per completed run instruction",

s/completed run instruction/instruction/
s/store unit/LSU/

> +        "MetricExpr": "PM_FX_LSU_FIN / PM_RUN_INST_CMPL",
> +        "MetricGroup": "General",
> +        "MetricName": "FX_PER_INST"
> +    },
> +    {
> +        "BriefDescription": "Percentage of demand load misses that reloaded the L1 cache",
> +        "MetricExpr": "PM_LD_DEMAND_MISS_L1 / PM_LD_MISS_L1 * 100",
> +        "MetricGroup": "General",
> +        "MetricName": "DL1_MISS_RELOADS",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Percentage of demand load misses that reloaded from beyond the local L2",
> +        "MetricExpr": "PM_DATA_FROM_L2MISS / PM_LD_DEMAND_MISS_L1 * 100",
> +        "MetricGroup": "dL1_Reloads",
> +        "MetricName": "DL1_RELOAD_FROM_L2_MISS",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Percentage of demand load misses that reloaded from beyond the local L3",
> +        "MetricExpr": "PM_DATA_FROM_L3MISS / PM_LD_DEMAND_MISS_L1 * 100",
> +        "MetricGroup": "dL1_Reloads",
> +        "MetricName": "DL1_RELOAD_FROM_L3_MISS",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Percentage of cycles stalled due to the NTC instruction waiting for a load miss to resolve from a source beyond the local L2 and local L3",
> +        "MetricExpr": "DMISS_L3MISS_STALL_CPI / RUN_CPI * 100",
> +        "MetricGroup": "General",
> +        "MetricName": "DCACHE_MISS_CPI",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Percentage of DERAT misses with 2M page size per completed run instruction",

s/completed run instruction/instruction/

> +        "MetricExpr": "PM_DERAT_MISS_2M / PM_RUN_INST_CMPL * 100",
> +        "MetricGroup": "Translation",
> +        "MetricName": "DERAT_2M_MISS_RATE",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Percentage of DERAT misses with 16M page size per completed run instruction",

s/completed run instruction/instruction/

> +        "MetricExpr": "PM_DERAT_MISS_16M / PM_RUN_INST_CMPL * 100",
> +        "MetricGroup": "Translation",
> +        "MetricName": "DERAT_16M_MISS_RATE",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "DERAT miss ratio for 4K page size",
> +        "MetricExpr": "PM_DERAT_MISS_4K / PM_DERAT_MISS",
> +        "MetricGroup": "Translation",
> +        "MetricName": "DERAT_4K_MISS_RATIO"
> +    },
> +    {
> +        "BriefDescription": "DERAT miss ratio for 2M page size",
> +        "MetricExpr": "PM_DERAT_MISS_2M / PM_DERAT_MISS",
> +        "MetricGroup": "Translation",
> +        "MetricName": "DERAT_2M_MISS_RATIO"
> +    },
> +    {
> +        "BriefDescription": "DERAT miss ratio for 16M page size",
> +        "MetricExpr": "PM_DERAT_MISS_16M / PM_DERAT_MISS",
> +        "MetricGroup": "Translation",
> +        "MetricName": "DERAT_16M_MISS_RATIO"
> +    },
> +    {
> +        "BriefDescription": "DERAT miss ratio for 64K page size",
> +        "MetricExpr": "PM_DERAT_MISS_64K / PM_DERAT_MISS",
> +        "MetricGroup": "Translation",
> +        "MetricName": "DERAT_64K_MISS_RATIO"
> +    },
> +    {
> +        "BriefDescription": "Percentage of DERAT misses that resulted in TLB reloads",
> +        "MetricExpr": "PM_DTLB_MISS / PM_DERAT_MISS * 100",
> +        "MetricGroup": "Translation",
> +        "MetricName": "DERAT_MISS_RELOAD",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Percentage of icache misses that were reloaded from beyond the local L3",
> +        "MetricExpr": "PM_INST_FROM_L3MISS / PM_L1_ICACHE_MISS * 100",
> +        "MetricGroup": "Instruction_Misses",
> +        "MetricName": "INST_FROM_L3_MISS",
> +        "ScaleUnit": "1%"
> +    },
> +    {
> +        "BriefDescription": "Percentage of icache reloads from the beyond the L3 per completed run instruction",

s/completed run instruction/instruction/

> +        "MetricExpr": "PM_INST_FROM_L3MISS / PM_RUN_INST_CMPL * 100",
> +        "MetricGroup": "Instruction_Misses",
> +        "MetricName": "INST_FROM_L3_MISS_RATE",
> +        "ScaleUnit": "1%"
> +    }
> +]
> -- 

PC

^ permalink raw reply

* Re: [PATCH] locking: remove spin_lock_flags() etc
From: Helge Deller @ 2021-10-22 14:10 UTC (permalink / raw)
  To: Arnd Bergmann, Peter Zijlstra, Ingo Molnar, Will Deacon
  Cc: Jonas Bonn, linux-s390, Alexander Gordeev, linux-ia64,
	Vasily Gorbik, Arnd Bergmann, Boqun Feng, linux-kernel,
	Stefan Kristiansson, James E.J. Bottomley, Christian Borntraeger,
	openrisc, Paul Mackerras, linux-parisc, Waiman Long,
	Stafford Horne, linuxppc-dev, Heiko Carstens
In-Reply-To: <20211022120058.1031690-1-arnd@kernel.org>

On 10/22/21 13:59, Arnd Bergmann wrote:
> From: Arnd Bergmann <arnd@arndb.de>
>
> parisc, ia64 and powerpc32 are the only remaining architectures that
> provide custom arch_{spin,read,write}_lock_flags() functions, which are
> meant to re-enable interrupts while waiting for a spinlock.
>
> However, none of these can actually run into this codepath, because
> it is only called on architectures without CONFIG_GENERIC_LOCKBREAK,
> or when CONFIG_DEBUG_LOCK_ALLOC is set without CONFIG_LOCKDEP, and none
> of those combinations are possible on the three architectures.
>
> Going back in the git history, it appears that arch/mn10300 may have
> been able to run into this code path, but there is a good chance that
> it never worked. On the architectures that still exist, it was
> already impossible to hit back in 2008 after the introduction of
> CONFIG_GENERIC_LOCKBREAK, and possibly earlier.
>
> As this is all dead code, just remove it and the helper functions built
> around it. For arch/ia64, the inline asm could be cleaned up, but
> it seems safer to leave it untouched.
>
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>

Acked-by: Helge Deller <deller@gmx.de>  # parisc

Helge

> ---
>  arch/ia64/include/asm/spinlock.h           | 23 ++++++----------------
>  arch/openrisc/include/asm/spinlock.h       |  3 ---
>  arch/parisc/include/asm/spinlock.h         | 15 --------------
>  arch/powerpc/include/asm/simple_spinlock.h | 21 --------------------
>  arch/s390/include/asm/spinlock.h           |  8 --------
>  include/linux/lockdep.h                    | 17 ----------------
>  include/linux/rwlock.h                     | 15 --------------
>  include/linux/rwlock_api_smp.h             |  6 ++----
>  include/linux/spinlock.h                   | 13 ------------
>  include/linux/spinlock_api_smp.h           |  9 ---------
>  include/linux/spinlock_up.h                |  1 -
>  kernel/locking/spinlock.c                  |  3 +--
>  12 files changed, 9 insertions(+), 125 deletions(-)
>
> diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h
> index 864775970c50..0e5c1ad3239c 100644
> --- a/arch/ia64/include/asm/spinlock.h
> +++ b/arch/ia64/include/asm/spinlock.h
> @@ -124,18 +124,13 @@ static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
>  	__ticket_spin_unlock(lock);
>  }
>
> -static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
> -						  unsigned long flags)
> -{
> -	arch_spin_lock(lock);
> -}
> -#define arch_spin_lock_flags	arch_spin_lock_flags
> -
>  #ifdef ASM_SUPPORTED
>
>  static __always_inline void
> -arch_read_lock_flags(arch_rwlock_t *lock, unsigned long flags)
> +arch_read_lock(arch_rwlock_t *lock)
>  {
> +	unsigned long flags = 0;
> +
>  	__asm__ __volatile__ (
>  		"tbit.nz p6, p0 = %1,%2\n"
>  		"br.few 3f\n"
> @@ -157,13 +152,8 @@ arch_read_lock_flags(arch_rwlock_t *lock, unsigned long flags)
>  		: "p6", "p7", "r2", "memory");
>  }
>
> -#define arch_read_lock_flags arch_read_lock_flags
> -#define arch_read_lock(lock) arch_read_lock_flags(lock, 0)
> -
>  #else /* !ASM_SUPPORTED */
>
> -#define arch_read_lock_flags(rw, flags) arch_read_lock(rw)
> -
>  #define arch_read_lock(rw)								\
>  do {											\
>  	arch_rwlock_t *__read_lock_ptr = (rw);						\
> @@ -186,8 +176,10 @@ do {								\
>  #ifdef ASM_SUPPORTED
>
>  static __always_inline void
> -arch_write_lock_flags(arch_rwlock_t *lock, unsigned long flags)
> +arch_write_lock(arch_rwlock_t *lock)
>  {
> +	unsigned long flags = 0;
> +
>  	__asm__ __volatile__ (
>  		"tbit.nz p6, p0 = %1, %2\n"
>  		"mov ar.ccv = r0\n"
> @@ -210,9 +202,6 @@ arch_write_lock_flags(arch_rwlock_t *lock, unsigned long flags)
>  		: "ar.ccv", "p6", "p7", "r2", "r29", "memory");
>  }
>
> -#define arch_write_lock_flags arch_write_lock_flags
> -#define arch_write_lock(rw) arch_write_lock_flags(rw, 0)
> -
>  #define arch_write_trylock(rw)							\
>  ({										\
>  	register long result;							\
> diff --git a/arch/openrisc/include/asm/spinlock.h b/arch/openrisc/include/asm/spinlock.h
> index a8940bdfcb7e..264944a71535 100644
> --- a/arch/openrisc/include/asm/spinlock.h
> +++ b/arch/openrisc/include/asm/spinlock.h
> @@ -19,9 +19,6 @@
>
>  #include <asm/qrwlock.h>
>
> -#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
> -#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
> -
>  #define arch_spin_relax(lock)	cpu_relax()
>  #define arch_read_relax(lock)	cpu_relax()
>  #define arch_write_relax(lock)	cpu_relax()
> diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h
> index fa5ee8a45dbd..a6e5d66a7656 100644
> --- a/arch/parisc/include/asm/spinlock.h
> +++ b/arch/parisc/include/asm/spinlock.h
> @@ -23,21 +23,6 @@ static inline void arch_spin_lock(arch_spinlock_t *x)
>  			continue;
>  }
>
> -static inline void arch_spin_lock_flags(arch_spinlock_t *x,
> -					unsigned long flags)
> -{
> -	volatile unsigned int *a;
> -
> -	a = __ldcw_align(x);
> -	while (__ldcw(a) == 0)
> -		while (*a == 0)
> -			if (flags & PSW_SM_I) {
> -				local_irq_enable();
> -				local_irq_disable();
> -			}
> -}
> -#define arch_spin_lock_flags arch_spin_lock_flags
> -
>  static inline void arch_spin_unlock(arch_spinlock_t *x)
>  {
>  	volatile unsigned int *a;
> diff --git a/arch/powerpc/include/asm/simple_spinlock.h b/arch/powerpc/include/asm/simple_spinlock.h
> index 8985791a2ba5..7ae6aeef8464 100644
> --- a/arch/powerpc/include/asm/simple_spinlock.h
> +++ b/arch/powerpc/include/asm/simple_spinlock.h
> @@ -123,27 +123,6 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
>  	}
>  }
>
> -static inline
> -void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
> -{
> -	unsigned long flags_dis;
> -
> -	while (1) {
> -		if (likely(__arch_spin_trylock(lock) == 0))
> -			break;
> -		local_save_flags(flags_dis);
> -		local_irq_restore(flags);
> -		do {
> -			HMT_low();
> -			if (is_shared_processor())
> -				splpar_spin_yield(lock);
> -		} while (unlikely(lock->slock != 0));
> -		HMT_medium();
> -		local_irq_restore(flags_dis);
> -	}
> -}
> -#define arch_spin_lock_flags arch_spin_lock_flags
> -
>  static inline void arch_spin_unlock(arch_spinlock_t *lock)
>  {
>  	__asm__ __volatile__("# arch_spin_unlock\n\t"
> diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
> index ef59588a3042..888a2f1c9ee3 100644
> --- a/arch/s390/include/asm/spinlock.h
> +++ b/arch/s390/include/asm/spinlock.h
> @@ -67,14 +67,6 @@ static inline void arch_spin_lock(arch_spinlock_t *lp)
>  		arch_spin_lock_wait(lp);
>  }
>
> -static inline void arch_spin_lock_flags(arch_spinlock_t *lp,
> -					unsigned long flags)
> -{
> -	if (!arch_spin_trylock_once(lp))
> -		arch_spin_lock_wait(lp);
> -}
> -#define arch_spin_lock_flags	arch_spin_lock_flags
> -
>  static inline int arch_spin_trylock(arch_spinlock_t *lp)
>  {
>  	if (!arch_spin_trylock_once(lp))
> diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
> index 9fe165beb0f9..467b94257105 100644
> --- a/include/linux/lockdep.h
> +++ b/include/linux/lockdep.h
> @@ -481,23 +481,6 @@ do {								\
>
>  #endif /* CONFIG_LOCK_STAT */
>
> -#ifdef CONFIG_LOCKDEP
> -
> -/*
> - * On lockdep we dont want the hand-coded irq-enable of
> - * _raw_*_lock_flags() code, because lockdep assumes
> - * that interrupts are not re-enabled during lock-acquire:
> - */
> -#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \
> -	LOCK_CONTENDED((_lock), (try), (lock))
> -
> -#else /* CONFIG_LOCKDEP */
> -
> -#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \
> -	lockfl((_lock), (flags))
> -
> -#endif /* CONFIG_LOCKDEP */
> -
>  #ifdef CONFIG_PROVE_LOCKING
>  extern void print_irqtrace_events(struct task_struct *curr);
>  #else
> diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h
> index 7ce9a51ae5c0..2c0ad417ce3c 100644
> --- a/include/linux/rwlock.h
> +++ b/include/linux/rwlock.h
> @@ -30,31 +30,16 @@ do {								\
>
>  #ifdef CONFIG_DEBUG_SPINLOCK
>   extern void do_raw_read_lock(rwlock_t *lock) __acquires(lock);
> -#define do_raw_read_lock_flags(lock, flags) do_raw_read_lock(lock)
>   extern int do_raw_read_trylock(rwlock_t *lock);
>   extern void do_raw_read_unlock(rwlock_t *lock) __releases(lock);
>   extern void do_raw_write_lock(rwlock_t *lock) __acquires(lock);
> -#define do_raw_write_lock_flags(lock, flags) do_raw_write_lock(lock)
>   extern int do_raw_write_trylock(rwlock_t *lock);
>   extern void do_raw_write_unlock(rwlock_t *lock) __releases(lock);
>  #else
> -
> -#ifndef arch_read_lock_flags
> -# define arch_read_lock_flags(lock, flags)	arch_read_lock(lock)
> -#endif
> -
> -#ifndef arch_write_lock_flags
> -# define arch_write_lock_flags(lock, flags)	arch_write_lock(lock)
> -#endif
> -
>  # define do_raw_read_lock(rwlock)	do {__acquire(lock); arch_read_lock(&(rwlock)->raw_lock); } while (0)
> -# define do_raw_read_lock_flags(lock, flags) \
> -		do {__acquire(lock); arch_read_lock_flags(&(lock)->raw_lock, *(flags)); } while (0)
>  # define do_raw_read_trylock(rwlock)	arch_read_trylock(&(rwlock)->raw_lock)
>  # define do_raw_read_unlock(rwlock)	do {arch_read_unlock(&(rwlock)->raw_lock); __release(lock); } while (0)
>  # define do_raw_write_lock(rwlock)	do {__acquire(lock); arch_write_lock(&(rwlock)->raw_lock); } while (0)
> -# define do_raw_write_lock_flags(lock, flags) \
> -		do {__acquire(lock); arch_write_lock_flags(&(lock)->raw_lock, *(flags)); } while (0)
>  # define do_raw_write_trylock(rwlock)	arch_write_trylock(&(rwlock)->raw_lock)
>  # define do_raw_write_unlock(rwlock)	do {arch_write_unlock(&(rwlock)->raw_lock); __release(lock); } while (0)
>  #endif
> diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h
> index abfb53ab11be..f1db6f17c4fb 100644
> --- a/include/linux/rwlock_api_smp.h
> +++ b/include/linux/rwlock_api_smp.h
> @@ -157,8 +157,7 @@ static inline unsigned long __raw_read_lock_irqsave(rwlock_t *lock)
>  	local_irq_save(flags);
>  	preempt_disable();
>  	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
> -	LOCK_CONTENDED_FLAGS(lock, do_raw_read_trylock, do_raw_read_lock,
> -			     do_raw_read_lock_flags, &flags);
> +	LOCK_CONTENDED(lock, do_raw_read_trylock, do_raw_read_lock);
>  	return flags;
>  }
>
> @@ -184,8 +183,7 @@ static inline unsigned long __raw_write_lock_irqsave(rwlock_t *lock)
>  	local_irq_save(flags);
>  	preempt_disable();
>  	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
> -	LOCK_CONTENDED_FLAGS(lock, do_raw_write_trylock, do_raw_write_lock,
> -			     do_raw_write_lock_flags, &flags);
> +	LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock);
>  	return flags;
>  }
>
> diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
> index c04e99edfe92..40e467cdee2d 100644
> --- a/include/linux/spinlock.h
> +++ b/include/linux/spinlock.h
> @@ -176,7 +176,6 @@ do {									\
>
>  #ifdef CONFIG_DEBUG_SPINLOCK
>   extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock);
> -#define do_raw_spin_lock_flags(lock, flags) do_raw_spin_lock(lock)
>   extern int do_raw_spin_trylock(raw_spinlock_t *lock);
>   extern void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock);
>  #else
> @@ -187,18 +186,6 @@ static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock)
>  	mmiowb_spin_lock();
>  }
>
> -#ifndef arch_spin_lock_flags
> -#define arch_spin_lock_flags(lock, flags)	arch_spin_lock(lock)
> -#endif
> -
> -static inline void
> -do_raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long *flags) __acquires(lock)
> -{
> -	__acquire(lock);
> -	arch_spin_lock_flags(&lock->raw_lock, *flags);
> -	mmiowb_spin_lock();
> -}
> -
>  static inline int do_raw_spin_trylock(raw_spinlock_t *lock)
>  {
>  	int ret = arch_spin_trylock(&(lock)->raw_lock);
> diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
> index 6b8e1a0b137b..51fa0dab68c4 100644
> --- a/include/linux/spinlock_api_smp.h
> +++ b/include/linux/spinlock_api_smp.h
> @@ -108,16 +108,7 @@ static inline unsigned long __raw_spin_lock_irqsave(raw_spinlock_t *lock)
>  	local_irq_save(flags);
>  	preempt_disable();
>  	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
> -	/*
> -	 * On lockdep we dont want the hand-coded irq-enable of
> -	 * do_raw_spin_lock_flags() code, because lockdep assumes
> -	 * that interrupts are not re-enabled during lock-acquire:
> -	 */
> -#ifdef CONFIG_LOCKDEP
>  	LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
> -#else
> -	do_raw_spin_lock_flags(lock, &flags);
> -#endif
>  	return flags;
>  }
>
> diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h
> index 0ac9112c1bbe..16521074b6f7 100644
> --- a/include/linux/spinlock_up.h
> +++ b/include/linux/spinlock_up.h
> @@ -62,7 +62,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
>  #define arch_spin_is_locked(lock)	((void)(lock), 0)
>  /* for sched/core.c and kernel_lock.c: */
>  # define arch_spin_lock(lock)		do { barrier(); (void)(lock); } while (0)
> -# define arch_spin_lock_flags(lock, flags)	do { barrier(); (void)(lock); } while (0)
>  # define arch_spin_unlock(lock)	do { barrier(); (void)(lock); } while (0)
>  # define arch_spin_trylock(lock)	({ barrier(); (void)(lock); 1; })
>  #endif /* DEBUG_SPINLOCK */
> diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
> index c5830cfa379a..b562f9289372 100644
> --- a/kernel/locking/spinlock.c
> +++ b/kernel/locking/spinlock.c
> @@ -378,8 +378,7 @@ unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
>  	local_irq_save(flags);
>  	preempt_disable();
>  	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
> -	LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock,
> -				do_raw_spin_lock_flags, &flags);
> +	LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
>  	return flags;
>  }
>  EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested);
>


^ permalink raw reply

* Re: [PATCH] powerpc: Enhance pmem DMA bypass handling
From: Alexey Kardashevskiy @ 2021-10-22 12:24 UTC (permalink / raw)
  To: Brian King, linuxppc-dev
In-Reply-To: <20211021174449.120875-1-brking@linux.vnet.ibm.com>



On 22/10/2021 04:44, Brian King wrote:
> If ibm,pmemory is installed in the system, it can appear anywhere
> in the address space. This patch enhances how we handle DMA for devices when
> ibm,pmemory is present. In the case where we have enough DMA space to
> direct map all of RAM, but not ibm,pmemory, we use direct DMA for
> I/O to RAM and use the default window to dynamically map ibm,pmemory.
> In the case where we only have a single DMA window, this won't work, > so if the window is not big enough to map the entire address range,
> we cannot direct map.

but we want the pmem range to be mapped into the huge DMA window too if 
we can, why skip it?


> 
> Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
> ---
>   arch/powerpc/platforms/pseries/iommu.c | 19 ++++++++++---------
>   1 file changed, 10 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index 269f61d519c2..d9ae985d10a4 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -1092,15 +1092,6 @@ static phys_addr_t ddw_memory_hotplug_max(void)
>   	phys_addr_t max_addr = memory_hotplug_max();
>   	struct device_node *memory;
>   
> -	/*
> -	 * The "ibm,pmemory" can appear anywhere in the address space.
> -	 * Assuming it is still backed by page structs, set the upper limit
> -	 * for the huge DMA window as MAX_PHYSMEM_BITS.
> -	 */
> -	if (of_find_node_by_type(NULL, "ibm,pmemory"))
> -		return (sizeof(phys_addr_t) * 8 <= MAX_PHYSMEM_BITS) ?
> -			(phys_addr_t) -1 : (1ULL << MAX_PHYSMEM_BITS);
> -
>   	for_each_node_by_type(memory, "memory") {
>   		unsigned long start, size;
>   		int n_mem_addr_cells, n_mem_size_cells, len;
> @@ -1341,6 +1332,16 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>   	 */
>   	len = max_ram_len;
>   	if (pmem_present) {
> +		if (default_win_removed) {
> +			/*
> +			 * If we only have one DMA window and have pmem present,
> +			 * then we need to be able to map the entire address
> +			 * range in order to be able to do direct DMA to RAM.
> +			 */
> +			len = order_base_2((sizeof(phys_addr_t) * 8 <= MAX_PHYSMEM_BITS) ?
> +					(phys_addr_t) -1 : (1ULL << MAX_PHYSMEM_BITS));
> +		}
> +
>   		if (query.largest_available_block >=
>   		    (1ULL << (MAX_PHYSMEM_BITS - page_shift)))
>   			len = MAX_PHYSMEM_BITS;
> 

-- 
Alexey

^ permalink raw reply

* Re: [PATCH v2] powerpc/pseries/mobility: ignore ibm, platform-facilities updates
From: Daniel Axtens @ 2021-10-22 12:21 UTC (permalink / raw)
  To: Nathan Lynch; +Cc: tyreld, cheloha, ldufour, linuxppc-dev
In-Reply-To: <87o87iy3ji.fsf@linux.ibm.com>

Nathan Lynch <nathanl@linux.ibm.com> writes:

> Daniel Axtens <dja@axtens.net> writes:
>>> On VMs with NX encryption, compression, and/or RNG offload, these
>>> capabilities are described by nodes in the ibm,platform-facilities device
>>> tree hierarchy:
>>>
>>>   $ tree -d /sys/firmware/devicetree/base/ibm,platform-facilities/
>>>   /sys/firmware/devicetree/base/ibm,platform-facilities/
>>>   ├── ibm,compression-v1
>>>   ├── ibm,random-v1
>>>   └── ibm,sym-encryption-v1
>>>
>>>   3 directories
>>>
>>> The acceleration functions that these nodes describe are not disrupted by
>>> live migration, not even temporarily.
>>>
>>> But the post-migration ibm,update-nodes sequence firmware always sends
>>> "delete" messages for this hierarchy, followed by an "add" directive to
>>> reconstruct it via ibm,configure-connector (log with debugging statements
>>> enabled in mobility.c):
>>>
>>>   mobility: removing node /ibm,platform-facilities/ibm,random-v1:4294967285
>>>   mobility: removing node /ibm,platform-facilities/ibm,compression-v1:4294967284
>>>   mobility: removing node /ibm,platform-facilities/ibm,sym-encryption-v1:4294967283
>>>   mobility: removing node /ibm,platform-facilities:4294967286
>>>   ...
>>>   mobility: added node /ibm,platform-facilities:4294967286
>>>
>>> Note we receive a single "add" message for the entire hierarchy, and what
>>> we receive from the ibm,configure-connector sequence is the top-level
>>> platform-facilities node along with its three children. The debug message
>>> simply reports the parent node and not the whole subtree.
>>
>> If I understand correctly, (and again, this is not my area at all!) we
>> still have to go out to the firmware and call the
>> ibm,configure-connector sequence in order to figure out that the node
>> we're supposed to add is the ibm,platform-facilites node, right? We
>> can't save enough information at delete time to avoid the trip out to
>> firmware?
>
> That is right... but maybe I don't understand your angle here. Unsure
> what avoiding the configure-connector sequence for the nodes would buy
> us.

It's not meant to be a tricky question, so the simple answer is probably
the right one. Just wondering if there was a marginal efficiency gain -
although I believe it's not really a hot path anyway.

>
>
>>> Until that can be realized we have a confirmed use-after-free and the
>>> possibility of memory corruption. So add a limited workaround that
>>> discriminates on the node type, ignoring adds and removes. This should be
>>> amenable to backporting in the meantime.
>>
>> Yeah it's an unpleasant situation to find ourselves in. It's a bit icky
>> but as I think you said in a previous email, at least this isn't worse:
>> in the common case it should now succeed and and if properties change
>> significantly it will still fail.
>>
>> My one question (from more of a security point of view) is:
>>  1) Say you start using the facilities with a particular set of
>>     parameters.
>>
>>  2) Say you then get migrated and the parameters change.
>>
>>  3) If you keep using the platform facilities as if the original
>>     properties are still valid, can this cause any Interesting,
>>     unexpected or otherwise Bad consequences? Are we going to end up
>>     (for example) scribbling over random memory somehow?
>
> If drivers are safely handling errors from H_COP_OP etc, then no. (I
> know, this looks like a Well That Would Be a Driver Bug dismissal, but
> that's not my attitude.) And again this is a case where the change
> cannot make things worse.
>
> In the current design of the pseries LPM implementation, user space and
> other normal system activity resume as soon as we return from the
> stop_machine() call which suspends the partition, executing concurrently
> with any device tree updates. So even if we had code in place to
> correctly resolve the DT changes and the drivers were able to respond to
> the changes, there would still be a window of exposure to the kind of
> problem you describe: the changed characteristics, if any, of the
> destination obtain as soon as execution resumes, regardless of when the
> OS initiates the update-nodes sequence.
>
> The way out of that mess is to use the Linux suspend framework, or
> otherwise prevent user space from executing until the destination
> system's characteristics have been appropriately propagated out to the
> necessary drivers etc. I'm trying to get there.

Fair enough. I do appreciate the perfect not being the enemy of the good
especially in areas of the codebase like this where there is scope to
improve things but there is also a lot of complexity that we cannot
really get away from because the underlying problem domain is itself
just plain complex. (I think EEH is the other obvious example in
arch/powerpc.)

>> Apart from that, the code seems to do what it says, it seems to solve a
>> real problem, the error and memory handling makes sense, you _put the DT
>> nodes that you _get, the comments are helpful and descriptive, and it
>> passes the automated tests on patchwork/snowpatch.
>
> I appreciate your review!

With those questions answered, and with the caveats above and noting my
complete inability to test the code:

Reviewed-by: Daniel Axtens <dja@axtens.net>

Kind regards,
Daniel

^ permalink raw reply

* [PATCH] locking: remove spin_lock_flags() etc
From: Arnd Bergmann @ 2021-10-22 11:59 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Will Deacon
  Cc: Jonas Bonn, linux-s390, Alexander Gordeev, linux-ia64,
	Vasily Gorbik, Arnd Bergmann, Boqun Feng, linux-kernel,
	Stefan Kristiansson, James E.J. Bottomley, Christian Borntraeger,
	openrisc, Paul Mackerras, linux-parisc, Waiman Long,
	Stafford Horne, linuxppc-dev, Helge Deller, Heiko Carstens

From: Arnd Bergmann <arnd@arndb.de>

parisc, ia64 and powerpc32 are the only remaining architectures that
provide custom arch_{spin,read,write}_lock_flags() functions, which are
meant to re-enable interrupts while waiting for a spinlock.

However, none of these can actually run into this codepath, because
it is only called on architectures without CONFIG_GENERIC_LOCKBREAK,
or when CONFIG_DEBUG_LOCK_ALLOC is set without CONFIG_LOCKDEP, and none
of those combinations are possible on the three architectures.

Going back in the git history, it appears that arch/mn10300 may have
been able to run into this code path, but there is a good chance that
it never worked. On the architectures that still exist, it was
already impossible to hit back in 2008 after the introduction of
CONFIG_GENERIC_LOCKBREAK, and possibly earlier.

As this is all dead code, just remove it and the helper functions built
around it. For arch/ia64, the inline asm could be cleaned up, but
it seems safer to leave it untouched.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/ia64/include/asm/spinlock.h           | 23 ++++++----------------
 arch/openrisc/include/asm/spinlock.h       |  3 ---
 arch/parisc/include/asm/spinlock.h         | 15 --------------
 arch/powerpc/include/asm/simple_spinlock.h | 21 --------------------
 arch/s390/include/asm/spinlock.h           |  8 --------
 include/linux/lockdep.h                    | 17 ----------------
 include/linux/rwlock.h                     | 15 --------------
 include/linux/rwlock_api_smp.h             |  6 ++----
 include/linux/spinlock.h                   | 13 ------------
 include/linux/spinlock_api_smp.h           |  9 ---------
 include/linux/spinlock_up.h                |  1 -
 kernel/locking/spinlock.c                  |  3 +--
 12 files changed, 9 insertions(+), 125 deletions(-)

diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h
index 864775970c50..0e5c1ad3239c 100644
--- a/arch/ia64/include/asm/spinlock.h
+++ b/arch/ia64/include/asm/spinlock.h
@@ -124,18 +124,13 @@ static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
 	__ticket_spin_unlock(lock);
 }
 
-static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
-						  unsigned long flags)
-{
-	arch_spin_lock(lock);
-}
-#define arch_spin_lock_flags	arch_spin_lock_flags
-
 #ifdef ASM_SUPPORTED
 
 static __always_inline void
-arch_read_lock_flags(arch_rwlock_t *lock, unsigned long flags)
+arch_read_lock(arch_rwlock_t *lock)
 {
+	unsigned long flags = 0;
+
 	__asm__ __volatile__ (
 		"tbit.nz p6, p0 = %1,%2\n"
 		"br.few 3f\n"
@@ -157,13 +152,8 @@ arch_read_lock_flags(arch_rwlock_t *lock, unsigned long flags)
 		: "p6", "p7", "r2", "memory");
 }
 
-#define arch_read_lock_flags arch_read_lock_flags
-#define arch_read_lock(lock) arch_read_lock_flags(lock, 0)
-
 #else /* !ASM_SUPPORTED */
 
-#define arch_read_lock_flags(rw, flags) arch_read_lock(rw)
-
 #define arch_read_lock(rw)								\
 do {											\
 	arch_rwlock_t *__read_lock_ptr = (rw);						\
@@ -186,8 +176,10 @@ do {								\
 #ifdef ASM_SUPPORTED
 
 static __always_inline void
-arch_write_lock_flags(arch_rwlock_t *lock, unsigned long flags)
+arch_write_lock(arch_rwlock_t *lock)
 {
+	unsigned long flags = 0;
+
 	__asm__ __volatile__ (
 		"tbit.nz p6, p0 = %1, %2\n"
 		"mov ar.ccv = r0\n"
@@ -210,9 +202,6 @@ arch_write_lock_flags(arch_rwlock_t *lock, unsigned long flags)
 		: "ar.ccv", "p6", "p7", "r2", "r29", "memory");
 }
 
-#define arch_write_lock_flags arch_write_lock_flags
-#define arch_write_lock(rw) arch_write_lock_flags(rw, 0)
-
 #define arch_write_trylock(rw)							\
 ({										\
 	register long result;							\
diff --git a/arch/openrisc/include/asm/spinlock.h b/arch/openrisc/include/asm/spinlock.h
index a8940bdfcb7e..264944a71535 100644
--- a/arch/openrisc/include/asm/spinlock.h
+++ b/arch/openrisc/include/asm/spinlock.h
@@ -19,9 +19,6 @@
 
 #include <asm/qrwlock.h>
 
-#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
-#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
-
 #define arch_spin_relax(lock)	cpu_relax()
 #define arch_read_relax(lock)	cpu_relax()
 #define arch_write_relax(lock)	cpu_relax()
diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h
index fa5ee8a45dbd..a6e5d66a7656 100644
--- a/arch/parisc/include/asm/spinlock.h
+++ b/arch/parisc/include/asm/spinlock.h
@@ -23,21 +23,6 @@ static inline void arch_spin_lock(arch_spinlock_t *x)
 			continue;
 }
 
-static inline void arch_spin_lock_flags(arch_spinlock_t *x,
-					unsigned long flags)
-{
-	volatile unsigned int *a;
-
-	a = __ldcw_align(x);
-	while (__ldcw(a) == 0)
-		while (*a == 0)
-			if (flags & PSW_SM_I) {
-				local_irq_enable();
-				local_irq_disable();
-			}
-}
-#define arch_spin_lock_flags arch_spin_lock_flags
-
 static inline void arch_spin_unlock(arch_spinlock_t *x)
 {
 	volatile unsigned int *a;
diff --git a/arch/powerpc/include/asm/simple_spinlock.h b/arch/powerpc/include/asm/simple_spinlock.h
index 8985791a2ba5..7ae6aeef8464 100644
--- a/arch/powerpc/include/asm/simple_spinlock.h
+++ b/arch/powerpc/include/asm/simple_spinlock.h
@@ -123,27 +123,6 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
 	}
 }
 
-static inline
-void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
-{
-	unsigned long flags_dis;
-
-	while (1) {
-		if (likely(__arch_spin_trylock(lock) == 0))
-			break;
-		local_save_flags(flags_dis);
-		local_irq_restore(flags);
-		do {
-			HMT_low();
-			if (is_shared_processor())
-				splpar_spin_yield(lock);
-		} while (unlikely(lock->slock != 0));
-		HMT_medium();
-		local_irq_restore(flags_dis);
-	}
-}
-#define arch_spin_lock_flags arch_spin_lock_flags
-
 static inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
 	__asm__ __volatile__("# arch_spin_unlock\n\t"
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index ef59588a3042..888a2f1c9ee3 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -67,14 +67,6 @@ static inline void arch_spin_lock(arch_spinlock_t *lp)
 		arch_spin_lock_wait(lp);
 }
 
-static inline void arch_spin_lock_flags(arch_spinlock_t *lp,
-					unsigned long flags)
-{
-	if (!arch_spin_trylock_once(lp))
-		arch_spin_lock_wait(lp);
-}
-#define arch_spin_lock_flags	arch_spin_lock_flags
-
 static inline int arch_spin_trylock(arch_spinlock_t *lp)
 {
 	if (!arch_spin_trylock_once(lp))
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 9fe165beb0f9..467b94257105 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -481,23 +481,6 @@ do {								\
 
 #endif /* CONFIG_LOCK_STAT */
 
-#ifdef CONFIG_LOCKDEP
-
-/*
- * On lockdep we dont want the hand-coded irq-enable of
- * _raw_*_lock_flags() code, because lockdep assumes
- * that interrupts are not re-enabled during lock-acquire:
- */
-#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \
-	LOCK_CONTENDED((_lock), (try), (lock))
-
-#else /* CONFIG_LOCKDEP */
-
-#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \
-	lockfl((_lock), (flags))
-
-#endif /* CONFIG_LOCKDEP */
-
 #ifdef CONFIG_PROVE_LOCKING
 extern void print_irqtrace_events(struct task_struct *curr);
 #else
diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h
index 7ce9a51ae5c0..2c0ad417ce3c 100644
--- a/include/linux/rwlock.h
+++ b/include/linux/rwlock.h
@@ -30,31 +30,16 @@ do {								\
 
 #ifdef CONFIG_DEBUG_SPINLOCK
  extern void do_raw_read_lock(rwlock_t *lock) __acquires(lock);
-#define do_raw_read_lock_flags(lock, flags) do_raw_read_lock(lock)
  extern int do_raw_read_trylock(rwlock_t *lock);
  extern void do_raw_read_unlock(rwlock_t *lock) __releases(lock);
  extern void do_raw_write_lock(rwlock_t *lock) __acquires(lock);
-#define do_raw_write_lock_flags(lock, flags) do_raw_write_lock(lock)
  extern int do_raw_write_trylock(rwlock_t *lock);
  extern void do_raw_write_unlock(rwlock_t *lock) __releases(lock);
 #else
-
-#ifndef arch_read_lock_flags
-# define arch_read_lock_flags(lock, flags)	arch_read_lock(lock)
-#endif
-
-#ifndef arch_write_lock_flags
-# define arch_write_lock_flags(lock, flags)	arch_write_lock(lock)
-#endif
-
 # define do_raw_read_lock(rwlock)	do {__acquire(lock); arch_read_lock(&(rwlock)->raw_lock); } while (0)
-# define do_raw_read_lock_flags(lock, flags) \
-		do {__acquire(lock); arch_read_lock_flags(&(lock)->raw_lock, *(flags)); } while (0)
 # define do_raw_read_trylock(rwlock)	arch_read_trylock(&(rwlock)->raw_lock)
 # define do_raw_read_unlock(rwlock)	do {arch_read_unlock(&(rwlock)->raw_lock); __release(lock); } while (0)
 # define do_raw_write_lock(rwlock)	do {__acquire(lock); arch_write_lock(&(rwlock)->raw_lock); } while (0)
-# define do_raw_write_lock_flags(lock, flags) \
-		do {__acquire(lock); arch_write_lock_flags(&(lock)->raw_lock, *(flags)); } while (0)
 # define do_raw_write_trylock(rwlock)	arch_write_trylock(&(rwlock)->raw_lock)
 # define do_raw_write_unlock(rwlock)	do {arch_write_unlock(&(rwlock)->raw_lock); __release(lock); } while (0)
 #endif
diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h
index abfb53ab11be..f1db6f17c4fb 100644
--- a/include/linux/rwlock_api_smp.h
+++ b/include/linux/rwlock_api_smp.h
@@ -157,8 +157,7 @@ static inline unsigned long __raw_read_lock_irqsave(rwlock_t *lock)
 	local_irq_save(flags);
 	preempt_disable();
 	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED_FLAGS(lock, do_raw_read_trylock, do_raw_read_lock,
-			     do_raw_read_lock_flags, &flags);
+	LOCK_CONTENDED(lock, do_raw_read_trylock, do_raw_read_lock);
 	return flags;
 }
 
@@ -184,8 +183,7 @@ static inline unsigned long __raw_write_lock_irqsave(rwlock_t *lock)
 	local_irq_save(flags);
 	preempt_disable();
 	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED_FLAGS(lock, do_raw_write_trylock, do_raw_write_lock,
-			     do_raw_write_lock_flags, &flags);
+	LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock);
 	return flags;
 }
 
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index c04e99edfe92..40e467cdee2d 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -176,7 +176,6 @@ do {									\
 
 #ifdef CONFIG_DEBUG_SPINLOCK
  extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock);
-#define do_raw_spin_lock_flags(lock, flags) do_raw_spin_lock(lock)
  extern int do_raw_spin_trylock(raw_spinlock_t *lock);
  extern void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock);
 #else
@@ -187,18 +186,6 @@ static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock)
 	mmiowb_spin_lock();
 }
 
-#ifndef arch_spin_lock_flags
-#define arch_spin_lock_flags(lock, flags)	arch_spin_lock(lock)
-#endif
-
-static inline void
-do_raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long *flags) __acquires(lock)
-{
-	__acquire(lock);
-	arch_spin_lock_flags(&lock->raw_lock, *flags);
-	mmiowb_spin_lock();
-}
-
 static inline int do_raw_spin_trylock(raw_spinlock_t *lock)
 {
 	int ret = arch_spin_trylock(&(lock)->raw_lock);
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index 6b8e1a0b137b..51fa0dab68c4 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -108,16 +108,7 @@ static inline unsigned long __raw_spin_lock_irqsave(raw_spinlock_t *lock)
 	local_irq_save(flags);
 	preempt_disable();
 	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	/*
-	 * On lockdep we dont want the hand-coded irq-enable of
-	 * do_raw_spin_lock_flags() code, because lockdep assumes
-	 * that interrupts are not re-enabled during lock-acquire:
-	 */
-#ifdef CONFIG_LOCKDEP
 	LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
-#else
-	do_raw_spin_lock_flags(lock, &flags);
-#endif
 	return flags;
 }
 
diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h
index 0ac9112c1bbe..16521074b6f7 100644
--- a/include/linux/spinlock_up.h
+++ b/include/linux/spinlock_up.h
@@ -62,7 +62,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 #define arch_spin_is_locked(lock)	((void)(lock), 0)
 /* for sched/core.c and kernel_lock.c: */
 # define arch_spin_lock(lock)		do { barrier(); (void)(lock); } while (0)
-# define arch_spin_lock_flags(lock, flags)	do { barrier(); (void)(lock); } while (0)
 # define arch_spin_unlock(lock)	do { barrier(); (void)(lock); } while (0)
 # define arch_spin_trylock(lock)	({ barrier(); (void)(lock); 1; })
 #endif /* DEBUG_SPINLOCK */
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index c5830cfa379a..b562f9289372 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -378,8 +378,7 @@ unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
 	local_irq_save(flags);
 	preempt_disable();
 	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
-	LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock,
-				do_raw_spin_lock_flags, &flags);
+	LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
 	return flags;
 }
 EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested);
-- 
2.29.2


^ permalink raw reply related

* Re: [PATCH v3 16/18] powerpc/64s: Move hash MMU support code under CONFIG_PPC_64S_HASH_MMU
From: Nicholas Piggin @ 2021-10-22  9:44 UTC (permalink / raw)
  To: Christophe Leroy, linuxppc-dev
In-Reply-To: <1634895021.4d2890ma8z.astroid@bobo.none>

Excerpts from Nicholas Piggin's message of October 22, 2021 7:34 pm:
> Excerpts from Christophe Leroy's message of October 22, 2021 5:18 pm:
>> 
>> 
>> Le 22/10/2021 à 00:30, Nicholas Piggin a écrit :
>>> Compiling out hash support code when CONFIG_PPC_64S_HASH_MMU=n saves
>>> 128kB kernel image size (90kB text) on powernv_defconfig minus KVM,
>>> 350kB on pseries_defconfig minus KVM, 40kB on a tiny config.
>>> 
>>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>>> ---
>>>   arch/powerpc/Kconfig                          |  2 +-
>>>   arch/powerpc/include/asm/book3s/64/mmu.h      | 19 +++++++++--
>>>   .../include/asm/book3s/64/tlbflush-hash.h     |  7 ++++
>>>   arch/powerpc/include/asm/book3s/pgtable.h     |  4 +++
>>>   arch/powerpc/include/asm/mmu_context.h        |  2 ++
>>>   arch/powerpc/include/asm/paca.h               |  8 +++++
>>>   arch/powerpc/kernel/asm-offsets.c             |  2 ++
>>>   arch/powerpc/kernel/entry_64.S                |  4 +--
>>>   arch/powerpc/kernel/exceptions-64s.S          | 16 +++++++++
>>>   arch/powerpc/kernel/mce.c                     |  2 +-
>>>   arch/powerpc/kernel/mce_power.c               | 10 ++++--
>>>   arch/powerpc/kernel/paca.c                    | 18 ++++------
>>>   arch/powerpc/kernel/process.c                 | 13 +++----
>>>   arch/powerpc/kernel/prom.c                    |  2 ++
>>>   arch/powerpc/kernel/setup_64.c                |  5 +++
>>>   arch/powerpc/kexec/core_64.c                  |  4 +--
>>>   arch/powerpc/kexec/ranges.c                   |  4 +++
>>>   arch/powerpc/mm/book3s64/Makefile             | 15 ++++----
>>>   arch/powerpc/mm/book3s64/hugetlbpage.c        |  2 ++
>>>   arch/powerpc/mm/book3s64/mmu_context.c        | 34 +++++++++++++++----
>>>   arch/powerpc/mm/book3s64/pgtable.c            |  2 +-
>>>   arch/powerpc/mm/book3s64/radix_pgtable.c      |  4 +++
>>>   arch/powerpc/mm/copro_fault.c                 |  2 ++
>>>   arch/powerpc/mm/ptdump/Makefile               |  2 +-
>>>   arch/powerpc/platforms/powernv/idle.c         |  2 ++
>>>   arch/powerpc/platforms/powernv/setup.c        |  2 ++
>>>   arch/powerpc/platforms/pseries/lpar.c         | 11 ++++--
>>>   arch/powerpc/platforms/pseries/lparcfg.c      |  2 +-
>>>   arch/powerpc/platforms/pseries/mobility.c     |  6 ++++
>>>   arch/powerpc/platforms/pseries/ras.c          |  2 ++
>>>   arch/powerpc/platforms/pseries/reconfig.c     |  2 ++
>>>   arch/powerpc/platforms/pseries/setup.c        |  6 ++--
>>>   arch/powerpc/xmon/xmon.c                      |  8 +++--
>>>   drivers/misc/lkdtm/Makefile                   |  2 +-
>>>   drivers/misc/lkdtm/core.c                     |  2 +-
>>>   35 files changed, 177 insertions(+), 51 deletions(-)
>>> 
>> 
>>> diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
>>> index c02f42d1031e..d94ebae386b6 100644
>>> --- a/arch/powerpc/include/asm/book3s/64/mmu.h
>>> +++ b/arch/powerpc/include/asm/book3s/64/mmu.h
>>> @@ -233,7 +245,8 @@ static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base,
>>>   	 * know which translations we will pick. Hence go with hash
>>>   	 * restrictions.
>>>   	 */
>>> -	return hash__setup_initial_memory_limit(first_memblock_base,
>>> +	if (!radix_enabled())
>>> +		return hash__setup_initial_memory_limit(first_memblock_base,
>>>   					   first_memblock_size);
>> 
>> It is a void function, using return is not correct.
> 
> I guess for this case I can fix as I go.
> 
>>> @@ -112,8 +112,15 @@ static inline void hash__flush_tlb_kernel_range(unsigned long start,
>>>   
>>>   struct mmu_gather;
>>>   extern void hash__tlb_flush(struct mmu_gather *tlb);
>>> +extern void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd,
>>> +				unsigned long addr);
>> 
>> 'extern' is superflous
> 
> Ditto.
> 
>>> @@ -144,12 +147,21 @@ static int hash__init_new_context(struct mm_struct *mm)
>>>   	return index;
>>>   }
>>>   
>>> +void slb_setup_new_exec(void);
>> 
>> Include arch/powerpc/mm/book3s64/internal.h instead
> 
> Will do.
> 
>>> diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
>>> index 7d556b5513e4..57d2d797c4f6 100644
>>> --- a/arch/powerpc/mm/book3s64/pgtable.c
>>> +++ b/arch/powerpc/mm/book3s64/pgtable.c
>>> @@ -535,7 +535,7 @@ static int __init pgtable_debugfs_setup(void)
>>>   }
>>>   arch_initcall(pgtable_debugfs_setup);
>>>   
>>> -#ifdef CONFIG_ZONE_DEVICE
>>> +#if defined(CONFIG_ZONE_DEVICE) && defined(ARCH_HAS_MEMREMAP_COMPAT_ALIGN)
>> 
>> Patch 12 does
>> 
>> 	select ARCH_HAS_MEMREMAP_COMPAT_ALIGN	if PPC_BOOK3S_64
> 
> Ah, I meant to change that to PPC_64S_HASH_MMU.

Oh I did in this patch, I was looking at the wrong commit.

Thanks,
Nick

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox