* [PATCH v14 8/9] powerpc/vdso: Switch VDSO to generic C implementation.
From: Michael Ellerman @ 2020-11-26 13:10 UTC (permalink / raw)
To: linuxppc-dev
In-Reply-To: <20201126131006.2431205-1-mpe@ellerman.id.au>
From: Christophe Leroy <christophe.leroy@csgroup.eu>
For VDSO32 on PPC64, we create a fake 32 bits config, on the same
principle as MIPS architecture, in order to get the correct parts of
the different asm header files.
With the C VDSO, the performance is slightly lower, but it is worth
it as it will ease maintenance and evolution, and also brings clocks
that are not supported with the ASM VDSO.
On an 8xx at 132 MHz, vdsotest with the ASM VDSO:
gettimeofday: vdso: 828 nsec/call
clock-getres-realtime-coarse: vdso: 391 nsec/call
clock-gettime-realtime-coarse: vdso: 614 nsec/call
clock-getres-realtime: vdso: 460 nsec/call
clock-gettime-realtime: vdso: 876 nsec/call
clock-getres-monotonic-coarse: vdso: 399 nsec/call
clock-gettime-monotonic-coarse: vdso: 691 nsec/call
clock-getres-monotonic: vdso: 460 nsec/call
clock-gettime-monotonic: vdso: 1026 nsec/call
On an 8xx at 132 MHz, vdsotest with the C VDSO:
gettimeofday: vdso: 955 nsec/call
clock-getres-realtime-coarse: vdso: 545 nsec/call
clock-gettime-realtime-coarse: vdso: 592 nsec/call
clock-getres-realtime: vdso: 545 nsec/call
clock-gettime-realtime: vdso: 941 nsec/call
clock-getres-monotonic-coarse: vdso: 545 nsec/call
clock-gettime-monotonic-coarse: vdso: 591 nsec/call
clock-getres-monotonic: vdso: 545 nsec/call
clock-gettime-monotonic: vdso: 940 nsec/call
It is even better for gettime with monotonic clocks.
Unsupported clocks with ASM VDSO:
clock-gettime-boottime: vdso: 3851 nsec/call
clock-gettime-tai: vdso: 3852 nsec/call
clock-gettime-monotonic-raw: vdso: 3396 nsec/call
Same clocks with C VDSO:
clock-gettime-tai: vdso: 941 nsec/call
clock-gettime-monotonic-raw: vdso: 1001 nsec/call
clock-gettime-monotonic-coarse: vdso: 591 nsec/call
On an 8321E at 333 MHz, vdsotest with the ASM VDSO:
gettimeofday: vdso: 220 nsec/call
clock-getres-realtime-coarse: vdso: 102 nsec/call
clock-gettime-realtime-coarse: vdso: 178 nsec/call
clock-getres-realtime: vdso: 129 nsec/call
clock-gettime-realtime: vdso: 235 nsec/call
clock-getres-monotonic-coarse: vdso: 105 nsec/call
clock-gettime-monotonic-coarse: vdso: 208 nsec/call
clock-getres-monotonic: vdso: 129 nsec/call
clock-gettime-monotonic: vdso: 274 nsec/call
On an 8321E at 333 MHz, vdsotest with the C VDSO:
gettimeofday: vdso: 272 nsec/call
clock-getres-realtime-coarse: vdso: 160 nsec/call
clock-gettime-realtime-coarse: vdso: 184 nsec/call
clock-getres-realtime: vdso: 166 nsec/call
clock-gettime-realtime: vdso: 281 nsec/call
clock-getres-monotonic-coarse: vdso: 160 nsec/call
clock-gettime-monotonic-coarse: vdso: 184 nsec/call
clock-getres-monotonic: vdso: 169 nsec/call
clock-gettime-monotonic: vdso: 275 nsec/call
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
[mpe: Tweak include guards]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/ef30ff8270a96c5c3b981523cd9b4e3d89049d3d.1604426550.git.christophe.leroy@csgroup.eu
---
arch/powerpc/Kconfig | 2 +
arch/powerpc/include/asm/vdso/vsyscall.h | 25 ++
arch/powerpc/include/asm/vdso_datapage.h | 40 +--
arch/powerpc/kernel/asm-offsets.c | 49 +---
arch/powerpc/kernel/time.c | 91 +------
arch/powerpc/kernel/vdso.c | 5 +-
arch/powerpc/kernel/vdso32/Makefile | 26 +-
arch/powerpc/kernel/vdso32/gettimeofday.S | 291 +---------------------
arch/powerpc/kernel/vdso32/vdso32.lds.S | 1 +
arch/powerpc/kernel/vdso64/Makefile | 23 +-
arch/powerpc/kernel/vdso64/gettimeofday.S | 242 +-----------------
arch/powerpc/kernel/vdso64/vdso64.lds.S | 2 +-
12 files changed, 106 insertions(+), 691 deletions(-)
create mode 100644 arch/powerpc/include/asm/vdso/vsyscall.h
v14: unchanged
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 7f4995b245a3..aad8532a718e 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -176,6 +176,7 @@ config PPC
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
select GENERIC_TIME_VSYSCALL
+ select GENERIC_GETTIMEOFDAY
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_HUGE_VMAP if PPC_BOOK3S_64 && PPC_RADIX_MMU
select HAVE_ARCH_JUMP_LABEL
@@ -206,6 +207,7 @@ config PPC
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
select HAVE_GCC_PLUGINS if GCC_VERSION >= 50200 # plugin support on gcc <= 5.1 is buggy on PPC
+ select HAVE_GENERIC_VDSO
select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx)
select HAVE_IDE
select HAVE_IOREMAP_PROT
diff --git a/arch/powerpc/include/asm/vdso/vsyscall.h b/arch/powerpc/include/asm/vdso/vsyscall.h
new file mode 100644
index 000000000000..48cf23f1e273
--- /dev/null
+++ b/arch/powerpc/include/asm/vdso/vsyscall.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_VDSO_VSYSCALL_H
+#define _ASM_POWERPC_VDSO_VSYSCALL_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/timekeeper_internal.h>
+#include <asm/vdso_datapage.h>
+
+/*
+ * Update the vDSO data page to keep in sync with kernel timekeeping.
+ */
+static __always_inline
+struct vdso_data *__arch_get_k_vdso_data(void)
+{
+ return vdso_data->data;
+}
+#define __arch_get_k_vdso_data __arch_get_k_vdso_data
+
+/* The asm-generic header needs to be included after the definitions above */
+#include <asm-generic/vdso/vsyscall.h>
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_POWERPC_VDSO_VSYSCALL_H */
diff --git a/arch/powerpc/include/asm/vdso_datapage.h b/arch/powerpc/include/asm/vdso_datapage.h
index b9ef6cf50ea5..c4d320504d26 100644
--- a/arch/powerpc/include/asm/vdso_datapage.h
+++ b/arch/powerpc/include/asm/vdso_datapage.h
@@ -36,6 +36,7 @@
#include <linux/unistd.h>
#include <linux/time.h>
+#include <vdso/datapage.h>
#define SYSCALL_MAP_SIZE ((NR_syscalls + 31) / 32)
@@ -45,7 +46,7 @@
#ifdef CONFIG_PPC64
-struct vdso_data {
+struct vdso_arch_data {
__u8 eye_catcher[16]; /* Eyecatcher: SYSTEMCFG:PPC64 0x00 */
struct { /* Systemcfg version numbers */
__u32 major; /* Major number 0x10 */
@@ -59,13 +60,13 @@ struct vdso_data {
__u32 processor; /* Processor type 0x1C */
__u64 processorCount; /* # of physical processors 0x20 */
__u64 physicalMemorySize; /* Size of real memory(B) 0x28 */
- __u64 tb_orig_stamp; /* Timebase at boot 0x30 */
+ __u64 tb_orig_stamp; /* (NU) Timebase at boot 0x30 */
__u64 tb_ticks_per_sec; /* Timebase tics / sec 0x38 */
- __u64 tb_to_xs; /* Inverse of TB to 2^20 0x40 */
- __u64 stamp_xsec; /* 0x48 */
- __u64 tb_update_count; /* Timebase atomicity ctr 0x50 */
- __u32 tz_minuteswest; /* Minutes west of Greenwich 0x58 */
- __u32 tz_dsttime; /* Type of dst correction 0x5C */
+ __u64 tb_to_xs; /* (NU) Inverse of TB to 2^20 0x40 */
+ __u64 stamp_xsec; /* (NU) 0x48 */
+ __u64 tb_update_count; /* (NU) Timebase atomicity ctr 0x50 */
+ __u32 tz_minuteswest; /* (NU) Min. west of Greenwich 0x58 */
+ __u32 tz_dsttime; /* (NU) Type of dst correction 0x5C */
__u32 dcache_size; /* L1 d-cache size 0x60 */
__u32 dcache_line_size; /* L1 d-cache line size 0x64 */
__u32 icache_size; /* L1 i-cache size 0x68 */
@@ -78,14 +79,10 @@ struct vdso_data {
__u32 icache_block_size; /* L1 i-cache block size */
__u32 dcache_log_block_size; /* L1 d-cache log block size */
__u32 icache_log_block_size; /* L1 i-cache log block size */
- __u32 stamp_sec_fraction; /* fractional seconds of stamp_xtime */
- __s32 wtom_clock_nsec; /* Wall to monotonic clock nsec */
- __s64 wtom_clock_sec; /* Wall to monotonic clock sec */
- __s64 stamp_xtime_sec; /* xtime secs as at tb_orig_stamp */
- __s64 stamp_xtime_nsec; /* xtime nsecs as at tb_orig_stamp */
- __u32 hrtimer_res; /* hrtimer resolution */
__u32 syscall_map_64[SYSCALL_MAP_SIZE]; /* map of syscalls */
__u32 syscall_map_32[SYSCALL_MAP_SIZE]; /* map of syscalls */
+
+ struct vdso_data data[CS_BASES];
};
#else /* CONFIG_PPC64 */
@@ -93,26 +90,15 @@ struct vdso_data {
/*
* And here is the simpler 32 bits version
*/
-struct vdso_data {
- __u64 tb_orig_stamp; /* Timebase at boot 0x30 */
+struct vdso_arch_data {
__u64 tb_ticks_per_sec; /* Timebase tics / sec 0x38 */
- __u64 tb_to_xs; /* Inverse of TB to 2^20 0x40 */
- __u64 stamp_xsec; /* 0x48 */
- __u32 tb_update_count; /* Timebase atomicity ctr 0x50 */
- __u32 tz_minuteswest; /* Minutes west of Greenwich 0x58 */
- __u32 tz_dsttime; /* Type of dst correction 0x5C */
- __s32 wtom_clock_sec; /* Wall to monotonic clock */
- __s32 wtom_clock_nsec;
- __s32 stamp_xtime_sec; /* xtime seconds as at tb_orig_stamp */
- __s32 stamp_xtime_nsec; /* xtime nsecs as at tb_orig_stamp */
- __u32 stamp_sec_fraction; /* fractional seconds of stamp_xtime */
- __u32 hrtimer_res; /* hrtimer resolution */
__u32 syscall_map_32[SYSCALL_MAP_SIZE]; /* map of syscalls */
+ struct vdso_data data[CS_BASES];
};
#endif /* CONFIG_PPC64 */
-extern struct vdso_data *vdso_data;
+extern struct vdso_arch_data *vdso_data;
#else /* __ASSEMBLY__ */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index c2722ff36e98..a2dcb8ed79b9 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -398,47 +398,16 @@ int main(void)
#endif /* ! CONFIG_PPC64 */
/* datapage offsets for use by vdso */
- OFFSET(CFG_TB_ORIG_STAMP, vdso_data, tb_orig_stamp);
- OFFSET(CFG_TB_TICKS_PER_SEC, vdso_data, tb_ticks_per_sec);
- OFFSET(CFG_TB_TO_XS, vdso_data, tb_to_xs);
- OFFSET(CFG_TB_UPDATE_COUNT, vdso_data, tb_update_count);
- OFFSET(CFG_TZ_MINUTEWEST, vdso_data, tz_minuteswest);
- OFFSET(CFG_TZ_DSTTIME, vdso_data, tz_dsttime);
- OFFSET(CFG_SYSCALL_MAP32, vdso_data, syscall_map_32);
- OFFSET(WTOM_CLOCK_SEC, vdso_data, wtom_clock_sec);
- OFFSET(WTOM_CLOCK_NSEC, vdso_data, wtom_clock_nsec);
- OFFSET(STAMP_XTIME_SEC, vdso_data, stamp_xtime_sec);
- OFFSET(STAMP_XTIME_NSEC, vdso_data, stamp_xtime_nsec);
- OFFSET(STAMP_SEC_FRAC, vdso_data, stamp_sec_fraction);
- OFFSET(CLOCK_HRTIMER_RES, vdso_data, hrtimer_res);
+ OFFSET(VDSO_DATA_OFFSET, vdso_arch_data, data);
+ OFFSET(CFG_TB_TICKS_PER_SEC, vdso_arch_data, tb_ticks_per_sec);
+ OFFSET(CFG_SYSCALL_MAP32, vdso_arch_data, syscall_map_32);
#ifdef CONFIG_PPC64
- OFFSET(CFG_ICACHE_BLOCKSZ, vdso_data, icache_block_size);
- OFFSET(CFG_DCACHE_BLOCKSZ, vdso_data, dcache_block_size);
- OFFSET(CFG_ICACHE_LOGBLOCKSZ, vdso_data, icache_log_block_size);
- OFFSET(CFG_DCACHE_LOGBLOCKSZ, vdso_data, dcache_log_block_size);
- OFFSET(CFG_SYSCALL_MAP64, vdso_data, syscall_map_64);
- OFFSET(TVAL64_TV_SEC, __kernel_old_timeval, tv_sec);
- OFFSET(TVAL64_TV_USEC, __kernel_old_timeval, tv_usec);
-#endif
- OFFSET(TSPC64_TV_SEC, __kernel_timespec, tv_sec);
- OFFSET(TSPC64_TV_NSEC, __kernel_timespec, tv_nsec);
- OFFSET(TVAL32_TV_SEC, old_timeval32, tv_sec);
- OFFSET(TVAL32_TV_USEC, old_timeval32, tv_usec);
- OFFSET(TSPC32_TV_SEC, old_timespec32, tv_sec);
- OFFSET(TSPC32_TV_NSEC, old_timespec32, tv_nsec);
- /* timeval/timezone offsets for use by vdso */
- OFFSET(TZONE_TZ_MINWEST, timezone, tz_minuteswest);
- OFFSET(TZONE_TZ_DSTTIME, timezone, tz_dsttime);
-
- /* Other bits used by the vdso */
- DEFINE(CLOCK_REALTIME, CLOCK_REALTIME);
- DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC);
- DEFINE(CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE);
- DEFINE(CLOCK_MONOTONIC_COARSE, CLOCK_MONOTONIC_COARSE);
- DEFINE(CLOCK_MAX, CLOCK_TAI);
- DEFINE(NSEC_PER_SEC, NSEC_PER_SEC);
- DEFINE(EINVAL, EINVAL);
- DEFINE(KTIME_LOW_RES, KTIME_LOW_RES);
+ OFFSET(CFG_ICACHE_BLOCKSZ, vdso_arch_data, icache_block_size);
+ OFFSET(CFG_DCACHE_BLOCKSZ, vdso_arch_data, dcache_block_size);
+ OFFSET(CFG_ICACHE_LOGBLOCKSZ, vdso_arch_data, icache_log_block_size);
+ OFFSET(CFG_DCACHE_LOGBLOCKSZ, vdso_arch_data, dcache_log_block_size);
+ OFFSET(CFG_SYSCALL_MAP64, vdso_arch_data, syscall_map_64);
+#endif
#ifdef CONFIG_BUG
DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry));
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 74efe46f5532..92481463f9dc 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -82,6 +82,7 @@ static struct clocksource clocksource_timebase = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
.mask = CLOCKSOURCE_MASK(64),
.read = timebase_read,
+ .vdso_clock_mode = VDSO_CLOCKMODE_ARCHTIMER,
};
#define DECREMENTER_DEFAULT_MAX 0x7FFFFFFF
@@ -831,95 +832,6 @@ static notrace u64 timebase_read(struct clocksource *cs)
return (u64)get_tb();
}
-
-void update_vsyscall(struct timekeeper *tk)
-{
- struct timespec64 xt;
- struct clocksource *clock = tk->tkr_mono.clock;
- u32 mult = tk->tkr_mono.mult;
- u32 shift = tk->tkr_mono.shift;
- u64 cycle_last = tk->tkr_mono.cycle_last;
- u64 new_tb_to_xs, new_stamp_xsec;
- u64 frac_sec;
-
- if (clock != &clocksource_timebase)
- return;
-
- xt.tv_sec = tk->xtime_sec;
- xt.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
-
- /* Make userspace gettimeofday spin until we're done. */
- ++vdso_data->tb_update_count;
- smp_mb();
-
- /*
- * This computes ((2^20 / 1e9) * mult) >> shift as a
- * 0.64 fixed-point fraction.
- * The computation in the else clause below won't overflow
- * (as long as the timebase frequency is >= 1.049 MHz)
- * but loses precision because we lose the low bits of the constant
- * in the shift. Note that 19342813113834067 ~= 2^(20+64) / 1e9.
- * For a shift of 24 the error is about 0.5e-9, or about 0.5ns
- * over a second. (Shift values are usually 22, 23 or 24.)
- * For high frequency clocks such as the 512MHz timebase clock
- * on POWER[6789], the mult value is small (e.g. 32768000)
- * and so we can shift the constant by 16 initially
- * (295147905179 ~= 2^(20+64-16) / 1e9) and then do the
- * remaining shifts after the multiplication, which gives a
- * more accurate result (e.g. with mult = 32768000, shift = 24,
- * the error is only about 1.2e-12, or 0.7ns over 10 minutes).
- */
- if (mult <= 62500000 && clock->shift >= 16)
- new_tb_to_xs = ((u64) mult * 295147905179ULL) >> (clock->shift - 16);
- else
- new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift);
-
- /*
- * Compute the fractional second in units of 2^-32 seconds.
- * The fractional second is tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift
- * in nanoseconds, so multiplying that by 2^32 / 1e9 gives
- * it in units of 2^-32 seconds.
- * We assume shift <= 32 because clocks_calc_mult_shift()
- * generates shift values in the range 0 - 32.
- */
- frac_sec = tk->tkr_mono.xtime_nsec << (32 - shift);
- do_div(frac_sec, NSEC_PER_SEC);
-
- /*
- * Work out new stamp_xsec value for any legacy users of systemcfg.
- * stamp_xsec is in units of 2^-20 seconds.
- */
- new_stamp_xsec = frac_sec >> 12;
- new_stamp_xsec += tk->xtime_sec * XSEC_PER_SEC;
-
- /*
- * tb_update_count is used to allow the userspace gettimeofday code
- * to assure itself that it sees a consistent view of the tb_to_xs and
- * stamp_xsec variables. It reads the tb_update_count, then reads
- * tb_to_xs and stamp_xsec and then reads tb_update_count again. If
- * the two values of tb_update_count match and are even then the
- * tb_to_xs and stamp_xsec values are consistent. If not, then it
- * loops back and reads them again until this criteria is met.
- */
- vdso_data->tb_orig_stamp = cycle_last;
- vdso_data->stamp_xsec = new_stamp_xsec;
- vdso_data->tb_to_xs = new_tb_to_xs;
- vdso_data->wtom_clock_sec = tk->wall_to_monotonic.tv_sec;
- vdso_data->wtom_clock_nsec = tk->wall_to_monotonic.tv_nsec;
- vdso_data->stamp_xtime_sec = xt.tv_sec;
- vdso_data->stamp_xtime_nsec = xt.tv_nsec;
- vdso_data->stamp_sec_fraction = frac_sec;
- vdso_data->hrtimer_res = hrtimer_resolution;
- smp_wmb();
- ++(vdso_data->tb_update_count);
-}
-
-void update_vsyscall_tz(void)
-{
- vdso_data->tz_minuteswest = sys_tz.tz_minuteswest;
- vdso_data->tz_dsttime = sys_tz.tz_dsttime;
-}
-
static void __init clocksource_init(void)
{
struct clocksource *clock = &clocksource_timebase;
@@ -1079,7 +991,6 @@ void __init time_init(void)
sys_tz.tz_dsttime = 0;
}
- vdso_data->tb_update_count = 0;
vdso_data->tb_ticks_per_sec = tb_ticks_per_sec;
/* initialise and enable the large decrementer (if we have one) */
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 8dad44262e75..23208a051af5 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -17,6 +17,7 @@
#include <linux/elf.h>
#include <linux/security.h>
#include <linux/memblock.h>
+#include <vdso/datapage.h>
#include <asm/processor.h>
#include <asm/mmu.h>
@@ -70,10 +71,10 @@ static int vdso_ready;
* with it, it will become dynamically allocated
*/
static union {
- struct vdso_data data;
+ struct vdso_arch_data data;
u8 page[PAGE_SIZE];
} vdso_data_store __page_aligned_data;
-struct vdso_data *vdso_data = &vdso_data_store.data;
+struct vdso_arch_data *vdso_data = &vdso_data_store.data;
/* Format of the patch table */
struct vdso_patch_def
diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile
index 73eada6bc8cd..853545a19a1e 100644
--- a/arch/powerpc/kernel/vdso32/Makefile
+++ b/arch/powerpc/kernel/vdso32/Makefile
@@ -2,8 +2,20 @@
# List of files in the vdso, has to be asm only for now
+ARCH_REL_TYPE_ABS := R_PPC_JUMP_SLOT|R_PPC_GLOB_DAT|R_PPC_ADDR32|R_PPC_ADDR24|R_PPC_ADDR16|R_PPC_ADDR16_LO|R_PPC_ADDR16_HI|R_PPC_ADDR16_HA|R_PPC_ADDR14|R_PPC_ADDR14_BRTAKEN|R_PPC_ADDR14_BRNTAKEN
+include $(srctree)/lib/vdso/Makefile
+
obj-vdso32 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o getcpu.o
+ifneq ($(c-gettimeofday-y),)
+ CFLAGS_vgettimeofday.o += -include $(c-gettimeofday-y)
+ CFLAGS_vgettimeofday.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
+ CFLAGS_vgettimeofday.o += $(call cc-option, -fno-stack-protector)
+ CFLAGS_vgettimeofday.o += -DDISABLE_BRANCH_PROFILING
+ CFLAGS_vgettimeofday.o += -ffreestanding -fasynchronous-unwind-tables
+ CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE)
+endif
+
# Build rules
ifdef CROSS32_COMPILE
@@ -15,6 +27,7 @@ endif
CC32FLAGS :=
ifdef CONFIG_PPC64
CC32FLAGS += -m32
+KBUILD_CFLAGS := $(filter-out -mcmodel=medium,$(KBUILD_CFLAGS))
endif
targets := $(obj-vdso32) vdso32.so vdso32.so.dbg
@@ -23,6 +36,7 @@ obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32))
GCOV_PROFILE := n
KCOV_INSTRUMENT := n
UBSAN_SANITIZE := n
+KASAN_SANITIZE := n
ccflags-y := -shared -fno-common -fno-builtin -nostdlib \
-Wl,-soname=linux-vdso32.so.1 -Wl,--hash-style=both
@@ -36,8 +50,8 @@ CPPFLAGS_vdso32.lds += -P -C -Upowerpc
$(obj)/vdso32_wrapper.o : $(obj)/vdso32.so
# link rule for the .so file, .lds has to be first
-$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) FORCE
- $(call if_changed,vdso32ld)
+$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) $(obj)/vgettimeofday.o FORCE
+ $(call if_changed,vdso32ld_and_check)
# strip rule for the .so file
$(obj)/%.so: OBJCOPYFLAGS := -S
@@ -47,12 +61,16 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE
# assembly rules for the .S files
$(obj-vdso32): %.o: %.S FORCE
$(call if_changed_dep,vdso32as)
+$(obj)/vgettimeofday.o: %.o: %.c FORCE
+ $(call if_changed_dep,vdso32cc)
# actual build commands
-quiet_cmd_vdso32ld = VDSO32L $@
- cmd_vdso32ld = $(VDSOCC) $(c_flags) $(CC32FLAGS) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^)
+quiet_cmd_vdso32ld_and_check = VDSO32L $@
+ cmd_vdso32ld_and_check = $(VDSOCC) $(c_flags) $(CC32FLAGS) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^) ; $(cmd_vdso_check)
quiet_cmd_vdso32as = VDSO32A $@
cmd_vdso32as = $(VDSOCC) $(a_flags) $(CC32FLAGS) -c -o $@ $<
+quiet_cmd_vdso32cc = VDSO32C $@
+ cmd_vdso32cc = $(VDSOCC) $(c_flags) $(CC32FLAGS) -c -o $@ $<
# install commands for the unstripped file
quiet_cmd_vdso_install = INSTALL $@
diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S
index e7f8f9f1b3f4..fd7b01c51281 100644
--- a/arch/powerpc/kernel/vdso32/gettimeofday.S
+++ b/arch/powerpc/kernel/vdso32/gettimeofday.S
@@ -12,13 +12,7 @@
#include <asm/vdso_datapage.h>
#include <asm/asm-offsets.h>
#include <asm/unistd.h>
-
-/* Offset for the low 32-bit part of a field of long type */
-#ifdef CONFIG_PPC64
-#define LOPART 4
-#else
-#define LOPART 0
-#endif
+#include <asm/vdso/gettimeofday.h>
.text
/*
@@ -28,32 +22,7 @@
*
*/
V_FUNCTION_BEGIN(__kernel_gettimeofday)
- .cfi_startproc
- mflr r12
- .cfi_register lr,r12
-
- mr. r10,r3 /* r10 saves tv */
- mr r11,r4 /* r11 saves tz */
- get_datapage r9, r0
- beq 3f
- LOAD_REG_IMMEDIATE(r7, 1000000) /* load up USEC_PER_SEC */
- bl __do_get_tspec@local /* get sec/usec from tb & kernel */
- stw r3,TVAL32_TV_SEC(r10)
- stw r4,TVAL32_TV_USEC(r10)
-
-3: cmplwi r11,0 /* check if tz is NULL */
- mtlr r12
- crclr cr0*4+so
- li r3,0
- beqlr
-
- lwz r4,CFG_TZ_MINUTEWEST(r9)/* fill tz */
- lwz r5,CFG_TZ_DSTTIME(r9)
- stw r4,TZONE_TZ_MINWEST(r11)
- stw r5,TZONE_TZ_DSTTIME(r11)
-
- blr
- .cfi_endproc
+ cvdso_call __c_kernel_gettimeofday
V_FUNCTION_END(__kernel_gettimeofday)
/*
@@ -63,127 +32,7 @@ V_FUNCTION_END(__kernel_gettimeofday)
*
*/
V_FUNCTION_BEGIN(__kernel_clock_gettime)
- .cfi_startproc
- /* Check for supported clock IDs */
- cmpli cr0,r3,CLOCK_REALTIME
- cmpli cr1,r3,CLOCK_MONOTONIC
- cror cr0*4+eq,cr0*4+eq,cr1*4+eq
-
- cmpli cr5,r3,CLOCK_REALTIME_COARSE
- cmpli cr6,r3,CLOCK_MONOTONIC_COARSE
- cror cr5*4+eq,cr5*4+eq,cr6*4+eq
-
- cror cr0*4+eq,cr0*4+eq,cr5*4+eq
- bne cr0, .Lgettime_fallback
-
- mflr r12 /* r12 saves lr */
- .cfi_register lr,r12
- mr r11,r4 /* r11 saves tp */
- get_datapage r9, r0
- LOAD_REG_IMMEDIATE(r7, NSEC_PER_SEC) /* load up NSEC_PER_SEC */
- beq cr5, .Lcoarse_clocks
-.Lprecise_clocks:
- bl __do_get_tspec@local /* get sec/nsec from tb & kernel */
- bne cr1, .Lfinish /* not monotonic -> all done */
-
- /*
- * CLOCK_MONOTONIC
- */
-
- /* now we must fixup using wall to monotonic. We need to snapshot
- * that value and do the counter trick again. Fortunately, we still
- * have the counter value in r8 that was returned by __do_get_xsec.
- * At this point, r3,r4 contain our sec/nsec values, r5 and r6
- * can be used, r7 contains NSEC_PER_SEC.
- */
-
- lwz r5,(WTOM_CLOCK_SEC+LOPART)(r9)
- lwz r6,WTOM_CLOCK_NSEC(r9)
-
- /* We now have our offset in r5,r6. We create a fake dependency
- * on that value and re-check the counter
- */
- or r0,r6,r5
- xor r0,r0,r0
- add r9,r9,r0
- lwz r0,(CFG_TB_UPDATE_COUNT+LOPART)(r9)
- cmpl cr0,r8,r0 /* check if updated */
- bne- .Lprecise_clocks
- b .Lfinish_monotonic
-
- /*
- * For coarse clocks we get data directly from the vdso data page, so
- * we don't need to call __do_get_tspec, but we still need to do the
- * counter trick.
- */
-.Lcoarse_clocks:
- lwz r8,(CFG_TB_UPDATE_COUNT+LOPART)(r9)
- andi. r0,r8,1 /* pending update ? loop */
- bne- .Lcoarse_clocks
- add r9,r9,r0 /* r0 is already 0 */
-
- /*
- * CLOCK_REALTIME_COARSE, below values are needed for MONOTONIC_COARSE
- * too
- */
- lwz r3,STAMP_XTIME_SEC+LOPART(r9)
- lwz r4,STAMP_XTIME_NSEC+LOPART(r9)
- bne cr6,1f
-
- /* CLOCK_MONOTONIC_COARSE */
- lwz r5,(WTOM_CLOCK_SEC+LOPART)(r9)
- lwz r6,WTOM_CLOCK_NSEC(r9)
-
- /* check if counter has updated */
- or r0,r6,r5
-1: or r0,r0,r3
- or r0,r0,r4
- xor r0,r0,r0
- add r3,r3,r0
- lwz r0,CFG_TB_UPDATE_COUNT+LOPART(r9)
- cmpl cr0,r0,r8 /* check if updated */
- bne- .Lcoarse_clocks
-
- /* Counter has not updated, so continue calculating proper values for
- * sec and nsec if monotonic coarse, or just return with the proper
- * values for realtime.
- */
- bne cr6, .Lfinish
-
- /* Calculate and store result. Note that this mimics the C code,
- * which may cause funny results if nsec goes negative... is that
- * possible at all ?
- */
-.Lfinish_monotonic:
- add r3,r3,r5
- add r4,r4,r6
- cmpw cr0,r4,r7
- cmpwi cr1,r4,0
- blt 1f
- subf r4,r7,r4
- addi r3,r3,1
-1: bge cr1, .Lfinish
- addi r3,r3,-1
- add r4,r4,r7
-
-.Lfinish:
- stw r3,TSPC32_TV_SEC(r11)
- stw r4,TSPC32_TV_NSEC(r11)
-
- mtlr r12
- crclr cr0*4+so
- li r3,0
- blr
-
- /*
- * syscall fallback
- */
-.Lgettime_fallback:
- li r0,__NR_clock_gettime
- .cfi_restore lr
- sc
- blr
- .cfi_endproc
+ cvdso_call __c_kernel_clock_gettime
V_FUNCTION_END(__kernel_clock_gettime)
@@ -194,37 +43,7 @@ V_FUNCTION_END(__kernel_clock_gettime)
*
*/
V_FUNCTION_BEGIN(__kernel_clock_getres)
- .cfi_startproc
- /* Check for supported clock IDs */
- cmplwi cr0, r3, CLOCK_MAX
- cmpwi cr1, r3, CLOCK_REALTIME_COARSE
- cmpwi cr7, r3, CLOCK_MONOTONIC_COARSE
- bgt cr0, 99f
- LOAD_REG_IMMEDIATE(r5, KTIME_LOW_RES)
- beq cr1, 1f
- beq cr7, 1f
-
- mflr r12
- .cfi_register lr,r12
- get_datapage r3, r0
- lwz r5, CLOCK_HRTIMER_RES(r3)
- mtlr r12
-1: li r3,0
- cmpli cr0,r4,0
- crclr cr0*4+so
- beqlr
- stw r3,TSPC32_TV_SEC(r4)
- stw r5,TSPC32_TV_NSEC(r4)
- blr
-
- /*
- * syscall fallback
- */
-99:
- li r0,__NR_clock_getres
- sc
- blr
- .cfi_endproc
+ cvdso_call __c_kernel_clock_getres
V_FUNCTION_END(__kernel_clock_getres)
@@ -235,105 +54,5 @@ V_FUNCTION_END(__kernel_clock_getres)
*
*/
V_FUNCTION_BEGIN(__kernel_time)
- .cfi_startproc
- mflr r12
- .cfi_register lr,r12
-
- mr r11,r3 /* r11 holds t */
- get_datapage r9, r0
-
- lwz r3,STAMP_XTIME_SEC+LOPART(r9)
-
- cmplwi r11,0 /* check if t is NULL */
- mtlr r12
- crclr cr0*4+so
- beqlr
- stw r3,0(r11) /* store result at *t */
- blr
- .cfi_endproc
+ cvdso_call_time __c_kernel_time
V_FUNCTION_END(__kernel_time)
-
-/*
- * This is the core of clock_gettime() and gettimeofday(),
- * it returns the current time in r3 (seconds) and r4.
- * On entry, r7 gives the resolution of r4, either USEC_PER_SEC
- * or NSEC_PER_SEC, giving r4 in microseconds or nanoseconds.
- * It expects the datapage ptr in r9 and doesn't clobber it.
- * It clobbers r0, r5 and r6.
- * On return, r8 contains the counter value that can be reused.
- * This clobbers cr0 but not any other cr field.
- */
-__do_get_tspec:
- .cfi_startproc
- /* Check for update count & load values. We use the low
- * order 32 bits of the update count
- */
-1: lwz r8,(CFG_TB_UPDATE_COUNT+LOPART)(r9)
- andi. r0,r8,1 /* pending update ? loop */
- bne- 1b
- xor r0,r8,r8 /* create dependency */
- add r9,r9,r0
-
- /* Load orig stamp (offset to TB) */
- lwz r5,CFG_TB_ORIG_STAMP(r9)
- lwz r6,(CFG_TB_ORIG_STAMP+4)(r9)
-
- /* Get a stable TB value */
-2: MFTBU(r3)
- MFTBL(r4)
- MFTBU(r0)
- cmplw cr0,r3,r0
- bne- 2b
-
- /* Subtract tb orig stamp and shift left 12 bits.
- */
- subfc r4,r6,r4
- subfe r0,r5,r3
- slwi r0,r0,12
- rlwimi. r0,r4,12,20,31
- slwi r4,r4,12
-
- /*
- * Load scale factor & do multiplication.
- * We only use the high 32 bits of the tb_to_xs value.
- * Even with a 1GHz timebase clock, the high 32 bits of
- * tb_to_xs will be at least 4 million, so the error from
- * ignoring the low 32 bits will be no more than 0.25ppm.
- * The error will just make the clock run very very slightly
- * slow until the next time the kernel updates the VDSO data,
- * at which point the clock will catch up to the kernel's value,
- * so there is no long-term error accumulation.
- */
- lwz r5,CFG_TB_TO_XS(r9) /* load values */
- mulhwu r4,r4,r5
- li r3,0
-
- beq+ 4f /* skip high part computation if 0 */
- mulhwu r3,r0,r5
- mullw r5,r0,r5
- addc r4,r4,r5
- addze r3,r3
-4:
- /* At this point, we have seconds since the xtime stamp
- * as a 32.32 fixed-point number in r3 and r4.
- * Load & add the xtime stamp.
- */
- lwz r5,STAMP_XTIME_SEC+LOPART(r9)
- lwz r6,STAMP_SEC_FRAC(r9)
- addc r4,r4,r6
- adde r3,r3,r5
-
- /* We create a fake dependency on the result in r3/r4
- * and re-check the counter
- */
- or r6,r4,r3
- xor r0,r6,r6
- add r9,r9,r0
- lwz r0,(CFG_TB_UPDATE_COUNT+LOPART)(r9)
- cmplw cr0,r8,r0 /* check if updated */
- bne- 1b
-
- mulhwu r4,r4,r7 /* convert to micro or nanoseconds */
-
- blr
- .cfi_endproc
diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S
index 7eadac74c7f9..51e9b3f3f88a 100644
--- a/arch/powerpc/kernel/vdso32/vdso32.lds.S
+++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S
@@ -111,6 +111,7 @@ SECTIONS
*(.note.GNU-stack)
*(.data .data.* .gnu.linkonce.d.* .sdata*)
*(.bss .sbss .dynbss .dynsbss)
+ *(.got1)
}
}
diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile
index dfd34f68bfa1..4a8c5e4d25c0 100644
--- a/arch/powerpc/kernel/vdso64/Makefile
+++ b/arch/powerpc/kernel/vdso64/Makefile
@@ -1,8 +1,20 @@
# SPDX-License-Identifier: GPL-2.0
# List of files in the vdso, has to be asm only for now
+ARCH_REL_TYPE_ABS := R_PPC_JUMP_SLOT|R_PPC_GLOB_DAT|R_PPC_ADDR32|R_PPC_ADDR24|R_PPC_ADDR16|R_PPC_ADDR16_LO|R_PPC_ADDR16_HI|R_PPC_ADDR16_HA|R_PPC_ADDR14|R_PPC_ADDR14_BRTAKEN|R_PPC_ADDR14_BRNTAKEN
+include $(srctree)/lib/vdso/Makefile
+
obj-vdso64 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o getcpu.o
+ifneq ($(c-gettimeofday-y),)
+ CFLAGS_vgettimeofday.o += -include $(c-gettimeofday-y)
+ CFLAGS_vgettimeofday.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
+ CFLAGS_vgettimeofday.o += $(call cc-option, -fno-stack-protector)
+ CFLAGS_vgettimeofday.o += -DDISABLE_BRANCH_PROFILING
+ CFLAGS_vgettimeofday.o += -ffreestanding -fasynchronous-unwind-tables
+ CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE)
+endif
+
# Build rules
targets := $(obj-vdso64) vdso64.so vdso64.so.dbg
@@ -11,6 +23,7 @@ obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64))
GCOV_PROFILE := n
KCOV_INSTRUMENT := n
UBSAN_SANITIZE := n
+KASAN_SANITIZE := n
ccflags-y := -shared -fno-common -fno-builtin -nostdlib \
-Wl,-soname=linux-vdso64.so.1 -Wl,--hash-style=both
@@ -20,12 +33,14 @@ obj-y += vdso64_wrapper.o
targets += vdso64.lds
CPPFLAGS_vdso64.lds += -P -C -U$(ARCH)
+$(obj)/vgettimeofday.o: %.o: %.c FORCE
+
# Force dependency (incbin is bad)
$(obj)/vdso64_wrapper.o : $(obj)/vdso64.so
# link rule for the .so file, .lds has to be first
-$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) FORCE
- $(call if_changed,vdso64ld)
+$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) $(obj)/vgettimeofday.o FORCE
+ $(call if_changed,vdso64ld_and_check)
# strip rule for the .so file
$(obj)/%.so: OBJCOPYFLAGS := -S
@@ -33,8 +48,8 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE
$(call if_changed,objcopy)
# actual build commands
-quiet_cmd_vdso64ld = VDSO64L $@
- cmd_vdso64ld = $(CC) $(c_flags) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^)
+quiet_cmd_vdso64ld_and_check = VDSO64L $@
+ cmd_vdso64ld_and_check = $(CC) $(c_flags) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^); $(cmd_vdso_check)
# install commands for the unstripped file
quiet_cmd_vdso_install = INSTALL $@
diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S
index 20f8be40c653..d7a7bfb51081 100644
--- a/arch/powerpc/kernel/vdso64/gettimeofday.S
+++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
@@ -12,6 +12,7 @@
#include <asm/vdso_datapage.h>
#include <asm/asm-offsets.h>
#include <asm/unistd.h>
+#include <asm/vdso/gettimeofday.h>
.text
/*
@@ -21,31 +22,7 @@
*
*/
V_FUNCTION_BEGIN(__kernel_gettimeofday)
- .cfi_startproc
- mflr r12
- .cfi_register lr,r12
-
- mr r11,r3 /* r11 holds tv */
- mr r10,r4 /* r10 holds tz */
- get_datapage r3, r0
- cmpldi r11,0 /* check if tv is NULL */
- beq 2f
- lis r7,1000000@ha /* load up USEC_PER_SEC */
- addi r7,r7,1000000@l
- bl V_LOCAL_FUNC(__do_get_tspec) /* get sec/us from tb & kernel */
- std r4,TVAL64_TV_SEC(r11) /* store sec in tv */
- std r5,TVAL64_TV_USEC(r11) /* store usec in tv */
-2: cmpldi r10,0 /* check if tz is NULL */
- beq 1f
- lwz r4,CFG_TZ_MINUTEWEST(r3)/* fill tz */
- lwz r5,CFG_TZ_DSTTIME(r3)
- stw r4,TZONE_TZ_MINWEST(r10)
- stw r5,TZONE_TZ_DSTTIME(r10)
-1: mtlr r12
- crclr cr0*4+so
- li r3,0 /* always success */
- blr
- .cfi_endproc
+ cvdso_call __c_kernel_gettimeofday
V_FUNCTION_END(__kernel_gettimeofday)
@@ -56,120 +33,7 @@ V_FUNCTION_END(__kernel_gettimeofday)
*
*/
V_FUNCTION_BEGIN(__kernel_clock_gettime)
- .cfi_startproc
- /* Check for supported clock IDs */
- cmpwi cr0,r3,CLOCK_REALTIME
- cmpwi cr1,r3,CLOCK_MONOTONIC
- cror cr0*4+eq,cr0*4+eq,cr1*4+eq
-
- cmpwi cr5,r3,CLOCK_REALTIME_COARSE
- cmpwi cr6,r3,CLOCK_MONOTONIC_COARSE
- cror cr5*4+eq,cr5*4+eq,cr6*4+eq
-
- cror cr0*4+eq,cr0*4+eq,cr5*4+eq
- bne cr0,99f
-
- mflr r12 /* r12 saves lr */
- .cfi_register lr,r12
- mr r11,r4 /* r11 saves tp */
- get_datapage r3, r0
- lis r7,NSEC_PER_SEC@h /* want nanoseconds */
- ori r7,r7,NSEC_PER_SEC@l
- beq cr5,70f
-50: bl V_LOCAL_FUNC(__do_get_tspec) /* get time from tb & kernel */
- bne cr1,80f /* if not monotonic, all done */
-
- /*
- * CLOCK_MONOTONIC
- */
-
- /* now we must fixup using wall to monotonic. We need to snapshot
- * that value and do the counter trick again. Fortunately, we still
- * have the counter value in r8 that was returned by __do_get_tspec.
- * At this point, r4,r5 contain our sec/nsec values.
- */
-
- ld r6,WTOM_CLOCK_SEC(r3)
- lwa r9,WTOM_CLOCK_NSEC(r3)
-
- /* We now have our result in r6,r9. We create a fake dependency
- * on that result and re-check the counter
- */
- or r0,r6,r9
- xor r0,r0,r0
- add r3,r3,r0
- ld r0,CFG_TB_UPDATE_COUNT(r3)
- cmpld cr0,r0,r8 /* check if updated */
- bne- 50b
- b 78f
-
- /*
- * For coarse clocks we get data directly from the vdso data page, so
- * we don't need to call __do_get_tspec, but we still need to do the
- * counter trick.
- */
-70: ld r8,CFG_TB_UPDATE_COUNT(r3)
- andi. r0,r8,1 /* pending update ? loop */
- bne- 70b
- add r3,r3,r0 /* r0 is already 0 */
-
- /*
- * CLOCK_REALTIME_COARSE, below values are needed for MONOTONIC_COARSE
- * too
- */
- ld r4,STAMP_XTIME_SEC(r3)
- ld r5,STAMP_XTIME_NSEC(r3)
- bne cr6,75f
-
- /* CLOCK_MONOTONIC_COARSE */
- ld r6,WTOM_CLOCK_SEC(r3)
- lwa r9,WTOM_CLOCK_NSEC(r3)
-
- /* check if counter has updated */
- or r0,r6,r9
-75: or r0,r0,r4
- or r0,r0,r5
- xor r0,r0,r0
- add r3,r3,r0
- ld r0,CFG_TB_UPDATE_COUNT(r3)
- cmpld cr0,r0,r8 /* check if updated */
- bne- 70b
-
- /* Counter has not updated, so continue calculating proper values for
- * sec and nsec if monotonic coarse, or just return with the proper
- * values for realtime.
- */
- bne cr6,80f
-
- /* Add wall->monotonic offset and check for overflow or underflow */
-78: add r4,r4,r6
- add r5,r5,r9
- cmpd cr0,r5,r7
- cmpdi cr1,r5,0
- blt 79f
- subf r5,r7,r5
- addi r4,r4,1
-79: bge cr1,80f
- addi r4,r4,-1
- add r5,r5,r7
-
-80: std r4,TSPC64_TV_SEC(r11)
- std r5,TSPC64_TV_NSEC(r11)
-
- mtlr r12
- crclr cr0*4+so
- li r3,0
- blr
-
- /*
- * syscall fallback
- */
-99:
- li r0,__NR_clock_gettime
- .cfi_restore lr
- sc
- blr
- .cfi_endproc
+ cvdso_call __c_kernel_clock_gettime
V_FUNCTION_END(__kernel_clock_gettime)
@@ -180,34 +44,7 @@ V_FUNCTION_END(__kernel_clock_gettime)
*
*/
V_FUNCTION_BEGIN(__kernel_clock_getres)
- .cfi_startproc
- /* Check for supported clock IDs */
- cmpwi cr0,r3,CLOCK_REALTIME
- cmpwi cr1,r3,CLOCK_MONOTONIC
- cror cr0*4+eq,cr0*4+eq,cr1*4+eq
- bne cr0,99f
-
- mflr r12
- .cfi_register lr,r12
- get_datapage r3, r0
- lwz r5, CLOCK_HRTIMER_RES(r3)
- mtlr r12
- li r3,0
- cmpldi cr0,r4,0
- crclr cr0*4+so
- beqlr
- std r3,TSPC64_TV_SEC(r4)
- std r5,TSPC64_TV_NSEC(r4)
- blr
-
- /*
- * syscall fallback
- */
-99:
- li r0,__NR_clock_getres
- sc
- blr
- .cfi_endproc
+ cvdso_call __c_kernel_clock_getres
V_FUNCTION_END(__kernel_clock_getres)
/*
@@ -217,74 +54,5 @@ V_FUNCTION_END(__kernel_clock_getres)
*
*/
V_FUNCTION_BEGIN(__kernel_time)
- .cfi_startproc
- mflr r12
- .cfi_register lr,r12
-
- mr r11,r3 /* r11 holds t */
- get_datapage r3, r0
-
- ld r4,STAMP_XTIME_SEC(r3)
-
- cmpldi r11,0 /* check if t is NULL */
- beq 2f
- std r4,0(r11) /* store result at *t */
-2: mtlr r12
- crclr cr0*4+so
- mr r3,r4
- blr
- .cfi_endproc
+ cvdso_call_time __c_kernel_time
V_FUNCTION_END(__kernel_time)
-
-
-/*
- * This is the core of clock_gettime() and gettimeofday(),
- * it returns the current time in r4 (seconds) and r5.
- * On entry, r7 gives the resolution of r5, either USEC_PER_SEC
- * or NSEC_PER_SEC, giving r5 in microseconds or nanoseconds.
- * It expects the datapage ptr in r3 and doesn't clobber it.
- * It clobbers r0, r6 and r9.
- * On return, r8 contains the counter value that can be reused.
- * This clobbers cr0 but not any other cr field.
- */
-V_FUNCTION_BEGIN(__do_get_tspec)
- .cfi_startproc
- /* check for update count & load values */
-1: ld r8,CFG_TB_UPDATE_COUNT(r3)
- andi. r0,r8,1 /* pending update ? loop */
- bne- 1b
- xor r0,r8,r8 /* create dependency */
- add r3,r3,r0
-
- /* Get TB & offset it. We use the MFTB macro which will generate
- * workaround code for Cell.
- */
- MFTB(r6)
- ld r9,CFG_TB_ORIG_STAMP(r3)
- subf r6,r9,r6
-
- /* Scale result */
- ld r5,CFG_TB_TO_XS(r3)
- sldi r6,r6,12 /* compute time since stamp_xtime */
- mulhdu r6,r6,r5 /* in units of 2^-32 seconds */
-
- /* Add stamp since epoch */
- ld r4,STAMP_XTIME_SEC(r3)
- lwz r5,STAMP_SEC_FRAC(r3)
- or r0,r4,r5
- or r0,r0,r6
- xor r0,r0,r0
- add r3,r3,r0
- ld r0,CFG_TB_UPDATE_COUNT(r3)
- cmpld r0,r8 /* check if updated */
- bne- 1b /* reload if so */
-
- /* convert to seconds & nanoseconds and add to stamp */
- add r6,r6,r5 /* add on fractional seconds of xtime */
- mulhwu r5,r6,r7 /* compute micro or nanoseconds and */
- srdi r6,r6,32 /* seconds since stamp_xtime */
- clrldi r5,r5,32
- add r4,r4,r6
- blr
- .cfi_endproc
-V_FUNCTION_END(__do_get_tspec)
diff --git a/arch/powerpc/kernel/vdso64/vdso64.lds.S b/arch/powerpc/kernel/vdso64/vdso64.lds.S
index 256fb9720298..71be083b24ed 100644
--- a/arch/powerpc/kernel/vdso64/vdso64.lds.S
+++ b/arch/powerpc/kernel/vdso64/vdso64.lds.S
@@ -61,7 +61,6 @@ SECTIONS
.gcc_except_table : { *(.gcc_except_table) }
.rela.dyn ALIGN(8) : { *(.rela.dyn) }
- .opd ALIGN(8) : { KEEP (*(.opd)) }
.got ALIGN(8) : { *(.got .toc) }
_end = .;
@@ -111,6 +110,7 @@ SECTIONS
*(.branch_lt)
*(.data .data.* .gnu.linkonce.d.* .sdata*)
*(.bss .sbss .dynbss .dynsbss)
+ *(.opd)
}
}
--
2.25.1
^ permalink raw reply related
* [PATCH v14 7/9] powerpc/vdso: Save and restore TOC pointer on PPC64
From: Michael Ellerman @ 2020-11-26 13:10 UTC (permalink / raw)
To: linuxppc-dev
In-Reply-To: <20201126131006.2431205-1-mpe@ellerman.id.au>
From: Christophe Leroy <christophe.leroy@csgroup.eu>
On PPC64, the TOC pointer needs to be saved and restored.
Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/bf21fd9762a124ce3afec34a8e403a44dfdf2f84.1604426550.git.christophe.leroy@csgroup.eu
---
arch/powerpc/include/asm/vdso/gettimeofday.h | 12 ++++++++++++
1 file changed, 12 insertions(+)
v14: unchanged
diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h b/arch/powerpc/include/asm/vdso/gettimeofday.h
index 43dd1dc47c37..6f56a6bce615 100644
--- a/arch/powerpc/include/asm/vdso/gettimeofday.h
+++ b/arch/powerpc/include/asm/vdso/gettimeofday.h
@@ -19,10 +19,16 @@
.cfi_register lr, r0
PPC_STLU r1, -PPC_MIN_STKFRM(r1)
PPC_STL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1)
+#ifdef __powerpc64__
+ PPC_STL r2, PPC_MIN_STKFRM + STK_GOT(r1)
+#endif
get_datapage r5, r0
addi r5, r5, VDSO_DATA_OFFSET
bl DOTSYM(\funct)
PPC_LL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1)
+#ifdef __powerpc64__
+ PPC_LL r2, PPC_MIN_STKFRM + STK_GOT(r1)
+#endif
cmpwi r3, 0
mtlr r0
.cfi_restore lr
@@ -42,10 +48,16 @@
.cfi_register lr, r0
PPC_STLU r1, -PPC_MIN_STKFRM(r1)
PPC_STL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1)
+#ifdef __powerpc64__
+ PPC_STL r2, PPC_MIN_STKFRM + STK_GOT(r1)
+#endif
get_datapage r4, r0
addi r4, r4, VDSO_DATA_OFFSET
bl DOTSYM(\funct)
PPC_LL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1)
+#ifdef __powerpc64__
+ PPC_LL r2, PPC_MIN_STKFRM + STK_GOT(r1)
+#endif
crclr so
mtlr r0
.cfi_restore lr
--
2.25.1
^ permalink raw reply related
* [PATCH v14 6/9] powerpc/vdso: Prepare for switching VDSO to generic C implementation.
From: Michael Ellerman @ 2020-11-26 13:10 UTC (permalink / raw)
To: linuxppc-dev
In-Reply-To: <20201126131006.2431205-1-mpe@ellerman.id.au>
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Prepare for switching VDSO to generic C implementation in following
patch. Here, we:
- Prepare the helpers to call the C VDSO functions
- Prepare the required callbacks for the C VDSO functions
- Prepare the clocksource.h files to define VDSO_ARCH_CLOCKMODES
- Add the C trampolines to the generic C VDSO functions
powerpc is a bit special for VDSO as well as system calls in the
way that it requires setting CR SO bit which cannot be done in C.
Therefore, entry/exit needs to be performed in ASM.
Implementing __arch_get_vdso_data() would clobber the link register,
requiring the caller to save it. As the ASM calling function already
has to set a stack frame and saves the link register before calling
the C vdso function, retriving the vdso data pointer there is lighter.
Implement __arch_vdso_capable() and always return true.
Provide vdso_shift_ns(), as the generic x >> s gives the following
bad result:
18: 35 25 ff e0 addic. r9,r5,-32
1c: 41 80 00 10 blt 2c <shift+0x14>
20: 7c 64 4c 30 srw r4,r3,r9
24: 38 60 00 00 li r3,0
...
2c: 54 69 08 3c rlwinm r9,r3,1,0,30
30: 21 45 00 1f subfic r10,r5,31
34: 7c 84 2c 30 srw r4,r4,r5
38: 7d 29 50 30 slw r9,r9,r10
3c: 7c 63 2c 30 srw r3,r3,r5
40: 7d 24 23 78 or r4,r9,r4
In our case the shift is always <= 32. In addition, the upper 32 bits
of the result are likely nul. Lets GCC know it, it also optimises the
following calculations.
With the patch, we get:
0: 21 25 00 20 subfic r9,r5,32
4: 7c 69 48 30 slw r9,r3,r9
8: 7c 84 2c 30 srw r4,r4,r5
c: 7d 24 23 78 or r4,r9,r4
10: 7c 63 2c 30 srw r3,r3,r5
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
[mpe: Tweak include guards]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/b734c0bd0b65a538e8d2ab2367a46184d097fc55.1604426550.git.christophe.leroy@csgroup.eu
---
arch/powerpc/include/asm/clocksource.h | 7 +
arch/powerpc/include/asm/ppc_asm.h | 2 +
arch/powerpc/include/asm/vdso/clocksource.h | 7 +
arch/powerpc/include/asm/vdso/gettimeofday.h | 187 +++++++++++++++++++
arch/powerpc/kernel/vdso32/vgettimeofday.c | 28 +++
arch/powerpc/kernel/vdso64/vgettimeofday.c | 29 +++
6 files changed, 260 insertions(+)
create mode 100644 arch/powerpc/include/asm/clocksource.h
create mode 100644 arch/powerpc/include/asm/vdso/clocksource.h
create mode 100644 arch/powerpc/include/asm/vdso/gettimeofday.h
create mode 100644 arch/powerpc/kernel/vdso32/vgettimeofday.c
create mode 100644 arch/powerpc/kernel/vdso64/vgettimeofday.c
v14: mpe: Tweak include guards
diff --git a/arch/powerpc/include/asm/clocksource.h b/arch/powerpc/include/asm/clocksource.h
new file mode 100644
index 000000000000..0a26ef13a34a
--- /dev/null
+++ b/arch/powerpc/include/asm/clocksource.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_CLOCKSOURCE_H
+#define _ASM_POWERPC_CLOCKSOURCE_H
+
+#include <asm/vdso/clocksource.h>
+
+#endif /* _ASM_POWERPC_CLOCKSOURCE_H */
diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index 101986d4a29d..cfa814824285 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -251,6 +251,8 @@ GLUE(.,name):
#define _GLOBAL_TOC(name) _GLOBAL(name)
+#define DOTSYM(a) a
+
#endif
/*
diff --git a/arch/powerpc/include/asm/vdso/clocksource.h b/arch/powerpc/include/asm/vdso/clocksource.h
new file mode 100644
index 000000000000..c1ba56b82ee5
--- /dev/null
+++ b/arch/powerpc/include/asm/vdso/clocksource.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_VDSO_CLOCKSOURCE_H
+#define _ASM_POWERPC_VDSO_CLOCKSOURCE_H
+
+#define VDSO_ARCH_CLOCKMODES VDSO_CLOCKMODE_ARCHTIMER
+
+#endif
diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h b/arch/powerpc/include/asm/vdso/gettimeofday.h
new file mode 100644
index 000000000000..43dd1dc47c37
--- /dev/null
+++ b/arch/powerpc/include/asm/vdso/gettimeofday.h
@@ -0,0 +1,187 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_VDSO_GETTIMEOFDAY_H
+#define _ASM_POWERPC_VDSO_GETTIMEOFDAY_H
+
+#ifdef __ASSEMBLY__
+
+#include <asm/ppc_asm.h>
+
+/*
+ * The macros sets two stack frames, one for the caller and one for the callee
+ * because there are no requirement for the caller to set a stack frame when
+ * calling VDSO so it may have omitted to set one, especially on PPC64
+ */
+
+.macro cvdso_call funct
+ .cfi_startproc
+ PPC_STLU r1, -PPC_MIN_STKFRM(r1)
+ mflr r0
+ .cfi_register lr, r0
+ PPC_STLU r1, -PPC_MIN_STKFRM(r1)
+ PPC_STL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1)
+ get_datapage r5, r0
+ addi r5, r5, VDSO_DATA_OFFSET
+ bl DOTSYM(\funct)
+ PPC_LL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1)
+ cmpwi r3, 0
+ mtlr r0
+ .cfi_restore lr
+ addi r1, r1, 2 * PPC_MIN_STKFRM
+ crclr so
+ beqlr+
+ crset so
+ neg r3, r3
+ blr
+ .cfi_endproc
+.endm
+
+.macro cvdso_call_time funct
+ .cfi_startproc
+ PPC_STLU r1, -PPC_MIN_STKFRM(r1)
+ mflr r0
+ .cfi_register lr, r0
+ PPC_STLU r1, -PPC_MIN_STKFRM(r1)
+ PPC_STL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1)
+ get_datapage r4, r0
+ addi r4, r4, VDSO_DATA_OFFSET
+ bl DOTSYM(\funct)
+ PPC_LL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1)
+ crclr so
+ mtlr r0
+ .cfi_restore lr
+ addi r1, r1, 2 * PPC_MIN_STKFRM
+ blr
+ .cfi_endproc
+.endm
+
+#else
+
+#include <asm/vdso/timebase.h>
+#include <asm/barrier.h>
+#include <asm/unistd.h>
+#include <uapi/linux/time.h>
+
+#define VDSO_HAS_CLOCK_GETRES 1
+
+#define VDSO_HAS_TIME 1
+
+static __always_inline int do_syscall_2(const unsigned long _r0, const unsigned long _r3,
+ const unsigned long _r4)
+{
+ register long r0 asm("r0") = _r0;
+ register unsigned long r3 asm("r3") = _r3;
+ register unsigned long r4 asm("r4") = _r4;
+ register int ret asm ("r3");
+
+ asm volatile(
+ " sc\n"
+ " bns+ 1f\n"
+ " neg %0, %0\n"
+ "1:\n"
+ : "=r" (ret), "+r" (r4), "+r" (r0)
+ : "r" (r3)
+ : "memory", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cr0", "ctr");
+
+ return ret;
+}
+
+static __always_inline
+int gettimeofday_fallback(struct __kernel_old_timeval *_tv, struct timezone *_tz)
+{
+ return do_syscall_2(__NR_gettimeofday, (unsigned long)_tv, (unsigned long)_tz);
+}
+
+static __always_inline
+int clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
+{
+ return do_syscall_2(__NR_clock_gettime, _clkid, (unsigned long)_ts);
+}
+
+static __always_inline
+int clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
+{
+ return do_syscall_2(__NR_clock_getres, _clkid, (unsigned long)_ts);
+}
+
+#ifdef CONFIG_VDSO32
+
+#define BUILD_VDSO32 1
+
+static __always_inline
+int clock_gettime32_fallback(clockid_t _clkid, struct old_timespec32 *_ts)
+{
+ return do_syscall_2(__NR_clock_gettime, _clkid, (unsigned long)_ts);
+}
+
+static __always_inline
+int clock_getres32_fallback(clockid_t _clkid, struct old_timespec32 *_ts)
+{
+ return do_syscall_2(__NR_clock_getres, _clkid, (unsigned long)_ts);
+}
+#endif
+
+static __always_inline u64 __arch_get_hw_counter(s32 clock_mode,
+ const struct vdso_data *vd)
+{
+ return get_tb();
+}
+
+const struct vdso_data *__arch_get_vdso_data(void);
+
+static inline bool vdso_clocksource_ok(const struct vdso_data *vd)
+{
+ return true;
+}
+#define vdso_clocksource_ok vdso_clocksource_ok
+
+/*
+ * powerpc specific delta calculation.
+ *
+ * This variant removes the masking of the subtraction because the
+ * clocksource mask of all VDSO capable clocksources on powerpc is U64_MAX
+ * which would result in a pointless operation. The compiler cannot
+ * optimize it away as the mask comes from the vdso data and is not compile
+ * time constant.
+ */
+static __always_inline u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
+{
+ return (cycles - last) * mult;
+}
+#define vdso_calc_delta vdso_calc_delta
+
+#ifndef __powerpc64__
+static __always_inline u64 vdso_shift_ns(u64 ns, unsigned long shift)
+{
+ u32 hi = ns >> 32;
+ u32 lo = ns;
+
+ lo >>= shift;
+ lo |= hi << (32 - shift);
+ hi >>= shift;
+
+ if (likely(hi == 0))
+ return lo;
+
+ return ((u64)hi << 32) | lo;
+}
+#define vdso_shift_ns vdso_shift_ns
+#endif
+
+#ifdef __powerpc64__
+int __c_kernel_clock_gettime(clockid_t clock, struct __kernel_timespec *ts,
+ const struct vdso_data *vd);
+int __c_kernel_clock_getres(clockid_t clock_id, struct __kernel_timespec *res,
+ const struct vdso_data *vd);
+#else
+int __c_kernel_clock_gettime(clockid_t clock, struct old_timespec32 *ts,
+ const struct vdso_data *vd);
+int __c_kernel_clock_getres(clockid_t clock_id, struct old_timespec32 *res,
+ const struct vdso_data *vd);
+#endif
+int __c_kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz,
+ const struct vdso_data *vd);
+__kernel_old_time_t __c_kernel_time(__kernel_old_time_t *time,
+ const struct vdso_data *vd);
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_POWERPC_VDSO_GETTIMEOFDAY_H */
diff --git a/arch/powerpc/kernel/vdso32/vgettimeofday.c b/arch/powerpc/kernel/vdso32/vgettimeofday.c
new file mode 100644
index 000000000000..0d4bc217529e
--- /dev/null
+++ b/arch/powerpc/kernel/vdso32/vgettimeofday.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Powerpc userspace implementations of gettimeofday() and similar.
+ */
+#include <linux/types.h>
+
+int __c_kernel_clock_gettime(clockid_t clock, struct old_timespec32 *ts,
+ const struct vdso_data *vd)
+{
+ return __cvdso_clock_gettime32_data(vd, clock, ts);
+}
+
+int __c_kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz,
+ const struct vdso_data *vd)
+{
+ return __cvdso_gettimeofday_data(vd, tv, tz);
+}
+
+int __c_kernel_clock_getres(clockid_t clock_id, struct old_timespec32 *res,
+ const struct vdso_data *vd)
+{
+ return __cvdso_clock_getres_time32_data(vd, clock_id, res);
+}
+
+__kernel_old_time_t __c_kernel_time(__kernel_old_time_t *time, const struct vdso_data *vd)
+{
+ return __cvdso_time_data(vd, time);
+}
diff --git a/arch/powerpc/kernel/vdso64/vgettimeofday.c b/arch/powerpc/kernel/vdso64/vgettimeofday.c
new file mode 100644
index 000000000000..5b5500058344
--- /dev/null
+++ b/arch/powerpc/kernel/vdso64/vgettimeofday.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Powerpc userspace implementations of gettimeofday() and similar.
+ */
+#include <linux/time.h>
+#include <linux/types.h>
+
+int __c_kernel_clock_gettime(clockid_t clock, struct __kernel_timespec *ts,
+ const struct vdso_data *vd)
+{
+ return __cvdso_clock_gettime_data(vd, clock, ts);
+}
+
+int __c_kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz,
+ const struct vdso_data *vd)
+{
+ return __cvdso_gettimeofday_data(vd, tv, tz);
+}
+
+int __c_kernel_clock_getres(clockid_t clock_id, struct __kernel_timespec *res,
+ const struct vdso_data *vd)
+{
+ return __cvdso_clock_getres_data(vd, clock_id, res);
+}
+
+__kernel_old_time_t __c_kernel_time(__kernel_old_time_t *time, const struct vdso_data *vd)
+{
+ return __cvdso_time_data(vd, time);
+}
--
2.25.1
^ permalink raw reply related
* [PATCH v14 5/9] powerpc/barrier: Use CONFIG_PPC64 for barrier selection
From: Michael Ellerman @ 2020-11-26 13:10 UTC (permalink / raw)
To: linuxppc-dev
In-Reply-To: <20201126131006.2431205-1-mpe@ellerman.id.au>
Currently we use ifdef __powerpc64__ in barrier.h to decide if we
should use lwsync or eieio for SMPWMB which is then used by
__smp_wmb().
That means when we are building the compat VDSO we will use eieio,
because it's 32-bit code, even though we're building a 64-bit kernel
for a 64-bit CPU.
Although eieio should work, it would be cleaner if we always used the
same barrier, even for the 32-bit VDSO.
So change the ifdef to CONFIG_PPC64, so that the selection is made
based on the bitness of the kernel we're building for, not the current
compilation unit.
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
arch/powerpc/include/asm/barrier.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
v14: new
diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
index f53c42380832..aecfde829d5d 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -40,7 +40,7 @@
#define wmb() __asm__ __volatile__ ("sync" : : : "memory")
/* The sub-arch has lwsync */
-#if defined(__powerpc64__) || defined(CONFIG_PPC_E500MC)
+#if defined(CONFIG_PPC64) || defined(CONFIG_PPC_E500MC)
# define SMPWMB LWSYNC
#else
# define SMPWMB eieio
--
2.25.1
^ permalink raw reply related
* [PATCH v14 4/9] powerpc/time: Fix mftb()/get_tb() for use with the compat VDSO
From: Michael Ellerman @ 2020-11-26 13:10 UTC (permalink / raw)
To: linuxppc-dev
In-Reply-To: <20201126131006.2431205-1-mpe@ellerman.id.au>
When we're building the compat VDSO we are building 32-bit code but in
the context of a 64-bit kernel configuration.
To make this work we need to be careful in some places when using
ifdefs to differentiate between CONFIG_PPC64 and __powerpc64__.
CONFIG_PPC64 indicates the kernel we're building is 64-bit, but it
doesn't tell us that we're currently building 64-bit code - we could
be building 32-bit code for the compat VDSO.
On the other hand __powerpc64__ tells us that we are currently
building 64-bit code (and therefore we must also be building a 64-bit
kernel).
In the case of get_tb() we want to use the 32-bit code sequence
regardless of whether the kernel we're building for is 64-bit or
32-bit, what matters is the word size of the current object. So we
need to check __powerpc64__ to decide if we use mftb() or the
mftbu()/mftb() sequence.
For mftb() the logic for CPU_FTR_CELL_TB_BUG only makes sense if we're
building 64-bit code, so guard that with a __powerpc64__ check.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
arch/powerpc/include/asm/vdso/timebase.h | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
v14: new
diff --git a/arch/powerpc/include/asm/vdso/timebase.h b/arch/powerpc/include/asm/vdso/timebase.h
index ac6769b348c6..b558b07959ce 100644
--- a/arch/powerpc/include/asm/vdso/timebase.h
+++ b/arch/powerpc/include/asm/vdso/timebase.h
@@ -8,7 +8,11 @@
#include <asm/reg.h>
-#if defined(CONFIG_PPC_CELL) || defined(CONFIG_E500)
+/*
+ * We use __powerpc64__ here because we want the compat VDSO to use the 32-bit
+ * version below in the else case of the ifdef.
+ */
+#if defined(__powerpc64__) && (defined(CONFIG_PPC_CELL) || defined(CONFIG_E500))
#define mftb() ({unsigned long rval; \
asm volatile( \
"90: mfspr %0, %2;\n" \
@@ -49,7 +53,11 @@ static inline u64 get_tb(void)
{
unsigned int tbhi, tblo, tbhi2;
- if (IS_ENABLED(CONFIG_PPC64))
+ /*
+ * We use __powerpc64__ here not CONFIG_PPC64 because we want the compat
+ * VDSO to use the 32-bit compatible version in the while loop below.
+ */
+ if (__is_defined(__powerpc64__))
return mftb();
do {
--
2.25.1
^ permalink raw reply related
* [PATCH v14 3/9] powerpc/time: Move timebase functions into new asm/vdso/timebase.h
From: Michael Ellerman @ 2020-11-26 13:10 UTC (permalink / raw)
To: linuxppc-dev
In-Reply-To: <20201126131006.2431205-1-mpe@ellerman.id.au>
From: Christophe Leroy <christophe.leroy@csgroup.eu>
In order to easily use get_tb() from C VDSO, move timebase
functions into a new header named asm/vdso/timebase.h
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
[mpe: Move into asm/vdso, drop dubious historical authorship comments,
move mftb() etc. also]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/2d03f4b466156c0a0bfe5494c8874dcac952445c.1604426550.git.christophe.leroy@csgroup.eu
---
arch/powerpc/include/asm/reg.h | 31 -----------
arch/powerpc/include/asm/time.h | 30 +---------
arch/powerpc/include/asm/timex.h | 2 +-
arch/powerpc/include/asm/vdso/timebase.h | 71 ++++++++++++++++++++++++
4 files changed, 73 insertions(+), 61 deletions(-)
create mode 100644 arch/powerpc/include/asm/vdso/timebase.h
v14: mpe: Move into asm/vdso, drop dubious historical authorship comments, move mftb() etc. also.
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index f877a576b338..602236e223c4 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -1419,37 +1419,6 @@ static inline void msr_check_and_clear(unsigned long bits)
__msr_check_and_clear(bits);
}
-#if defined(CONFIG_PPC_CELL) || defined(CONFIG_E500)
-#define mftb() ({unsigned long rval; \
- asm volatile( \
- "90: mfspr %0, %2;\n" \
- ASM_FTR_IFSET( \
- "97: cmpwi %0,0;\n" \
- " beq- 90b;\n", "", %1) \
- : "=r" (rval) \
- : "i" (CPU_FTR_CELL_TB_BUG), "i" (SPRN_TBRL) : "cr0"); \
- rval;})
-#elif defined(CONFIG_PPC_8xx)
-#define mftb() ({unsigned long rval; \
- asm volatile("mftbl %0" : "=r" (rval)); rval;})
-#else
-#define mftb() ({unsigned long rval; \
- asm volatile("mfspr %0, %1" : \
- "=r" (rval) : "i" (SPRN_TBRL)); rval;})
-#endif /* !CONFIG_PPC_CELL */
-
-#if defined(CONFIG_PPC_8xx)
-#define mftbu() ({unsigned long rval; \
- asm volatile("mftbu %0" : "=r" (rval)); rval;})
-#else
-#define mftbu() ({unsigned long rval; \
- asm volatile("mfspr %0, %1" : "=r" (rval) : \
- "i" (SPRN_TBRU)); rval;})
-#endif
-
-#define mttbl(v) asm volatile("mttbl %0":: "r"(v))
-#define mttbu(v) asm volatile("mttbu %0":: "r"(v))
-
#ifdef CONFIG_PPC32
#define mfsrin(v) ({unsigned int rval; \
asm volatile("mfsrin %0,%1" : "=r" (rval) : "r" (v)); \
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 2f566c1a754c..a59f8030f020 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -15,6 +15,7 @@
#include <asm/processor.h>
#include <asm/cpu_has_feature.h>
+#include <asm/vdso/timebase.h>
/* time.c */
extern unsigned long tb_ticks_per_jiffy;
@@ -38,12 +39,6 @@ struct div_result {
u64 result_low;
};
-/* For compatibility, get_tbl() is defined as get_tb() on ppc64 */
-static inline unsigned long get_tbl(void)
-{
- return mftb();
-}
-
static inline u64 get_vtb(void)
{
#ifdef CONFIG_PPC_BOOK3S_64
@@ -53,29 +48,6 @@ static inline u64 get_vtb(void)
return 0;
}
-static inline u64 get_tb(void)
-{
- unsigned int tbhi, tblo, tbhi2;
-
- if (IS_ENABLED(CONFIG_PPC64))
- return mftb();
-
- do {
- tbhi = mftbu();
- tblo = mftb();
- tbhi2 = mftbu();
- } while (tbhi != tbhi2);
-
- return ((u64)tbhi << 32) | tblo;
-}
-
-static inline void set_tb(unsigned int upper, unsigned int lower)
-{
- mtspr(SPRN_TBWL, 0);
- mtspr(SPRN_TBWU, upper);
- mtspr(SPRN_TBWL, lower);
-}
-
/* Accessor functions for the decrementer register.
* The 4xx doesn't even have a decrementer. I tried to use the
* generic timer interrupt code, which seems OK, with the 4xx PIT
diff --git a/arch/powerpc/include/asm/timex.h b/arch/powerpc/include/asm/timex.h
index 95988870a57b..fa2e76e4093a 100644
--- a/arch/powerpc/include/asm/timex.h
+++ b/arch/powerpc/include/asm/timex.h
@@ -9,7 +9,7 @@
*/
#include <asm/cputable.h>
-#include <asm/reg.h>
+#include <asm/vdso/timebase.h>
#define CLOCK_TICK_RATE 1024000 /* Underlying HZ */
diff --git a/arch/powerpc/include/asm/vdso/timebase.h b/arch/powerpc/include/asm/vdso/timebase.h
new file mode 100644
index 000000000000..ac6769b348c6
--- /dev/null
+++ b/arch/powerpc/include/asm/vdso/timebase.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Common timebase prototypes and such for all ppc machines.
+ */
+
+#ifndef _ASM_POWERPC_VDSO_TIMEBASE_H
+#define _ASM_POWERPC_VDSO_TIMEBASE_H
+
+#include <asm/reg.h>
+
+#if defined(CONFIG_PPC_CELL) || defined(CONFIG_E500)
+#define mftb() ({unsigned long rval; \
+ asm volatile( \
+ "90: mfspr %0, %2;\n" \
+ ASM_FTR_IFSET( \
+ "97: cmpwi %0,0;\n" \
+ " beq- 90b;\n", "", %1) \
+ : "=r" (rval) \
+ : "i" (CPU_FTR_CELL_TB_BUG), "i" (SPRN_TBRL) : "cr0"); \
+ rval;})
+#elif defined(CONFIG_PPC_8xx)
+#define mftb() ({unsigned long rval; \
+ asm volatile("mftbl %0" : "=r" (rval)); rval;})
+#else
+#define mftb() ({unsigned long rval; \
+ asm volatile("mfspr %0, %1" : \
+ "=r" (rval) : "i" (SPRN_TBRL)); rval;})
+#endif /* !CONFIG_PPC_CELL */
+
+#if defined(CONFIG_PPC_8xx)
+#define mftbu() ({unsigned long rval; \
+ asm volatile("mftbu %0" : "=r" (rval)); rval;})
+#else
+#define mftbu() ({unsigned long rval; \
+ asm volatile("mfspr %0, %1" : "=r" (rval) : \
+ "i" (SPRN_TBRU)); rval;})
+#endif
+
+#define mttbl(v) asm volatile("mttbl %0":: "r"(v))
+#define mttbu(v) asm volatile("mttbu %0":: "r"(v))
+
+/* For compatibility, get_tbl() is defined as get_tb() on ppc64 */
+static inline unsigned long get_tbl(void)
+{
+ return mftb();
+}
+
+static inline u64 get_tb(void)
+{
+ unsigned int tbhi, tblo, tbhi2;
+
+ if (IS_ENABLED(CONFIG_PPC64))
+ return mftb();
+
+ do {
+ tbhi = mftbu();
+ tblo = mftb();
+ tbhi2 = mftbu();
+ } while (tbhi != tbhi2);
+
+ return ((u64)tbhi << 32) | tblo;
+}
+
+static inline void set_tb(unsigned int upper, unsigned int lower)
+{
+ mtspr(SPRN_TBWL, 0);
+ mtspr(SPRN_TBWU, upper);
+ mtspr(SPRN_TBWL, lower);
+}
+
+#endif /* _ASM_POWERPC_VDSO_TIMEBASE_H */
--
2.25.1
^ permalink raw reply related
* [PATCH v14 2/9] powerpc/processor: Move cpu_relax() into asm/vdso/processor.h
From: Michael Ellerman @ 2020-11-26 13:09 UTC (permalink / raw)
To: linuxppc-dev
In-Reply-To: <20201126131006.2431205-1-mpe@ellerman.id.au>
From: Christophe Leroy <christophe.leroy@csgroup.eu>
cpu_relax() need to be in asm/vdso/processor.h to be used by
the C VDSO generic library.
Move it there.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
[mpe: Tweak include guards to match our usual formatting]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/3747db2c50eb98a86088871bef7ed7c593e336bb.1604426550.git.christophe.leroy@csgroup.eu
---
arch/powerpc/include/asm/processor.h | 13 ++-----------
arch/powerpc/include/asm/vdso/processor.h | 23 +++++++++++++++++++++++
2 files changed, 25 insertions(+), 11 deletions(-)
create mode 100644 arch/powerpc/include/asm/vdso/processor.h
v14: mpe: Tweak include guards to match our usual formatting
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index c61c859b51a8..333e3b6c76fb 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -6,6 +6,8 @@
* Copyright (C) 2001 PPC 64 Team, IBM Corp
*/
+#include <vdso/processor.h>
+
#include <asm/reg.h>
#ifdef CONFIG_VSX
@@ -63,14 +65,6 @@ extern int _chrp_type;
#endif /* defined(__KERNEL__) && defined(CONFIG_PPC32) */
-/* Macros for adjusting thread priority (hardware multi-threading) */
-#define HMT_very_low() asm volatile("or 31,31,31 # very low priority")
-#define HMT_low() asm volatile("or 1,1,1 # low priority")
-#define HMT_medium_low() asm volatile("or 6,6,6 # medium low priority")
-#define HMT_medium() asm volatile("or 2,2,2 # medium priority")
-#define HMT_medium_high() asm volatile("or 5,5,5 # medium high priority")
-#define HMT_high() asm volatile("or 3,3,3 # high priority")
-
#ifdef __KERNEL__
#ifdef CONFIG_PPC64
@@ -344,7 +338,6 @@ static inline unsigned long __pack_fe01(unsigned int fpmode)
}
#ifdef CONFIG_PPC64
-#define cpu_relax() do { HMT_low(); HMT_medium(); barrier(); } while (0)
#define spin_begin() HMT_low()
@@ -363,8 +356,6 @@ do { \
} \
} while (0)
-#else
-#define cpu_relax() barrier()
#endif
/* Check that a certain kernel stack pointer is valid in task_struct p */
diff --git a/arch/powerpc/include/asm/vdso/processor.h b/arch/powerpc/include/asm/vdso/processor.h
new file mode 100644
index 000000000000..e072577bc7c0
--- /dev/null
+++ b/arch/powerpc/include/asm/vdso/processor.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_POWERPC_VDSO_PROCESSOR_H
+#define _ASM_POWERPC_VDSO_PROCESSOR_H
+
+#ifndef __ASSEMBLY__
+
+/* Macros for adjusting thread priority (hardware multi-threading) */
+#define HMT_very_low() asm volatile("or 31, 31, 31 # very low priority")
+#define HMT_low() asm volatile("or 1, 1, 1 # low priority")
+#define HMT_medium_low() asm volatile("or 6, 6, 6 # medium low priority")
+#define HMT_medium() asm volatile("or 2, 2, 2 # medium priority")
+#define HMT_medium_high() asm volatile("or 5, 5, 5 # medium high priority")
+#define HMT_high() asm volatile("or 3, 3, 3 # high priority")
+
+#ifdef CONFIG_PPC64
+#define cpu_relax() do { HMT_low(); HMT_medium(); barrier(); } while (0)
+#else
+#define cpu_relax() barrier()
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_POWERPC_VDSO_PROCESSOR_H */
--
2.25.1
^ permalink raw reply related
* [PATCH v14 1/9] powerpc/feature: Use CONFIG_PPC64 instead of __powerpc64__ to define possible features
From: Michael Ellerman @ 2020-11-26 13:09 UTC (permalink / raw)
To: linuxppc-dev
From: Christophe Leroy <christophe.leroy@csgroup.eu>
In order to build VDSO32 for PPC64, we need to have CPU_FTRS_POSSIBLE
and CPU_FTRS_ALWAYS independant of whether we are building the
32 bits VDSO or the 64 bits VDSO.
Use #ifdef CONFIG_PPC64 instead of #ifdef __powerpc64__
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/08ffecd31403b3c692f25f97b07a378ba784873e.1604426550.git.christophe.leroy@csgroup.eu
---
arch/powerpc/include/asm/cputable.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
v14: unchanged
diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index 5e31960a56a9..e069a2d9f7c1 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -488,7 +488,7 @@ static inline void cpu_feature_keys_init(void) { }
CPU_FTR_PURR | CPU_FTR_REAL_LE | CPU_FTR_DABRX)
#define CPU_FTRS_COMPATIBLE (CPU_FTR_PPCAS_ARCH_V2)
-#ifdef __powerpc64__
+#ifdef CONFIG_PPC64
#ifdef CONFIG_PPC_BOOK3E
#define CPU_FTRS_POSSIBLE (CPU_FTRS_E6500 | CPU_FTRS_E5500)
#else
@@ -545,7 +545,7 @@ enum {
};
#endif /* __powerpc64__ */
-#ifdef __powerpc64__
+#ifdef CONFIG_PPC64
#ifdef CONFIG_PPC_BOOK3E
#define CPU_FTRS_ALWAYS (CPU_FTRS_E6500 & CPU_FTRS_E5500)
#else
--
2.25.1
^ permalink raw reply related
* Re: [PATCH v2 3/6] perf/core: Fix arch_perf_get_page_size()
From: Peter Zijlstra @ 2020-11-26 13:06 UTC (permalink / raw)
To: Matthew Wilcox
Cc: mark.rutland, aneesh.kumar, linux-arch, catalin.marinas, will,
alexander.shishkin, linuxppc-dev, npiggin, linux-kernel, acme,
davem, dave.hansen, ak, eranian, sparclinux, jolsa, mingo,
kirill.shutemov, kan.liang
In-Reply-To: <20201126125606.GR4327@casper.infradead.org>
On Thu, Nov 26, 2020 at 12:56:06PM +0000, Matthew Wilcox wrote:
> On Thu, Nov 26, 2020 at 01:42:07PM +0100, Peter Zijlstra wrote:
> > + pgdp = pgd_offset(mm, addr);
> > + pgd = READ_ONCE(*pgdp);
>
> I forget how x86-32-PAE maps to Linux's PGD/P4D/PUD/PMD scheme, but
> according to volume 3, section 4.4.2, PAE paging uses a 64-bit PDE, so
> whether a PDE is a PGD or a PMD, we're only reading it with READ_ONCE
> rather than the lockless-retry method used by ptep_get_lockless().
> So it's potentially racy? Do we need a pmdp_get_lockless() or
> pgdp_get_lockless()?
Oh gawd... this isn't new here though, right? Current gup_fast also gets
that wrong, if it is in deed wrong.
I suppose it's a race far more likely today, with THP and all, than it
ever was back then.
^ permalink raw reply
* Re: [PATCH v2 1/6] mm/gup: Provide gup_get_pte() more generic
From: Peter Zijlstra @ 2020-11-26 13:02 UTC (permalink / raw)
To: Matthew Wilcox
Cc: mark.rutland, aneesh.kumar, linux-arch, catalin.marinas, will,
alexander.shishkin, linuxppc-dev, npiggin, linux-kernel, acme,
davem, dave.hansen, ak, eranian, sparclinux, jolsa, mingo,
kirill.shutemov, kan.liang
In-Reply-To: <20201126124300.GP4327@casper.infradead.org>
On Thu, Nov 26, 2020 at 12:43:00PM +0000, Matthew Wilcox wrote:
> On Thu, Nov 26, 2020 at 01:01:15PM +0100, Peter Zijlstra wrote:
> > +#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
> > +/*
> > + * WARNING: only to be used in the get_user_pages_fast() implementation.
> > + * With get_user_pages_fast(), we walk down the pagetables without taking any
> > + * locks. For this we would like to load the pointers atomically, but sometimes
> > + * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What
> > + * we do have is the guarantee that a PTE will only either go from not present
> > + * to present, or present to not present or both -- it will not switch to a
> > + * completely different present page without a TLB flush in between; something
> > + * that we are blocking by holding interrupts off.
>
> I feel like this comment needs some love. How about:
>
> * For walking the pagetables without holding any locks. Some architectures
> * (eg x86-32 PAE) cannot load the entries atomically without using
> * expensive instructions. We are guaranteed that a PTE will only either go
> * from not present to present, or present to not present -- it will not
> * switch to a completely different present page without a TLB flush
> * inbetween; which we are blocking by holding interrupts off.
>
> And it would be nice to have an assertion that interrupts are disabled
> in the code. Because comments are nice, but nobody reads them.
Quite agreed, I'll stick a separate patch on with the updated comment
and a lockdep_assert_irqs_disabled() in. I'm afraid that latter will make
for header soup though :/
We'll see, let the robots have it.
^ permalink raw reply
* Re: [PATCH v2 4/6] arm64/mm: Implement pXX_leaf_size() support
From: Peter Zijlstra @ 2020-11-26 12:57 UTC (permalink / raw)
To: kan.liang, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
eranian
Cc: linux-arch, ak, catalin.marinas, linuxppc-dev, willy,
linux-kernel, dave.hansen, npiggin, aneesh.kumar, sparclinux,
will, davem, kirill.shutemov
In-Reply-To: <20201126121121.226210959@infradead.org>
Now with pmd_cont() defined...
---
Subject: arm64/mm: Implement pXX_leaf_size() support
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri Nov 13 11:46:06 CET 2020
ARM64 has non-pagetable aligned large page support with PTE_CONT, when
this bit is set the page is part of a super-page. Match the hugetlb
code and support these super pages for PTE and PMD levels.
This enables PERF_SAMPLE_{DATA,CODE}_PAGE_SIZE to report accurate
pagetable leaf sizes.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
arch/arm64/include/asm/pgtable.h | 4 ++++
1 file changed, 4 insertions(+)
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -407,6 +407,7 @@ static inline int pmd_trans_huge(pmd_t p
#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_young(pmd) pte_young(pmd_pte(pmd))
#define pmd_valid(pmd) pte_valid(pmd_pte(pmd))
+#define pmd_cont(pmd) pte_cont(pmd_pte(pmd))
#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd)))
@@ -503,6 +504,9 @@ extern pgprot_t phys_mem_access_prot(str
PMD_TYPE_SECT)
#define pmd_leaf(pmd) pmd_sect(pmd)
+#define pmd_leaf_size(pmd) (pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE)
+#define pte_leaf_size(pte) (pte_cont(pte) ? CONT_PTE_SIZE : PAGE_SIZE)
+
#if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
static inline bool pud_sect(pud_t pud) { return false; }
static inline bool pud_table(pud_t pud) { return true; }
^ permalink raw reply
* Re: [PATCH v2 3/6] perf/core: Fix arch_perf_get_page_size()
From: Matthew Wilcox @ 2020-11-26 12:56 UTC (permalink / raw)
To: Peter Zijlstra
Cc: mark.rutland, aneesh.kumar, linux-arch, catalin.marinas, will,
alexander.shishkin, linuxppc-dev, npiggin, linux-kernel, acme,
davem, dave.hansen, ak, eranian, sparclinux, jolsa, mingo,
kirill.shutemov, kan.liang
In-Reply-To: <20201126124207.GM3040@hirez.programming.kicks-ass.net>
On Thu, Nov 26, 2020 at 01:42:07PM +0100, Peter Zijlstra wrote:
> + pgdp = pgd_offset(mm, addr);
> + pgd = READ_ONCE(*pgdp);
I forget how x86-32-PAE maps to Linux's PGD/P4D/PUD/PMD scheme, but
according to volume 3, section 4.4.2, PAE paging uses a 64-bit PDE, so
whether a PDE is a PGD or a PMD, we're only reading it with READ_ONCE
rather than the lockless-retry method used by ptep_get_lockless().
So it's potentially racy? Do we need a pmdp_get_lockless() or
pgdp_get_lockless()?
[...]
> + pmdp = pmd_offset_lockless(pudp, pud, addr);
> + pmd = READ_ONCE(*pmdp);
> + if (!pmd_present(pmd))
> return 0;
>
> + if (pmd_leaf(pmd))
> + return pmd_leaf_size(pmd);
>
^ permalink raw reply
* Re: [PATCH v2 2/6] mm: Introduce pXX_leaf_size()
From: Matthew Wilcox @ 2020-11-26 12:43 UTC (permalink / raw)
To: Peter Zijlstra
Cc: mark.rutland, aneesh.kumar, linux-arch, catalin.marinas, will,
alexander.shishkin, linuxppc-dev, npiggin, linux-kernel, acme,
davem, dave.hansen, ak, eranian, sparclinux, jolsa, mingo,
kirill.shutemov, kan.liang
In-Reply-To: <20201126121121.102580109@infradead.org>
On Thu, Nov 26, 2020 at 01:01:16PM +0100, Peter Zijlstra wrote:
> A number of architectures have non-pagetable aligned huge/large pages.
> For such architectures a leaf can actually be part of a larger entry.
>
> Provide generic helpers to determine the size of a page-table leaf.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
^ permalink raw reply
* Re: [PATCH v2 1/6] mm/gup: Provide gup_get_pte() more generic
From: Matthew Wilcox @ 2020-11-26 12:43 UTC (permalink / raw)
To: Peter Zijlstra
Cc: mark.rutland, aneesh.kumar, linux-arch, catalin.marinas, will,
alexander.shishkin, linuxppc-dev, npiggin, linux-kernel, acme,
davem, dave.hansen, ak, eranian, sparclinux, jolsa, mingo,
kirill.shutemov, kan.liang
In-Reply-To: <20201126121121.036370527@infradead.org>
On Thu, Nov 26, 2020 at 01:01:15PM +0100, Peter Zijlstra wrote:
> +#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
> +/*
> + * WARNING: only to be used in the get_user_pages_fast() implementation.
> + * With get_user_pages_fast(), we walk down the pagetables without taking any
> + * locks. For this we would like to load the pointers atomically, but sometimes
> + * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What
> + * we do have is the guarantee that a PTE will only either go from not present
> + * to present, or present to not present or both -- it will not switch to a
> + * completely different present page without a TLB flush in between; something
> + * that we are blocking by holding interrupts off.
I feel like this comment needs some love. How about:
* For walking the pagetables without holding any locks. Some architectures
* (eg x86-32 PAE) cannot load the entries atomically without using
* expensive instructions. We are guaranteed that a PTE will only either go
* from not present to present, or present to not present -- it will not
* switch to a completely different present page without a TLB flush
* inbetween; which we are blocking by holding interrupts off.
And it would be nice to have an assertion that interrupts are disabled
in the code. Because comments are nice, but nobody reads them.
> +static inline pte_t ptep_get_lockless(pte_t *ptep)
> +{
> + pte_t pte;
> +
> + do {
> + pte.pte_low = ptep->pte_low;
> + smp_rmb();
> + pte.pte_high = ptep->pte_high;
> + smp_rmb();
> + } while (unlikely(pte.pte_low != ptep->pte_low));
> +
> + return pte;
> +}
^ permalink raw reply
* Re: [PATCH v2 3/6] perf/core: Fix arch_perf_get_page_size()
From: Peter Zijlstra @ 2020-11-26 12:42 UTC (permalink / raw)
To: Matthew Wilcox
Cc: mark.rutland, aneesh.kumar, linux-arch, catalin.marinas, will,
alexander.shishkin, linuxppc-dev, npiggin, linux-kernel, acme,
davem, dave.hansen, ak, eranian, sparclinux, jolsa, mingo,
kirill.shutemov, kan.liang
In-Reply-To: <20201126123458.GO4327@casper.infradead.org>
On Thu, Nov 26, 2020 at 12:34:58PM +0000, Matthew Wilcox wrote:
> On Thu, Nov 26, 2020 at 01:01:17PM +0100, Peter Zijlstra wrote:
> > The (new) page-table walker in arch_perf_get_page_size() is broken in
> > various ways. Specifically while it is used in a lockless manner, it
> > doesn't depend on CONFIG_HAVE_FAST_GUP nor uses the proper _lockless
> > offset methods, nor is careful to only read each entry only once.
> >
> > Also the hugetlb support is broken due to calling pte_page() without
> > first checking pte_special().
> >
> > Rewrite the whole thing to be a proper lockless page-table walker and
> > employ the new pXX_leaf_size() pgtable functions to determine the
> > pagetable size without looking at the page-frames.
> >
> > Fixes: 51b646b2d9f8 ("perf,mm: Handle non-page-table-aligned hugetlbfs")
> > Fixes: 8d97e71811aa ("perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE")
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > Tested-by: Kan Liang <kan.liang@linux.intel.com>
> > ---
> > arch/arm64/include/asm/pgtable.h | 3 +
> > arch/sparc/include/asm/pgtable_64.h | 13 ++++
> > arch/sparc/mm/hugetlbpage.c | 19 ++++--
> > include/linux/pgtable.h | 16 +++++
> > kernel/events/core.c | 102 +++++++++++++-----------------------
> > 5 files changed, 82 insertions(+), 71 deletions(-)
>
> This diffstat doesn't match the patch in this email ...
Urgh, no idea how I did that... I must've edited the diff and not done a
quilt-refresh. Updated below.
---
Subject: perf/core: Fix arch_perf_get_page_size()
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 11 Nov 2020 13:43:57 +0100
The (new) page-table walker in arch_perf_get_page_size() is broken in
various ways. Specifically while it is used in a lockless manner, it
doesn't depend on CONFIG_HAVE_FAST_GUP nor uses the proper _lockless
offset methods, nor is careful to only read each entry only once.
Also the hugetlb support is broken due to calling pte_page() without
first checking pte_special().
Rewrite the whole thing to be a proper lockless page-table walker and
employ the new pXX_leaf_size() pgtable functions to determine the
pagetable size without looking at the page-frames.
Fixes: 51b646b2d9f8 ("perf,mm: Handle non-page-table-aligned hugetlbfs")
Fixes: 8d97e71811aa ("perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Kan Liang <kan.liang@linux.intel.com>
---
kernel/events/core.c | 105 ++++++++++++++++++---------------------------------
1 file changed, 39 insertions(+), 66 deletions(-)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -52,6 +52,7 @@
#include <linux/mount.h>
#include <linux/min_heap.h>
#include <linux/highmem.h>
+#include <linux/pgtable.h>
#include "internal.h"
@@ -7001,90 +7002,62 @@ static u64 perf_virt_to_phys(u64 virt)
return phys_addr;
}
-#ifdef CONFIG_MMU
-
/*
- * Return the MMU page size of a given virtual address.
- *
- * This generic implementation handles page-table aligned huge pages, as well
- * as non-page-table aligned hugetlbfs compound pages.
- *
- * If an architecture supports and uses non-page-table aligned pages in their
- * kernel mapping it will need to provide it's own implementation of this
- * function.
+ * Return the pagetable size of a given virtual address.
*/
-__weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
+static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
{
- struct page *page;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- pgd = pgd_offset(mm, addr);
- if (pgd_none(*pgd))
- return 0;
+ u64 size = 0;
- p4d = p4d_offset(pgd, addr);
- if (!p4d_present(*p4d))
+#ifdef CONFIG_HAVE_FAST_GUP
+ pgd_t *pgdp, pgd;
+ p4d_t *p4dp, p4d;
+ pud_t *pudp, pud;
+ pmd_t *pmdp, pmd;
+ pte_t *ptep, pte;
+
+ pgdp = pgd_offset(mm, addr);
+ pgd = READ_ONCE(*pgdp);
+ if (pgd_none(pgd))
return 0;
- if (p4d_leaf(*p4d))
- return 1ULL << P4D_SHIFT;
+ if (pgd_leaf(pgd))
+ return pgd_leaf_size(pgd);
- pud = pud_offset(p4d, addr);
- if (!pud_present(*pud))
+ p4dp = p4d_offset_lockless(pgdp, pgd, addr);
+ p4d = READ_ONCE(*p4dp);
+ if (!p4d_present(p4d))
return 0;
- if (pud_leaf(*pud)) {
-#ifdef pud_page
- page = pud_page(*pud);
- if (PageHuge(page))
- return page_size(compound_head(page));
-#endif
- return 1ULL << PUD_SHIFT;
- }
+ if (p4d_leaf(p4d))
+ return p4d_leaf_size(p4d);
- pmd = pmd_offset(pud, addr);
- if (!pmd_present(*pmd))
+ pudp = pud_offset_lockless(p4dp, p4d, addr);
+ pud = READ_ONCE(*pudp);
+ if (!pud_present(pud))
return 0;
- if (pmd_leaf(*pmd)) {
-#ifdef pmd_page
- page = pmd_page(*pmd);
- if (PageHuge(page))
- return page_size(compound_head(page));
-#endif
- return 1ULL << PMD_SHIFT;
- }
+ if (pud_leaf(pud))
+ return pud_leaf_size(pud);
- pte = pte_offset_map(pmd, addr);
- if (!pte_present(*pte)) {
- pte_unmap(pte);
+ pmdp = pmd_offset_lockless(pudp, pud, addr);
+ pmd = READ_ONCE(*pmdp);
+ if (!pmd_present(pmd))
return 0;
- }
- page = pte_page(*pte);
- if (PageHuge(page)) {
- u64 size = page_size(compound_head(page));
- pte_unmap(pte);
- return size;
- }
+ if (pmd_leaf(pmd))
+ return pmd_leaf_size(pmd);
- pte_unmap(pte);
- return PAGE_SIZE;
-}
-
-#else
+ ptep = pte_offset_map(&pmd, addr);
+ pte = ptep_get_lockless(ptep);
+ if (pte_present(pte))
+ size = pte_leaf_size(pte);
+ pte_unmap(ptep);
+#endif /* CONFIG_HAVE_FAST_GUP */
-static u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
-{
- return 0;
+ return size;
}
-#endif
-
static u64 perf_get_page_size(unsigned long addr)
{
struct mm_struct *mm;
@@ -7109,7 +7082,7 @@ static u64 perf_get_page_size(unsigned l
mm = &init_mm;
}
- size = arch_perf_get_page_size(mm, addr);
+ size = perf_get_pgtable_size(mm, addr);
local_irq_restore(flags);
^ permalink raw reply
* Re: [PATCH v2 3/6] perf/core: Fix arch_perf_get_page_size()
From: Matthew Wilcox @ 2020-11-26 12:34 UTC (permalink / raw)
To: Peter Zijlstra
Cc: mark.rutland, aneesh.kumar, linux-arch, catalin.marinas, will,
alexander.shishkin, linuxppc-dev, npiggin, linux-kernel, acme,
davem, dave.hansen, ak, eranian, sparclinux, jolsa, mingo,
kirill.shutemov, kan.liang
In-Reply-To: <20201126121121.164675154@infradead.org>
On Thu, Nov 26, 2020 at 01:01:17PM +0100, Peter Zijlstra wrote:
> The (new) page-table walker in arch_perf_get_page_size() is broken in
> various ways. Specifically while it is used in a lockless manner, it
> doesn't depend on CONFIG_HAVE_FAST_GUP nor uses the proper _lockless
> offset methods, nor is careful to only read each entry only once.
>
> Also the hugetlb support is broken due to calling pte_page() without
> first checking pte_special().
>
> Rewrite the whole thing to be a proper lockless page-table walker and
> employ the new pXX_leaf_size() pgtable functions to determine the
> pagetable size without looking at the page-frames.
>
> Fixes: 51b646b2d9f8 ("perf,mm: Handle non-page-table-aligned hugetlbfs")
> Fixes: 8d97e71811aa ("perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE")
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Tested-by: Kan Liang <kan.liang@linux.intel.com>
> ---
> arch/arm64/include/asm/pgtable.h | 3 +
> arch/sparc/include/asm/pgtable_64.h | 13 ++++
> arch/sparc/mm/hugetlbpage.c | 19 ++++--
> include/linux/pgtable.h | 16 +++++
> kernel/events/core.c | 102 +++++++++++++-----------------------
> 5 files changed, 82 insertions(+), 71 deletions(-)
This diffstat doesn't match the patch in this email ...
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -52,6 +52,7 @@
> #include <linux/mount.h>
> #include <linux/min_heap.h>
> #include <linux/highmem.h>
> +#include <linux/pgtable.h>
>
> #include "internal.h"
>
> @@ -7001,90 +7001,62 @@ static u64 perf_virt_to_phys(u64 virt)
> return phys_addr;
> }
>
> -#ifdef CONFIG_MMU
> -
> /*
> - * Return the MMU page size of a given virtual address.
> - *
> - * This generic implementation handles page-table aligned huge pages, as well
> - * as non-page-table aligned hugetlbfs compound pages.
> - *
> - * If an architecture supports and uses non-page-table aligned pages in their
> - * kernel mapping it will need to provide it's own implementation of this
> - * function.
> + * Return the pagetable size of a given virtual address.
> */
> -__weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
> +static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
> {
> - struct page *page;
> - pgd_t *pgd;
> - p4d_t *p4d;
> - pud_t *pud;
> - pmd_t *pmd;
> - pte_t *pte;
> + u64 size = 0;
>
> - pgd = pgd_offset(mm, addr);
> - if (pgd_none(*pgd))
> - return 0;
> +#ifdef CONFIG_HAVE_FAST_GUP
> + pgd_t *pgdp, pgd;
> + p4d_t *p4dp, p4d;
> + pud_t *pudp, pud;
> + pmd_t *pmdp, pmd;
> + pte_t *ptep, pte;
>
> - p4d = p4d_offset(pgd, addr);
> - if (!p4d_present(*p4d))
> + pgdp = pgd_offset(mm, addr);
> + pgd = READ_ONCE(*pgdp);
> + if (pgd_none(pgd))
> return 0;
>
> - if (p4d_leaf(*p4d))
> - return 1ULL << P4D_SHIFT;
> + if (pgd_leaf(pgd))
> + return pgd_leaf_size(pgd);
>
> - pud = pud_offset(p4d, addr);
> - if (!pud_present(*pud))
> + p4dp = p4d_offset_lockless(pgdp, pgd, addr);
> + p4d = READ_ONCE(*p4dp);
> + if (!p4d_present(p4d))
> return 0;
>
> - if (pud_leaf(*pud)) {
> -#ifdef pud_page
> - page = pud_page(*pud);
> - if (PageHuge(page))
> - return page_size(compound_head(page));
> -#endif
> - return 1ULL << PUD_SHIFT;
> - }
> + if (p4d_leaf(p4d))
> + return p4d_leaf_size(p4d);
>
> - pmd = pmd_offset(pud, addr);
> - if (!pmd_present(*pmd))
> + pudp = pud_offset_lockless(p4dp, p4d, addr);
> + pud = READ_ONCE(*pudp);
> + if (!pud_present(pud))
> return 0;
>
> - if (pmd_leaf(*pmd)) {
> -#ifdef pmd_page
> - page = pmd_page(*pmd);
> - if (PageHuge(page))
> - return page_size(compound_head(page));
> -#endif
> - return 1ULL << PMD_SHIFT;
> - }
> + if (pud_leaf(pud))
> + return pud_leaf_size(pud);
>
> - pte = pte_offset_map(pmd, addr);
> - if (!pte_present(*pte)) {
> - pte_unmap(pte);
> + pmdp = pmd_offset_lockless(pudp, pud, addr);
> + pmd = READ_ONCE(*pmdp);
> + if (!pmd_present(pmd))
> return 0;
> - }
>
> - page = pte_page(*pte);
> - if (PageHuge(page)) {
> - u64 size = page_size(compound_head(page));
> - pte_unmap(pte);
> - return size;
> - }
> -
> - pte_unmap(pte);
> - return PAGE_SIZE;
> -}
> + if (pmd_leaf(pmd))
> + return pmd_leaf_size(pmd);
>
> -#else
> + ptep = pte_offset_map(&pmd, addr);
> + pte = ptep_get_lockless(ptep);
> + if (pte_present(pte))
> + size = pte_leaf_size(pte);
> + pte_unmap(ptep);
> +#endif /* CONFIG_HAVE_FAST_GUP */
>
> -static u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
> -{
> - return 0;
> + return size;
> }
>
> -#endif
> -
> static u64 perf_get_page_size(unsigned long addr)
> {
> struct mm_struct *mm;
> @@ -7109,7 +7081,7 @@ static u64 perf_get_page_size(unsigned l
> mm = &init_mm;
> }
>
> - size = arch_perf_get_page_size(mm, addr);
> + size = perf_get_pgtable_size(mm, addr);
>
> local_irq_restore(flags);
>
>
>
^ permalink raw reply
* [PATCH v2 1/6] mm/gup: Provide gup_get_pte() more generic
From: Peter Zijlstra @ 2020-11-26 12:01 UTC (permalink / raw)
To: kan.liang, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
eranian
Cc: linux-arch, ak, catalin.marinas, peterz, linuxppc-dev, willy,
linux-kernel, dave.hansen, npiggin, aneesh.kumar, sparclinux,
will, davem, kirill.shutemov
In-Reply-To: <20201126120114.071913521@infradead.org>
In order to write another lockless page-table walker, we need
gup_get_pte() exposed. While doing that, rename it to
ptep_get_lockless() to match the existing ptep_get() naming.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/pgtable.h | 55 +++++++++++++++++++++++++++++++++++++++++++++
mm/gup.c | 58 ------------------------------------------------
2 files changed, 56 insertions(+), 57 deletions(-)
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -258,6 +258,61 @@ static inline pte_t ptep_get(pte_t *ptep
}
#endif
+#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
+/*
+ * WARNING: only to be used in the get_user_pages_fast() implementation.
+ *
+ * With get_user_pages_fast(), we walk down the pagetables without taking any
+ * locks. For this we would like to load the pointers atomically, but sometimes
+ * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What
+ * we do have is the guarantee that a PTE will only either go from not present
+ * to present, or present to not present or both -- it will not switch to a
+ * completely different present page without a TLB flush in between; something
+ * that we are blocking by holding interrupts off.
+ *
+ * Setting ptes from not present to present goes:
+ *
+ * ptep->pte_high = h;
+ * smp_wmb();
+ * ptep->pte_low = l;
+ *
+ * And present to not present goes:
+ *
+ * ptep->pte_low = 0;
+ * smp_wmb();
+ * ptep->pte_high = 0;
+ *
+ * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
+ * We load pte_high *after* loading pte_low, which ensures we don't see an older
+ * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't
+ * picked up a changed pte high. We might have gotten rubbish values from
+ * pte_low and pte_high, but we are guaranteed that pte_low will not have the
+ * present bit set *unless* it is 'l'. Because get_user_pages_fast() only
+ * operates on present ptes we're safe.
+ */
+static inline pte_t ptep_get_lockless(pte_t *ptep)
+{
+ pte_t pte;
+
+ do {
+ pte.pte_low = ptep->pte_low;
+ smp_rmb();
+ pte.pte_high = ptep->pte_high;
+ smp_rmb();
+ } while (unlikely(pte.pte_low != ptep->pte_low));
+
+ return pte;
+}
+#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */
+/*
+ * We require that the PTE can be read atomically.
+ */
+static inline pte_t ptep_get_lockless(pte_t *ptep)
+{
+ return ptep_get(ptep);
+}
+#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2079,62 +2079,6 @@ static void put_compound_head(struct pag
put_page(page);
}
-#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
-
-/*
- * WARNING: only to be used in the get_user_pages_fast() implementation.
- *
- * With get_user_pages_fast(), we walk down the pagetables without taking any
- * locks. For this we would like to load the pointers atomically, but sometimes
- * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What
- * we do have is the guarantee that a PTE will only either go from not present
- * to present, or present to not present or both -- it will not switch to a
- * completely different present page without a TLB flush in between; something
- * that we are blocking by holding interrupts off.
- *
- * Setting ptes from not present to present goes:
- *
- * ptep->pte_high = h;
- * smp_wmb();
- * ptep->pte_low = l;
- *
- * And present to not present goes:
- *
- * ptep->pte_low = 0;
- * smp_wmb();
- * ptep->pte_high = 0;
- *
- * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
- * We load pte_high *after* loading pte_low, which ensures we don't see an older
- * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't
- * picked up a changed pte high. We might have gotten rubbish values from
- * pte_low and pte_high, but we are guaranteed that pte_low will not have the
- * present bit set *unless* it is 'l'. Because get_user_pages_fast() only
- * operates on present ptes we're safe.
- */
-static inline pte_t gup_get_pte(pte_t *ptep)
-{
- pte_t pte;
-
- do {
- pte.pte_low = ptep->pte_low;
- smp_rmb();
- pte.pte_high = ptep->pte_high;
- smp_rmb();
- } while (unlikely(pte.pte_low != ptep->pte_low));
-
- return pte;
-}
-#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */
-/*
- * We require that the PTE can be read atomically.
- */
-static inline pte_t gup_get_pte(pte_t *ptep)
-{
- return ptep_get(ptep);
-}
-#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
-
static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
unsigned int flags,
struct page **pages)
@@ -2160,7 +2104,7 @@ static int gup_pte_range(pmd_t pmd, unsi
ptem = ptep = pte_offset_map(&pmd, addr);
do {
- pte_t pte = gup_get_pte(ptep);
+ pte_t pte = ptep_get_lockless(ptep);
struct page *head, *page;
/*
^ permalink raw reply
* [PATCH v2 0/6] perf/mm: Fix PERF_SAMPLE_*_PAGE_SIZE
From: Peter Zijlstra @ 2020-11-26 12:01 UTC (permalink / raw)
To: kan.liang, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
eranian
Cc: linux-arch, ak, catalin.marinas, peterz, linuxppc-dev, willy,
linux-kernel, dave.hansen, npiggin, aneesh.kumar, sparclinux,
will, davem, kirill.shutemov
Hi,
These patches provide generic infrastructure to determine TLB page size from
page table entries alone. Perf will use this (for either data or code address)
to aid in profiling TLB issues.
While most architectures only have page table aligned large pages, some
(notably ARM64, Sparc64 and Power) provide non page table aligned large pages
and need to provide their own implementation of these functions.
I've provided (completely untested) implementations for ARM64, Sparc64 and
Power/8xxx (it looks like I'm still missing Power/Book3s64/hash support).
Changes since -v1:
- Changed wording to reflect these are page-table sizes; actual TLB sizes
might vary.
- added Power/8xx
Barring any objections I'll queue these in tip/perf/core, as these patches fix
the code that's currently in there.
^ permalink raw reply
* [PATCH v2 4/6] arm64/mm: Implement pXX_leaf_size() support
From: Peter Zijlstra @ 2020-11-26 12:01 UTC (permalink / raw)
To: kan.liang, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
eranian
Cc: linux-arch, ak, catalin.marinas, peterz, linuxppc-dev, willy,
linux-kernel, dave.hansen, npiggin, aneesh.kumar, sparclinux,
will, davem, kirill.shutemov
In-Reply-To: <20201126120114.071913521@infradead.org>
ARM64 has non-pagetable aligned large page support with PTE_CONT, when
this bit is set the page is part of a super-page. Match the hugetlb
code and support these super pages for PTE and PMD levels.
This enables PERF_SAMPLE_{DATA,CODE}_PAGE_SIZE to report accurate
pagetable leaf sizes.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
arch/arm64/include/asm/pgtable.h | 3 +++
1 file changed, 3 insertions(+)
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -503,6 +503,9 @@ extern pgprot_t phys_mem_access_prot(str
PMD_TYPE_SECT)
#define pmd_leaf(pmd) pmd_sect(pmd)
+#define pmd_leaf_size(pmd) (pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE)
+#define pte_leaf_size(pte) (pte_cont(pte) ? CONT_PTE_SIZE : PAGE_SIZE)
+
#if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
static inline bool pud_sect(pud_t pud) { return false; }
static inline bool pud_table(pud_t pud) { return true; }
^ permalink raw reply
* [PATCH v2 3/6] perf/core: Fix arch_perf_get_page_size()
From: Peter Zijlstra @ 2020-11-26 12:01 UTC (permalink / raw)
To: kan.liang, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
eranian
Cc: linux-arch, ak, catalin.marinas, peterz, linuxppc-dev, willy,
linux-kernel, dave.hansen, npiggin, aneesh.kumar, sparclinux,
will, davem, kirill.shutemov
In-Reply-To: <20201126120114.071913521@infradead.org>
The (new) page-table walker in arch_perf_get_page_size() is broken in
various ways. Specifically while it is used in a lockless manner, it
doesn't depend on CONFIG_HAVE_FAST_GUP nor uses the proper _lockless
offset methods, nor is careful to only read each entry only once.
Also the hugetlb support is broken due to calling pte_page() without
first checking pte_special().
Rewrite the whole thing to be a proper lockless page-table walker and
employ the new pXX_leaf_size() pgtable functions to determine the
pagetable size without looking at the page-frames.
Fixes: 51b646b2d9f8 ("perf,mm: Handle non-page-table-aligned hugetlbfs")
Fixes: 8d97e71811aa ("perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Kan Liang <kan.liang@linux.intel.com>
---
arch/arm64/include/asm/pgtable.h | 3 +
arch/sparc/include/asm/pgtable_64.h | 13 ++++
arch/sparc/mm/hugetlbpage.c | 19 ++++--
include/linux/pgtable.h | 16 +++++
kernel/events/core.c | 102 +++++++++++++-----------------------
5 files changed, 82 insertions(+), 71 deletions(-)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -52,6 +52,7 @@
#include <linux/mount.h>
#include <linux/min_heap.h>
#include <linux/highmem.h>
+#include <linux/pgtable.h>
#include "internal.h"
@@ -7001,90 +7001,62 @@ static u64 perf_virt_to_phys(u64 virt)
return phys_addr;
}
-#ifdef CONFIG_MMU
-
/*
- * Return the MMU page size of a given virtual address.
- *
- * This generic implementation handles page-table aligned huge pages, as well
- * as non-page-table aligned hugetlbfs compound pages.
- *
- * If an architecture supports and uses non-page-table aligned pages in their
- * kernel mapping it will need to provide it's own implementation of this
- * function.
+ * Return the pagetable size of a given virtual address.
*/
-__weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
+static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
{
- struct page *page;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
+ u64 size = 0;
- pgd = pgd_offset(mm, addr);
- if (pgd_none(*pgd))
- return 0;
+#ifdef CONFIG_HAVE_FAST_GUP
+ pgd_t *pgdp, pgd;
+ p4d_t *p4dp, p4d;
+ pud_t *pudp, pud;
+ pmd_t *pmdp, pmd;
+ pte_t *ptep, pte;
- p4d = p4d_offset(pgd, addr);
- if (!p4d_present(*p4d))
+ pgdp = pgd_offset(mm, addr);
+ pgd = READ_ONCE(*pgdp);
+ if (pgd_none(pgd))
return 0;
- if (p4d_leaf(*p4d))
- return 1ULL << P4D_SHIFT;
+ if (pgd_leaf(pgd))
+ return pgd_leaf_size(pgd);
- pud = pud_offset(p4d, addr);
- if (!pud_present(*pud))
+ p4dp = p4d_offset_lockless(pgdp, pgd, addr);
+ p4d = READ_ONCE(*p4dp);
+ if (!p4d_present(p4d))
return 0;
- if (pud_leaf(*pud)) {
-#ifdef pud_page
- page = pud_page(*pud);
- if (PageHuge(page))
- return page_size(compound_head(page));
-#endif
- return 1ULL << PUD_SHIFT;
- }
+ if (p4d_leaf(p4d))
+ return p4d_leaf_size(p4d);
- pmd = pmd_offset(pud, addr);
- if (!pmd_present(*pmd))
+ pudp = pud_offset_lockless(p4dp, p4d, addr);
+ pud = READ_ONCE(*pudp);
+ if (!pud_present(pud))
return 0;
- if (pmd_leaf(*pmd)) {
-#ifdef pmd_page
- page = pmd_page(*pmd);
- if (PageHuge(page))
- return page_size(compound_head(page));
-#endif
- return 1ULL << PMD_SHIFT;
- }
+ if (pud_leaf(pud))
+ return pud_leaf_size(pud);
- pte = pte_offset_map(pmd, addr);
- if (!pte_present(*pte)) {
- pte_unmap(pte);
+ pmdp = pmd_offset_lockless(pudp, pud, addr);
+ pmd = READ_ONCE(*pmdp);
+ if (!pmd_present(pmd))
return 0;
- }
- page = pte_page(*pte);
- if (PageHuge(page)) {
- u64 size = page_size(compound_head(page));
- pte_unmap(pte);
- return size;
- }
-
- pte_unmap(pte);
- return PAGE_SIZE;
-}
+ if (pmd_leaf(pmd))
+ return pmd_leaf_size(pmd);
-#else
+ ptep = pte_offset_map(&pmd, addr);
+ pte = ptep_get_lockless(ptep);
+ if (pte_present(pte))
+ size = pte_leaf_size(pte);
+ pte_unmap(ptep);
+#endif /* CONFIG_HAVE_FAST_GUP */
-static u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
-{
- return 0;
+ return size;
}
-#endif
-
static u64 perf_get_page_size(unsigned long addr)
{
struct mm_struct *mm;
@@ -7109,7 +7081,7 @@ static u64 perf_get_page_size(unsigned l
mm = &init_mm;
}
- size = arch_perf_get_page_size(mm, addr);
+ size = perf_get_pgtable_size(mm, addr);
local_irq_restore(flags);
^ permalink raw reply
* [PATCH v2 5/6] sparc64/mm: Implement pXX_leaf_size() support
From: Peter Zijlstra @ 2020-11-26 12:01 UTC (permalink / raw)
To: kan.liang, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
eranian
Cc: linux-arch, ak, catalin.marinas, peterz, linuxppc-dev, willy,
linux-kernel, dave.hansen, npiggin, aneesh.kumar, sparclinux,
will, davem, kirill.shutemov
In-Reply-To: <20201126120114.071913521@infradead.org>
Sparc64 has non-pagetable aligned large page support; wire up the
pXX_leaf_size() functions to report the correct pagetable page size.
This enables PERF_SAMPLE_{DATA,CODE}_PAGE_SIZE to report accurate
pagetable leaf sizes.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
arch/sparc/include/asm/pgtable_64.h | 13 +++++++++++++
arch/sparc/mm/hugetlbpage.c | 19 +++++++++++++------
2 files changed, 26 insertions(+), 6 deletions(-)
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -1121,6 +1121,19 @@ extern unsigned long cmdline_memory_size
asmlinkage void do_sparc64_fault(struct pt_regs *regs);
+#ifdef CONFIG_HUGETLB_PAGE
+
+#define pud_leaf_size pud_leaf_size
+extern unsigned long pud_leaf_size(pud_t pud);
+
+#define pmd_leaf_size pmd_leaf_size
+extern unsigned long pmd_leaf_size(pmd_t pmd);
+
+#define pte_leaf_size pte_leaf_size
+extern unsigned long pte_leaf_size(pte_t pte);
+
+#endif /* CONFIG_HUGETLB_PAGE */
+
#endif /* !(__ASSEMBLY__) */
#endif /* !(_SPARC64_PGTABLE_H) */
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -247,14 +247,17 @@ static unsigned int sun4u_huge_tte_to_sh
return shift;
}
-static unsigned int huge_tte_to_shift(pte_t entry)
+static unsigned long tte_to_shift(pte_t entry)
{
- unsigned long shift;
-
if (tlb_type == hypervisor)
- shift = sun4v_huge_tte_to_shift(entry);
- else
- shift = sun4u_huge_tte_to_shift(entry);
+ return sun4v_huge_tte_to_shift(entry);
+
+ return sun4u_huge_tte_to_shift(entry);
+}
+
+static unsigned int huge_tte_to_shift(pte_t entry)
+{
+ unsigned long shift = tte_to_shift(entry);
if (shift == PAGE_SHIFT)
WARN_ONCE(1, "tto_to_shift: invalid hugepage tte=0x%lx\n",
@@ -272,6 +275,10 @@ static unsigned long huge_tte_to_size(pt
return size;
}
+unsigned long pud_leaf_size(pud_t pud) { return 1UL << tte_to_shift((pte_t)pud); }
+unsigned long pmd_leaf_size(pmd_t pmd) { return 1UL << tte_to_shift((pte_t)pmd); }
+unsigned long pte_leaf_size(pte_t pte) { return 1UL << tte_to_shift((pte_t)pte); }
+
pte_t *huge_pte_alloc(struct mm_struct *mm,
unsigned long addr, unsigned long sz)
{
^ permalink raw reply
* [PATCH v2 6/6] powerpc/8xx: Implement pXX_leaf_size() support
From: Peter Zijlstra @ 2020-11-26 12:01 UTC (permalink / raw)
To: kan.liang, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
eranian
Cc: linux-arch, ak, catalin.marinas, peterz, linuxppc-dev, willy,
linux-kernel, dave.hansen, npiggin, aneesh.kumar, sparclinux,
will, davem, kirill.shutemov
In-Reply-To: <20201126120114.071913521@infradead.org>
Christophe Leroy wrote:
> I can help with powerpc 8xx. It is a 32 bits powerpc. The PGD has 1024
> entries, that means each entry maps 4M.
>
> Page sizes are 4k, 16k, 512k and 8M.
>
> For the 8M pages we use hugepd with a single entry. The two related PGD
> entries point to the same hugepd.
>
> For the other sizes, they are in standard page tables. 16k pages appear
> 4 times in the page table. 512k entries appear 128 times in the page
> table.
>
> When the PGD entry has _PMD_PAGE_8M bits, the PMD entry points to a
> hugepd with holds the single 8M entry.
>
> In the PTE, we have two bits: _PAGE_SPS and _PAGE_HUGE
>
> _PAGE_HUGE means it is a 512k page
> _PAGE_SPS means it is not a 4k page
>
> The kernel can by build either with 4k pages as standard page size, or
> 16k pages. It doesn't change the page table layout though.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
arch/powerpc/include/asm/nohash/32/pte-8xx.h | 23 +++++++++++++++++++++++
1 file changed, 23 insertions(+)
--- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
@@ -135,6 +135,29 @@ static inline pte_t pte_mkhuge(pte_t pte
}
#define pte_mkhuge pte_mkhuge
+
+static inline unsigned long pgd_leaf_size(pgd_t pgd)
+{
+ if (pgd_val(pgd) & _PMD_PAGE_8M)
+ return SZ_8M;
+ return SZ_4M;
+}
+
+#define pgd_leaf_size pgd_leaf_size
+
+static inline unsigned long pte_leaf_size(pte_t pte)
+{
+ pte_basic_t val = pte_val(pte);
+
+ if (val & _PAGE_HUGE)
+ return SZ_512K;
+ if (val & _PAGE_SPS)
+ return SZ_16K;
+ return SZ_4K;
+}
+
+#define pte_leaf_size pte_leaf_size
+
#endif
#endif /* __KERNEL__ */
^ permalink raw reply
* [PATCH v2 2/6] mm: Introduce pXX_leaf_size()
From: Peter Zijlstra @ 2020-11-26 12:01 UTC (permalink / raw)
To: kan.liang, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
eranian
Cc: linux-arch, ak, catalin.marinas, peterz, linuxppc-dev, willy,
linux-kernel, dave.hansen, npiggin, aneesh.kumar, sparclinux,
will, davem, kirill.shutemov
In-Reply-To: <20201126120114.071913521@infradead.org>
A number of architectures have non-pagetable aligned huge/large pages.
For such architectures a leaf can actually be part of a larger entry.
Provide generic helpers to determine the size of a page-table leaf.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/pgtable.h | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1536,4 +1536,20 @@ typedef unsigned int pgtbl_mod_mask;
#define pmd_leaf(x) 0
#endif
+#ifndef pgd_leaf_size
+#define pgd_leaf_size(x) (1ULL << PGDIR_SHIFT)
+#endif
+#ifndef p4d_leaf_size
+#define p4d_leaf_size(x) P4D_SIZE
+#endif
+#ifndef pud_leaf_size
+#define pud_leaf_size(x) PUD_SIZE
+#endif
+#ifndef pmd_leaf_size
+#define pmd_leaf_size(x) PMD_SIZE
+#endif
+#ifndef pte_leaf_size
+#define pte_leaf_size(x) PAGE_SIZE
+#endif
+
#endif /* _LINUX_PGTABLE_H */
^ permalink raw reply
* Re: [PATCH 0/5] perf/mm: Fix PERF_SAMPLE_*_PAGE_SIZE
From: Peter Zijlstra @ 2020-11-26 10:46 UTC (permalink / raw)
To: Christophe Leroy
Cc: mark.rutland, aneesh.kumar, willy, catalin.marinas, will,
alexander.shishkin, linuxppc-dev, npiggin, linux-kernel, acme,
davem, dave.hansen, ak, eranian, sparclinux, linux-arch, jolsa,
mingo, kirill.shutemov, kan.liang
In-Reply-To: <20201120122004.GG3021@hirez.programming.kicks-ass.net>
On Fri, Nov 20, 2020 at 01:20:04PM +0100, Peter Zijlstra wrote:
> > > I can help with powerpc 8xx. It is a 32 bits powerpc. The PGD has 1024
> > > entries, that means each entry maps 4M.
> > >
> > > Page sizes are 4k, 16k, 512k and 8M.
> > >
> > > For the 8M pages we use hugepd with a single entry. The two related PGD
> > > entries point to the same hugepd.
> > >
> > > For the other sizes, they are in standard page tables. 16k pages appear
> > > 4 times in the page table. 512k entries appear 128 times in the page
> > > table.
> > >
> > > When the PGD entry has _PMD_PAGE_8M bits, the PMD entry points to a
> > > hugepd with holds the single 8M entry.
> > >
> > > In the PTE, we have two bits: _PAGE_SPS and _PAGE_HUGE
> > >
> > > _PAGE_HUGE means it is a 512k page
> > > _PAGE_SPS means it is not a 4k page
> > >
> > > The kernel can by build either with 4k pages as standard page size, or
> > > 16k pages. It doesn't change the page table layout though.
> > >
> > > Hope this is clear. Now I don't really know to wire that up to your series.
Does the below accurately reflect things?
Let me go find a suitable cross-compiler ..
diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
index 1581204467e1..fcc48d590d88 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
@@ -135,6 +135,29 @@ static inline pte_t pte_mkhuge(pte_t pte)
}
#define pte_mkhuge pte_mkhuge
+
+static inline unsigned long pgd_leaf_size(pgd_t pgd)
+{
+ if (pgd_val(pgd) & _PMD_PAGE_8M)
+ return SZ_8M;
+ return SZ_4M;
+}
+
+#define pgd_leaf_size pgd_leaf_size
+
+static inline unsigned long pte_leaf_size(pte_t pte)
+{
+ pte_basic_t val = pte_val(pte);
+
+ if (val & _PAGE_HUGE)
+ return SZ_512K;
+ if (val & _PAGE_SPS)
+ return SZ_16K;
+ return SZ_4K;
+}
+
+#define pte_leaf_size pte_leaf_size
+
#endif
#endif /* __KERNEL__ */
^ permalink raw reply related
* Re: [PATCH v6 16/22] powerpc/book3s64/kuap: Improve error reporting with KUAP
From: Christophe Leroy @ 2020-11-26 10:39 UTC (permalink / raw)
To: Michael Ellerman, Aneesh Kumar K.V, linuxppc-dev
In-Reply-To: <87r1ogxy8j.fsf@mpe.ellerman.id.au>
Le 26/11/2020 à 10:29, Michael Ellerman a écrit :
> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>>
>>> Le 25/11/2020 à 06:16, Aneesh Kumar K.V a écrit :
>>>> With hash translation use DSISR_KEYFAULT to identify a wrong access.
>>>> With Radix we look at the AMR value and type of fault.
>>>>
>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>> ---
>>>> arch/powerpc/include/asm/book3s/32/kup.h | 4 +--
>>>> arch/powerpc/include/asm/book3s/64/kup.h | 27 ++++++++++++++++----
>>>> arch/powerpc/include/asm/kup.h | 4 +--
>>>> arch/powerpc/include/asm/nohash/32/kup-8xx.h | 4 +--
>>>> arch/powerpc/mm/fault.c | 2 +-
>>>> 5 files changed, 29 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h
>>>> index 32fd4452e960..b18cd931e325 100644
>>>> --- a/arch/powerpc/include/asm/book3s/32/kup.h
>>>> +++ b/arch/powerpc/include/asm/book3s/32/kup.h
>>>> @@ -177,8 +177,8 @@ static inline void restore_user_access(unsigned long flags)
>>>> allow_user_access(to, to, end - addr, KUAP_READ_WRITE);
>>>> }
>>>>
>>>> -static inline bool
>>>> -bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write)
>>>> +static inline bool bad_kuap_fault(struct pt_regs *regs, unsigned long address,
>>>> + bool is_write, unsigned long error_code)
>>>> {
>>>> unsigned long begin = regs->kuap & 0xf0000000;
>>>> unsigned long end = regs->kuap << 28;
>>>> diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h
>>>> index 4a3d0d601745..2922c442a218 100644
>>>> --- a/arch/powerpc/include/asm/book3s/64/kup.h
>>>> +++ b/arch/powerpc/include/asm/book3s/64/kup.h
>>>> @@ -301,12 +301,29 @@ static inline void set_kuap(unsigned long value)
>>>> isync();
>>>> }
>>>>
>>>> -static inline bool
>>>> -bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write)
>>>> +#define RADIX_KUAP_BLOCK_READ UL(0x4000000000000000)
>>>> +#define RADIX_KUAP_BLOCK_WRITE UL(0x8000000000000000)
>>>> +
>>>> +static inline bool bad_kuap_fault(struct pt_regs *regs, unsigned long address,
>>>> + bool is_write, unsigned long error_code)
>>>> {
>>>> - return WARN(mmu_has_feature(MMU_FTR_KUAP) &&
>>>> - (regs->kuap & (is_write ? AMR_KUAP_BLOCK_WRITE : AMR_KUAP_BLOCK_READ)),
>>>> - "Bug: %s fault blocked by AMR!", is_write ? "Write" : "Read");
>>>> + if (!mmu_has_feature(MMU_FTR_KUAP))
>>>> + return false;
>>>> +
>>>> + if (radix_enabled()) {
>>>> + /*
>>>> + * Will be a storage protection fault.
>>>> + * Only check the details of AMR[0]
>>>> + */
>>>> + return WARN((regs->kuap & (is_write ? RADIX_KUAP_BLOCK_WRITE : RADIX_KUAP_BLOCK_READ)),
>>>> + "Bug: %s fault blocked by AMR!", is_write ? "Write" : "Read");
>>>
>>> I think it is pointless to keep the WARN() here.
>>>
>>> I have a series aiming at removing them. See
>>> https://patchwork.ozlabs.org/project/linuxppc-dev/patch/cc9129bdda1dbc2f0a09cf45fece7d0b0e690784.1605541983.git.christophe.leroy@csgroup.eu/
>>
>> Can we do this as a spearate patch as you posted above? We can drop the
>> WARN in that while keeping the hash branch to look at DSISR value.
>
> Yeah we can reconcile Christophe's series with yours later.
>
> I'm still not 100% convinced I want to drop that WARN.
Ok, you can still take the rest of the series as that patch is the last one.
But, I really can't see the point with the WARN. When I hip a kuap bad fault, I get a double dump
(see below). The second one is the interesting one, it tells me everything about the fault. But the
WARN provides internals of do_page_fault() function. What interesting information do I get from there ?
[ 37.842509] lkdtm: attempting bad write at b7bae000
[ 37.842526] ------------[ cut here ]------------
[ 37.842536] Bug: write fault blocked by segment registers !
[ 37.842598] WARNING: CPU: 0 PID: 434 at arch/powerpc/include/asm/book3s/32/kup.h:189
do_page_fault+0x3c8/0x5f0
[ 37.842630] CPU: 0 PID: 434 Comm: busybox Not tainted 5.10.0-rc5-s3k-dev-01343-g8bec80f73baa #4165
[ 37.842650] NIP: c00155e4 LR: c00155e4 CTR: 00000000
[ 37.842670] REGS: e6719c78 TRAP: 0700 Not tainted (5.10.0-rc5-s3k-dev-01343-g8bec80f73baa)
[ 37.842683] MSR: 00021032 <ME,IR,DR,RI> CR: 22002224 XER: 20000000
[ 37.842750]
[ 37.842750] GPR00: c00155e4 e6719d30 c113c660 0000002f c097adf8 c097af10 00000027 00000027
[ 37.842750] GPR08: c0b0afbc 00000000 00000023 00000001 24002224 100d166a 100a0920 00000000
[ 37.842750] GPR16: 100cac0c 100b0000 10169444 1016a685 100d0000 100d0000 00000000 100a0900
[ 37.842750] GPR24: ffffffef ffffffff c1392220 00000300 c076f424 02000000 b7bae000 e6719d70
[ 37.843049] NIP [c00155e4] do_page_fault+0x3c8/0x5f0
[ 37.843074] LR [c00155e4] do_page_fault+0x3c8/0x5f0
[ 37.843087] Call Trace:
[ 37.843114] [e6719d30] [c00155e4] do_page_fault+0x3c8/0x5f0 (unreliable)
[ 37.843154] [e6719d60] [c0014384] handle_page_fault+0x10/0x3c
[ 37.843211] --- interrupt: 301 at lkdtm_ACCESS_USERSPACE+0xdc/0xe4
[ 37.843211] LR = lkdtm_ACCESS_USERSPACE+0xd0/0xe4
[ 37.843238] [e6719e48] [c039d76c] direct_entry+0xe0/0x164
[ 37.843281] [e6719e68] [c0286730] full_proxy_write+0x78/0xbc
[ 37.843325] [e6719e88] [c01657a8] vfs_write+0xdc/0x458
[ 37.843359] [e6719f08] [c0165cb0] ksys_write+0x6c/0x11c
[ 37.843397] [e6719f38] [c0014164] ret_from_syscall+0x0/0x34
[ 37.843426] --- interrupt: c01 at 0xfd55784
[ 37.843426] LR = 0xfe16244
[ 37.843438] Instruction dump:
[ 37.843459] 38600007 4bff7a19 3bc00000 4bfffdbc 419e0110 813f00b0 55280006 7c1e4040
[ 37.843529] 408000f4 3c60c080 3863e148 4801552d <0fe00000> 3c80c072 3c60c097 38840d84
[ 37.843602] ---[ end trace 29c115c8ef352681 ]---
[ 37.843627] Kernel attempted to write user page (b7bae000) - exploit attempt? (uid: 0)
[ 37.851531] BUG: Unable to handle kernel data access on write at 0xb7bae000
[ 37.858472] Faulting instruction address: 0xc039e550
[ 37.863432] Oops: Kernel access of bad area, sig: 11 [#1]
[ 37.868822] BE PAGE_SIZE=4K PREEMPT CMPCPRO
[ 37.873029] SAF3000 DIE NOTIFICATION
[ 37.876624] CPU: 0 PID: 434 Comm: busybox Tainted: G W
5.10.0-rc5-s3k-dev-01343-g8bec80f73baa #4165
[ 37.886940] NIP: c039e550 LR: c039e544 CTR: 00000000
[ 37.891988] REGS: e6719d70 TRAP: 0300 Tainted: G W
(5.10.0-rc5-s3k-dev-01343-g8bec80f73baa)
[ 37.901866] MSR: 00009032 <EE,ME,IR,DR,RI> CR: 24002224 XER: 00000000
[ 37.908617] DAR: b7bae000 DSISR: 0a000000
[ 37.908617] GPR00: c039e544 e6719e28 c113c660 c083aad8 c097adf8 c097af10 00000027 00000027
[ 37.908617] GPR08: c0b0afbc c0dec0de 00000023 00000001 28002224 100d166a 100a0920 00000000
[ 37.908617] GPR16: 100cac0c 100b0000 10169444 1016a685 100d0000 100d0000 00000000 100a0900
[ 37.908617] GPR24: ffffffef ffffffff e6719f10 00000011 c076f424 c1cb1000 c0839e24 b7bae000
[ 37.946267] NIP [c039e550] lkdtm_ACCESS_USERSPACE+0xdc/0xe4
[ 37.951842] LR [c039e544] lkdtm_ACCESS_USERSPACE+0xd0/0xe4
[ 37.957316] Call Trace:
[ 37.959782] [e6719e28] [c039e544] lkdtm_ACCESS_USERSPACE+0xd0/0xe4 (unreliable)
[ 37.967102] [e6719e48] [c039d76c] direct_entry+0xe0/0x164
[ 37.972524] [e6719e68] [c0286730] full_proxy_write+0x78/0xbc
[ 37.978204] [e6719e88] [c01657a8] vfs_write+0xdc/0x458
[ 37.983358] [e6719f08] [c0165cb0] ksys_write+0x6c/0x11c
[ 37.988605] [e6719f38] [c0014164] ret_from_syscall+0x0/0x34
[ 37.994185] --- interrupt: c01 at 0xfd55784
[ 37.994185] LR = 0xfe16244
[ 38.001385] Instruction dump:
[ 38.004360] 3863ac00 3d29c0df 3929c0de 91210008 4bcd04c9 3c60c084 3863ac24 7fe4fb78
[ 38.012149] 4bcd04b9 3c60c084 81210008 3863aad8 <913f0000> 4bffff80 3c60c084 3863aa00
[ 38.020120] ---[ end trace 29c115c8ef352682 ]---
Christophe
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox