Linux s390 Architecture development
 help / color / mirror / Atom feed
From: Heiko Carstens <hca@linux.ibm.com>
To: Alexander Gordeev <agordeev@linux.ibm.com>,
	Sven Schnelle <svens@linux.ibm.com>,
	Vasily Gorbik <gor@linux.ibm.com>,
	Christian Borntraeger <borntraeger@linux.ibm.com>,
	Juergen Christ <jchrist@linux.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>,
	Yang Shi <yang@os.amperecomputing.com>,
	Shrikanth Hegde <sshegde@linux.ibm.com>,
	linux-kernel@vger.kernel.org, linux-s390@vger.kernel.org
Subject: [PATCH v3 2/9] s390/percpu: Infrastructure for more efficient this_cpu operations
Date: Wed, 20 May 2026 11:22:36 +0200	[thread overview]
Message-ID: <20260520092243.264847-3-hca@linux.ibm.com> (raw)
In-Reply-To: <20260520092243.264847-1-hca@linux.ibm.com>

With the intended removal of PREEMPT_NONE this_cpu operations based on
atomic instructions, guarded with preempt_disable()/preempt_enable() pairs
become more expensive: the preempt_disable() / preempt_enable() pairs are
not optimized away anymore during compile time.

In particular the conditional call to preempt_schedule_notrace() after
preempt_enable() adds additional code and register pressure.

E.g. this simple C code sequence

DEFINE_PER_CPU(long, foo);
long bar(long a) { return this_cpu_add_return(foo, a); }

generates this code:

  11a976:       eb af f0 68 00 24       stmg    %r10,%r15,104(%r15)
  11a97c:       b9 04 00 ef             lgr     %r14,%r15
  11a980:       b9 04 00 b2             lgr     %r11,%r2
  11a984:       e3 f0 ff c8 ff 71       lay     %r15,-56(%r15)
  11a98a:       e3 e0 f0 98 00 24       stg     %r14,152(%r15)
  11a990:       eb 01 03 a8 00 6a       asi     936,1            <- __preempt_count_add(1)
  11a996:       c0 10 00 d2 ac b5       larl    %r1,1b70300      <- address of percpu var
  11a9a0:       e3 10 23 b8 00 08       ag      %r1,952          <- add percpu offset
  11a9a6:       eb ab 10 00 00 e8       laag    %r10,%r11,0(%r1) <- atomic op
  11a9ac:       eb ff 03 a8 00 6e       alsi    936,-1           <- __preempt_count_dec_and_test()
  11a9b2:       a7 54 00 05             jnhe    11a9bc <bar+0x4c>
  11a9b6:       c0 e5 00 76 d1 bd       brasl   %r14,ff4d30 <preempt_schedule_notrace>
  11a9bc:       b9 e8 b0 2a             agrk    %r2,%r10,%r11
  11a9c0:       eb af f0 a0 00 04       lmg     %r10,%r15,160(%r15)
  11a9c6        07 fe                   br      %r14

Even though the above example is more or less the worst case, since the
branch to preempt_schedule_notrace() requires a stackframe, which
otherwise wouldn't be necessary, there is also the conditional jnhe branch
instruction.

Get rid of the conditional branch with the following code sequence:

  11a8e6:       c0 30 00 d0 c5 0d       larl    %r3,1b33300
  11a8ec:       b9 04 00 43             lgr     %r4,%r3
  11a8f0:       eb 00 43 c0 00 52       mviy    960,4
  11a8f6:       e3 40 03 b8 00 08       ag      %r4,952
  11a8fc:       eb 52 40 00 00 e8       laag    %r5,%r2,0(%r4)
  11a902:       eb 00 03 c0 00 52       mviy    960,0
  11a908:       b9 08 00 25             agr     %r2,%r5
  11a90c        07 fe                   br      %r14

The general idea is that this_cpu operations based on atomic instructions
are guarded with mviy instructions:

- The first mviy instruction writes the register number, which contains
  the percpu address variable to lowcore. This also indicates that a
  percpu code section is executed.

- The first instruction following the mviy instruction must be the ag
  instruction which adds the percpu offset to the percpu address register.

- Afterwards the atomic percpu operation follows.

- Then a second mviy instruction writes a zero to lowcore, which indicates
  the end of the percpu code section.

- In case of an interrupt/exception/nmi the register number which was
  written to lowcore is copied to the exception frame (pt_regs), and a zero
  is written to lowcore.

- On return to the previous context it is checked if a percpu code section
  was executed (saved register number not zero), and if the process was
  migrated to a different cpu. If the percpu offset was already added to
  the percpu address register (instruction address does _not_ point to the
  ag instruction) the content of the percpu address register is adjusted so
  it points to percpu variable of the new cpu.

Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/include/asm/entry-percpu.h | 76 ++++++++++++++++++++++++++++
 arch/s390/include/asm/lowcore.h      |  3 +-
 arch/s390/include/asm/percpu.h       | 54 ++++++++++++++++++++
 arch/s390/include/asm/ptrace.h       |  2 +
 arch/s390/kernel/irq.c               | 26 +++++++---
 arch/s390/kernel/nmi.c               |  6 +++
 arch/s390/kernel/traps.c             |  6 +++
 7 files changed, 165 insertions(+), 8 deletions(-)
 create mode 100644 arch/s390/include/asm/entry-percpu.h

diff --git a/arch/s390/include/asm/entry-percpu.h b/arch/s390/include/asm/entry-percpu.h
new file mode 100644
index 000000000000..e25108e773ab
--- /dev/null
+++ b/arch/s390/include/asm/entry-percpu.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_S390_ENTRY_PERCPU_H
+#define ARCH_S390_ENTRY_PERCPU_H
+
+#include <linux/kprobes.h>
+#include <linux/percpu.h>
+#include <asm/lowcore.h>
+#include <asm/ptrace.h>
+#include <asm/asm-offsets.h>
+
+static __always_inline void percpu_entry(struct pt_regs *regs)
+{
+	struct lowcore *lc = get_lowcore();
+
+	if (user_mode(regs))
+		return;
+	regs->cpu = lc->cpu_nr;
+	regs->percpu_register = lc->percpu_register;
+	lc->percpu_register = 0;
+}
+
+static __always_inline bool percpu_code_check(struct pt_regs *regs)
+{
+	unsigned int insn, disp;
+	struct kprobe *p;
+
+	if (user_mode(regs) || !regs->percpu_register)
+		return false;
+	/*
+	 * Within a percpu code section - check if the percpu base register
+	 * needs to be updated. This is the case if the PSW does not point to
+	 * the ADD instruction within the section.
+	 * - AG %rx,percpu_offset_in_lowcore(%r0,%r0)
+	 * which adds the percpu offset to the percpu base register.
+	 */
+	lockdep_assert_preemption_disabled();
+again:
+	insn = READ_ONCE(*(u16 *)psw_bits(regs->psw).ia);
+	if (unlikely(insn == BREAKPOINT_INSTRUCTION)) {
+		p = get_kprobe((void *)psw_bits(regs->psw).ia);
+		/*
+		 * If the kprobe is concurrently removed on a different CPU
+		 * it might not be found anymore. However text must have
+		 * been restored - try again.
+		 */
+		if (!p)
+			goto again;
+		insn = p->opcode;
+	}
+	if ((insn & 0xff0f) != 0xe300)
+		return false;
+	disp = offsetof(struct lowcore, percpu_offset);
+	if (machine_has_relocated_lowcore())
+		disp += LOWCORE_ALT_ADDRESS;
+	insn = (disp & 0xff000) >> 4 | (disp & 0x00fff) << 16 | 0x8;
+	if (*(u32 *)(psw_bits(regs->psw).ia + 2) != insn)
+		return false;
+	return true;
+}
+
+static __always_inline void percpu_fixup(struct pt_regs *regs)
+{
+	struct lowcore *lc = get_lowcore();
+	unsigned char reg;
+
+	reg = regs->percpu_register;
+	lc->percpu_register = reg;
+	/* Check if process has been migrated to a different CPU. */
+	if (regs->cpu == lc->cpu_nr)
+		return;
+	/* Fixup percpu base register */
+	regs->gprs[reg] -= __per_cpu_offset[regs->cpu];
+	regs->gprs[reg] += lc->percpu_offset;
+}
+
+#endif
diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index 50ffe75adeb4..cd1ddfdb5d35 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -165,7 +165,8 @@ struct lowcore {
 	__u32	spinlock_index;			/* 0x03b0 */
 	__u8	pad_0x03b4[0x03b8-0x03b4];	/* 0x03b4 */
 	__u64	percpu_offset;			/* 0x03b8 */
-	__u8	pad_0x03c0[0x0400-0x03c0];	/* 0x03c0 */
+	__u8	percpu_register;		/* 0x03c0 */
+	__u8	pad_0x03c1[0x0400-0x03c1];	/* 0x03c1 */
 
 	__u32	return_lpswe;			/* 0x0400 */
 	__u32	return_mcck_lpswe;		/* 0x0404 */
diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h
index b18a96f3a334..1af622a8aa67 100644
--- a/arch/s390/include/asm/percpu.h
+++ b/arch/s390/include/asm/percpu.h
@@ -60,6 +60,60 @@
 #define this_cpu_or_1(pcp, val)		arch_this_cpu_to_op_simple(pcp, val, |)
 #define this_cpu_or_2(pcp, val)		arch_this_cpu_to_op_simple(pcp, val, |)
 
+/*
+ * Macros to be used for percpu code section based on atomic instructions.
+ *
+ * Avoid the need to use preempt_disable() / preempt_disable() pairs and the
+ * conditional preempt_schedule_notrace() function calls which come with
+ * this. The idea is that this_cpu operations based on atomic instructions are
+ * guarded with mviy instructions:
+ *
+ * - The first mviy instruction writes the register number, which contains the
+ *   percpu address variable to lowcore. This also indicates that a percpu
+ *   code section is executed.
+ *
+ * - The first mviy instruction following the mviy instruction must be the ag
+ *   instruction which adds the percpu offset to the percpu address register.
+ *
+ * - Afterwards the atomic percpu operation follows.
+ *
+ * - Then a second mviy instruction writes a zero to lowcore, which indicates
+ *   the end of the percpu code section.
+ *
+ * - In case of an interrupt/exception/nmi the register number which was
+ *   written to lowcore is copied to the exception frame (pt_regs), and a zero
+ *   is written to lowcore.
+ *
+ * - On return to the previous context it is checked if a percpu code section
+ *   was executed (saved register number not zero), and if the process was
+ *   migrated to a different cpu. If the percpu offset was already added to
+ *   the percpu address register (instruction address does _not_ point to the
+ *   ag instruction) the content of the percpu address register is adjusted so
+ *   it points to percpu variable of the new cpu.
+ *
+ * Inline assemblies making use of this typically have a code sequence like:
+ *
+ *   MVIY_PERCPU(...) <- start of percpu code section
+ *   AG_ALT(...)      <- add percpu offset; must be the second instruction
+ *   atomic_op	      <- atomic op
+ *   MVIY_ALT(...)    <- end of percpu code section
+ */
+
+#define MVIY_PERCPU(disp, dispalt, base)				\
+	ALTERNATIVE("	mviy	" disp	  "(" base " ),0\n",		\
+		    "	mviy	" dispalt "(" base " ),0\n",		\
+		    ALT_PERCPU(0))
+
+#define MVIY_ALT(disp, dispalt, base)					\
+	ALTERNATIVE("	mviy	" disp	  "(" base " ),0\n",		\
+		    "	mviy	" dispalt "(" base " ),0\n",		\
+		    ALT_FEATURE(MFEATURE_LOWCORE))
+
+#define AG_ALT(disp, dispalt, reg)					\
+	ALTERNATIVE("	ag	" reg ", " disp    "(%%r0)\n",		\
+		    "	ag	" reg ", " dispalt "(%%r0)\n",		\
+		    ALT_FEATURE(MFEATURE_LOWCORE))
+
 #ifndef MARCH_HAS_Z196_FEATURES
 
 #define this_cpu_add_4(pcp, val)	arch_this_cpu_to_op_simple(pcp, val, +)
diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h
index aaceb1d9110a..495e310c3d6d 100644
--- a/arch/s390/include/asm/ptrace.h
+++ b/arch/s390/include/asm/ptrace.h
@@ -134,6 +134,8 @@ struct pt_regs {
 	};
 	unsigned long flags;
 	unsigned long last_break;
+	unsigned int cpu;
+	unsigned char percpu_register;
 };
 
 /*
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index d10a17e6531d..92fdc2ae96f8 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -33,6 +33,7 @@
 #include <asm/softirq_stack.h>
 #include <asm/vtime.h>
 #include <asm/asm.h>
+#include <asm/entry-percpu.h>
 #include "entry.h"
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat);
@@ -142,10 +143,13 @@ static int irq_pending(struct pt_regs *regs)
 
 void noinstr do_io_irq(struct pt_regs *regs)
 {
-	irqentry_state_t state = irqentry_enter(regs);
-	struct pt_regs *old_regs = set_irq_regs(regs);
-	bool from_idle;
+	bool from_idle, percpu_needs_fixup;
+	struct pt_regs *old_regs;
+	irqentry_state_t state;
 
+	percpu_entry(regs);
+	state = irqentry_enter(regs);
+	old_regs = set_irq_regs(regs);
 	from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT);
 	if (from_idle)
 		update_timer_idle();
@@ -170,21 +174,26 @@ void noinstr do_io_irq(struct pt_regs *regs)
 			do_irq_async(regs, IO_INTERRUPT);
 	} while (machine_is_lpar() && irq_pending(regs));
 
+	percpu_needs_fixup = percpu_code_check(regs);
 	irq_exit_rcu();
-
 	set_irq_regs(old_regs);
 	irqentry_exit(regs, state);
 
 	if (from_idle)
 		regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT);
+	if (percpu_needs_fixup)
+		percpu_fixup(regs);
 }
 
 void noinstr do_ext_irq(struct pt_regs *regs)
 {
-	irqentry_state_t state = irqentry_enter(regs);
-	struct pt_regs *old_regs = set_irq_regs(regs);
-	bool from_idle;
+	bool from_idle, percpu_needs_fixup;
+	struct pt_regs *old_regs;
+	irqentry_state_t state;
 
+	percpu_entry(regs);
+	state = irqentry_enter(regs);
+	old_regs = set_irq_regs(regs);
 	from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT);
 	if (from_idle)
 		update_timer_idle();
@@ -206,12 +215,15 @@ void noinstr do_ext_irq(struct pt_regs *regs)
 
 	do_irq_async(regs, EXT_INTERRUPT);
 
+	percpu_needs_fixup = percpu_code_check(regs);
 	irq_exit_rcu();
 	set_irq_regs(old_regs);
 	irqentry_exit(regs, state);
 
 	if (from_idle)
 		regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT);
+	if (percpu_needs_fixup)
+		percpu_fixup(regs);
 }
 
 static void show_msi_interrupt(struct seq_file *p, int irq)
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index 94fbfad49f62..d43cc18fe9be 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/sched/signal.h>
 #include <linux/kvm_host.h>
+#include <asm/entry-percpu.h>
 #include <asm/lowcore.h>
 #include <asm/ctlreg.h>
 #include <asm/fpu.h>
@@ -363,6 +364,7 @@ NOKPROBE_SYMBOL(s390_backup_mcck_info);
  */
 void notrace s390_do_machine_check(struct pt_regs *regs)
 {
+	bool percpu_needs_fixup;
 	static int ipd_count;
 	static DEFINE_SPINLOCK(ipd_lock);
 	static unsigned long long last_ipd;
@@ -374,6 +376,7 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
 	unsigned long mcck_dam_code;
 	int mcck_pending = 0;
 
+	percpu_entry(regs);
 	irq_state = irqentry_nmi_enter(regs);
 
 	if (user_mode(regs))
@@ -495,7 +498,10 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
 	if (mcck_pending)
 		schedule_mcck_handler();
 
+	percpu_needs_fixup = percpu_code_check(regs);
 	irqentry_nmi_exit(regs, irq_state);
+	if (percpu_needs_fixup)
+		percpu_fixup(regs);
 }
 NOKPROBE_SYMBOL(s390_do_machine_check);
 
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index 1b5c6fc431cc..fb16e9bee80b 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -24,6 +24,7 @@
 #include <linux/entry-common.h>
 #include <linux/kmsan.h>
 #include <linux/bug.h>
+#include <asm/entry-percpu.h>
 #include <asm/asm-extable.h>
 #include <asm/irqflags.h>
 #include <asm/ptrace.h>
@@ -329,6 +330,7 @@ static void (*pgm_check_table[128])(struct pt_regs *regs);
 void noinstr __do_pgm_check(struct pt_regs *regs)
 {
 	struct lowcore *lc = get_lowcore();
+	bool percpu_needs_fixup;
 	irqentry_state_t state;
 	unsigned int trapnr;
 	union teid teid;
@@ -349,6 +351,7 @@ void noinstr __do_pgm_check(struct pt_regs *regs)
 		current->thread.gmap_int_code = regs->int_code & 0xffff;
 		return;
 	}
+	percpu_entry(regs);
 	state = irqentry_enter(regs);
 	if (user_mode(regs)) {
 		update_timer_sys();
@@ -385,7 +388,10 @@ void noinstr __do_pgm_check(struct pt_regs *regs)
 		pgm_check_table[trapnr](regs);
 out:
 	local_irq_disable();
+	percpu_needs_fixup = percpu_code_check(regs);
 	irqentry_exit(regs, state);
+	if (percpu_needs_fixup)
+		percpu_fixup(regs);
 }
 
 /*
-- 
2.51.0


  parent reply	other threads:[~2026-05-20  9:23 UTC|newest]

Thread overview: 37+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-20  9:22 [PATCH v3 0/9] s390: Improve this_cpu operations Heiko Carstens
2026-05-20  9:22 ` [PATCH v3 1/9] s390/alternatives: Add new ALT_TYPE_PERCPU type Heiko Carstens
2026-05-20 12:43   ` David Laight
2026-05-20 13:50     ` Heiko Carstens
2026-05-20 14:16       ` Heiko Carstens
2026-05-20  9:22 ` Heiko Carstens [this message]
2026-05-20  9:22 ` [PATCH v3 3/9] s390/percpu: Add missing do { } while (0) constructs Heiko Carstens
2026-05-20  9:22 ` [PATCH v3 4/9] s390/percpu: Use new percpu code section for arch_this_cpu_add() Heiko Carstens
2026-05-20  9:22 ` [PATCH v3 5/9] s390/percpu: Use new percpu code section for arch_this_cpu_add_return() Heiko Carstens
2026-05-20  9:22 ` [PATCH v3 6/9] s390/percpu: Use new percpu code section for arch_this_cpu_[and|or]() Heiko Carstens
2026-05-20  9:22 ` [PATCH v3 7/9] s390/percpu: Provide arch_this_cpu_read() implementation Heiko Carstens
2026-05-20  9:22 ` [PATCH v3 8/9] s390/percpu: Provide arch_this_cpu_write() implementation Heiko Carstens
2026-05-20  9:22 ` [PATCH v3 9/9] s390/percpu: Remove one and two byte this_cpu operation implementation Heiko Carstens
2026-05-20 18:42 ` [PATCH v3 0/9] s390: Improve this_cpu operations Yang Shi
2026-05-20 22:34   ` David Laight
2026-05-21  0:23     ` Yang Shi
2026-05-21 10:17       ` David Laight
2026-05-21 16:57         ` Yang Shi
2026-05-21 17:55           ` David Laight
2026-05-21 20:46             ` Yang Shi
2026-05-21 22:13               ` David Laight
2026-05-21 23:41                 ` Yang Shi
2026-05-21 10:23       ` David Laight
2026-05-21 17:48         ` Yang Shi
2026-05-21 10:37       ` Heiko Carstens
2026-05-21 17:47         ` Yang Shi
2026-05-22  9:18           ` Heiko Carstens
2026-05-27 19:09             ` Christoph Lameter (Ampere)
2026-05-27 20:38               ` Yang Shi
2026-05-28  8:36                 ` David Laight
2026-05-27 23:44             ` Yang Shi
2026-05-28  9:03               ` David Laight
2026-05-28 19:19                 ` Yang Shi
2026-05-28 20:34                   ` David Laight
2026-05-28 14:14               ` Heiko Carstens
2026-05-28 17:14                 ` David Laight
2026-05-28 18:39                 ` Yang Shi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260520092243.264847-3-hca@linux.ibm.com \
    --to=hca@linux.ibm.com \
    --cc=agordeev@linux.ibm.com \
    --cc=borntraeger@linux.ibm.com \
    --cc=gor@linux.ibm.com \
    --cc=jchrist@linux.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-s390@vger.kernel.org \
    --cc=peterz@infradead.org \
    --cc=sshegde@linux.ibm.com \
    --cc=svens@linux.ibm.com \
    --cc=yang@os.amperecomputing.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox