[PATCH 0/2] use adrp/add pairs for PLT entries

linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/2] use adrp/add pairs for PLT entries
@ 2018-11-22  8:46 Ard Biesheuvel
  2018-11-22  8:46 ` [PATCH 1/2] arm64/insn: add support for emitting ADR/ADRP instructions Ard Biesheuvel
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Ard Biesheuvel @ 2018-11-22  8:46 UTC (permalink / raw)
  To: linux-arm-kernel

Currently, PLT entries use a non-idiomatic movn/movz/movz/br instruction
sequence which is also longer than necessary. Also, the code emitting
them does not use the instruction generation code but open codes the
opcodes directly.

The extended KASLR range is now 4 GB, given that we switched to the
small code model everywhere else (including for modules), so we can
switch to adrp/add/br sequences which are easier in the I-cache.

So implement adrp/add pair generation in the instruction generation code
and wire it up into the PLT code. Note that the Cortex-A53 errata handling
requires some special care to ensure that generated veneers are not
susceptible to the erratum.

Cc: Torsten Duwe <duwe@lst.de>
Cc: Jessica Yu <jeyu@kernel.org>

Ard Biesheuvel (2):
  arm64/insn: add support for emitting ADR/ADRP instructions
  arm64/module: switch to ADRP/ADD sequences for PLT entries

 arch/arm64/include/asm/insn.h   |  8 ++
 arch/arm64/include/asm/module.h | 36 ++------
 arch/arm64/kernel/ftrace.c      |  2 +-
 arch/arm64/kernel/insn.c        | 29 ++++++
 arch/arm64/kernel/module-plts.c | 93 +++++++++++++++-----
 arch/arm64/kernel/module.c      |  4 +-
 6 files changed, 119 insertions(+), 53 deletions(-)

-- 
2.17.1

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 1/2] arm64/insn: add support for emitting ADR/ADRP instructions
  2018-11-22  8:46 [PATCH 0/2] use adrp/add pairs for PLT entries Ard Biesheuvel
@ 2018-11-22  8:46 ` Ard Biesheuvel
  2018-11-22  8:46 ` [PATCH 2/2] arm64/module: switch to ADRP/ADD sequences for PLT entries Ard Biesheuvel
  2018-11-27 19:44 ` [PATCH 0/2] use adrp/add pairs " Will Deacon
  2 siblings, 0 replies; 8+ messages in thread
From: Ard Biesheuvel @ 2018-11-22  8:46 UTC (permalink / raw)
  To: linux-arm-kernel

Add support for emitting ADR and ADRP instructions so we can switch
over our PLT generation code in a subsequent patch.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/include/asm/insn.h |  8 ++++++
 arch/arm64/kernel/insn.c      | 29 ++++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index c6802dea6cab..9c01f04db64d 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -261,6 +261,11 @@ enum aarch64_insn_prfm_policy {
 	AARCH64_INSN_PRFM_POLICY_STRM,
 };
 
+enum aarch64_insn_adr_type {
+	AARCH64_INSN_ADR_TYPE_ADRP,
+	AARCH64_INSN_ADR_TYPE_ADR,
+};
+
 #define	__AARCH64_INSN_FUNCS(abbr, mask, val)	\
 static __always_inline bool aarch64_insn_is_##abbr(u32 code) \
 { return (code & (mask)) == (val); } \
@@ -393,6 +398,9 @@ u32 aarch64_insn_gen_add_sub_imm(enum aarch64_insn_register dst,
 				 enum aarch64_insn_register src,
 				 int imm, enum aarch64_insn_variant variant,
 				 enum aarch64_insn_adsb_type type);
+u32 aarch64_insn_gen_adr(unsigned long pc, unsigned long addr,
+			 enum aarch64_insn_register reg,
+			 enum aarch64_insn_adr_type type);
 u32 aarch64_insn_gen_bitfield(enum aarch64_insn_register dst,
 			      enum aarch64_insn_register src,
 			      int immr, int imms,
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index 2b3413549734..7820a4a688fa 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -1239,6 +1239,35 @@ u32 aarch64_insn_gen_logical_shifted_reg(enum aarch64_insn_register dst,
 	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift);
 }
 
+u32 aarch64_insn_gen_adr(unsigned long pc, unsigned long addr,
+			 enum aarch64_insn_register reg,
+			 enum aarch64_insn_adr_type type)
+{
+	u32 insn;
+	s32 offset;
+
+	switch (type) {
+	case AARCH64_INSN_ADR_TYPE_ADR:
+		insn = aarch64_insn_get_adr_value();
+		offset = addr - pc;
+		break;
+	case AARCH64_INSN_ADR_TYPE_ADRP:
+		insn = aarch64_insn_get_adrp_value();
+		offset = (addr - ALIGN_DOWN(pc, SZ_4K)) >> 12;
+		break;
+	default:
+		pr_err("%s: unknown adr encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	if (offset < -SZ_1M || offset >= SZ_1M)
+		return AARCH64_BREAK_FAULT;
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, reg);
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_ADR, insn, offset);
+}
+
 /*
  * Decode the imm field of a branch, and return the byte offset as a
  * signed value (so it can be used when computing a new branch
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/2] arm64/module: switch to ADRP/ADD sequences for PLT entries
  2018-11-22  8:46 [PATCH 0/2] use adrp/add pairs for PLT entries Ard Biesheuvel
  2018-11-22  8:46 ` [PATCH 1/2] arm64/insn: add support for emitting ADR/ADRP instructions Ard Biesheuvel
@ 2018-11-22  8:46 ` Ard Biesheuvel
  2018-11-23 16:11   ` Torsten Duwe
  2018-11-27 19:44 ` [PATCH 0/2] use adrp/add pairs " Will Deacon
  2 siblings, 1 reply; 8+ messages in thread
From: Ard Biesheuvel @ 2018-11-22  8:46 UTC (permalink / raw)
  To: linux-arm-kernel

Now that we have switched to the small code model entirely, and
reduced the extended KASLR range to 4 GB, we can be sure that the
targets of relative branches that are out of range are in range
for a ADRP/ADD pair, which is one instruction shorter than our
current MOVN/MOVK/MOVK sequence, and is more idiomatic and so it
is more likely to be implemented efficiently by micro-architectures.

So switch over the ordinary PLT code and the special handling of
the Cortex-A53 ADRP errata, as well as the ftrace trampline
handling.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/include/asm/module.h | 36 ++------
 arch/arm64/kernel/ftrace.c      |  2 +-
 arch/arm64/kernel/module-plts.c | 93 +++++++++++++++-----
 arch/arm64/kernel/module.c      |  4 +-
 4 files changed, 82 insertions(+), 53 deletions(-)

diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h
index 97d0ef12e2ff..9ce31b056ac9 100644
--- a/arch/arm64/include/asm/module.h
+++ b/arch/arm64/include/asm/module.h
@@ -56,39 +56,19 @@ struct plt_entry {
 	 * is exactly what we are dealing with here, we are free to use x16
 	 * as a scratch register in the PLT veneers.
 	 */
-	__le32	mov0;	/* movn	x16, #0x....			*/
-	__le32	mov1;	/* movk	x16, #0x...., lsl #16		*/
-	__le32	mov2;	/* movk	x16, #0x...., lsl #32		*/
+	__le32	adrp;	/* adrp	x16, ....			*/
+	__le32	add;	/* add	x16, x16, #0x....		*/
 	__le32	br;	/* br	x16				*/
 };
 
-static inline struct plt_entry get_plt_entry(u64 val)
+static inline bool is_forbidden_offset_for_adrp(void *place)
 {
-	/*
-	 * MOVK/MOVN/MOVZ opcode:
-	 * +--------+------------+--------+-----------+-------------+---------+
-	 * | sf[31] | opc[30:29] | 100101 | hw[22:21] | imm16[20:5] | Rd[4:0] |
-	 * +--------+------------+--------+-----------+-------------+---------+
-	 *
-	 * Rd     := 0x10 (x16)
-	 * hw     := 0b00 (no shift), 0b01 (lsl #16), 0b10 (lsl #32)
-	 * opc    := 0b11 (MOVK), 0b00 (MOVN), 0b10 (MOVZ)
-	 * sf     := 1 (64-bit variant)
-	 */
-	return (struct plt_entry){
-		cpu_to_le32(0x92800010 | (((~val      ) & 0xffff)) << 5),
-		cpu_to_le32(0xf2a00010 | ((( val >> 16) & 0xffff)) << 5),
-		cpu_to_le32(0xf2c00010 | ((( val >> 32) & 0xffff)) << 5),
-		cpu_to_le32(0xd61f0200)
-	};
+	return IS_ENABLED(CONFIG_ARM64_ERRATUM_843419) &&
+	       cpus_have_const_cap(ARM64_WORKAROUND_843419) &&
+	       ((u64)place & 0xfff) >= 0xff8;
 }
 
-static inline bool plt_entries_equal(const struct plt_entry *a,
-				     const struct plt_entry *b)
-{
-	return a->mov0 == b->mov0 &&
-	       a->mov1 == b->mov1 &&
-	       a->mov2 == b->mov2;
-}
+struct plt_entry get_plt_entry(u64 dst, void *pc);
+bool plt_entries_equal(const struct plt_entry *a, const struct plt_entry *b);
 
 #endif /* __ASM_MODULE_H */
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index 50986e388d2b..2135665a8ab3 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -104,7 +104,7 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 		 * is added in the future, but for now, the pr_err() below
 		 * deals with a theoretical issue only.
 		 */
-		trampoline = get_plt_entry(addr);
+		trampoline = get_plt_entry(addr, mod->arch.ftrace_trampoline);
 		if (!plt_entries_equal(mod->arch.ftrace_trampoline,
 				       &trampoline)) {
 			if (!plt_entries_equal(mod->arch.ftrace_trampoline,
diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c
index f0690c2ca3e0..3c6e5f3a4973 100644
--- a/arch/arm64/kernel/module-plts.c
+++ b/arch/arm64/kernel/module-plts.c
@@ -11,6 +11,55 @@
 #include <linux/module.h>
 #include <linux/sort.h>
 
+static struct plt_entry __get_adrp_add_pair(u64 dst, u64 pc,
+					    enum aarch64_insn_register reg)
+{
+	u32 adrp, add;
+
+	adrp = aarch64_insn_gen_adr(pc, dst, reg, AARCH64_INSN_ADR_TYPE_ADRP);
+	add = aarch64_insn_gen_add_sub_imm(reg, reg, dst % SZ_4K,
+					   AARCH64_INSN_VARIANT_64BIT,
+					   AARCH64_INSN_ADSB_ADD);
+
+	return (struct plt_entry){ cpu_to_le32(adrp), cpu_to_le32(add) };
+}
+
+struct plt_entry get_plt_entry(u64 dst, void *pc)
+{
+	struct plt_entry plt;
+	static u32 br;
+
+	if (!br)
+		br = aarch64_insn_gen_branch_reg(AARCH64_INSN_REG_16,
+						 AARCH64_INSN_BRANCH_NOLINK);
+
+	plt = __get_adrp_add_pair(dst, (u64)pc, AARCH64_INSN_REG_16);
+	plt.br = cpu_to_le32(br);
+
+	return plt;
+}
+
+bool plt_entries_equal(const struct plt_entry *a, const struct plt_entry *b)
+{
+	u64 p, q;
+
+	/*
+	 * Check whether both entries refer to the same target:
+	 * do the cheapest checks first.
+	 */
+	if (a->add != b->add || a->br != b->br)
+		return false;
+
+	p = ALIGN_DOWN((u64)a, SZ_4K);
+	q = ALIGN_DOWN((u64)b, SZ_4K);
+
+	if (a->adrp == b->adrp && p == q)
+		return true;
+
+	return (p + aarch64_insn_adrp_get_offset(le32_to_cpu(a->adrp))) ==
+	       (q + aarch64_insn_adrp_get_offset(le32_to_cpu(b->adrp)));
+}
+
 static bool in_init(const struct module *mod, void *loc)
 {
 	return (u64)loc - (u64)mod->init_layout.base < mod->init_layout.size;
@@ -23,19 +72,23 @@ u64 module_emit_plt_entry(struct module *mod, void *loc, const Elf64_Rela *rela,
 							  &mod->arch.init;
 	struct plt_entry *plt = (struct plt_entry *)pltsec->plt->sh_addr;
 	int i = pltsec->plt_num_entries;
+	int j = i - 1;
 	u64 val = sym->st_value + rela->r_addend;
 
-	plt[i] = get_plt_entry(val);
+	if (is_forbidden_offset_for_adrp(&plt[i].adrp))
+		i++;
+
+	plt[i] = get_plt_entry(val, &plt[i]);
 
 	/*
 	 * Check if the entry we just created is a duplicate. Given that the
 	 * relocations are sorted, this will be the last entry we allocated.
 	 * (if one exists).
 	 */
-	if (i > 0 && plt_entries_equal(plt + i, plt + i - 1))
-		return (u64)&plt[i - 1];
+	if (j >= 0 && plt_entries_equal(plt + i, plt + j))
+		return (u64)&plt[j];
 
-	pltsec->plt_num_entries++;
+	pltsec->plt_num_entries += i - j;
 	if (WARN_ON(pltsec->plt_num_entries > pltsec->plt_max_entries))
 		return 0;
 
@@ -49,35 +102,24 @@ u64 module_emit_veneer_for_adrp(struct module *mod, void *loc, u64 val)
 							  &mod->arch.init;
 	struct plt_entry *plt = (struct plt_entry *)pltsec->plt->sh_addr;
 	int i = pltsec->plt_num_entries++;
-	u32 mov0, mov1, mov2, br;
+	u32 br;
 	int rd;
 
 	if (WARN_ON(pltsec->plt_num_entries > pltsec->plt_max_entries))
 		return 0;
 
+	if (is_forbidden_offset_for_adrp(&plt[i].adrp))
+		i = pltsec->plt_num_entries++;
+
 	/* get the destination register of the ADRP instruction */
 	rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD,
 					  le32_to_cpup((__le32 *)loc));
 
-	/* generate the veneer instructions */
-	mov0 = aarch64_insn_gen_movewide(rd, (u16)~val, 0,
-					 AARCH64_INSN_VARIANT_64BIT,
-					 AARCH64_INSN_MOVEWIDE_INVERSE);
-	mov1 = aarch64_insn_gen_movewide(rd, (u16)(val >> 16), 16,
-					 AARCH64_INSN_VARIANT_64BIT,
-					 AARCH64_INSN_MOVEWIDE_KEEP);
-	mov2 = aarch64_insn_gen_movewide(rd, (u16)(val >> 32), 32,
-					 AARCH64_INSN_VARIANT_64BIT,
-					 AARCH64_INSN_MOVEWIDE_KEEP);
 	br = aarch64_insn_gen_branch_imm((u64)&plt[i].br, (u64)loc + 4,
 					 AARCH64_INSN_BRANCH_NOLINK);
 
-	plt[i] = (struct plt_entry){
-			cpu_to_le32(mov0),
-			cpu_to_le32(mov1),
-			cpu_to_le32(mov2),
-			cpu_to_le32(br)
-		};
+	plt[i] = __get_adrp_add_pair(val, (u64)&plt[i], rd);
+	plt[i].br = cpu_to_le32(br);
 
 	return (u64)&plt[i];
 }
@@ -193,6 +235,15 @@ static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num,
 			break;
 		}
 	}
+
+	if (IS_ENABLED(CONFIG_ARM64_ERRATUM_843419) &&
+	    cpus_have_const_cap(ARM64_WORKAROUND_843419))
+		/*
+		 * Add some slack so we can skip PLT slots that may trigger
+		 * the erratum due to the placement of the ADRP instruction.
+		 */
+		ret += DIV_ROUND_UP(ret, (SZ_4K / sizeof(struct plt_entry)));
+
 	return ret;
 }
 
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index f0f27aeefb73..3b6dc4ce7ec7 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -202,9 +202,7 @@ static int reloc_insn_adrp(struct module *mod, __le32 *place, u64 val)
 {
 	u32 insn;
 
-	if (!IS_ENABLED(CONFIG_ARM64_ERRATUM_843419) ||
-	    !cpus_have_const_cap(ARM64_WORKAROUND_843419) ||
-	    ((u64)place & 0xfff) < 0xff8)
+	if (!is_forbidden_offset_for_adrp(place))
 		return reloc_insn_imm(RELOC_OP_PAGE, place, val, 12, 21,
 				      AARCH64_INSN_IMM_ADR);
 
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/2] arm64/module: switch to ADRP/ADD sequences for PLT entries
  2018-11-22  8:46 ` [PATCH 2/2] arm64/module: switch to ADRP/ADD sequences for PLT entries Ard Biesheuvel
@ 2018-11-23 16:11   ` Torsten Duwe
  2018-11-23 16:24     ` Ard Biesheuvel
  0 siblings, 1 reply; 8+ messages in thread
From: Torsten Duwe @ 2018-11-23 16:11 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Nov 22, 2018 at 09:46:46AM +0100, Ard Biesheuvel wrote:
> Now that we have switched to the small code model entirely, and
> reduced the extended KASLR range to 4 GB, we can be sure that the
> targets of relative branches that are out of range are in range
> for a ADRP/ADD pair, which is one instruction shorter than our
> current MOVN/MOVK/MOVK sequence, and is more idiomatic and so it
> is more likely to be implemented efficiently by micro-architectures.
> 
> So switch over the ordinary PLT code and the special handling of
> the Cortex-A53 ADRP errata, as well as the ftrace trampline
> handling.
> 
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

Generally, an ACK by me, but...

> diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c
> index f0690c2ca3e0..3c6e5f3a4973 100644
> --- a/arch/arm64/kernel/module-plts.c
> +++ b/arch/arm64/kernel/module-plts.c
> @@ -11,6 +11,55 @@
>  #include <linux/module.h>
>  #include <linux/sort.h>
>  
> +static struct plt_entry __get_adrp_add_pair(u64 dst, u64 pc,
> +					    enum aarch64_insn_register reg)
> +{
> +	u32 adrp, add;
> +
> +	adrp = aarch64_insn_gen_adr(pc, dst, reg, AARCH64_INSN_ADR_TYPE_ADRP);
> +	add = aarch64_insn_gen_add_sub_imm(reg, reg, dst % SZ_4K,
> +					   AARCH64_INSN_VARIANT_64BIT,
> +					   AARCH64_INSN_ADSB_ADD);
> +
> +	return (struct plt_entry){ cpu_to_le32(adrp), cpu_to_le32(add) };
> +}

Will __get_adrp_add_pair get reused? Otherwise it would just be inlined
below, but then again why is it returning a partial struct plt_entry?

> +struct plt_entry get_plt_entry(u64 dst, void *pc)
> +{
> +	struct plt_entry plt;
> +	static u32 br;

Well, _I_ would call this variable insn_br_x16...

> +	if (!br)
> +		br = aarch64_insn_gen_branch_reg(AARCH64_INSN_REG_16,
> +						 AARCH64_INSN_BRANCH_NOLINK);
> +
> +	plt = __get_adrp_add_pair(dst, (u64)pc, AARCH64_INSN_REG_16);
> +	plt.br = cpu_to_le32(br);
> +
> +	return plt;
> +}

But I'm really lost with this one:

> +bool plt_entries_equal(const struct plt_entry *a, const struct plt_entry *b)
> +{
> +	u64 p, q;
> +
> +	/*
> +	 * Check whether both entries refer to the same target:
> +	 * do the cheapest checks first.
> +	 */
> +	if (a->add != b->add || a->br != b->br)
> +		return false;
> +
> +	p = ALIGN_DOWN((u64)a, SZ_4K);
> +	q = ALIGN_DOWN((u64)b, SZ_4K);
> +
> +	if (a->adrp == b->adrp && p == q)
> +		return true;
> +
> +	return (p + aarch64_insn_adrp_get_offset(le32_to_cpu(a->adrp))) ==
> +	       (q + aarch64_insn_adrp_get_offset(le32_to_cpu(b->adrp)));
> +}

IIUC addr/addrp are PC-relative? So in order to tell whether they lead to
the same destination, their location (a and b) must _fully_ been taken
into account, not just some bits?

Also, plt entries residing at different locations might address the same
target, but (a->add != b->add || a->br != b->br) would yield true
despite that. Is this intended?

	Torsten

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 2/2] arm64/module: switch to ADRP/ADD sequences for PLT entries
  2018-11-23 16:11   ` Torsten Duwe
@ 2018-11-23 16:24     ` Ard Biesheuvel
  2018-11-24 12:20       ` Torsten Duwe
  0 siblings, 1 reply; 8+ messages in thread
From: Ard Biesheuvel @ 2018-11-23 16:24 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 23 Nov 2018 at 17:12, Torsten Duwe <duwe@lst.de> wrote:
>
> On Thu, Nov 22, 2018 at 09:46:46AM +0100, Ard Biesheuvel wrote:
> > Now that we have switched to the small code model entirely, and
> > reduced the extended KASLR range to 4 GB, we can be sure that the
> > targets of relative branches that are out of range are in range
> > for a ADRP/ADD pair, which is one instruction shorter than our
> > current MOVN/MOVK/MOVK sequence, and is more idiomatic and so it
> > is more likely to be implemented efficiently by micro-architectures.
> >
> > So switch over the ordinary PLT code and the special handling of
> > the Cortex-A53 ADRP errata, as well as the ftrace trampline
> > handling.
> >
> > Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
>
> Generally, an ACK by me, but...
>
> > diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c
> > index f0690c2ca3e0..3c6e5f3a4973 100644
> > --- a/arch/arm64/kernel/module-plts.c
> > +++ b/arch/arm64/kernel/module-plts.c
> > @@ -11,6 +11,55 @@
> >  #include <linux/module.h>
> >  #include <linux/sort.h>
> >
> > +static struct plt_entry __get_adrp_add_pair(u64 dst, u64 pc,
> > +                                         enum aarch64_insn_register reg)
> > +{
> > +     u32 adrp, add;
> > +
> > +     adrp = aarch64_insn_gen_adr(pc, dst, reg, AARCH64_INSN_ADR_TYPE_ADRP);
> > +     add = aarch64_insn_gen_add_sub_imm(reg, reg, dst % SZ_4K,
> > +                                        AARCH64_INSN_VARIANT_64BIT,
> > +                                        AARCH64_INSN_ADSB_ADD);
> > +
> > +     return (struct plt_entry){ cpu_to_le32(adrp), cpu_to_le32(add) };
> > +}
>
> Will __get_adrp_add_pair get reused? Otherwise it would just be inlined
> below, but then again why is it returning a partial struct plt_entry?
>

Because it is used in two places: get_plt_entry() and
module_emit_veneer_for_adrp()

> > +struct plt_entry get_plt_entry(u64 dst, void *pc)
> > +{
> > +     struct plt_entry plt;
> > +     static u32 br;
>
> Well, _I_ would call this variable insn_br_x16...
>
> > +     if (!br)
> > +             br = aarch64_insn_gen_branch_reg(AARCH64_INSN_REG_16,
> > +                                              AARCH64_INSN_BRANCH_NOLINK);
> > +
> > +     plt = __get_adrp_add_pair(dst, (u64)pc, AARCH64_INSN_REG_16);
> > +     plt.br = cpu_to_le32(br);
> > +
> > +     return plt;
> > +}
>
> But I'm really lost with this one:
>
> > +bool plt_entries_equal(const struct plt_entry *a, const struct plt_entry *b)
> > +{
> > +     u64 p, q;
> > +
> > +     /*
> > +      * Check whether both entries refer to the same target:
> > +      * do the cheapest checks first.
> > +      */
> > +     if (a->add != b->add || a->br != b->br)
> > +             return false;
> > +
> > +     p = ALIGN_DOWN((u64)a, SZ_4K);
> > +     q = ALIGN_DOWN((u64)b, SZ_4K);
> > +
> > +     if (a->adrp == b->adrp && p == q)
> > +             return true;
> > +
> > +     return (p + aarch64_insn_adrp_get_offset(le32_to_cpu(a->adrp))) ==
> > +            (q + aarch64_insn_adrp_get_offset(le32_to_cpu(b->adrp)));
> > +}
>
> IIUC addr/addrp are PC-relative? So in order to tell whether they lead to
> the same destination, their location (a and b) must _fully_ been taken
> into account, not just some bits?
>

The criterion is whether they point to the same target.

So the reasoning is as follows:
- if the 'add' or 'br' opcodes are different, they are definitely not equal
- if the 'add' and 'br' opcodes are the same, the 'adrp' opcodes are
the same, and the adrp instructions reside in the same 4 KB sized/4 KB
aligned window, they must point to the same symbol
- otherwise, decode the instructions to see if they point to the same
symbol. Note that we already checked the 'add's so no need to check
them again.

> Also, plt entries residing at different locations might address the same
> target, but (a->add != b->add || a->br != b->br) would yield true
> despite that. Is this intended?
>

If they address the same target, the add will be the same. The br also
has to be the same because we cannot reuse an ordinary PLT as and ADRP
veneer or vice versa.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 2/2] arm64/module: switch to ADRP/ADD sequences for PLT entries
  2018-11-23 16:24     ` Ard Biesheuvel
@ 2018-11-24 12:20       ` Torsten Duwe
  0 siblings, 0 replies; 8+ messages in thread
From: Torsten Duwe @ 2018-11-24 12:20 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Nov 23, 2018 at 05:24:13PM +0100, Ard Biesheuvel wrote:
> On Fri, 23 Nov 2018 at 17:12, Torsten Duwe <duwe@lst.de> wrote:
> > On Thu, Nov 22, 2018 at 09:46:46AM +0100, Ard Biesheuvel wrote:
> >
> > But I'm really lost with this one:
> >
> > > +bool plt_entries_equal(const struct plt_entry *a, const struct plt_entry *b)
> > > +{
> > > +     u64 p, q;
> > > +
> > > +     /*
> > > +      * Check whether both entries refer to the same target:
> > > +      * do the cheapest checks first.
> > > +      */
> > > +     if (a->add != b->add || a->br != b->br)
> > > +             return false;
> > > +
> > > +     p = ALIGN_DOWN((u64)a, SZ_4K);
> > > +     q = ALIGN_DOWN((u64)b, SZ_4K);
> > > +
> > > +     if (a->adrp == b->adrp && p == q)
> > > +             return true;
> > > +
> > > +     return (p + aarch64_insn_adrp_get_offset(le32_to_cpu(a->adrp))) ==
> > > +            (q + aarch64_insn_adrp_get_offset(le32_to_cpu(b->adrp)));
> > > +}
> >
> > IIUC addr/addrp are PC-relative? So in order to tell whether they lead to
> > the same destination, their location (a and b) must _fully_ been taken
> > into account, not just some bits?

Ok, only the % 4k part of the PC is taken into account for adrp, I learned;
well then.

> 
> The criterion is whether they point to the same target.
> 
> So the reasoning is as follows:
> - if the 'add' or 'br' opcodes are different, they are definitely not equal
> - if the 'add' and 'br' opcodes are the same, the 'adrp' opcodes are
> the same, and the adrp instructions reside in the same 4 KB sized/4 KB
> aligned window, they must point to the same symbol
> - otherwise, decode the instructions to see if they point to the same
> symbol. Note that we already checked the 'add's so no need to check
> them again.
> 
> > Also, plt entries residing at different locations might address the same
> > target, but (a->add != b->add || a->br != b->br) would yield true
> > despite that. Is this intended?
> >
> 
> If they address the same target, the add will be the same. The br also
> has to be the same because we cannot reuse an ordinary PLT as and ADRP
> veneer or vice versa.

Ah, _that's_ the purpose! Could you please clarify it like

/* make sure we're comparing equally typed veneers (or quote the above) */
if (a->br != b->br)
	return false;
/* different offsets into the page can never lead to equal dest */
if (a->add != b->add)
	return false;
/* it remains to compare the destination pages */ ...

Seems like this is per se difficult territory, see erratum#843419 ;-)

Reviewed-by: Torsten Duwe <duwe@lst.de>

	Torsten

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 0/2] use adrp/add pairs for PLT entries
  2018-11-22  8:46 [PATCH 0/2] use adrp/add pairs for PLT entries Ard Biesheuvel
  2018-11-22  8:46 ` [PATCH 1/2] arm64/insn: add support for emitting ADR/ADRP instructions Ard Biesheuvel
  2018-11-22  8:46 ` [PATCH 2/2] arm64/module: switch to ADRP/ADD sequences for PLT entries Ard Biesheuvel
@ 2018-11-27 19:44 ` Will Deacon
  2018-11-27 21:13   ` Ard Biesheuvel
  2 siblings, 1 reply; 8+ messages in thread
From: Will Deacon @ 2018-11-27 19:44 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Ard,

On Thu, Nov 22, 2018 at 09:46:44AM +0100, Ard Biesheuvel wrote:
> Currently, PLT entries use a non-idiomatic movn/movz/movz/br instruction
> sequence which is also longer than necessary. Also, the code emitting
> them does not use the instruction generation code but open codes the
> opcodes directly.
> 
> The extended KASLR range is now 4 GB, given that we switched to the
> small code model everywhere else (including for modules), so we can
> switch to adrp/add/br sequences which are easier in the I-cache.
> 
> So implement adrp/add pair generation in the instruction generation code
> and wire it up into the PLT code. Note that the Cortex-A53 errata handling
> requires some special care to ensure that generated veneers are not
> susceptible to the erratum.
> 
> Cc: Torsten Duwe <duwe@lst.de>
> Cc: Jessica Yu <jeyu@kernel.org>

I've applied this, with a couple of extra comments in the plt comparison
code and the Reviewed-by from Torsten. There were some trivial conflicts
with Jessica's rework of the plt lookup, but I think I got it right. Please
take a look at for-next/core when you get a chance.

Will

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 0/2] use adrp/add pairs for PLT entries
  2018-11-27 19:44 ` [PATCH 0/2] use adrp/add pairs " Will Deacon
@ 2018-11-27 21:13   ` Ard Biesheuvel
  0 siblings, 0 replies; 8+ messages in thread
From: Ard Biesheuvel @ 2018-11-27 21:13 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, 27 Nov 2018 at 20:44, Will Deacon <will.deacon@arm.com> wrote:
>
> Hi Ard,
>
> On Thu, Nov 22, 2018 at 09:46:44AM +0100, Ard Biesheuvel wrote:
> > Currently, PLT entries use a non-idiomatic movn/movz/movz/br instruction
> > sequence which is also longer than necessary. Also, the code emitting
> > them does not use the instruction generation code but open codes the
> > opcodes directly.
> >
> > The extended KASLR range is now 4 GB, given that we switched to the
> > small code model everywhere else (including for modules), so we can
> > switch to adrp/add/br sequences which are easier in the I-cache.
> >
> > So implement adrp/add pair generation in the instruction generation code
> > and wire it up into the PLT code. Note that the Cortex-A53 errata handling
> > requires some special care to ensure that generated veneers are not
> > susceptible to the erratum.
> >
> > Cc: Torsten Duwe <duwe@lst.de>
> > Cc: Jessica Yu <jeyu@kernel.org>
>
> I've applied this, with a couple of extra comments in the plt comparison
> code and the Reviewed-by from Torsten. There were some trivial conflicts
> with Jessica's rework of the plt lookup, but I think I got it right. Please
> take a look at for-next/core when you get a chance.
>

Looks fine to me.

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2018-11-27 21:13 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-11-22  8:46 [PATCH 0/2] use adrp/add pairs for PLT entries Ard Biesheuvel
2018-11-22  8:46 ` [PATCH 1/2] arm64/insn: add support for emitting ADR/ADRP instructions Ard Biesheuvel
2018-11-22  8:46 ` [PATCH 2/2] arm64/module: switch to ADRP/ADD sequences for PLT entries Ard Biesheuvel
2018-11-23 16:11   ` Torsten Duwe
2018-11-23 16:24     ` Ard Biesheuvel
2018-11-24 12:20       ` Torsten Duwe
2018-11-27 19:44 ` [PATCH 0/2] use adrp/add pairs " Will Deacon
2018-11-27 21:13   ` Ard Biesheuvel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).