public inbox for linux-crypto@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/2] crypto: some hardening against AES cache-timing attacks
@ 2018-10-17  6:18 Eric Biggers
  2018-10-17  6:18 ` [PATCH v2 1/2] crypto: aes_ti - disable interrupts while accessing S-box Eric Biggers
  2018-10-17  6:18 ` [PATCH v2 2/2] crypto: arm/aes - add some hardening against cache-timing attacks Eric Biggers
  0 siblings, 2 replies; 7+ messages in thread
From: Eric Biggers @ 2018-10-17  6:18 UTC (permalink / raw)
  To: linux-crypto, Herbert Xu; +Cc: Ard Biesheuvel, Paul Crowley

This series makes the "aes-fixed-time" and "aes-arm" implementations of
AES more resistant to cache-timing attacks.

Note that even after these changes, the implementations still aren't
necessarily guaranteed to be constant-time; see
https://cr.yp.to/antiforgery/cachetiming-20050414.pdf for a discussion
of the many difficulties involved in writing truly constant-time AES
software.  But it's valuable to make such attacks more difficult.

Eric Biggers (2):
  crypto: aes_ti - disable interrupts while accessing S-box
  crypto: arm/aes - add some hardening against cache-timing attacks

 arch/arm/crypto/aes-cipher-core.S | 26 ++++++++++++++++++++++++++
 arch/arm/crypto/aes-cipher-glue.c | 13 +++++++++++++
 crypto/aes_generic.c              |  9 +++++----
 crypto/aes_ti.c                   | 18 ++++++++++++++++++
 4 files changed, 62 insertions(+), 4 deletions(-)

-- 
2.19.1

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v2 1/2] crypto: aes_ti - disable interrupts while accessing S-box
  2018-10-17  6:18 [PATCH v2 0/2] crypto: some hardening against AES cache-timing attacks Eric Biggers
@ 2018-10-17  6:18 ` Eric Biggers
  2018-10-18  4:01   ` Ard Biesheuvel
  2018-10-17  6:18 ` [PATCH v2 2/2] crypto: arm/aes - add some hardening against cache-timing attacks Eric Biggers
  1 sibling, 1 reply; 7+ messages in thread
From: Eric Biggers @ 2018-10-17  6:18 UTC (permalink / raw)
  To: linux-crypto, Herbert Xu; +Cc: Ard Biesheuvel, Paul Crowley

From: Eric Biggers <ebiggers@google.com>

In the "aes-fixed-time" AES implementation, disable interrupts while
accessing the S-box, in order to make cache-timing attacks more
difficult.  Previously it was possible for the CPU to be interrupted
while the S-box was loaded into L1 cache, potentially evicting the
cachelines and causing later table lookups to be time-variant.

In tests I did on x86 and ARM, this doesn't affect performance
significantly.  Responsiveness is potentially a concern, but interrupts
are only disabled for a single AES block.

Note that even after this change, the implementation still isn't
necessarily guaranteed to be constant-time; see
https://cr.yp.to/antiforgery/cachetiming-20050414.pdf for a discussion
of the many difficulties involved in writing truly constant-time AES
software.  But it's valuable to make such attacks more difficult.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 crypto/aes_ti.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/crypto/aes_ti.c b/crypto/aes_ti.c
index 03023b2290e8..1ff9785b30f5 100644
--- a/crypto/aes_ti.c
+++ b/crypto/aes_ti.c
@@ -269,6 +269,7 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	const u32 *rkp = ctx->key_enc + 4;
 	int rounds = 6 + ctx->key_length / 4;
 	u32 st0[4], st1[4];
+	unsigned long flags;
 	int round;
 
 	st0[0] = ctx->key_enc[0] ^ get_unaligned_le32(in);
@@ -276,6 +277,12 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	st0[2] = ctx->key_enc[2] ^ get_unaligned_le32(in + 8);
 	st0[3] = ctx->key_enc[3] ^ get_unaligned_le32(in + 12);
 
+	/*
+	 * Temporarily disable interrupts to avoid races where cachelines are
+	 * evicted when the CPU is interrupted to do something else.
+	 */
+	local_irq_save(flags);
+
 	st0[0] ^= __aesti_sbox[ 0] ^ __aesti_sbox[128];
 	st0[1] ^= __aesti_sbox[32] ^ __aesti_sbox[160];
 	st0[2] ^= __aesti_sbox[64] ^ __aesti_sbox[192];
@@ -300,6 +307,8 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	put_unaligned_le32(subshift(st1, 1) ^ rkp[5], out + 4);
 	put_unaligned_le32(subshift(st1, 2) ^ rkp[6], out + 8);
 	put_unaligned_le32(subshift(st1, 3) ^ rkp[7], out + 12);
+
+	local_irq_restore(flags);
 }
 
 static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
@@ -308,6 +317,7 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	const u32 *rkp = ctx->key_dec + 4;
 	int rounds = 6 + ctx->key_length / 4;
 	u32 st0[4], st1[4];
+	unsigned long flags;
 	int round;
 
 	st0[0] = ctx->key_dec[0] ^ get_unaligned_le32(in);
@@ -315,6 +325,12 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	st0[2] = ctx->key_dec[2] ^ get_unaligned_le32(in + 8);
 	st0[3] = ctx->key_dec[3] ^ get_unaligned_le32(in + 12);
 
+	/*
+	 * Temporarily disable interrupts to avoid races where cachelines are
+	 * evicted when the CPU is interrupted to do something else.
+	 */
+	local_irq_save(flags);
+
 	st0[0] ^= __aesti_inv_sbox[ 0] ^ __aesti_inv_sbox[128];
 	st0[1] ^= __aesti_inv_sbox[32] ^ __aesti_inv_sbox[160];
 	st0[2] ^= __aesti_inv_sbox[64] ^ __aesti_inv_sbox[192];
@@ -339,6 +355,8 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	put_unaligned_le32(inv_subshift(st1, 1) ^ rkp[5], out + 4);
 	put_unaligned_le32(inv_subshift(st1, 2) ^ rkp[6], out + 8);
 	put_unaligned_le32(inv_subshift(st1, 3) ^ rkp[7], out + 12);
+
+	local_irq_restore(flags);
 }
 
 static struct crypto_alg aes_alg = {
-- 
2.19.1

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v2 2/2] crypto: arm/aes - add some hardening against cache-timing attacks
  2018-10-17  6:18 [PATCH v2 0/2] crypto: some hardening against AES cache-timing attacks Eric Biggers
  2018-10-17  6:18 ` [PATCH v2 1/2] crypto: aes_ti - disable interrupts while accessing S-box Eric Biggers
@ 2018-10-17  6:18 ` Eric Biggers
  2018-10-18  3:46   ` Ard Biesheuvel
  1 sibling, 1 reply; 7+ messages in thread
From: Eric Biggers @ 2018-10-17  6:18 UTC (permalink / raw)
  To: linux-crypto, Herbert Xu; +Cc: Ard Biesheuvel, Paul Crowley

From: Eric Biggers <ebiggers@google.com>

Make the ARM scalar AES implementation closer to constant-time by
disabling interrupts and prefetching the tables into L1 cache.  This is
feasible because due to ARM's "free" rotations, the main tables are only
1024 bytes instead of the usual 4096 used by most AES implementations.

On ARM Cortex-A7, the speed loss is only about 5%.  The resulting
implementation is still over twice as fast as aes_ti.c.

Note that even after these changes, the implementation still isn't
necessarily guaranteed to be constant-time; see
https://cr.yp.to/antiforgery/cachetiming-20050414.pdf for a discussion
of the many difficulties involved in writing truly constant-time AES
software.  But it's valuable to make such attacks more difficult.

Suggested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/arm/crypto/aes-cipher-core.S | 26 ++++++++++++++++++++++++++
 arch/arm/crypto/aes-cipher-glue.c | 13 +++++++++++++
 crypto/aes_generic.c              |  9 +++++----
 3 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/arch/arm/crypto/aes-cipher-core.S b/arch/arm/crypto/aes-cipher-core.S
index 184d6c2d15d5..ba9d4aefe585 100644
--- a/arch/arm/crypto/aes-cipher-core.S
+++ b/arch/arm/crypto/aes-cipher-core.S
@@ -138,6 +138,23 @@
 	eor		r7, r7, r11
 
 	__adrl		ttab, \ttab
+	/*
+	 * Prefetch the 1024-byte 'ft' or 'it' table into L1 cache, assuming
+	 * cacheline size >= 32.  This, along with the caller disabling
+	 * interrupts, is a hardening measure intended to make cache-timing
+	 * attacks more difficult.  They may not be fully prevented, however;
+	 * see the paper https://cr.yp.to/antiforgery/cachetiming-20050414.pdf
+	 * ("Cache-timing attacks on AES") for a discussion of the many
+	 * difficulties involved in writing truly constant-time AES software.
+	 */
+	.set i, 0
+.rept 1024 / 128
+	ldr		r8, [ttab, #i + 0]
+	ldr		r9, [ttab, #i + 32]
+	ldr		r10, [ttab, #i + 64]
+	ldr		r11, [ttab, #i + 96]
+	.set i, i + 128
+.endr
 
 	tst		rounds, #2
 	bne		1f
@@ -152,6 +169,15 @@
 	b		0b
 
 2:	__adrl		ttab, \ltab
+.if \bsz == 0
+	/* Prefetch the 256-byte inverse S-box; see explanation above */
+	.set i, 0
+.rept 256 / 64
+	ldr		t0, [ttab, #i + 0]
+	ldr		t1, [ttab, #i + 32]
+	.set i, i + 64
+.endr
+.endif
 	\round		r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b
 
 #ifdef CONFIG_CPU_BIG_ENDIAN
diff --git a/arch/arm/crypto/aes-cipher-glue.c b/arch/arm/crypto/aes-cipher-glue.c
index c222f6e072ad..f40e35eb22e4 100644
--- a/arch/arm/crypto/aes-cipher-glue.c
+++ b/arch/arm/crypto/aes-cipher-glue.c
@@ -23,16 +23,29 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
 	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
 	int rounds = 6 + ctx->key_length / 4;
+	unsigned long flags;
 
+	/*
+	 * This AES implementation prefetches the lookup table into L1 cache to
+	 * try to make timing attacks on the table lookups more difficult.
+	 * Temporarily disable interrupts to avoid races where cachelines are
+	 * evicted when the CPU is interrupted to do something else.
+	 */
+	local_irq_save(flags);
 	__aes_arm_encrypt(ctx->key_enc, rounds, in, out);
+	local_irq_restore(flags);
 }
 
 static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
 	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
 	int rounds = 6 + ctx->key_length / 4;
+	unsigned long flags;
 
+	/* Disable interrupts to help mitigate timing attacks, see above */
+	local_irq_save(flags);
 	__aes_arm_decrypt(ctx->key_dec, rounds, in, out);
+	local_irq_restore(flags);
 }
 
 static struct crypto_alg aes_alg = {
diff --git a/crypto/aes_generic.c b/crypto/aes_generic.c
index ca554d57d01e..13df33aca463 100644
--- a/crypto/aes_generic.c
+++ b/crypto/aes_generic.c
@@ -63,7 +63,8 @@ static inline u8 byte(const u32 x, const unsigned n)
 
 static const u32 rco_tab[10] = { 1, 2, 4, 8, 16, 32, 64, 128, 27, 54 };
 
-__visible const u32 crypto_ft_tab[4][256] = {
+/* cacheline-aligned to facilitate prefetching into cache */
+__visible const u32 crypto_ft_tab[4][256] __cacheline_aligned = {
 	{
 		0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6,
 		0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591,
@@ -327,7 +328,7 @@ __visible const u32 crypto_ft_tab[4][256] = {
 	}
 };
 
-__visible const u32 crypto_fl_tab[4][256] = {
+__visible const u32 crypto_fl_tab[4][256] __cacheline_aligned = {
 	{
 		0x00000063, 0x0000007c, 0x00000077, 0x0000007b,
 		0x000000f2, 0x0000006b, 0x0000006f, 0x000000c5,
@@ -591,7 +592,7 @@ __visible const u32 crypto_fl_tab[4][256] = {
 	}
 };
 
-__visible const u32 crypto_it_tab[4][256] = {
+__visible const u32 crypto_it_tab[4][256] __cacheline_aligned = {
 	{
 		0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a,
 		0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b,
@@ -855,7 +856,7 @@ __visible const u32 crypto_it_tab[4][256] = {
 	}
 };
 
-__visible const u32 crypto_il_tab[4][256] = {
+__visible const u32 crypto_il_tab[4][256] __cacheline_aligned = {
 	{
 		0x00000052, 0x00000009, 0x0000006a, 0x000000d5,
 		0x00000030, 0x00000036, 0x000000a5, 0x00000038,
-- 
2.19.1

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v2 2/2] crypto: arm/aes - add some hardening against cache-timing attacks
  2018-10-17  6:18 ` [PATCH v2 2/2] crypto: arm/aes - add some hardening against cache-timing attacks Eric Biggers
@ 2018-10-18  3:46   ` Ard Biesheuvel
  2018-10-18  3:46     ` Ard Biesheuvel
  0 siblings, 1 reply; 7+ messages in thread
From: Ard Biesheuvel @ 2018-10-18  3:46 UTC (permalink / raw)
  To: Eric Biggers
  Cc: open list:HARDWARE RANDOM NUMBER GENERATOR CORE, Herbert Xu,
	Paul Crowley

Hi Eric,

Thanks for looking into this.

On 17 October 2018 at 14:18, Eric Biggers <ebiggers@kernel.org> wrote:
> From: Eric Biggers <ebiggers@google.com>
>
> Make the ARM scalar AES implementation closer to constant-time by
> disabling interrupts and prefetching the tables into L1 cache.  This is
> feasible because due to ARM's "free" rotations, the main tables are only
> 1024 bytes instead of the usual 4096 used by most AES implementations.
>
> On ARM Cortex-A7, the speed loss is only about 5%.  The resulting
> implementation is still over twice as fast as aes_ti.c.
>
> Note that even after these changes, the implementation still isn't
> necessarily guaranteed to be constant-time; see
> https://cr.yp.to/antiforgery/cachetiming-20050414.pdf for a discussion
> of the many difficulties involved in writing truly constant-time AES
> software.  But it's valuable to make such attacks more difficult.
>
> Suggested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> Signed-off-by: Eric Biggers <ebiggers@google.com>
> ---
>  arch/arm/crypto/aes-cipher-core.S | 26 ++++++++++++++++++++++++++
>  arch/arm/crypto/aes-cipher-glue.c | 13 +++++++++++++
>  crypto/aes_generic.c              |  9 +++++----
>  3 files changed, 44 insertions(+), 4 deletions(-)
>
> diff --git a/arch/arm/crypto/aes-cipher-core.S b/arch/arm/crypto/aes-cipher-core.S
> index 184d6c2d15d5..ba9d4aefe585 100644
> --- a/arch/arm/crypto/aes-cipher-core.S
> +++ b/arch/arm/crypto/aes-cipher-core.S
> @@ -138,6 +138,23 @@
>         eor             r7, r7, r11
>
>         __adrl          ttab, \ttab
> +       /*
> +        * Prefetch the 1024-byte 'ft' or 'it' table into L1 cache, assuming
> +        * cacheline size >= 32.  This, along with the caller disabling
> +        * interrupts, is a hardening measure intended to make cache-timing
> +        * attacks more difficult.  They may not be fully prevented, however;
> +        * see the paper https://cr.yp.to/antiforgery/cachetiming-20050414.pdf
> +        * ("Cache-timing attacks on AES") for a discussion of the many
> +        * difficulties involved in writing truly constant-time AES software.
> +        */
> +       .set i, 0
> +.rept 1024 / 128
> +       ldr             r8, [ttab, #i + 0]
> +       ldr             r9, [ttab, #i + 32]
> +       ldr             r10, [ttab, #i + 64]
> +       ldr             r11, [ttab, #i + 96]
> +       .set i, i + 128
> +.endr
>

Mind sticking a bit more to the coding style of the file? I.e., indent
the gas directives with the code, and putting two tabs after them

>         tst             rounds, #2
>         bne             1f
> @@ -152,6 +169,15 @@
>         b               0b
>
>  2:     __adrl          ttab, \ltab
> +.if \bsz == 0
> +       /* Prefetch the 256-byte inverse S-box; see explanation above */
> +       .set i, 0
> +.rept 256 / 64
> +       ldr             t0, [ttab, #i + 0]
> +       ldr             t1, [ttab, #i + 32]
> +       .set i, i + 64
> +.endr
> +.endif
>         \round          r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b
>
>  #ifdef CONFIG_CPU_BIG_ENDIAN
> diff --git a/arch/arm/crypto/aes-cipher-glue.c b/arch/arm/crypto/aes-cipher-glue.c
> index c222f6e072ad..f40e35eb22e4 100644
> --- a/arch/arm/crypto/aes-cipher-glue.c
> +++ b/arch/arm/crypto/aes-cipher-glue.c
> @@ -23,16 +23,29 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>  {
>         struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
>         int rounds = 6 + ctx->key_length / 4;
> +       unsigned long flags;
>
> +       /*
> +        * This AES implementation prefetches the lookup table into L1 cache to
> +        * try to make timing attacks on the table lookups more difficult.
> +        * Temporarily disable interrupts to avoid races where cachelines are
> +        * evicted when the CPU is interrupted to do something else.
> +        */
> +       local_irq_save(flags);
>         __aes_arm_encrypt(ctx->key_enc, rounds, in, out);
> +       local_irq_restore(flags);
>  }
>

Disabling interrupts like that is going to raise some eyebrows, so I'd
prefer it if we can reduce the scope of the IRQ blackout as much as we
can. This means we should move the IRQ en/disable into the .S file,
and only let it cover the parts of the code where we are actually
doing table lookups that are indexed by the input.

>  static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>  {
>         struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
>         int rounds = 6 + ctx->key_length / 4;
> +       unsigned long flags;
>
> +       /* Disable interrupts to help mitigate timing attacks, see above */
> +       local_irq_save(flags);
>         __aes_arm_decrypt(ctx->key_dec, rounds, in, out);
> +       local_irq_restore(flags);
>  }
>
>  static struct crypto_alg aes_alg = {
> diff --git a/crypto/aes_generic.c b/crypto/aes_generic.c
> index ca554d57d01e..13df33aca463 100644
> --- a/crypto/aes_generic.c
> +++ b/crypto/aes_generic.c
> @@ -63,7 +63,8 @@ static inline u8 byte(const u32 x, const unsigned n)
>
>  static const u32 rco_tab[10] = { 1, 2, 4, 8, 16, 32, 64, 128, 27, 54 };
>
> -__visible const u32 crypto_ft_tab[4][256] = {
> +/* cacheline-aligned to facilitate prefetching into cache */
> +__visible const u32 crypto_ft_tab[4][256] __cacheline_aligned = {
>         {
>                 0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6,
>                 0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591,
> @@ -327,7 +328,7 @@ __visible const u32 crypto_ft_tab[4][256] = {
>         }
>  };
>
> -__visible const u32 crypto_fl_tab[4][256] = {
> +__visible const u32 crypto_fl_tab[4][256] __cacheline_aligned = {
>         {
>                 0x00000063, 0x0000007c, 0x00000077, 0x0000007b,
>                 0x000000f2, 0x0000006b, 0x0000006f, 0x000000c5,
> @@ -591,7 +592,7 @@ __visible const u32 crypto_fl_tab[4][256] = {
>         }
>  };
>
> -__visible const u32 crypto_it_tab[4][256] = {
> +__visible const u32 crypto_it_tab[4][256] __cacheline_aligned = {
>         {
>                 0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a,
>                 0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b,
> @@ -855,7 +856,7 @@ __visible const u32 crypto_it_tab[4][256] = {
>         }
>  };
>
> -__visible const u32 crypto_il_tab[4][256] = {
> +__visible const u32 crypto_il_tab[4][256] __cacheline_aligned = {
>         {
>                 0x00000052, 0x00000009, 0x0000006a, 0x000000d5,
>                 0x00000030, 0x00000036, 0x000000a5, 0x00000038,
> --
> 2.19.1
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2 2/2] crypto: arm/aes - add some hardening against cache-timing attacks
  2018-10-18  3:46   ` Ard Biesheuvel
@ 2018-10-18  3:46     ` Ard Biesheuvel
  0 siblings, 0 replies; 7+ messages in thread
From: Ard Biesheuvel @ 2018-10-18  3:46 UTC (permalink / raw)
  To: Eric Biggers
  Cc: open list:HARDWARE RANDOM NUMBER GENERATOR CORE, Herbert Xu,
	Paul Crowley

Hi Eric,

Thanks for looking into this.

On 17 October 2018 at 14:18, Eric Biggers <ebiggers@kernel.org> wrote:
> From: Eric Biggers <ebiggers@google.com>
>
> Make the ARM scalar AES implementation closer to constant-time by
> disabling interrupts and prefetching the tables into L1 cache.  This is
> feasible because due to ARM's "free" rotations, the main tables are only
> 1024 bytes instead of the usual 4096 used by most AES implementations.
>
> On ARM Cortex-A7, the speed loss is only about 5%.  The resulting
> implementation is still over twice as fast as aes_ti.c.
>
> Note that even after these changes, the implementation still isn't
> necessarily guaranteed to be constant-time; see
> https://cr.yp.to/antiforgery/cachetiming-20050414.pdf for a discussion
> of the many difficulties involved in writing truly constant-time AES
> software.  But it's valuable to make such attacks more difficult.
>
> Suggested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> Signed-off-by: Eric Biggers <ebiggers@google.com>
> ---
>  arch/arm/crypto/aes-cipher-core.S | 26 ++++++++++++++++++++++++++
>  arch/arm/crypto/aes-cipher-glue.c | 13 +++++++++++++
>  crypto/aes_generic.c              |  9 +++++----
>  3 files changed, 44 insertions(+), 4 deletions(-)
>
> diff --git a/arch/arm/crypto/aes-cipher-core.S b/arch/arm/crypto/aes-cipher-core.S
> index 184d6c2d15d5..ba9d4aefe585 100644
> --- a/arch/arm/crypto/aes-cipher-core.S
> +++ b/arch/arm/crypto/aes-cipher-core.S
> @@ -138,6 +138,23 @@
>         eor             r7, r7, r11
>
>         __adrl          ttab, \ttab
> +       /*
> +        * Prefetch the 1024-byte 'ft' or 'it' table into L1 cache, assuming
> +        * cacheline size >= 32.  This, along with the caller disabling
> +        * interrupts, is a hardening measure intended to make cache-timing
> +        * attacks more difficult.  They may not be fully prevented, however;
> +        * see the paper https://cr.yp.to/antiforgery/cachetiming-20050414.pdf
> +        * ("Cache-timing attacks on AES") for a discussion of the many
> +        * difficulties involved in writing truly constant-time AES software.
> +        */
> +       .set i, 0
> +.rept 1024 / 128
> +       ldr             r8, [ttab, #i + 0]
> +       ldr             r9, [ttab, #i + 32]
> +       ldr             r10, [ttab, #i + 64]
> +       ldr             r11, [ttab, #i + 96]
> +       .set i, i + 128
> +.endr
>

Mind sticking a bit more to the coding style of the file? I.e., indent
the gas directives with the code, and putting two tabs after them

>         tst             rounds, #2
>         bne             1f
> @@ -152,6 +169,15 @@
>         b               0b
>
>  2:     __adrl          ttab, \ltab
> +.if \bsz == 0
> +       /* Prefetch the 256-byte inverse S-box; see explanation above */
> +       .set i, 0
> +.rept 256 / 64
> +       ldr             t0, [ttab, #i + 0]
> +       ldr             t1, [ttab, #i + 32]
> +       .set i, i + 64
> +.endr
> +.endif
>         \round          r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b
>
>  #ifdef CONFIG_CPU_BIG_ENDIAN
> diff --git a/arch/arm/crypto/aes-cipher-glue.c b/arch/arm/crypto/aes-cipher-glue.c
> index c222f6e072ad..f40e35eb22e4 100644
> --- a/arch/arm/crypto/aes-cipher-glue.c
> +++ b/arch/arm/crypto/aes-cipher-glue.c
> @@ -23,16 +23,29 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>  {
>         struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
>         int rounds = 6 + ctx->key_length / 4;
> +       unsigned long flags;
>
> +       /*
> +        * This AES implementation prefetches the lookup table into L1 cache to
> +        * try to make timing attacks on the table lookups more difficult.
> +        * Temporarily disable interrupts to avoid races where cachelines are
> +        * evicted when the CPU is interrupted to do something else.
> +        */
> +       local_irq_save(flags);
>         __aes_arm_encrypt(ctx->key_enc, rounds, in, out);
> +       local_irq_restore(flags);
>  }
>

Disabling interrupts like that is going to raise some eyebrows, so I'd
prefer it if we can reduce the scope of the IRQ blackout as much as we
can. This means we should move the IRQ en/disable into the .S file,
and only let it cover the parts of the code where we are actually
doing table lookups that are indexed by the input.

>  static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>  {
>         struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
>         int rounds = 6 + ctx->key_length / 4;
> +       unsigned long flags;
>
> +       /* Disable interrupts to help mitigate timing attacks, see above */
> +       local_irq_save(flags);
>         __aes_arm_decrypt(ctx->key_dec, rounds, in, out);
> +       local_irq_restore(flags);
>  }
>
>  static struct crypto_alg aes_alg = {
> diff --git a/crypto/aes_generic.c b/crypto/aes_generic.c
> index ca554d57d01e..13df33aca463 100644
> --- a/crypto/aes_generic.c
> +++ b/crypto/aes_generic.c
> @@ -63,7 +63,8 @@ static inline u8 byte(const u32 x, const unsigned n)
>
>  static const u32 rco_tab[10] = { 1, 2, 4, 8, 16, 32, 64, 128, 27, 54 };
>
> -__visible const u32 crypto_ft_tab[4][256] = {
> +/* cacheline-aligned to facilitate prefetching into cache */
> +__visible const u32 crypto_ft_tab[4][256] __cacheline_aligned = {
>         {
>                 0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6,
>                 0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591,
> @@ -327,7 +328,7 @@ __visible const u32 crypto_ft_tab[4][256] = {
>         }
>  };
>
> -__visible const u32 crypto_fl_tab[4][256] = {
> +__visible const u32 crypto_fl_tab[4][256] __cacheline_aligned = {
>         {
>                 0x00000063, 0x0000007c, 0x00000077, 0x0000007b,
>                 0x000000f2, 0x0000006b, 0x0000006f, 0x000000c5,
> @@ -591,7 +592,7 @@ __visible const u32 crypto_fl_tab[4][256] = {
>         }
>  };
>
> -__visible const u32 crypto_it_tab[4][256] = {
> +__visible const u32 crypto_it_tab[4][256] __cacheline_aligned = {
>         {
>                 0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a,
>                 0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b,
> @@ -855,7 +856,7 @@ __visible const u32 crypto_it_tab[4][256] = {
>         }
>  };
>
> -__visible const u32 crypto_il_tab[4][256] = {
> +__visible const u32 crypto_il_tab[4][256] __cacheline_aligned = {
>         {
>                 0x00000052, 0x00000009, 0x0000006a, 0x000000d5,
>                 0x00000030, 0x00000036, 0x000000a5, 0x00000038,
> --
> 2.19.1
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2 1/2] crypto: aes_ti - disable interrupts while accessing S-box
  2018-10-17  6:18 ` [PATCH v2 1/2] crypto: aes_ti - disable interrupts while accessing S-box Eric Biggers
@ 2018-10-18  4:01   ` Ard Biesheuvel
  2018-10-18  4:01     ` Ard Biesheuvel
  0 siblings, 1 reply; 7+ messages in thread
From: Ard Biesheuvel @ 2018-10-18  4:01 UTC (permalink / raw)
  To: Eric Biggers
  Cc: open list:HARDWARE RANDOM NUMBER GENERATOR CORE, Herbert Xu,
	Paul Crowley

Hi Eric,

On 17 October 2018 at 14:18, Eric Biggers <ebiggers@kernel.org> wrote:
> From: Eric Biggers <ebiggers@google.com>
>
> In the "aes-fixed-time" AES implementation, disable interrupts while
> accessing the S-box, in order to make cache-timing attacks more
> difficult.  Previously it was possible for the CPU to be interrupted
> while the S-box was loaded into L1 cache, potentially evicting the
> cachelines and causing later table lookups to be time-variant.
>
> In tests I did on x86 and ARM, this doesn't affect performance
> significantly.  Responsiveness is potentially a concern, but interrupts
> are only disabled for a single AES block.
>
> Note that even after this change, the implementation still isn't
> necessarily guaranteed to be constant-time; see
> https://cr.yp.to/antiforgery/cachetiming-20050414.pdf for a discussion
> of the many difficulties involved in writing truly constant-time AES
> software.  But it's valuable to make such attacks more difficult.
>
> Signed-off-by: Eric Biggers <ebiggers@google.com>

Thanks for taking a look. Could we add something to the Kconfig blurb
that mentions that it runs the algorithm witn interrupts disabled? In
any case,

Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>


> ---
>  crypto/aes_ti.c | 18 ++++++++++++++++++
>  1 file changed, 18 insertions(+)
>
> diff --git a/crypto/aes_ti.c b/crypto/aes_ti.c
> index 03023b2290e8..1ff9785b30f5 100644
> --- a/crypto/aes_ti.c
> +++ b/crypto/aes_ti.c
> @@ -269,6 +269,7 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>         const u32 *rkp = ctx->key_enc + 4;
>         int rounds = 6 + ctx->key_length / 4;
>         u32 st0[4], st1[4];
> +       unsigned long flags;
>         int round;
>
>         st0[0] = ctx->key_enc[0] ^ get_unaligned_le32(in);
> @@ -276,6 +277,12 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>         st0[2] = ctx->key_enc[2] ^ get_unaligned_le32(in + 8);
>         st0[3] = ctx->key_enc[3] ^ get_unaligned_le32(in + 12);
>
> +       /*
> +        * Temporarily disable interrupts to avoid races where cachelines are
> +        * evicted when the CPU is interrupted to do something else.
> +        */
> +       local_irq_save(flags);
> +
>         st0[0] ^= __aesti_sbox[ 0] ^ __aesti_sbox[128];
>         st0[1] ^= __aesti_sbox[32] ^ __aesti_sbox[160];
>         st0[2] ^= __aesti_sbox[64] ^ __aesti_sbox[192];
> @@ -300,6 +307,8 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>         put_unaligned_le32(subshift(st1, 1) ^ rkp[5], out + 4);
>         put_unaligned_le32(subshift(st1, 2) ^ rkp[6], out + 8);
>         put_unaligned_le32(subshift(st1, 3) ^ rkp[7], out + 12);
> +
> +       local_irq_restore(flags);
>  }
>
>  static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
> @@ -308,6 +317,7 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>         const u32 *rkp = ctx->key_dec + 4;
>         int rounds = 6 + ctx->key_length / 4;
>         u32 st0[4], st1[4];
> +       unsigned long flags;
>         int round;
>
>         st0[0] = ctx->key_dec[0] ^ get_unaligned_le32(in);
> @@ -315,6 +325,12 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>         st0[2] = ctx->key_dec[2] ^ get_unaligned_le32(in + 8);
>         st0[3] = ctx->key_dec[3] ^ get_unaligned_le32(in + 12);
>
> +       /*
> +        * Temporarily disable interrupts to avoid races where cachelines are
> +        * evicted when the CPU is interrupted to do something else.
> +        */
> +       local_irq_save(flags);
> +
>         st0[0] ^= __aesti_inv_sbox[ 0] ^ __aesti_inv_sbox[128];
>         st0[1] ^= __aesti_inv_sbox[32] ^ __aesti_inv_sbox[160];
>         st0[2] ^= __aesti_inv_sbox[64] ^ __aesti_inv_sbox[192];
> @@ -339,6 +355,8 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>         put_unaligned_le32(inv_subshift(st1, 1) ^ rkp[5], out + 4);
>         put_unaligned_le32(inv_subshift(st1, 2) ^ rkp[6], out + 8);
>         put_unaligned_le32(inv_subshift(st1, 3) ^ rkp[7], out + 12);
> +
> +       local_irq_restore(flags);
>  }
>
>  static struct crypto_alg aes_alg = {
> --
> 2.19.1
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2 1/2] crypto: aes_ti - disable interrupts while accessing S-box
  2018-10-18  4:01   ` Ard Biesheuvel
@ 2018-10-18  4:01     ` Ard Biesheuvel
  0 siblings, 0 replies; 7+ messages in thread
From: Ard Biesheuvel @ 2018-10-18  4:01 UTC (permalink / raw)
  To: Eric Biggers
  Cc: open list:HARDWARE RANDOM NUMBER GENERATOR CORE, Herbert Xu,
	Paul Crowley

Hi Eric,

On 17 October 2018 at 14:18, Eric Biggers <ebiggers@kernel.org> wrote:
> From: Eric Biggers <ebiggers@google.com>
>
> In the "aes-fixed-time" AES implementation, disable interrupts while
> accessing the S-box, in order to make cache-timing attacks more
> difficult.  Previously it was possible for the CPU to be interrupted
> while the S-box was loaded into L1 cache, potentially evicting the
> cachelines and causing later table lookups to be time-variant.
>
> In tests I did on x86 and ARM, this doesn't affect performance
> significantly.  Responsiveness is potentially a concern, but interrupts
> are only disabled for a single AES block.
>
> Note that even after this change, the implementation still isn't
> necessarily guaranteed to be constant-time; see
> https://cr.yp.to/antiforgery/cachetiming-20050414.pdf for a discussion
> of the many difficulties involved in writing truly constant-time AES
> software.  But it's valuable to make such attacks more difficult.
>
> Signed-off-by: Eric Biggers <ebiggers@google.com>

Thanks for taking a look. Could we add something to the Kconfig blurb
that mentions that it runs the algorithm witn interrupts disabled? In
any case,

Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>


> ---
>  crypto/aes_ti.c | 18 ++++++++++++++++++
>  1 file changed, 18 insertions(+)
>
> diff --git a/crypto/aes_ti.c b/crypto/aes_ti.c
> index 03023b2290e8..1ff9785b30f5 100644
> --- a/crypto/aes_ti.c
> +++ b/crypto/aes_ti.c
> @@ -269,6 +269,7 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>         const u32 *rkp = ctx->key_enc + 4;
>         int rounds = 6 + ctx->key_length / 4;
>         u32 st0[4], st1[4];
> +       unsigned long flags;
>         int round;
>
>         st0[0] = ctx->key_enc[0] ^ get_unaligned_le32(in);
> @@ -276,6 +277,12 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>         st0[2] = ctx->key_enc[2] ^ get_unaligned_le32(in + 8);
>         st0[3] = ctx->key_enc[3] ^ get_unaligned_le32(in + 12);
>
> +       /*
> +        * Temporarily disable interrupts to avoid races where cachelines are
> +        * evicted when the CPU is interrupted to do something else.
> +        */
> +       local_irq_save(flags);
> +
>         st0[0] ^= __aesti_sbox[ 0] ^ __aesti_sbox[128];
>         st0[1] ^= __aesti_sbox[32] ^ __aesti_sbox[160];
>         st0[2] ^= __aesti_sbox[64] ^ __aesti_sbox[192];
> @@ -300,6 +307,8 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>         put_unaligned_le32(subshift(st1, 1) ^ rkp[5], out + 4);
>         put_unaligned_le32(subshift(st1, 2) ^ rkp[6], out + 8);
>         put_unaligned_le32(subshift(st1, 3) ^ rkp[7], out + 12);
> +
> +       local_irq_restore(flags);
>  }
>
>  static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
> @@ -308,6 +317,7 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>         const u32 *rkp = ctx->key_dec + 4;
>         int rounds = 6 + ctx->key_length / 4;
>         u32 st0[4], st1[4];
> +       unsigned long flags;
>         int round;
>
>         st0[0] = ctx->key_dec[0] ^ get_unaligned_le32(in);
> @@ -315,6 +325,12 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>         st0[2] = ctx->key_dec[2] ^ get_unaligned_le32(in + 8);
>         st0[3] = ctx->key_dec[3] ^ get_unaligned_le32(in + 12);
>
> +       /*
> +        * Temporarily disable interrupts to avoid races where cachelines are
> +        * evicted when the CPU is interrupted to do something else.
> +        */
> +       local_irq_save(flags);
> +
>         st0[0] ^= __aesti_inv_sbox[ 0] ^ __aesti_inv_sbox[128];
>         st0[1] ^= __aesti_inv_sbox[32] ^ __aesti_inv_sbox[160];
>         st0[2] ^= __aesti_inv_sbox[64] ^ __aesti_inv_sbox[192];
> @@ -339,6 +355,8 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>         put_unaligned_le32(inv_subshift(st1, 1) ^ rkp[5], out + 4);
>         put_unaligned_le32(inv_subshift(st1, 2) ^ rkp[6], out + 8);
>         put_unaligned_le32(inv_subshift(st1, 3) ^ rkp[7], out + 12);
> +
> +       local_irq_restore(flags);
>  }
>
>  static struct crypto_alg aes_alg = {
> --
> 2.19.1
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2018-10-18 12:00 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-10-17  6:18 [PATCH v2 0/2] crypto: some hardening against AES cache-timing attacks Eric Biggers
2018-10-17  6:18 ` [PATCH v2 1/2] crypto: aes_ti - disable interrupts while accessing S-box Eric Biggers
2018-10-18  4:01   ` Ard Biesheuvel
2018-10-18  4:01     ` Ard Biesheuvel
2018-10-17  6:18 ` [PATCH v2 2/2] crypto: arm/aes - add some hardening against cache-timing attacks Eric Biggers
2018-10-18  3:46   ` Ard Biesheuvel
2018-10-18  3:46     ` Ard Biesheuvel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox