Linux cryptographic layer development

Linux cryptographic layer development
 help / color / mirror / Atom feed

* [RFC PATCH 4.10 5/6] bpf: Rename fdinfo's prog_digest to prog_sha256
From: Andy Lutomirski @ 2016-12-24  2:22 UTC (permalink / raw)
  To: Daniel Borkmann, Netdev, LKML, Linux Crypto Mailing List
  Cc: Jason A. Donenfeld, Hannes Frederic Sowa, Alexei Starovoitov,
	Eric Dumazet, Eric Biggers, Tom Herbert, David S. Miller,
	Andy Lutomirski, Alexei Starovoitov
In-Reply-To: <cover.1482545792.git.luto@kernel.org>

This makes it easier to add another digest algorithm down the road
if needed.  It also serves to force any programs that might have
been written against a kernel that had 'prog_digest' to be updated.

This shouldn't violate any stable API policies, as no released
kernel has ever had 'prog_digest'.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 kernel/bpf/syscall.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e89acea22ecf..956370b80296 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -694,7 +694,7 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
 	seq_printf(m,
 		   "prog_type:\t%u\n"
 		   "prog_jited:\t%u\n"
-		   "prog_digest:\t%s\n"
+		   "prog_sha256:\t%s\n"
 		   "memlock:\t%llu\n",
 		   prog->type,
 		   prog->jited,
-- 
2.9.3

^ permalink raw reply related

* [RFC PATCH 4.10 4/6] bpf: Avoid copying the entire BPF program when hashing it
From: Andy Lutomirski @ 2016-12-24  2:22 UTC (permalink / raw)
  To: Daniel Borkmann, Netdev, LKML, Linux Crypto Mailing List
  Cc: Jason A. Donenfeld, Hannes Frederic Sowa, Alexei Starovoitov,
	Eric Dumazet, Eric Biggers, Tom Herbert, David S. Miller,
	Andy Lutomirski, Alexei Starovoitov
In-Reply-To: <cover.1482545792.git.luto@kernel.org>

The sha256 helpers can consume a message incrementally, so there's no need
to allocate a buffer to store the whole blob to be hashed.

This may be a slight slowdown for very long messages because gcc can't
inline the sha256_update() calls.  For reasonably-sized programs,
however, this should be a considerable speedup as vmalloc() is quite
slow.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 kernel/bpf/core.c | 34 ++++++++++++++--------------------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 911993863799..1c2931f505af 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -149,43 +149,37 @@ void __bpf_prog_free(struct bpf_prog *fp)
 int bpf_prog_calc_digest(struct bpf_prog *fp)
 {
 	struct sha256_state sha;
-	u32 i, psize;
-	struct bpf_insn *dst;
+	u32 i;
 	bool was_ld_map;
-	u8 *raw;
-
-	psize = bpf_prog_insn_size(fp);
-	raw = vmalloc(psize);
-	if (!raw)
-		return -ENOMEM;
 
 	sha256_init(&sha);
 
 	/* We need to take out the map fd for the digest calculation
 	 * since they are unstable from user space side.
 	 */
-	dst = (void *)raw;
 	for (i = 0, was_ld_map = false; i < fp->len; i++) {
-		dst[i] = fp->insnsi[i];
+		struct bpf_insn insn = fp->insnsi[i];
+
 		if (!was_ld_map &&
-		    dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
-		    dst[i].src_reg == BPF_PSEUDO_MAP_FD) {
+		    insn.code == (BPF_LD | BPF_IMM | BPF_DW) &&
+		    insn.src_reg == BPF_PSEUDO_MAP_FD) {
 			was_ld_map = true;
-			dst[i].imm = 0;
+			insn.imm = 0;
 		} else if (was_ld_map &&
-			   dst[i].code == 0 &&
-			   dst[i].dst_reg == 0 &&
-			   dst[i].src_reg == 0 &&
-			   dst[i].off == 0) {
+			   insn.code == 0 &&
+			   insn.dst_reg == 0 &&
+			   insn.src_reg == 0 &&
+			   insn.off == 0) {
 			was_ld_map = false;
-			dst[i].imm = 0;
+			insn.imm = 0;
 		} else {
 			was_ld_map = false;
 		}
+
+		sha256_update(&sha, (const u8 *)&insn, sizeof(insn));
 	}
 
-	sha256_finup(&sha, raw, psize, fp->digest);
-	vfree(raw);
+	sha256_final(&sha, fp->digest);
 	return 0;
 }
 
-- 
2.9.3

^ permalink raw reply related

* [RFC PATCH 4.10 2/6] crypto/sha256: Make the sha256 library functions selectable
From: Andy Lutomirski @ 2016-12-24  2:22 UTC (permalink / raw)
  To: Daniel Borkmann, Netdev, LKML, Linux Crypto Mailing List
  Cc: Jason A. Donenfeld, Hannes Frederic Sowa, Alexei Starovoitov,
	Eric Dumazet, Eric Biggers, Tom Herbert, David S. Miller,
	Andy Lutomirski
In-Reply-To: <cover.1482545792.git.luto@kernel.org>

This will let other kernel code call into sha256_init(), etc. without
pulling in the core crypto code.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 crypto/Kconfig          | 8 ++++++++
 crypto/Makefile         | 2 +-
 crypto/sha256_generic.c | 4 ++++
 include/crypto/sha.h    | 4 ++++
 4 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 160f08e721cc..85a2b3440c2b 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -10,6 +10,13 @@ config XOR_BLOCKS
 source "crypto/async_tx/Kconfig"
 
 #
+# Cryptographic algorithms that are usable without the Crypto API.
+# None of these should have visible config options.
+#
+config CRYPTO_SHA256_LIB
+	bool
+
+#
 # Cryptographic API Configuration
 #
 menuconfig CRYPTO
@@ -763,6 +770,7 @@ config CRYPTO_SHA512_MB
 
 config CRYPTO_SHA256
 	tristate "SHA224 and SHA256 digest algorithm"
+	select CRYPTO_SHA256_LIB
 	select CRYPTO_HASH
 	help
 	  SHA256 secure hash standard (DFIPS 180-2).
diff --git a/crypto/Makefile b/crypto/Makefile
index b8f0e3eb0791..d147d4c911f5 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -71,7 +71,7 @@ obj-$(CONFIG_CRYPTO_RMD160) += rmd160.o
 obj-$(CONFIG_CRYPTO_RMD256) += rmd256.o
 obj-$(CONFIG_CRYPTO_RMD320) += rmd320.o
 obj-$(CONFIG_CRYPTO_SHA1) += sha1_generic.o
-obj-$(CONFIG_CRYPTO_SHA256) += sha256_generic.o
+obj-$(CONFIG_CRYPTO_SHA256_LIB) += sha256_generic.o
 obj-$(CONFIG_CRYPTO_SHA512) += sha512_generic.o
 obj-$(CONFIG_CRYPTO_SHA3) += sha3_generic.o
 obj-$(CONFIG_CRYPTO_WP512) += wp512.o
diff --git a/crypto/sha256_generic.c b/crypto/sha256_generic.c
index f2747893402c..9df71ac66dc4 100644
--- a/crypto/sha256_generic.c
+++ b/crypto/sha256_generic.c
@@ -261,6 +261,8 @@ void sha256_final(struct sha256_state *sctx, u8 *out)
 }
 EXPORT_SYMBOL(sha256_final);
 
+#ifdef CONFIG_CRYPTO_HASH
+
 static int crypto_sha256_update(struct shash_desc *desc, const u8 *data,
 				unsigned int len)
 {
@@ -328,6 +330,8 @@ static void __exit sha256_generic_mod_fini(void)
 module_init(sha256_generic_mod_init);
 module_exit(sha256_generic_mod_fini);
 
+#endif /* CONFIG_CRYPTO_HASH */
+
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("SHA-224 and SHA-256 Secure Hash Algorithm");
 
diff --git a/include/crypto/sha.h b/include/crypto/sha.h
index 2b6978471605..381ba7fa5e3f 100644
--- a/include/crypto/sha.h
+++ b/include/crypto/sha.h
@@ -96,6 +96,8 @@ extern int crypto_sha1_update(struct shash_desc *desc, const u8 *data,
 extern int crypto_sha1_finup(struct shash_desc *desc, const u8 *data,
 			     unsigned int len, u8 *hash);
 
+#ifdef CONFIG_CRYPTO_SHA256_LIB
+
 static inline void sha256_init(struct sha256_state *sctx)
 {
 	sctx->state[0] = SHA256_H0;
@@ -121,6 +123,8 @@ static inline void sha256_finup(struct sha256_state *sctx, const u8 *data,
 	sha256_final(sctx, hash);
 }
 
+#endif /* CONFIG_CRYPTO_SHA256_LIB */
+
 extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data,
 			      unsigned int len);
 
-- 
2.9.3

^ permalink raw reply related

* [RFC PATCH 4.10 1/6] crypto/sha256: Refactor the API so it can be used without shash
From: Andy Lutomirski @ 2016-12-24  2:22 UTC (permalink / raw)
  To: Daniel Borkmann, Netdev, LKML, Linux Crypto Mailing List
  Cc: Jason A. Donenfeld, Hannes Frederic Sowa, Alexei Starovoitov,
	Eric Dumazet, Eric Biggers, Tom Herbert, David S. Miller,
	Andy Lutomirski, Ard Biesheuvel, Herbert Xu
In-Reply-To: <cover.1482545792.git.luto@kernel.org>

There are some pieecs of kernel code that want to compute SHA256
directly without going through the crypto core.  Adjust the exported
API to decouple it from the crypto core.

I suspect this will very slightly speed up the SHA256 shash operations
as well by reducing the amount of indirection involved.

Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 arch/arm/crypto/sha2-ce-glue.c      | 10 ++++---
 arch/arm/crypto/sha256_glue.c       | 23 ++++++++++-----
 arch/arm/crypto/sha256_neon_glue.c  | 34 +++++++++++----------
 arch/arm64/crypto/sha2-ce-glue.c    | 13 ++++----
 arch/arm64/crypto/sha256-glue.c     | 59 +++++++++++++++++++++----------------
 arch/x86/crypto/sha256_ssse3_glue.c | 46 +++++++++++++++++------------
 arch/x86/purgatory/purgatory.c      |  2 +-
 arch/x86/purgatory/sha256.c         | 25 ++--------------
 arch/x86/purgatory/sha256.h         | 22 --------------
 crypto/sha256_generic.c             | 50 +++++++++++++++++++++++--------
 include/crypto/sha.h                | 29 ++++++++++++++----
 include/crypto/sha256_base.h        | 40 ++++++++-----------------
 12 files changed, 184 insertions(+), 169 deletions(-)
 delete mode 100644 arch/x86/purgatory/sha256.h

diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c
index 0755b2d657f3..8832c2f85591 100644
--- a/arch/arm/crypto/sha2-ce-glue.c
+++ b/arch/arm/crypto/sha2-ce-glue.c
@@ -38,7 +38,7 @@ static int sha2_ce_update(struct shash_desc *desc, const u8 *data,
 		return crypto_sha256_arm_update(desc, data, len);
 
 	kernel_neon_begin();
-	sha256_base_do_update(desc, data, len,
+	sha256_base_do_update(sctx, data, len,
 			      (sha256_block_fn *)sha2_ce_transform);
 	kernel_neon_end();
 
@@ -48,17 +48,19 @@ static int sha2_ce_update(struct shash_desc *desc, const u8 *data,
 static int sha2_ce_finup(struct shash_desc *desc, const u8 *data,
 			 unsigned int len, u8 *out)
 {
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
 	if (!may_use_simd())
 		return crypto_sha256_arm_finup(desc, data, len, out);
 
 	kernel_neon_begin();
 	if (len)
-		sha256_base_do_update(desc, data, len,
+		sha256_base_do_update(sctx, data, len,
 				      (sha256_block_fn *)sha2_ce_transform);
-	sha256_base_do_finalize(desc, (sha256_block_fn *)sha2_ce_transform);
+	sha256_base_do_finalize(sctx, (sha256_block_fn *)sha2_ce_transform);
 	kernel_neon_end();
 
-	return sha256_base_finish(desc, out);
+	return crypto_sha2_final(desc, out);
 }
 
 static int sha2_ce_final(struct shash_desc *desc, u8 *out)
diff --git a/arch/arm/crypto/sha256_glue.c b/arch/arm/crypto/sha256_glue.c
index a84e869ef900..405a29a9a9d3 100644
--- a/arch/arm/crypto/sha256_glue.c
+++ b/arch/arm/crypto/sha256_glue.c
@@ -36,27 +36,34 @@ asmlinkage void sha256_block_data_order(u32 *digest, const void *data,
 int crypto_sha256_arm_update(struct shash_desc *desc, const u8 *data,
 			     unsigned int len)
 {
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
 	/* make sure casting to sha256_block_fn() is safe */
 	BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0);
 
-	return sha256_base_do_update(desc, data, len,
+	sha256_base_do_update(sctx, data, len,
 				(sha256_block_fn *)sha256_block_data_order);
+	return 0;
 }
 EXPORT_SYMBOL(crypto_sha256_arm_update);
 
-static int sha256_final(struct shash_desc *desc, u8 *out)
+static int sha256_arm_final(struct shash_desc *desc, u8 *out)
 {
-	sha256_base_do_finalize(desc,
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	sha256_base_do_finalize(sctx,
 				(sha256_block_fn *)sha256_block_data_order);
-	return sha256_base_finish(desc, out);
+	return crypto_sha2_final(desc, out);
 }
 
 int crypto_sha256_arm_finup(struct shash_desc *desc, const u8 *data,
 			    unsigned int len, u8 *out)
 {
-	sha256_base_do_update(desc, data, len,
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	sha256_base_do_update(sctx, data, len,
 			      (sha256_block_fn *)sha256_block_data_order);
-	return sha256_final(desc, out);
+	return crypto_sha2_final(desc, out);
 }
 EXPORT_SYMBOL(crypto_sha256_arm_finup);
 
@@ -64,7 +71,7 @@ static struct shash_alg algs[] = { {
 	.digestsize	=	SHA256_DIGEST_SIZE,
 	.init		=	sha256_base_init,
 	.update		=	crypto_sha256_arm_update,
-	.final		=	sha256_final,
+	.final		=	sha256_arm_final,
 	.finup		=	crypto_sha256_arm_finup,
 	.descsize	=	sizeof(struct sha256_state),
 	.base		=	{
@@ -79,7 +86,7 @@ static struct shash_alg algs[] = { {
 	.digestsize	=	SHA224_DIGEST_SIZE,
 	.init		=	sha224_base_init,
 	.update		=	crypto_sha256_arm_update,
-	.final		=	sha256_final,
+	.final		=	sha256_arm_final,
 	.finup		=	crypto_sha256_arm_finup,
 	.descsize	=	sizeof(struct sha256_state),
 	.base		=	{
diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c
index 39ccd658817e..40c85d1d4c1e 100644
--- a/arch/arm/crypto/sha256_neon_glue.c
+++ b/arch/arm/crypto/sha256_neon_glue.c
@@ -29,8 +29,8 @@
 asmlinkage void sha256_block_data_order_neon(u32 *digest, const void *data,
 					     unsigned int num_blks);
 
-static int sha256_update(struct shash_desc *desc, const u8 *data,
-			 unsigned int len)
+static int sha256_neon_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len)
 {
 	struct sha256_state *sctx = shash_desc_ctx(desc);
 
@@ -39,41 +39,43 @@ static int sha256_update(struct shash_desc *desc, const u8 *data,
 		return crypto_sha256_arm_update(desc, data, len);
 
 	kernel_neon_begin();
-	sha256_base_do_update(desc, data, len,
+	sha256_base_do_update(sctx, data, len,
 			(sha256_block_fn *)sha256_block_data_order_neon);
 	kernel_neon_end();
 
 	return 0;
 }
 
-static int sha256_finup(struct shash_desc *desc, const u8 *data,
-			unsigned int len, u8 *out)
+static int sha256_neon_finup(struct shash_desc *desc, const u8 *data,
+			     unsigned int len, u8 *out)
 {
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
 	if (!may_use_simd())
 		return crypto_sha256_arm_finup(desc, data, len, out);
 
 	kernel_neon_begin();
 	if (len)
-		sha256_base_do_update(desc, data, len,
+		sha256_base_do_update(sctx, data, len,
 			(sha256_block_fn *)sha256_block_data_order_neon);
-	sha256_base_do_finalize(desc,
+	sha256_base_do_finalize(sctx,
 			(sha256_block_fn *)sha256_block_data_order_neon);
 	kernel_neon_end();
 
-	return sha256_base_finish(desc, out);
+	return crypto_sha2_final(desc, out);
 }
 
-static int sha256_final(struct shash_desc *desc, u8 *out)
+static int sha256_neon_final(struct shash_desc *desc, u8 *out)
 {
-	return sha256_finup(desc, NULL, 0, out);
+	return sha256_neon_finup(desc, NULL, 0, out);
 }
 
 struct shash_alg sha256_neon_algs[] = { {
 	.digestsize	=	SHA256_DIGEST_SIZE,
 	.init		=	sha256_base_init,
-	.update		=	sha256_update,
-	.final		=	sha256_final,
-	.finup		=	sha256_finup,
+	.update		=	sha256_neon_update,
+	.final		=	sha256_neon_final,
+	.finup		=	sha256_neon_finup,
 	.descsize	=	sizeof(struct sha256_state),
 	.base		=	{
 		.cra_name	=	"sha256",
@@ -86,9 +88,9 @@ struct shash_alg sha256_neon_algs[] = { {
 }, {
 	.digestsize	=	SHA224_DIGEST_SIZE,
 	.init		=	sha224_base_init,
-	.update		=	sha256_update,
-	.final		=	sha256_final,
-	.finup		=	sha256_finup,
+	.update		=	sha256_neon_update,
+	.final		=	sha256_neon_final,
+	.finup		=	sha256_neon_finup,
 	.descsize	=	sizeof(struct sha256_state),
 	.base		=	{
 		.cra_name	=	"sha224",
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
index 7cd587564a41..e38dd301abce 100644
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -39,7 +39,7 @@ static int sha256_ce_update(struct shash_desc *desc, const u8 *data,
 
 	sctx->finalize = 0;
 	kernel_neon_begin_partial(28);
-	sha256_base_do_update(desc, data, len,
+	sha256_base_do_update(&sctx->sst, data, len,
 			      (sha256_block_fn *)sha2_ce_transform);
 	kernel_neon_end();
 
@@ -64,13 +64,13 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data,
 	sctx->finalize = finalize;
 
 	kernel_neon_begin_partial(28);
-	sha256_base_do_update(desc, data, len,
+	sha256_base_do_update(&sctx->sst, data, len,
 			      (sha256_block_fn *)sha2_ce_transform);
 	if (!finalize)
-		sha256_base_do_finalize(desc,
+		sha256_base_do_finalize(&sctx->sst,
 					(sha256_block_fn *)sha2_ce_transform);
 	kernel_neon_end();
-	return sha256_base_finish(desc, out);
+	return crypto_sha2_final(desc, out);
 }
 
 static int sha256_ce_final(struct shash_desc *desc, u8 *out)
@@ -79,9 +79,10 @@ static int sha256_ce_final(struct shash_desc *desc, u8 *out)
 
 	sctx->finalize = 0;
 	kernel_neon_begin_partial(28);
-	sha256_base_do_finalize(desc, (sha256_block_fn *)sha2_ce_transform);
+	sha256_base_do_finalize(&sctx->sst,
+				(sha256_block_fn *)sha2_ce_transform);
 	kernel_neon_end();
-	return sha256_base_finish(desc, out);
+	return crypto_sha2_final(desc, out);
 }
 
 static struct shash_alg algs[] = { {
diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c
index a2226f841960..132a1ef89a71 100644
--- a/arch/arm64/crypto/sha256-glue.c
+++ b/arch/arm64/crypto/sha256-glue.c
@@ -33,36 +33,39 @@ asmlinkage void sha256_block_data_order(u32 *digest, const void *data,
 asmlinkage void sha256_block_neon(u32 *digest, const void *data,
 				  unsigned int num_blks);
 
-static int sha256_update(struct shash_desc *desc, const u8 *data,
-			 unsigned int len)
+static int sha256_update_arm64(struct shash_desc *desc, const u8 *data,
+			       unsigned int len)
 {
-	return sha256_base_do_update(desc, data, len,
-				(sha256_block_fn *)sha256_block_data_order);
+	sha256_base_do_update(shash_desc_ctx(desc), data, len,
+			      (sha256_block_fn *)sha256_block_data_order);
+	return 0;
 }
 
-static int sha256_finup(struct shash_desc *desc, const u8 *data,
-			unsigned int len, u8 *out)
+static int sha256_finup_arm64(struct shash_desc *desc, const u8 *data,
+			      unsigned int len, u8 *out)
 {
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
 	if (len)
-		sha256_base_do_update(desc, data, len,
+		sha256_base_do_update(sctx, data, len,
 				(sha256_block_fn *)sha256_block_data_order);
-	sha256_base_do_finalize(desc,
+	sha256_base_do_finalize(sctx,
 				(sha256_block_fn *)sha256_block_data_order);
 
-	return sha256_base_finish(desc, out);
+	return crypto_sha2_final(desc, out);
 }
 
-static int sha256_final(struct shash_desc *desc, u8 *out)
+static int sha256_final_arm64(struct shash_desc *desc, u8 *out)
 {
-	return sha256_finup(desc, NULL, 0, out);
+	return sha256_finup_arm64(desc, NULL, 0, out);
 }
 
 static struct shash_alg algs[] = { {
 	.digestsize		= SHA256_DIGEST_SIZE,
 	.init			= sha256_base_init,
-	.update			= sha256_update,
-	.final			= sha256_final,
-	.finup			= sha256_finup,
+	.update			= sha256_update_arm64,
+	.final			= sha256_final_arm64,
+	.finup			= sha256_finup_arm64,
 	.descsize		= sizeof(struct sha256_state),
 	.base.cra_name		= "sha256",
 	.base.cra_driver_name	= "sha256-arm64",
@@ -73,9 +76,9 @@ static struct shash_alg algs[] = { {
 }, {
 	.digestsize		= SHA224_DIGEST_SIZE,
 	.init			= sha224_base_init,
-	.update			= sha256_update,
-	.final			= sha256_final,
-	.finup			= sha256_finup,
+	.update			= sha256_update_arm64,
+	.final			= sha256_final_arm64,
+	.finup			= sha256_finup_arm64,
 	.descsize		= sizeof(struct sha256_state),
 	.base.cra_name		= "sha224",
 	.base.cra_driver_name	= "sha224-arm64",
@@ -88,18 +91,22 @@ static struct shash_alg algs[] = { {
 static int sha256_update_neon(struct shash_desc *desc, const u8 *data,
 			      unsigned int len)
 {
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
 	/*
 	 * Stacking and unstacking a substantial slice of the NEON register
 	 * file may significantly affect performance for small updates when
 	 * executing in interrupt context, so fall back to the scalar code
 	 * in that case.
 	 */
-	if (!may_use_simd())
-		return sha256_base_do_update(desc, data, len,
+	if (!may_use_simd()) {
+		sha256_base_do_update(sctx, data, len,
 				(sha256_block_fn *)sha256_block_data_order);
+		return 0;
+	}
 
 	kernel_neon_begin();
-	sha256_base_do_update(desc, data, len,
+	sha256_base_do_update(sctx, data, len,
 				(sha256_block_fn *)sha256_block_neon);
 	kernel_neon_end();
 
@@ -109,22 +116,24 @@ static int sha256_update_neon(struct shash_desc *desc, const u8 *data,
 static int sha256_finup_neon(struct shash_desc *desc, const u8 *data,
 			     unsigned int len, u8 *out)
 {
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
 	if (!may_use_simd()) {
 		if (len)
-			sha256_base_do_update(desc, data, len,
+			sha256_base_do_update(sctx, data, len,
 				(sha256_block_fn *)sha256_block_data_order);
-		sha256_base_do_finalize(desc,
+		sha256_base_do_finalize(sctx,
 				(sha256_block_fn *)sha256_block_data_order);
 	} else {
 		kernel_neon_begin();
 		if (len)
-			sha256_base_do_update(desc, data, len,
+			sha256_base_do_update(sctx, data, len,
 				(sha256_block_fn *)sha256_block_neon);
-		sha256_base_do_finalize(desc,
+		sha256_base_do_finalize(sctx,
 				(sha256_block_fn *)sha256_block_neon);
 		kernel_neon_end();
 	}
-	return sha256_base_finish(desc, out);
+	return crypto_sha2_final(desc, out);
 }
 
 static int sha256_final_neon(struct shash_desc *desc, u8 *out)
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index 9e79baf03a4b..e722fbaf0558 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -44,52 +44,60 @@ asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data,
 				       u64 rounds);
 typedef void (sha256_transform_fn)(u32 *digest, const char *data, u64 rounds);
 
-static int sha256_update(struct shash_desc *desc, const u8 *data,
-			 unsigned int len, sha256_transform_fn *sha256_xform)
+static int sha256_fpu_update(struct shash_desc *desc, const u8 *data,
+			       unsigned int len,
+			       sha256_transform_fn *sha256_xform)
 {
 	struct sha256_state *sctx = shash_desc_ctx(desc);
 
 	if (!irq_fpu_usable() ||
-	    (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE)
-		return crypto_sha256_update(desc, data, len);
+	    (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE) {
+		sha256_update(sctx, data, len);
+		return 0;
+	}
 
 	/* make sure casting to sha256_block_fn() is safe */
 	BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0);
 
 	kernel_fpu_begin();
-	sha256_base_do_update(desc, data, len,
+	sha256_base_do_update(sctx, data, len,
 			      (sha256_block_fn *)sha256_xform);
 	kernel_fpu_end();
 
 	return 0;
 }
 
-static int sha256_finup(struct shash_desc *desc, const u8 *data,
+static int sha256_fpu_finup(struct shash_desc *desc, const u8 *data,
 	      unsigned int len, u8 *out, sha256_transform_fn *sha256_xform)
 {
-	if (!irq_fpu_usable())
-		return crypto_sha256_finup(desc, data, len, out);
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	if (!irq_fpu_usable()) {
+		sha256_finup(sctx, data, len, out);
+		return 0;
+	}
 
 	kernel_fpu_begin();
 	if (len)
-		sha256_base_do_update(desc, data, len,
+		sha256_base_do_update(sctx, data, len,
 				      (sha256_block_fn *)sha256_xform);
-	sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_xform);
+	sha256_base_do_finalize(sctx, (sha256_block_fn *)sha256_xform);
 	kernel_fpu_end();
 
-	return sha256_base_finish(desc, out);
+	crypto_sha2_final(desc, out);
+	return 0;
 }
 
 static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
 			 unsigned int len)
 {
-	return sha256_update(desc, data, len, sha256_transform_ssse3);
+	return sha256_fpu_update(desc, data, len, sha256_transform_ssse3);
 }
 
 static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
 	      unsigned int len, u8 *out)
 {
-	return sha256_finup(desc, data, len, out, sha256_transform_ssse3);
+	return sha256_fpu_finup(desc, data, len, out, sha256_transform_ssse3);
 }
 
 /* Add padding and return the message digest. */
@@ -152,13 +160,13 @@ asmlinkage void sha256_transform_avx(u32 *digest, const char *data,
 static int sha256_avx_update(struct shash_desc *desc, const u8 *data,
 			 unsigned int len)
 {
-	return sha256_update(desc, data, len, sha256_transform_avx);
+	return sha256_fpu_update(desc, data, len, sha256_transform_avx);
 }
 
 static int sha256_avx_finup(struct shash_desc *desc, const u8 *data,
 		      unsigned int len, u8 *out)
 {
-	return sha256_finup(desc, data, len, out, sha256_transform_avx);
+	return sha256_fpu_finup(desc, data, len, out, sha256_transform_avx);
 }
 
 static int sha256_avx_final(struct shash_desc *desc, u8 *out)
@@ -236,13 +244,13 @@ asmlinkage void sha256_transform_rorx(u32 *digest, const char *data,
 static int sha256_avx2_update(struct shash_desc *desc, const u8 *data,
 			 unsigned int len)
 {
-	return sha256_update(desc, data, len, sha256_transform_rorx);
+	return sha256_fpu_update(desc, data, len, sha256_transform_rorx);
 }
 
 static int sha256_avx2_finup(struct shash_desc *desc, const u8 *data,
 		      unsigned int len, u8 *out)
 {
-	return sha256_finup(desc, data, len, out, sha256_transform_rorx);
+	return sha256_fpu_finup(desc, data, len, out, sha256_transform_rorx);
 }
 
 static int sha256_avx2_final(struct shash_desc *desc, u8 *out)
@@ -318,13 +326,13 @@ asmlinkage void sha256_ni_transform(u32 *digest, const char *data,
 static int sha256_ni_update(struct shash_desc *desc, const u8 *data,
 			 unsigned int len)
 {
-	return sha256_update(desc, data, len, sha256_ni_transform);
+	return sha256_fpu_update(desc, data, len, sha256_ni_transform);
 }
 
 static int sha256_ni_finup(struct shash_desc *desc, const u8 *data,
 		      unsigned int len, u8 *out)
 {
-	return sha256_finup(desc, data, len, out, sha256_ni_transform);
+	return sha256_fpu_finup(desc, data, len, out, sha256_ni_transform);
 }
 
 static int sha256_ni_final(struct shash_desc *desc, u8 *out)
diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c
index 25e068ba3382..ed6e80b844cf 100644
--- a/arch/x86/purgatory/purgatory.c
+++ b/arch/x86/purgatory/purgatory.c
@@ -10,7 +10,7 @@
  * Version 2.  See the file COPYING for more details.
  */
 
-#include "sha256.h"
+#include <crypto/sha.h>
 #include "../boot/string.h"
 
 struct sha_region {
diff --git a/arch/x86/purgatory/sha256.c b/arch/x86/purgatory/sha256.c
index 548ca675a14a..724925d5da61 100644
--- a/arch/x86/purgatory/sha256.c
+++ b/arch/x86/purgatory/sha256.c
@@ -17,7 +17,7 @@
 
 #include <linux/bitops.h>
 #include <asm/byteorder.h>
-#include "sha256.h"
+#include <crypto/sha.h>
 #include "../boot/string.h"
 
 static inline u32 Ch(u32 x, u32 y, u32 z)
@@ -208,22 +208,7 @@ static void sha256_transform(u32 *state, const u8 *input)
 	memset(W, 0, 64 * sizeof(u32));
 }
 
-int sha256_init(struct sha256_state *sctx)
-{
-	sctx->state[0] = SHA256_H0;
-	sctx->state[1] = SHA256_H1;
-	sctx->state[2] = SHA256_H2;
-	sctx->state[3] = SHA256_H3;
-	sctx->state[4] = SHA256_H4;
-	sctx->state[5] = SHA256_H5;
-	sctx->state[6] = SHA256_H6;
-	sctx->state[7] = SHA256_H7;
-	sctx->count = 0;
-
-	return 0;
-}
-
-int sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len)
+void sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len)
 {
 	unsigned int partial, done;
 	const u8 *src;
@@ -249,11 +234,9 @@ int sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len)
 		partial = 0;
 	}
 	memcpy(sctx->buf + partial, src, len - done);
-
-	return 0;
 }
 
-int sha256_final(struct sha256_state *sctx, u8 *out)
+void sha256_final(struct sha256_state *sctx, u8 *out)
 {
 	__be32 *dst = (__be32 *)out;
 	__be64 bits;
@@ -278,6 +261,4 @@ int sha256_final(struct sha256_state *sctx, u8 *out)
 
 	/* Zeroize sensitive information. */
 	memset(sctx, 0, sizeof(*sctx));
-
-	return 0;
 }
diff --git a/arch/x86/purgatory/sha256.h b/arch/x86/purgatory/sha256.h
deleted file mode 100644
index bd15a4127735..000000000000
--- a/arch/x86/purgatory/sha256.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright (C) 2014 Red Hat Inc.
- *
- *  Author: Vivek Goyal <vgoyal@redhat.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2.  See the file COPYING for more details.
- */
-
-#ifndef SHA256_H
-#define SHA256_H
-
-
-#include <linux/types.h>
-#include <crypto/sha.h>
-
-extern int sha256_init(struct sha256_state *sctx);
-extern int sha256_update(struct sha256_state *sctx, const u8 *input,
-				unsigned int length);
-extern int sha256_final(struct sha256_state *sctx, u8 *hash);
-
-#endif /* SHA256_H */
diff --git a/crypto/sha256_generic.c b/crypto/sha256_generic.c
index 8f9c47e1a96e..f2747893402c 100644
--- a/crypto/sha256_generic.c
+++ b/crypto/sha256_generic.c
@@ -231,6 +231,13 @@ static void sha256_transform(u32 *state, const u8 *input)
 	memzero_explicit(W, 64 * sizeof(u32));
 }
 
+int sha256_base_init(struct shash_desc *desc)
+{
+	sha256_init(shash_desc_ctx(desc));
+	return 0;
+}
+EXPORT_SYMBOL(sha256_base_init);
+
 static void sha256_generic_block_fn(struct sha256_state *sst, u8 const *src,
 				    int blocks)
 {
@@ -240,32 +247,49 @@ static void sha256_generic_block_fn(struct sha256_state *sst, u8 const *src,
 	}
 }
 
-int crypto_sha256_update(struct shash_desc *desc, const u8 *data,
+void sha256_update(struct sha256_state *sctx, const u8 *data,
 			  unsigned int len)
 {
-	return sha256_base_do_update(desc, data, len, sha256_generic_block_fn);
+	sha256_base_do_update(sctx, data, len, sha256_generic_block_fn);
+}
+EXPORT_SYMBOL(sha256_update);
+
+void sha256_final(struct sha256_state *sctx, u8 *out)
+{
+	sha256_base_do_finalize(sctx, sha256_generic_block_fn);
+	sha256_base_finish(sctx, out);
 }
-EXPORT_SYMBOL(crypto_sha256_update);
+EXPORT_SYMBOL(sha256_final);
 
-static int sha256_final(struct shash_desc *desc, u8 *out)
+static int crypto_sha256_update(struct shash_desc *desc, const u8 *data,
+				unsigned int len)
 {
-	sha256_base_do_finalize(desc, sha256_generic_block_fn);
-	return sha256_base_finish(desc, out);
+	sha256_update(shash_desc_ctx(desc), data, len);
+	return 0;
+}
+
+int crypto_sha2_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	sha256_base_do_finalize(sctx, sha256_generic_block_fn);
+	sha2_base_finish(sctx, crypto_shash_digestsize(desc->tfm), out);
+	return 0;
 }
+EXPORT_SYMBOL(crypto_sha2_final);
 
-int crypto_sha256_finup(struct shash_desc *desc, const u8 *data,
-			unsigned int len, u8 *hash)
+static int crypto_sha256_finup(struct shash_desc *desc, const u8 *data,
+			       unsigned int len, u8 *hash)
 {
-	sha256_base_do_update(desc, data, len, sha256_generic_block_fn);
-	return sha256_final(desc, hash);
+	sha256_finup(shash_desc_ctx(desc), data, len, hash);
+	return 0;
 }
-EXPORT_SYMBOL(crypto_sha256_finup);
 
 static struct shash_alg sha256_algs[2] = { {
 	.digestsize	=	SHA256_DIGEST_SIZE,
 	.init		=	sha256_base_init,
 	.update		=	crypto_sha256_update,
-	.final		=	sha256_final,
+	.final		=	crypto_sha2_final,
 	.finup		=	crypto_sha256_finup,
 	.descsize	=	sizeof(struct sha256_state),
 	.base		=	{
@@ -279,7 +303,7 @@ static struct shash_alg sha256_algs[2] = { {
 	.digestsize	=	SHA224_DIGEST_SIZE,
 	.init		=	sha224_base_init,
 	.update		=	crypto_sha256_update,
-	.final		=	sha256_final,
+	.final		=	crypto_sha2_final,
 	.finup		=	crypto_sha256_finup,
 	.descsize	=	sizeof(struct sha256_state),
 	.base		=	{
diff --git a/include/crypto/sha.h b/include/crypto/sha.h
index c94d3eb1cefd..2b6978471605 100644
--- a/include/crypto/sha.h
+++ b/include/crypto/sha.h
@@ -96,11 +96,30 @@ extern int crypto_sha1_update(struct shash_desc *desc, const u8 *data,
 extern int crypto_sha1_finup(struct shash_desc *desc, const u8 *data,
 			     unsigned int len, u8 *hash);
 
-extern int crypto_sha256_update(struct shash_desc *desc, const u8 *data,
-			      unsigned int len);
-
-extern int crypto_sha256_finup(struct shash_desc *desc, const u8 *data,
-			       unsigned int len, u8 *hash);
+static inline void sha256_init(struct sha256_state *sctx)
+{
+	sctx->state[0] = SHA256_H0;
+	sctx->state[1] = SHA256_H1;
+	sctx->state[2] = SHA256_H2;
+	sctx->state[3] = SHA256_H3;
+	sctx->state[4] = SHA256_H4;
+	sctx->state[5] = SHA256_H5;
+	sctx->state[6] = SHA256_H6;
+	sctx->state[7] = SHA256_H7;
+	sctx->count = 0;
+}
+
+extern void sha256_update(struct sha256_state *sctx, const u8 *data,
+			  unsigned int len);
+
+extern void sha256_final(struct sha256_state *sctx, u8 *out);
+
+static inline void sha256_finup(struct sha256_state *sctx, const u8 *data,
+				unsigned int len, u8 *hash)
+{
+	sha256_update(sctx, data, len);
+	sha256_final(sctx, hash);
+}
 
 extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data,
 			      unsigned int len);
diff --git a/include/crypto/sha256_base.h b/include/crypto/sha256_base.h
index d1f2195bb7de..f65d9a516b36 100644
--- a/include/crypto/sha256_base.h
+++ b/include/crypto/sha256_base.h
@@ -35,29 +35,13 @@ static inline int sha224_base_init(struct shash_desc *desc)
 	return 0;
 }
 
-static inline int sha256_base_init(struct shash_desc *desc)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-
-	sctx->state[0] = SHA256_H0;
-	sctx->state[1] = SHA256_H1;
-	sctx->state[2] = SHA256_H2;
-	sctx->state[3] = SHA256_H3;
-	sctx->state[4] = SHA256_H4;
-	sctx->state[5] = SHA256_H5;
-	sctx->state[6] = SHA256_H6;
-	sctx->state[7] = SHA256_H7;
-	sctx->count = 0;
-
-	return 0;
-}
+extern int sha256_base_init(struct shash_desc *desc);
 
-static inline int sha256_base_do_update(struct shash_desc *desc,
+static inline void sha256_base_do_update(struct sha256_state *sctx,
 					const u8 *data,
 					unsigned int len,
 					sha256_block_fn *block_fn)
 {
-	struct sha256_state *sctx = shash_desc_ctx(desc);
 	unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
 
 	sctx->count += len;
@@ -86,15 +70,12 @@ static inline int sha256_base_do_update(struct shash_desc *desc,
 	}
 	if (len)
 		memcpy(sctx->buf + partial, data, len);
-
-	return 0;
 }
 
-static inline int sha256_base_do_finalize(struct shash_desc *desc,
+static inline void sha256_base_do_finalize(struct sha256_state *sctx,
 					  sha256_block_fn *block_fn)
 {
 	const int bit_offset = SHA256_BLOCK_SIZE - sizeof(__be64);
-	struct sha256_state *sctx = shash_desc_ctx(desc);
 	__be64 *bits = (__be64 *)(sctx->buf + bit_offset);
 	unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
 
@@ -109,14 +90,11 @@ static inline int sha256_base_do_finalize(struct shash_desc *desc,
 	memset(sctx->buf + partial, 0x0, bit_offset - partial);
 	*bits = cpu_to_be64(sctx->count << 3);
 	block_fn(sctx, sctx->buf, 1);
-
-	return 0;
 }
 
-static inline int sha256_base_finish(struct shash_desc *desc, u8 *out)
+static inline void sha2_base_finish(struct sha256_state *sctx,
+				    unsigned int digest_size, u8 *out)
 {
-	unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
-	struct sha256_state *sctx = shash_desc_ctx(desc);
 	__be32 *digest = (__be32 *)out;
 	int i;
 
@@ -124,5 +102,11 @@ static inline int sha256_base_finish(struct shash_desc *desc, u8 *out)
 		put_unaligned_be32(sctx->state[i], digest++);
 
 	*sctx = (struct sha256_state){};
-	return 0;
 }
+
+static inline void sha256_base_finish(struct sha256_state *sctx, u8 *out)
+{
+	sha2_base_finish(sctx, SHA256_DIGEST_SIZE, out);
+}
+
+extern int crypto_sha2_final(struct shash_desc *desc, u8 *out);
-- 
2.9.3

^ permalink raw reply related

* Re: George's crazy full state idea (Re: HalfSipHash Acceptable Usage)
From: George Spelvin @ 2016-12-24  1:17 UTC (permalink / raw)
  To: daniel, hannes, linux
  Cc: ak, davem, David.Laight, ebiggers3, eric.dumazet, Jason,
	kernel-hardening, linux-crypto, linux-kernel, luto, netdev, tom,
	tytso, vegard.nossum
In-Reply-To: <254f8cd7-d045-172d-8692-6052d9da287e@stressinduktion.org>

Hannes Frederic Sowa wrote:
> On 24.12.2016 00:39, George Spelvin wrote:
>> We just finished discussing why 8 bytes isn't enough.  If you only
>> feed back 8 bytes, an attacker who can do 2^64 computation can find it
>> (by guessing and computing forward to verify the guess) and recover the
>> previous state.  You need to feed back at least as much output as your
>> security targete.  For /dev/urandom's ChaCha20, that's 256 bits.

> I followed the discussion but it appeared to me that this has the
> additional constraint of time until the next reseeding event happenes,
> which is 300s (under the assumption that calls to get_random_int happen
> regularly, which I expect right now). After that the existing reseeding
> mechansim will ensure enough backtracking protection. The number of
> bytes can easily be increased here, given that reseeding was shown to be
> quite fast already and we produce enough output. But I am not sure if
> this is a bit overengineered in the end?

I'm not following your description of how the time-based and call-based
mechanisms interact, but for any mix-back, you should either do enough
or none at all.  (Also called "catastrophic reseeding".)

For example, two mix-backs of 64 bits gives you 65 bit security, not 128.
(Because each mixback can be guessed and verified separately.)

> Also agreed. Given your solution below to prandom_u32, I do think it
> might also work without the seqlock now.

It's not technically a seqlock; in particular the reader doesn't
spin.  But the write side, and general logic is so similar it's
a good mental model.

Basically, assume a 64-byte buffer.  The reader has gone through
32 bytes of it, and has 32 left, and when he reads another 8 bytes,
has to distinguish three cases:

1) No update; we read the old bytes and there are now 32 - 24 bytes left.
2) Update completed while we weren't looking.  There are now new
   bytes in the buffer, and we took 8 leaving 64 - 8 = 56.
3) Update in progress at the time of the read.  We don't know if we
   are seeing old bytes or new bytes, so we have to assume the worst
   and not proceeed unless 32 >= 8, but assume at the end there are
   64 - 8 = 56 new bytes left.

> I wouldn't have added a disable irqs, but given that I really like your
> proposal, I would take it in my todo branch and submit it when net-next
> window opens.

If you want that, I have a pile of patches to prandom I really
should push upstream.  Shall I refresh them and send them to you?


commit 4cf1b3d9f4fbccc29ffc2fe4ca4ff52ea77253f1
Author: George Spelvin <linux@horizon.com>
Date:   Mon Aug 31 00:05:00 2015 -0400

    net: Use prandom_u32_max()
    
    It's slightly more efficient than "prandom_u32() % range"
    
    The net/802 code was already efficient, but prandom_u32_max() is simpler.
    
    In net/batman-adv/bat_iv_ogm.c, batadv_iv_ogm_fwd_send_time() got changed
    from picking a random number of milliseconds and converting to jiffies to
    picking a random number of jiffies, since the number of milliseconds (and
    thus the conversion to jiffies) is a compile-time constant.  The equivalent
    code in batadv_iv_ogm_emit_send_time was not changed, because the number
    of milliseconds is variable.
    
    In net/ipv6/ip6_flowlabel.c, ip6_flowlabel had htonl(prandom_u32()),
    which is silly.  Just cast to __be32 without permuting the bits.
    
    net/sched/sch_netem.c got adjusted to only need one call to prandom_u32
    instead of 2.  (Assuming skb_headlen can't exceed 512 MiB, which is
    hopefully safe for some time yet.)
    
    Signed-off-by: George Spelvin <linux@horizon.com>

commit 9c8fb80e1fd2be42c35cab1af27187d600fd85e3
Author: George Spelvin <linux@horizon.com>
Date:   Sat May 24 15:20:47 2014 -0400

    mm/swapfile.c: Use prandom_u32_max()
    
    It's slightly more efficient than "prandom_u32() % range"
    
    Signed-off-by: George Spelvin <linux@horizon.com>

commit 2743eb01e5c5958fd88ae78d19c5fea772d4b117
Author: George Spelvin <linux@horizon.com>
Date:   Sat May 24 15:19:53 2014 -0400

    lib: Use prandom_u32_max()
    
    It's slightly more efficient than "prandom_u32() % range"
    
    Signed-off-by: George Spelvin <linux@horizon.com>

commit 6a5e91bf395060a3351bfe5efc40ac20ffba2c1b
Author: George Spelvin <linux@horizon.com>
Date:   Sat May 24 15:18:50 2014 -0400

    fs/xfs: Use prandom_u32_max()
    
    It's slightly more efficient than "prandom_u32() % range".
    
    Also changed the last argument of xfs_error_test() from "unsigned long"
    to "unsigned", since the code never did support values > 2^32, and
    the largest value ever passed is 100.
    
    The code could be improved even further by passing in 2^32/rf rather
    than rf, but I'll leave that to some XFS developers.
    
    Signed-off-by: George Spelvin <linux@horizon.com>

commit 6f6d485d9179ca6ec4e30caa06ade0e0c6931810
Author: George Spelvin <linux@horizon.com>
Date:   Sat May 24 15:00:17 2014 -0400

    fs/ubifs: Use prandom_u32_max()
    
    It's slightly more efficient than "prandom_u32() % range".
    
    In fs/ubifs/debug.c, the chance() function got rewritten entirely
    to take advantage of the fact its arguments are always constant.
    
    Signed-off-by: George Spelvin <linux@horizon.com>

commit 0b6bf2c874bbbcfa74f6be0413c772b3ac134d12
Author: George Spelvin <linux@horizon.com>
Date:   Sat May 24 14:52:17 2014 -0400

    fs/extX: Use prandom_u32_max()
    
    It's slightly more efficient than "prandom_u32() % range".
    
    In ext4/ialloc.c:436, there's one more chance to do that, but
    the modulo is required to keep the deterministic option consistent,
    so I left it alone.
    
    Switching to "parent_group = ((u64)grp * ngroups) >> 32;" would be more
    efficient, but would change user-visible behavior.
    
    Signed-off-by: George Spelvin <linux@horizon.com>

commit e79e0e8b491bc976c0b4e1b2070eccdf55b34f43
Author: George Spelvin <linux@horizon.com>
Date:   Sat May 24 14:47:15 2014 -0400

    fs/ceph: Use prandom_u32_max()
    
    It's slightly more efficient than "prandom_u32() % range"
    
    Signed-off-by: George Spelvin <linux@horizon.com>

commit fc628326d8cf4abe364ea01259bc392c0bbaf5a0
Author: George Spelvin <linux@horizon.com>
Date:   Sat May 24 14:46:29 2014 -0400

    drivers/scsi/fcoe/fcoe_ctlr.c: Use prandom_u32_max()
    
    It's slightly more efficient than "prandom_u32() % range"
    
    Signed-off-by: George Spelvin <linux@horizon.com>

commit 4810d67dd2edf08e7801ef47550d46b7397fe2dc
Author: George Spelvin <linux@horizon.com>
Date:   Sat May 24 14:45:55 2014 -0400

    drivers/ntb/ntb_hw.c: Use prandom_u32_max()
    
    It's slightly more efficient than "prandom_u32() % range"
    
    Signed-off-by: George Spelvin <linux@horizon.com>

commit f4a806abbc0785e8f0363e02fac613246eed9e34
Author: George Spelvin <linux@horizon.com>
Date:   Sat May 24 14:45:27 2014 -0400

    drivers/net: Use prandom_u32_max()
    
    It's slightly more efficient than "prandom_u32() % range"
    
    In some cases (like ethernet/broadcom/cnic.c and drivers/net/hamradio),
    the range is a compile-time constant power of 2, so the code doesn't
    get any better, but I'm trying to do a full sweep.
    
    In drivers/net/wireless/brcm80211/brcmfmac/p2p.c, a timeout is selected from
    100, 200 or 300 ms.  It would be easy enough to make the granularity finer,
    but I assume the existing code is that way for a reason.
    
    Signed-off-by: George Spelvin <linux@horizon.com>

commit 603e0c311fac09d633ff6c0fd9242108f1c52ead
Author: George Spelvin <linux@horizon.com>
Date:   Sat May 24 13:55:09 2014 -0400

    drivers/mtd: Use prandom_u32_max()
    
    It's slightly more efficient than "prandom_u32() % range".
    
    And rewrote the 1-in-N code in drivers/mtd/ubi/debug.h to avoid
    even more arithmetic.
    
    Signed-off-by: George Spelvin <linux@horizon.com>

commit e0657cc865e8e02768906935b8e8bf63af58aa46
Author: George Spelvin <linux@horizon.com>
Date:   Sat May 24 13:52:13 2014 -0400

    drivers/mmc/core: Use prandom_u32_max()
    
    It's slightly more efficient than "prandom_u32() % range"
    
    Signed-off-by: George Spelvin <linux@horizon.com>

commit 017ee6841ec8d416093fc1f18bdd3df0dfc6aacc
Author: George Spelvin <linux@horizon.com>
Date:   Sat May 24 13:51:33 2014 -0400

    drivers/lguest/page_tables.c: Use prandom_u32_max()
    
    This doesn't actually change the code because the array size is
    a power of 2 (it's a 4-element array).  But I'm on a roll.
    
    Signed-off-by: George Spelvin <linux@horizon.com>

commit 87556165f46eb42c756bcb94e062c3fbd272a4bf
Author: George Spelvin <linux@horizon.com>
Date:   Sat May 24 13:49:41 2014 -0400

    drivers/infiniband: Use prandom_u32_max()
    
    It's slightly more efficient than "prandom_u32() % range"
    
    Signed-off-by: George Spelvin <linux@horizon.com>

commit 1eafe1d429f442218810e8c604d4e7c466414cf3
Author: George Spelvin <linux@horizon.com>
Date:   Sun Aug 30 23:42:41 2015 -0400

    block/blk-mq-tag.c: Use prandom_u32_max()
    
    It's slightly more efficient than "prandom_u32() % range"
    
    Signed-off-by: George Spelvin <linux@horizon.com>

commit f03ee59a63098d244d5b8932fc68c9fc3e2bb222
Author: George Spelvin <linux@horizon.com>
Date:   Sat May 24 13:46:52 2014 -0400

    arch/x86: Use prandom_u32_max()
    
    It's slightly more efficient than "prandom_u32() % range"
    
    Signed-off-by: George Spelvin <linux@horizon.com>

commit feafd3a3fb09924ea633d677a7ab8a25a817f39d
Author: George Spelvin <linux@horizon.com>
Date:   Sat May 24 13:44:49 2014 -0400

    lib/random32.c: Remove redundant U suffixes on integers
    
    Get rid of a few of the extraneous U suffixes on ordinary integers.
    
    Signed-off-by: George Spelvin <linux@horizon.com>

commit f14328d248e59c862478633479528c9cb8554d7a
Author: George Spelvin <linux@horizon.com>
Date:   Sat May 24 12:40:19 2014 -0400

    lib/random32.c: Randomize timeout to the millisecond, not the second.
    
    If you're going to bother randomizing it, do it right.
    And use prandom_u32_max(), which is designed for the job, rather
    than "% 40".
    
    Signed-off-by: George Spelvin <linux@horizon.com>

commit 143342006adfff718afedf58f639b72337d7c816
Author: George Spelvin <linux@horizon.com>
Date:   Sat May 24 12:51:26 2014 -0400

    lib/random32.c: Make prandom_u32_max efficient for powers of 2
    
    The multiply-and-shift is efficient in the general case, but slower
    than a simple bitwise AND if the range is a power of 2.  Make the function
    handle this case so callers don't have to worry about micro-optimizing it.
    
    Signed-off-by: George Spelvin <linux@horizon.com>

commit 99864db5cb023e6b09d71cde4997a5f6cafb5cca
Author: George Spelvin <linux@horizon.com>
Date:   Sat May 24 12:02:17 2014 -0400

    lib/random32.c: Use <asm/unaligned.h> instead of hand-rolling it
    
    The functions exist for a reason; the manual byte-at-a-time code
    is unnecessarily slow (and bloated).
    
    Signed-off-by: George Spelvin <linux@horizon.com>

commit 4ecd45f6eadd4d171782dc6b451ed1040e47d419
Author: George Spelvin <linux@horizon.com>
Date:   Sat May 24 11:55:59 2014 -0400

    lib/random32.c: Debloat non-speed-critical code
    
    Unrolling code in rarely-used code paths is just silly.  There are
    two places that static calls to prandom_u32_state() can be removed:
    
    1) prandom_warmup() calls prandom_u32_state 10 times.
    2) prandom_state_selftest() can avoid one call and simplify the
       loop logic by repeating an assignment to a local variable
       (which probably adds zero code anyway).
    
    Signed-off-by: George Spelvin <linux@horizon.com>

commit 8765cff45da1d96e4310d50dd49231790c49b612
Author: George Spelvin <linux@horizon.com>
Date:   Sat May 24 11:52:34 2014 -0400

    lib/random32.c: Mark self-test data as __initconst
    
    So it can be thrown away along with the code that uses it.
    
    Signed-off-by: George Spelvin <linux@horizon.com>

^ permalink raw reply

* Re: George's crazy full state idea (Re: HalfSipHash Acceptable Usage)
From: Hannes Frederic Sowa @ 2016-12-24  0:12 UTC (permalink / raw)
  To: George Spelvin, daniel
  Cc: ak, davem, David.Laight, ebiggers3, eric.dumazet, Jason,
	kernel-hardening, linux-crypto, linux-kernel, luto, netdev, tom,
	tytso, vegard.nossum
In-Reply-To: <20161223233904.11739.qmail@ns.sciencehorizons.net>

Hi,

On 24.12.2016 00:39, George Spelvin wrote:
> Hannes Frederic Sowa wrote:
>> In general this looks good, but bitbuffer needs to be protected from
>> concurrent access, thus needing at least some atomic instruction and
>> disabling of interrupts for the locking if done outside of
>> get_random_long. Thus I liked your previous approach more where you
>> simply embed this in the already necessary get_random_long and aliased
>> get_random_long as get_random_bits(BITS_PER_LONG) accordingly, IMHO.
> 
> It's meant to be part of the same approach, and I didn't include locking
> because that's a requirement for *any* solution, and isn't specific
> to the part I was trying to illustrate.
> 
> (As for re-using the name "get_random_long", that was just so
> I didn't have to explain it.  Call it "get_random_long_internal"
> if you like.)
> 
> Possible locking implementations include:
> 1) Use the same locking as applies to get_random_long_internal(), or
> 2) Make bitbuffer a per-CPU variable (note that we currently have 128
>    bits of per-CPU state in get_random_int_hash[]), and this is all a
>    fast-path to bypass heavier locking in get_random_long_internal().

I understood that this is definitely a solvable problem.

>>> But, I just realized I've been overlooking something glaringly obvious...
>>> there's no reason you can't compute multple blocks in advance.
>>
>> In the current code on the cost of per cpu allocations thus memory.
> 
> Yes, but on 4-core machines it's still not much, and 4096-core
> behemoths have RAM to burn.
> 
>> In the extract_crng case, couldn't we simply use 8 bytes of the 64 byte
>> return block to feed it directly back into the state chacha? So we pass
>> on 56 bytes into the pcpu buffer, and consume 8 bytes for the next
>> state. This would make the window max shorter than the anti
>> backtracking protection right now from 300s to 14 get_random_int calls.
>> Not sure if this is worth it.
> 
> We just finished discussing why 8 bytes isn't enough.  If you only
> feed back 8 bytes, an attacker who can do 2^64 computation can find it
> (by guessing and computing forward to verify the guess) and recover the
> previous state.  You need to feed back at least as much output as your
> security targete.  For /dev/urandom's ChaCha20, that's 256 bits.

I followed the discussion but it appeared to me that this has the
additional constraint of time until the next reseeding event happenes,
which is 300s (under the assumption that calls to get_random_int happen
regularly, which I expect right now). After that the existing reseeding
mechansim will ensure enough backtracking protection. The number of
bytes can easily be increased here, given that reseeding was shown to be
quite fast already and we produce enough output. But I am not sure if
this is a bit overengineered in the end?

>>> For example, suppose we gave each CPU a small pool to minimize locking.
>>> When one runs out and calls the global refill, it could refill *all*
>>> of the CPU pools.  (You don't even need locking; there may be a race to
>>> determine *which* random numbers the reader sees, but they're equally
>>> good either way.)
> 
>> Yes, but still disabled interrupts, otherwise the same state could be
>> used twice on the same CPU. Uff, I think we have this problem in
>> prandom_u32.
> 
> There are some locking gotchas, but it is doable lock-free.
> 
> Basically, it's a seqlock.  The writer increments it once (to an odd
> number) before starting to overwrite the buffer, and a second time (to
> an even number) after.  "Before" and "after" mean smp_wmb().
> 
> The reader can use this to figure out how much of the data in the buffer
> is safely fresh.  The full sequence of checks is a bit intricate,
> but straightforward.
> 
> I didn't discuss the locking because I'm confident it's solvable,
> not because I wasn't aware it has to be solved.

Also agreed. Given your solution below to prandom_u32, I do think it
might also work without the seqlock now.

> As for prandom_u32(), what's the problem?  Are you worried that
> get_cpu_var disables preemption but not interrupts, and so an
> ISR might return the same value as process-level code?

Yes, exactly those were my thoughts.

> First of all, that's not a problem because prandom_u32() doesn't
> have security guarantees.  Occasionally returning a dupicate number
> is okay.
>
> Second, if you do care, that could be trivially fixed by throwing
> a barrier() in the middle of the code.  (Patch appended; S-o-B
> if anyone wants it.)

I wouldn't have added a disable irqs, but given that I really like your
proposal, I would take it in my todo branch and submit it when net-next
window opens.

> diff --git a/lib/random32.c b/lib/random32.c
> index c750216d..6bee4a36 100644
> --- a/lib/random32.c
> +++ b/lib/random32.c
> [...]

Thanks,
Hannes

^ permalink raw reply

* Re: George's crazy full state idea (Re: HalfSipHash Acceptable Usage)
From: George Spelvin @ 2016-12-23 23:39 UTC (permalink / raw)
  To: daniel, hannes, linux
  Cc: ak, davem, David.Laight, ebiggers3, eric.dumazet, Jason,
	kernel-hardening, linux-crypto, linux-kernel, luto, netdev, tom,
	tytso, vegard.nossum
In-Reply-To: <1482526135.2554.5.camel@stressinduktion.org>

Hannes Frederic Sowa wrote:
> In general this looks good, but bitbuffer needs to be protected from
> concurrent access, thus needing at least some atomic instruction and
> disabling of interrupts for the locking if done outside of
> get_random_long. Thus I liked your previous approach more where you
> simply embed this in the already necessary get_random_long and aliased
> get_random_long as get_random_bits(BITS_PER_LONG) accordingly, IMHO.

It's meant to be part of the same approach, and I didn't include locking
because that's a requirement for *any* solution, and isn't specific
to the part I was trying to illustrate.

(As for re-using the name "get_random_long", that was just so
I didn't have to explain it.  Call it "get_random_long_internal"
if you like.)

Possible locking implementations include:
1) Use the same locking as applies to get_random_long_internal(), or
2) Make bitbuffer a per-CPU variable (note that we currently have 128
   bits of per-CPU state in get_random_int_hash[]), and this is all a
   fast-path to bypass heavier locking in get_random_long_internal().

>> But, I just realized I've been overlooking something glaringly obvious...
>> there's no reason you can't compute multple blocks in advance.
>
> In the current code on the cost of per cpu allocations thus memory.

Yes, but on 4-core machines it's still not much, and 4096-core
behemoths have RAM to burn.

> In the extract_crng case, couldn't we simply use 8 bytes of the 64 byte
> return block to feed it directly back into the state chacha? So we pass
> on 56 bytes into the pcpu buffer, and consume 8 bytes for the next
> state. This would make the window max shorter than the anti
> backtracking protection right now from 300s to 14 get_random_int calls.
> Not sure if this is worth it.

We just finished discussing why 8 bytes isn't enough.  If you only
feed back 8 bytes, an attacker who can do 2^64 computation can find it
(by guessing and computing forward to verify the guess) and recover the
previous state.  You need to feed back at least as much output as your
security targete.  For /dev/urandom's ChaCha20, that's 256 bits.

>> For example, suppose we gave each CPU a small pool to minimize locking.
>> When one runs out and calls the global refill, it could refill *all*
>> of the CPU pools.  (You don't even need locking; there may be a race to
>> determine *which* random numbers the reader sees, but they're equally
>> good either way.)

> Yes, but still disabled interrupts, otherwise the same state could be
> used twice on the same CPU. Uff, I think we have this problem in
> prandom_u32.

There are some locking gotchas, but it is doable lock-free.

Basically, it's a seqlock.  The writer increments it once (to an odd
number) before starting to overwrite the buffer, and a second time (to
an even number) after.  "Before" and "after" mean smp_wmb().

The reader can use this to figure out how much of the data in the buffer
is safely fresh.  The full sequence of checks is a bit intricate,
but straightforward.

I didn't discuss the locking because I'm confident it's solvable,
not because I wasn't aware it has to be solved.

As for prandom_u32(), what's the problem?  Are you worried that
get_cpu_var disables preemption but not interrupts, and so an
ISR might return the same value as process-level code?

First of all, that's not a problem because prandom_u32() doesn't
have security guarantees.  Occasionally returning a dupicate number
is okay.

Second, if you do care, that could be trivially fixed by throwing
a barrier() in the middle of the code.  (Patch appended; S-o-B
if anyone wants it.)

diff --git a/lib/random32.c b/lib/random32.c
index c750216d..6bee4a36 100644
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -55,16 +55,29 @@ static DEFINE_PER_CPU(struct rnd_state, net_rand_state);
  *
  *	This is used for pseudo-randomness with no outside seeding.
  *	For more random results, use prandom_u32().
+ *
+ *	The barrier() is to allow prandom_u32() to be called from interupt
+ *	context without locking.  An interrupt will run to completion,
+ *	updating all four state variables.  The barrier() ensures that
+ *	the interrupted code will compute a different result.  Either it
+ *	will have written s1 and s2 (so the interrupt will start with
+ *	the updated values), or it will use the values of s3 and s4
+ *	updated by the interrupt.
+ *
+ *	(The same logic applies recursively to nested interrupts, trap
+ *	handlers, and NMIs.)
  */
 u32 prandom_u32_state(struct rnd_state *state)
 {
+	register u32 x;
 #define TAUSWORTHE(s, a, b, c, d) ((s & c) << d) ^ (((s << a) ^ s) >> b)
-	state->s1 = TAUSWORTHE(state->s1,  6, 13, 4294967294U, 18U);
-	state->s2 = TAUSWORTHE(state->s2,  2, 27, 4294967288U,  2U);
-	state->s3 = TAUSWORTHE(state->s3, 13, 21, 4294967280U,  7U);
-	state->s4 = TAUSWORTHE(state->s4,  3, 12, 4294967168U, 13U);
+	x  = state->s1 = TAUSWORTHE(state->s1,  6, 13,   ~1u, 18);
+	x += state->s2 = TAUSWORTHE(state->s2,  2, 27,   ~7u,  2);
+	barrier();
+	x ^= state->s3 = TAUSWORTHE(state->s3, 13, 21,  ~15u,  7);
+	x += state->s4 = TAUSWORTHE(state->s4,  3, 12, ~127u, 13);

-	return (state->s1 ^ state->s2 ^ state->s3 ^ state->s4);
+	return x;
 }
 EXPORT_SYMBOL(prandom_u32_state);

^ permalink raw reply related

* Re: BPF hash algo (Re: [kernel-hardening] Re: [PATCH v7 3/6] random: use SipHash in place of MD5)
From: Jason A. Donenfeld @ 2016-12-23 21:18 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: Andy Lutomirski, Daniel Borkmann, Alexei Starovoitov,
	kernel-hardening@lists.openwall.com, Theodore Ts'o, Netdev,
	LKML, Linux Crypto Mailing List, David Laight, Eric Dumazet,
	Linus Torvalds, Eric Biggers, Tom Herbert, Andi Kleen,
	David S. Miller, Jean-Philippe Aumasson
In-Reply-To: <ec43894a-7b54-ff30-9874-bb4a25314e09@stressinduktion.org>

On Fri, Dec 23, 2016 at 7:19 PM, Hannes Frederic Sowa
<hannes@stressinduktion.org> wrote:
> Factoring out sha3

Per the other thread, you probably don't actually want SHA3, because
it's slow in software. You want SHA2. If you want something faster and
better, then Blake2 is most certainly the way to go. I'll be
submitting some patches in a month or so adding Blake2 for future use
in the tree.

Jason

^ permalink raw reply

* Re: George's crazy full state idea (Re: HalfSipHash Acceptable Usage)
From: Hannes Frederic Sowa @ 2016-12-23 20:48 UTC (permalink / raw)
  To: George Spelvin
  Cc: ak, davem, David.Laight, ebiggers3, eric.dumazet, Jason,
	kernel-hardening, linux-crypto, linux-kernel, luto, netdev, tom,
	tytso, vegard.nossum
In-Reply-To: <20161223182626.6290.qmail@ns.sciencehorizons.net>

Hi,

On Fri, 2016-12-23 at 13:26 -0500, George Spelvin wrote:
> (Cc: list trimmed slightly as the topic is wandering a bit.)
> 
> Hannes Frederic Sowa wrote:
> > On Thu, 2016-12-22 at 19:07 -0500, George Spelvin wrote:
> > > Adding might_lock() annotations will improve coverage a lot.
> > 
> > Might be hard to find the correct lock we take later down the code
> > path, but if that is possible, certainly.
> 
> The point of might_lock() is that you don't have to.  You find the
> worst case (most global) lock that the code *might* take if all the
> buffer-empty conditions are true, and tell lockdep "assume this lock is
> taken all the time".

Yes, of course. Probably indicating input_pool's and primary_crng's
locks are good to indicate here, but on NUMA other locks might be
taken.

> > > Hannes Frederic Sowa wrote:
> > > > Yes, that does look nice indeed. Accounting for bits instead of bytes
> > > > shouldn't be a huge problem either. Maybe it gets a bit more verbose in
> > > > case you can't satisfy a request with one batched entropy block and have
> > > > to consume randomness from two.
> 
> For example, here's a simple bit-buffer implementation that wraps around
> a get_random_long.  The bitbuffer is of the form "00001xxxx", where the
> x bits are valid, and the position of the msbit indicates how many bits
> are valid.
> 
> extern unsigned long get_random_long();
> static unsigned long bitbuffer = 1;	/* Holds 0..BITS_PER_LONG-1 bits */
> unsigned long get_random_bits(unsigned char bits)
> {
> 	/* We handle bits == BITS_PER_LONG,and not bits == 0 */
> 	unsigned long mask = -1ul >> (BITS_PER_LONG - bits);
> 	unsigned long val;
> 
> 	if (bitbuffer > mask) {
> 		/* Request can be satisfied out of the bit buffer */
> 		val = bitbuffer;
> 		bitbuffer >>= bits;
> 	} else {
> 		/*
> 		 * Not enough bits, but enough room in bitbuffer for the
> 		 * leftovers.  avail < bits, so avail + 64 <= bits + 63.
> 		 */
> 		val = get_random_long();
> 		bitbuffer = bitbuffer << (BITS_PER_LONG - bits)
> 			| val >> 1 >> (bits-1);
> 	}
> 	return val & mask;
> }

In general this looks good, but bitbuffer needs to be protected from
concurrent access, thus needing at least some atomic instruction and
disabling of interrupts for the locking if done outside of
get_random_long. Thus I liked your previous approach more where you
simply embed this in the already necessary get_random_long and aliased
get_random_long as get_random_bits(BITS_PER_LONG) accordingly, IMHO.

> > When we hit the chacha20 without doing a reseed we only mutate the
> > state of chacha, but being an invertible function in its own, a
> > proposal would be to mix parts of the chacha20 output back into the
> > state, which, as a result, would cause slowdown because we couldn't
> > propagate the complete output of the cipher back to the caller (looking
> > at the function _extract_crng).
> 
> Basically, yes.  Half of the output goes to rekeying itself.

Okay, thanks and understood. :)

> But, I just realized I've been overlooking something glaringly obvious...
> there's no reason you can't compute multple blocks in advance.

In the current code on the cost of per cpu allocations thus memory.

> The standard assumption in antibacktracking is that you'll *notice* the
> state capture and stop trusting the random numbers afterward; you just
> want the output *before* to be secure.  In other words, cops busting
> down the door can't find the session key used in the message you just sent.

Yep, analogous to forward secrecy and future secrecy (self healing) is
provided by catastrophic reseeding from /dev/random.

In the extract_crng case, couldn't we simply use 8 bytes of the 64 byte
return block to feed it directly back into the state chacha? So we pass
on 56 bytes into the pcpu buffer, and consume 8 bytes for the next
state. This would make the window max shorter than the anti
backtracking protection right now from 300s to 14 get_random_int calls.
Not sure if this is worth it.

> So you can compute and store random numbers ahead of need.
> 
> This can amortize the antibacktracking as much as you'd like.
> 
> For example, suppose we gave each CPU a small pool to minimize locking.
> When one runs out and calls the global refill, it could refill *all*
> of the CPU pools.  (You don't even need locking; there may be a race to
> determine *which* random numbers the reader sees, but they're equally
> good either way.)

Yes, but still disabled interrupts, otherwise the same state could be
used twice on the same CPU. Uff, I think we have this problem in
prandom_u32.

> > Or are you referring that the anti-backtrack protection should happen
> > in every call from get_random_int?
> 
> If you ask for anti-backtracking without qualification, that's the
> goal, since you don't know how long will elapse until the next call.  
> 
> It's like fsync().  There are lots of more limited forms of "keep my
> data safe in case of a crash", but the most basic one is "if we lost
> power the very instant the call returned, the data would be safe."

Yes, it absolutely makes sense.

Thanks a lot,
Hannes

^ permalink raw reply

* Re: George's crazy full state idea (Re: HalfSipHash Acceptable Usage)
From: George Spelvin @ 2016-12-23 18:26 UTC (permalink / raw)
  To: hannes, linux
  Cc: ak, davem, David.Laight, ebiggers3, eric.dumazet, Jason,
	kernel-hardening, linux-crypto, linux-kernel, luto, netdev, tom,
	tytso, vegard.nossum
In-Reply-To: <1482494724.3353.7.camel@stressinduktion.org>

(Cc: list trimmed slightly as the topic is wandering a bit.)

Hannes Frederic Sowa wrote:
> On Thu, 2016-12-22 at 19:07 -0500, George Spelvin wrote:
>> Adding might_lock() annotations will improve coverage a lot.
>
> Might be hard to find the correct lock we take later down the code
> path, but if that is possible, certainly.

The point of might_lock() is that you don't have to.  You find the
worst case (most global) lock that the code *might* take if all the
buffer-empty conditions are true, and tell lockdep "assume this lock is
taken all the time".

>> Hannes Frederic Sowa wrote:
>>> Yes, that does look nice indeed. Accounting for bits instead of bytes
>>> shouldn't be a huge problem either. Maybe it gets a bit more verbose in
>>> case you can't satisfy a request with one batched entropy block and have
>>> to consume randomness from two.

For example, here's a simple bit-buffer implementation that wraps around
a get_random_long.  The bitbuffer is of the form "00001xxxx", where the
x bits are valid, and the position of the msbit indicates how many bits
are valid.

extern unsigned long get_random_long();
static unsigned long bitbuffer = 1;	/* Holds 0..BITS_PER_LONG-1 bits */
unsigned long get_random_bits(unsigned char bits)
{
	/* We handle bits == BITS_PER_LONG,and not bits == 0 */
	unsigned long mask = -1ul >> (BITS_PER_LONG - bits);
	unsigned long val;

	if (bitbuffer > mask) {
		/* Request can be satisfied out of the bit buffer */
		val = bitbuffer;
		bitbuffer >>= bits;
	} else {
		/*
		 * Not enough bits, but enough room in bitbuffer for the
		 * leftovers.  avail < bits, so avail + 64 <= bits + 63.
		 */
		val = get_random_long();
		bitbuffer = bitbuffer << (BITS_PER_LONG - bits)
			| val >> 1 >> (bits-1);
	}
	return val & mask;
}

> When we hit the chacha20 without doing a reseed we only mutate the
> state of chacha, but being an invertible function in its own, a
> proposal would be to mix parts of the chacha20 output back into the
> state, which, as a result, would cause slowdown because we couldn't
> propagate the complete output of the cipher back to the caller (looking
> at the function _extract_crng).

Basically, yes.  Half of the output goes to rekeying itself.

But, I just realized I've been overlooking something glaringly obvious...
there's no reason you can't compute multple blocks in advance.

The standard assumption in antibacktracking is that you'll *notice* the
state capture and stop trusting the random numbers afterward; you just
want the output *before* to be secure.  In other words, cops busting
down the door can't find the session key used in the message you just sent.

So you can compute and store random numbers ahead of need.

This can amortize the antibacktracking as much as you'd like.

For example, suppose we gave each CPU a small pool to minimize locking.
When one runs out and calls the global refill, it could refill *all*
of the CPU pools.  (You don't even need locking; there may be a race to
determine *which* random numbers the reader sees, but they're equally
good either way.)

> Or are you referring that the anti-backtrack protection should happen
> in every call from get_random_int?

If you ask for anti-backtracking without qualification, that's the
goal, since you don't know how long will elapse until the next call.  

It's like fsync().  There are lots of more limited forms of "keep my
data safe in case of a crash", but the most basic one is "if we lost
power the very instant the call returned, the data would be safe."

^ permalink raw reply

* Re: BPF hash algo (Re: [kernel-hardening] Re: [PATCH v7 3/6] random: use SipHash in place of MD5)
From: Hannes Frederic Sowa @ 2016-12-23 18:19 UTC (permalink / raw)
  To: Andy Lutomirski, Daniel Borkmann
  Cc: Alexei Starovoitov, Jason A. Donenfeld,
	kernel-hardening@lists.openwall.com, Theodore Ts'o, Netdev,
	LKML, Linux Crypto Mailing List, David Laight, Eric Dumazet,
	Linus Torvalds, Eric Biggers, Tom Herbert, Andi Kleen,
	David S. Miller, Jean-Philippe Aumasson
In-Reply-To: <CALCETrXHoPBV1UpLwtLjjmqJXVpgEUTCnwePDLx4g6a1SFMZaw@mail.gmail.com>

On 23.12.2016 17:42, Andy Lutomirski wrote:
> On Fri, Dec 23, 2016 at 8:23 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Fri, Dec 23, 2016 at 3:59 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>>> On 12/23/2016 11:59 AM, Hannes Frederic Sowa wrote:
>>>>
>>>> On Fri, 2016-12-23 at 11:04 +0100, Daniel Borkmann wrote:
>>>>>
>>>>> On 12/22/2016 05:59 PM, Hannes Frederic Sowa wrote:
>>>>>>
>>>>>> On Thu, 2016-12-22 at 08:07 -0800, Andy Lutomirski wrote:
>>>
>>> [...]
>>>
>>>>>> The hashing is not a proper sha1 neither, unfortunately. I think that
>>>>>> is why it will have a custom implementation in iproute2?
>>>>>
>>>>>
>>>>> Still trying to catch up on this admittedly bit confusing thread. I
>>>>> did run automated tests over couple of days comparing the data I got
>>>>> from fdinfo with the one from af_alg and found no mismatch on the test
>>>>> cases varying from min to max possible program sizes. In the process
>>>>> of testing, as you might have seen on netdev, I found couple of other
>>>>> bugs in bpf code along the way and fixed them up as well. So my question,
>>>>> do you or Andy or anyone participating in claiming this have any
>>>>> concrete data or test cases that suggests something different? If yes,
>>>>> I'm very curious to hear about it and willing fix it up, of course.
>>>>> When I'm back from pto I'll prep and cook up my test suite to be
>>>>> included into the selftests/bpf/, should have done this initially,
>>>>> sorry about that. I'll also post something to expose the alg, that
>>>>> sounds fine to me.
>>>>
>>>>
>>>> Looking into your code closer, I noticed that you indeed seem to do the
>>>> finalization of sha-1 by hand by aligning and padding the buffer
>>>> accordingly and also patching in the necessary payload length.
>>>>
>>>> Apologies for my side for claiming that this is not correct sha1
>>>> output, I was only looking at sha_transform and its implementation and
>>>> couldn't see the padding and finalization round with embedding the data
>>>> length in there and hadn't thought of it being done manually.
>>>>
>>>> Anyway, is it difficult to get the sha finalization into some common
>>>> code library? It is not very bpf specific and crypto code reviewers
>>>> won't find it there at all.
>>>
>>>
>>> Yes, sure, I'll rework it that way (early next year when I'm back if
>>> that's fine with you).
>>
>> Can we make it SHA-256 before 4.10 comes out, though?  This really
>> looks like it will be used in situations where collisions matter and
>> it will be exposed to malicious programs, and SHA-1 should not be used
>> for new designs for this purpose because it simply isn't long enough.
>>
>> Also, a SHA-1 digest isn't a pile of u32s, so u32 digest[...] is very
>> misleading.  That should be u8 or, at the very least, __be32.
>>
>> I realize that there isn't a sha-256 implementation in lib, but would
>> it really be so bad to make the bpf digest only work (for now) when
>> crypto is enabled?  I would *love* to see the crypto core learn how to
>> export simple primitives for direct use without needing the whole
>> crypto core, and this doesn't seem particularly hard to do, but I
>> don't think that's 4.10 material.
> 
> I'm going to try to send out RFC patches for all of this today or
> tomorrow.  It doesn't look bad at all.

Factoring out sha3 to lib/ and use it as standalone and in crypto api
doesn't seem hard, yep. I also proposed this to Daniel offlist.

Bye,
Hannes

^ permalink raw reply

* Re: BPF hash algo (Re: [kernel-hardening] Re: [PATCH v7 3/6] random: use SipHash in place of MD5)
From: Andy Lutomirski @ 2016-12-23 16:42 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Hannes Frederic Sowa, Alexei Starovoitov, Jason A. Donenfeld,
	kernel-hardening@lists.openwall.com, Theodore Ts'o, Netdev,
	LKML, Linux Crypto Mailing List, David Laight, Eric Dumazet,
	Linus Torvalds, Eric Biggers, Tom Herbert, Andi Kleen,
	David S. Miller, Jean-Philippe Aumasson
In-Reply-To: <CALCETrVgnnbstPoPf-0bZ_ySDydXNCp5dLAEvhAD9cApozFsng@mail.gmail.com>

On Fri, Dec 23, 2016 at 8:23 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Fri, Dec 23, 2016 at 3:59 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 12/23/2016 11:59 AM, Hannes Frederic Sowa wrote:
>>>
>>> On Fri, 2016-12-23 at 11:04 +0100, Daniel Borkmann wrote:
>>>>
>>>> On 12/22/2016 05:59 PM, Hannes Frederic Sowa wrote:
>>>>>
>>>>> On Thu, 2016-12-22 at 08:07 -0800, Andy Lutomirski wrote:
>>
>> [...]
>>
>>>>> The hashing is not a proper sha1 neither, unfortunately. I think that
>>>>> is why it will have a custom implementation in iproute2?
>>>>
>>>>
>>>> Still trying to catch up on this admittedly bit confusing thread. I
>>>> did run automated tests over couple of days comparing the data I got
>>>> from fdinfo with the one from af_alg and found no mismatch on the test
>>>> cases varying from min to max possible program sizes. In the process
>>>> of testing, as you might have seen on netdev, I found couple of other
>>>> bugs in bpf code along the way and fixed them up as well. So my question,
>>>> do you or Andy or anyone participating in claiming this have any
>>>> concrete data or test cases that suggests something different? If yes,
>>>> I'm very curious to hear about it and willing fix it up, of course.
>>>> When I'm back from pto I'll prep and cook up my test suite to be
>>>> included into the selftests/bpf/, should have done this initially,
>>>> sorry about that. I'll also post something to expose the alg, that
>>>> sounds fine to me.
>>>
>>>
>>> Looking into your code closer, I noticed that you indeed seem to do the
>>> finalization of sha-1 by hand by aligning and padding the buffer
>>> accordingly and also patching in the necessary payload length.
>>>
>>> Apologies for my side for claiming that this is not correct sha1
>>> output, I was only looking at sha_transform and its implementation and
>>> couldn't see the padding and finalization round with embedding the data
>>> length in there and hadn't thought of it being done manually.
>>>
>>> Anyway, is it difficult to get the sha finalization into some common
>>> code library? It is not very bpf specific and crypto code reviewers
>>> won't find it there at all.
>>
>>
>> Yes, sure, I'll rework it that way (early next year when I'm back if
>> that's fine with you).
>
> Can we make it SHA-256 before 4.10 comes out, though?  This really
> looks like it will be used in situations where collisions matter and
> it will be exposed to malicious programs, and SHA-1 should not be used
> for new designs for this purpose because it simply isn't long enough.
>
> Also, a SHA-1 digest isn't a pile of u32s, so u32 digest[...] is very
> misleading.  That should be u8 or, at the very least, __be32.
>
> I realize that there isn't a sha-256 implementation in lib, but would
> it really be so bad to make the bpf digest only work (for now) when
> crypto is enabled?  I would *love* to see the crypto core learn how to
> export simple primitives for direct use without needing the whole
> crypto core, and this doesn't seem particularly hard to do, but I
> don't think that's 4.10 material.

I'm going to try to send out RFC patches for all of this today or
tomorrow.  It doesn't look bad at all.

--Andy

^ permalink raw reply

* Re: BPF hash algo (Re: [kernel-hardening] Re: [PATCH v7 3/6] random: use SipHash in place of MD5)
From: Andy Lutomirski @ 2016-12-23 16:23 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Hannes Frederic Sowa, Alexei Starovoitov, Jason A. Donenfeld,
	kernel-hardening@lists.openwall.com, Theodore Ts'o, Netdev,
	LKML, Linux Crypto Mailing List, David Laight, Eric Dumazet,
	Linus Torvalds, Eric Biggers, Tom Herbert, Andi Kleen,
	David S. Miller, Jean-Philippe Aumasson
In-Reply-To: <585D11BF.60903@iogearbox.net>

On Fri, Dec 23, 2016 at 3:59 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> On 12/23/2016 11:59 AM, Hannes Frederic Sowa wrote:
>>
>> On Fri, 2016-12-23 at 11:04 +0100, Daniel Borkmann wrote:
>>>
>>> On 12/22/2016 05:59 PM, Hannes Frederic Sowa wrote:
>>>>
>>>> On Thu, 2016-12-22 at 08:07 -0800, Andy Lutomirski wrote:
>
> [...]
>
>>>> The hashing is not a proper sha1 neither, unfortunately. I think that
>>>> is why it will have a custom implementation in iproute2?
>>>
>>>
>>> Still trying to catch up on this admittedly bit confusing thread. I
>>> did run automated tests over couple of days comparing the data I got
>>> from fdinfo with the one from af_alg and found no mismatch on the test
>>> cases varying from min to max possible program sizes. In the process
>>> of testing, as you might have seen on netdev, I found couple of other
>>> bugs in bpf code along the way and fixed them up as well. So my question,
>>> do you or Andy or anyone participating in claiming this have any
>>> concrete data or test cases that suggests something different? If yes,
>>> I'm very curious to hear about it and willing fix it up, of course.
>>> When I'm back from pto I'll prep and cook up my test suite to be
>>> included into the selftests/bpf/, should have done this initially,
>>> sorry about that. I'll also post something to expose the alg, that
>>> sounds fine to me.
>>
>>
>> Looking into your code closer, I noticed that you indeed seem to do the
>> finalization of sha-1 by hand by aligning and padding the buffer
>> accordingly and also patching in the necessary payload length.
>>
>> Apologies for my side for claiming that this is not correct sha1
>> output, I was only looking at sha_transform and its implementation and
>> couldn't see the padding and finalization round with embedding the data
>> length in there and hadn't thought of it being done manually.
>>
>> Anyway, is it difficult to get the sha finalization into some common
>> code library? It is not very bpf specific and crypto code reviewers
>> won't find it there at all.
>
>
> Yes, sure, I'll rework it that way (early next year when I'm back if
> that's fine with you).

Can we make it SHA-256 before 4.10 comes out, though?  This really
looks like it will be used in situations where collisions matter and
it will be exposed to malicious programs, and SHA-1 should not be used
for new designs for this purpose because it simply isn't long enough.

Also, a SHA-1 digest isn't a pile of u32s, so u32 digest[...] is very
misleading.  That should be u8 or, at the very least, __be32.

I realize that there isn't a sha-256 implementation in lib, but would
it really be so bad to make the bpf digest only work (for now) when
crypto is enabled?  I would *love* to see the crypto core learn how to
export simple primitives for direct use without needing the whole
crypto core, and this doesn't seem particularly hard to do, but I
don't think that's 4.10 material.

--Andy

^ permalink raw reply

* Re: George's crazy full state idea (Re: HalfSipHash Acceptable Usage)
From: Hannes Frederic Sowa @ 2016-12-23 12:05 UTC (permalink / raw)
  To: George Spelvin, luto
  Cc: ak, davem, David.Laight, djb, ebiggers3, eric.dumazet, Jason,
	jeanphilippe.aumasson, kernel-hardening, linux-crypto,
	linux-kernel, netdev, tom, torvalds, tytso, vegard.nossum
In-Reply-To: <20161223000716.5768.qmail@ns.sciencehorizons.net>

On Thu, 2016-12-22 at 19:07 -0500, George Spelvin wrote:
> Hannes Frederic Sowa wrote:
> > A lockdep test should still be done. ;)
> 
> Adding might_lock() annotations will improve coverage a lot.

Might be hard to find the correct lock we take later down the code
path, but if that is possible, certainly.

> > Yes, that does look nice indeed. Accounting for bits instead of bytes
> > shouldn't be a huge problem either. Maybe it gets a bit more verbose in
> > case you can't satisfy a request with one batched entropy block and have
> > to consume randomness from two.
> 
> The bit granularity is also for the callers' convenience, so they don't
> have to mask again.  Whether get_random_bits rounds up to byte boundaries
> internally or not is something else.
> 
> When the current batch runs low, I was actually thinking of throwing
> away the remaining bits and computing a new batch of 512.  But it's
> whatever works best at implementation time.
> 
> > > > It could only mix the output back in every two calls, in which case
> > > > you can backtrack up to one call but you need to do 2^128 work to
> > > > backtrack farther.  But yes, this is getting excessively complicated.
> > > No, if you're willing to accept limited backtrack, this is a perfectly
> > > acceptable solution, and not too complicated.  You could do it phase-less
> > > if you like; store the previous output, then after generating the new
> > > one, mix in both.  Then overwrite the previous output.  (But doing two
> > > rounds of a crypto primtive to avoid one conditional jump is stupid,
> > > so forget that.)
> > Can you quickly explain why we lose the backtracking capability?
> 
> Sure.  An RNG is (state[i], output[i]) = f(state[i-1]).  The goal of
> backtracking is to compute output[i], or better yet state[i-1], given
> state[i].
> 
> For example, consider an OFB or CTR mode generator.  The state is a key
> and and IV, and you encrypt the IV with the key to produce output, then
> either replace the IV with the output, or increment it.  Either way,
> since you still have the key, you can invert the transformation and
> recover the previous IV.
> 
> The standard way around this is to use the Davies-Meyer construction:
> 
> IV[i] = IV[i-1] + E(IV[i-1], key)
> 
> This is the standard way to make a non-invertible random function
> out of an invertible random permutation.
> 
> From the sum, there's no easy way to find the ciphertext *or* the
> plaintext that was encrypted.  Assuming the encryption is secure,
> the only way to reverse it is brute force: guess IV[i-1] and run the
> operation forward to see if the resultant IV[i] matches.
> 
> There are a variety of ways to organize this computation, since the
> guess gives toy both IV[i-1] and E(IV[i-1], key) = IV[i] - IV[i-1], including
> running E forward, backward, or starting from both ends to see if you
> meet in the middle.
> 
> The way you add the encryption output to the IV is not very important.
> It can be addition, xor, or some more complex invertible transformation.
> In the case of SipHash, the "encryption" output is smaller than the
> input, so we have to get a bit more creative, but it's still basically
> the same thing.
> 
> The problem is that the output which is combined with the IV is too small.
> With only 64 bits, trying all possible values is practical.  (The world's
> Bitcoin miners are collectively computing SHA-256(SHA-256(input)) 1.7 * 2^64
> times per second.)
> 
> By basically doing two iterations at once and mixing in 128 bits of
> output, the guessing attack is rendered impractical.  The only downside
> is that you need to remember and store one result between when it's
> computed and last used.  This is part of the state, so an attack can
> find output[i-1], but not anything farther back.

Thanks a lot for the explanation!

> > ChaCha as a block cipher gives a "perfect" permutation from the output
> > of either the CRNG or the CPRNG, which actually itself has backtracking
> > protection.
> 
> I'm not quite understanding.  The /dev/random implementation uses some
> of the ChaCha output as a new ChaCha key (that's another way to mix output
> back into the state) to prevent backtracking.  But this slows it down, and
> again if you want to be efficient, you're generating and storing large batches
> of entropy and storing it in the RNG state.

I was actually referring to the anti-backtrack protection in
/dev/random and also /dev/urandom, from where we reseed every 300
seconds and if our batched entropy runs low with Ted's/Jason's current
patch for get_random_int.

As far as I can understand it, backtracking is not a problem in case of
a reseed event inside extract_crng.

When we hit the chacha20 without doing a reseed we only mutate the
state of chacha, but being an invertible function in its own, a
proposal would be to mix parts of the chacha20 output back into the
state, which, as a result, would cause slowdown because we couldn't
propagate the complete output of the cipher back to the caller (looking
at the function _extract_crng).

Or are you referring that the anti-backtrack protection should happen
in every call from get_random_int?

Thanks,
Hannes

^ permalink raw reply

* Re: BPF hash algo (Re: [kernel-hardening] Re: [PATCH v7 3/6] random: use SipHash in place of MD5)
From: Daniel Borkmann @ 2016-12-23 11:59 UTC (permalink / raw)
  To: Hannes Frederic Sowa, Andy Lutomirski, Alexei Starovoitov
  Cc: Jason A. Donenfeld, kernel-hardening@lists.openwall.com,
	Theodore Ts'o, Netdev, LKML, Linux Crypto Mailing List,
	David Laight, Eric Dumazet, Linus Torvalds, Eric Biggers,
	Tom Herbert, Andi Kleen, David S. Miller, Jean-Philippe Aumasson
In-Reply-To: <1482490762.3353.2.camel@stressinduktion.org>

On 12/23/2016 11:59 AM, Hannes Frederic Sowa wrote:
> On Fri, 2016-12-23 at 11:04 +0100, Daniel Borkmann wrote:
>> On 12/22/2016 05:59 PM, Hannes Frederic Sowa wrote:
>>> On Thu, 2016-12-22 at 08:07 -0800, Andy Lutomirski wrote:
[...]
>>> The hashing is not a proper sha1 neither, unfortunately. I think that
>>> is why it will have a custom implementation in iproute2?
>>
>> Still trying to catch up on this admittedly bit confusing thread. I
>> did run automated tests over couple of days comparing the data I got
>> from fdinfo with the one from af_alg and found no mismatch on the test
>> cases varying from min to max possible program sizes. In the process
>> of testing, as you might have seen on netdev, I found couple of other
>> bugs in bpf code along the way and fixed them up as well. So my question,
>> do you or Andy or anyone participating in claiming this have any
>> concrete data or test cases that suggests something different? If yes,
>> I'm very curious to hear about it and willing fix it up, of course.
>> When I'm back from pto I'll prep and cook up my test suite to be
>> included into the selftests/bpf/, should have done this initially,
>> sorry about that. I'll also post something to expose the alg, that
>> sounds fine to me.
>
> Looking into your code closer, I noticed that you indeed seem to do the
> finalization of sha-1 by hand by aligning and padding the buffer
> accordingly and also patching in the necessary payload length.
>
> Apologies for my side for claiming that this is not correct sha1
> output, I was only looking at sha_transform and its implementation and
> couldn't see the padding and finalization round with embedding the data
> length in there and hadn't thought of it being done manually.
>
> Anyway, is it difficult to get the sha finalization into some common
> code library? It is not very bpf specific and crypto code reviewers
> won't find it there at all.

Yes, sure, I'll rework it that way (early next year when I'm back if
that's fine with you).

Thanks,
Daniel

^ permalink raw reply

* Re: [PATCH v2 11/12] crypto: atmel-authenc: add support to authenc(hmac(shaX), Y(aes)) modes
From: Stephan Müller @ 2016-12-23 11:34 UTC (permalink / raw)
  To: Cyrille Pitchen
  Cc: herbert, nicolas.ferre, linux-kernel, linux-crypto, davem,
	linux-arm-kernel
In-Reply-To: <2a6cc4637621d2ef0d84754817128c43795cb022.1482424395.git.cyrille.pitchen@atmel.com>

Am Donnerstag, 22. Dezember 2016, 17:38:00 CET schrieb Cyrille Pitchen:

Hi Cyrille,

> This patchs allows to combine the AES and SHA hardware accelerators on
> some Atmel SoCs. Doing so, AES blocks are only written to/read from the
> AES hardware. Those blocks are also transferred from the AES to the SHA
> accelerator internally, without additionnal accesses to the system busses.
> 
> Hence, the AES and SHA accelerators work in parallel to process all the
> data blocks, instead of serializing the process by (de)crypting those
> blocks first then authenticating them after like the generic
> crypto/authenc.c driver does.
> 
> Of course, both the AES and SHA hardware accelerators need to be available
> before we can start to process the data blocks. Hence we use their crypto
> request queue to synchronize both drivers.
> 
> Signed-off-by: Cyrille Pitchen <cyrille.pitchen@atmel.com>
> ---
>  drivers/crypto/Kconfig          |  12 +
>  drivers/crypto/atmel-aes-regs.h |  16 ++
>  drivers/crypto/atmel-aes.c      | 471
> +++++++++++++++++++++++++++++++++++++++- drivers/crypto/atmel-authenc.h  | 
> 64 ++++++
>  drivers/crypto/atmel-sha-regs.h |  14 ++
>  drivers/crypto/atmel-sha.c      | 344 +++++++++++++++++++++++++++--
>  6 files changed, 906 insertions(+), 15 deletions(-)
>  create mode 100644 drivers/crypto/atmel-authenc.h
> 
> diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
> index 79564785ae30..719a868d8ea1 100644
> --- a/drivers/crypto/Kconfig
> +++ b/drivers/crypto/Kconfig
> @@ -415,6 +415,18 @@ config CRYPTO_DEV_BFIN_CRC
>  	  Newer Blackfin processors have CRC hardware. Select this if you
>  	  want to use the Blackfin CRC module.
> 
> +config CRYPTO_DEV_ATMEL_AUTHENC
> +	tristate "Support for Atmel IPSEC/SSL hw accelerator"
> +	depends on (ARCH_AT91 && HAS_DMA) || COMPILE_TEST
> +	select CRYPTO_AUTHENC
> +	select CRYPTO_DEV_ATMEL_AES
> +	select CRYPTO_DEV_ATMEL_SHA
> +	help
> +	  Some Atmel processors can combine the AES and SHA hw accelerators
> +	  to enhance support of IPSEC/SSL.
> +	  Select this if you want to use the Atmel modules for
> +	  authenc(hmac(shaX),Y(cbc)) algorithms.
> +
>  config CRYPTO_DEV_ATMEL_AES
>  	tristate "Support for Atmel AES hw accelerator"
>  	depends on HAS_DMA
> diff --git a/drivers/crypto/atmel-aes-regs.h
> b/drivers/crypto/atmel-aes-regs.h index 0ec04407b533..7694679802b3 100644
> --- a/drivers/crypto/atmel-aes-regs.h
> +++ b/drivers/crypto/atmel-aes-regs.h
> @@ -68,6 +68,22 @@
>  #define AES_CTRR	0x98
>  #define AES_GCMHR(x)	(0x9c + ((x) * 0x04))
> 
> +#define AES_EMR		0xb0
> +#define AES_EMR_APEN		BIT(0)	/* Auto Padding Enable */
> +#define AES_EMR_APM		BIT(1)	/* Auto Padding Mode */
> +#define AES_EMR_APM_IPSEC	0x0
> +#define AES_EMR_APM_SSL		BIT(1)
> +#define AES_EMR_PLIPEN		BIT(4)	/* PLIP Enable */
> +#define AES_EMR_PLIPD		BIT(5)	/* PLIP Decipher */
> +#define AES_EMR_PADLEN_MASK	(0xFu << 8)
> +#define AES_EMR_PADLEN_OFFSET	8
> +#define AES_EMR_PADLEN(padlen)	(((padlen) << AES_EMR_PADLEN_OFFSET) &\
> +				 AES_EMR_PADLEN_MASK)
> +#define AES_EMR_NHEAD_MASK	(0xFu << 16)
> +#define AES_EMR_NHEAD_OFFSET	16
> +#define AES_EMR_NHEAD(nhead)	(((nhead) << AES_EMR_NHEAD_OFFSET) &\
> +				 AES_EMR_NHEAD_MASK)
> +
>  #define AES_TWR(x)	(0xc0 + ((x) * 0x04))
>  #define AES_ALPHAR(x)	(0xd0 + ((x) * 0x04))
> 
> diff --git a/drivers/crypto/atmel-aes.c b/drivers/crypto/atmel-aes.c
> index 9fd2f63b8bc0..3c651e0c3113 100644
> --- a/drivers/crypto/atmel-aes.c
> +++ b/drivers/crypto/atmel-aes.c
> @@ -41,6 +41,7 @@
>  #include <linux/platform_data/crypto-atmel.h>
>  #include <dt-bindings/dma/at91.h>
>  #include "atmel-aes-regs.h"
> +#include "atmel-authenc.h"
> 
>  #define ATMEL_AES_PRIORITY	300
> 
> @@ -78,6 +79,7 @@
>  #define AES_FLAGS_INIT		BIT(2)
>  #define AES_FLAGS_BUSY		BIT(3)
>  #define AES_FLAGS_DUMP_REG	BIT(4)
> +#define AES_FLAGS_OWN_SHA	BIT(5)
> 
>  #define AES_FLAGS_PERSISTENT	(AES_FLAGS_INIT | AES_FLAGS_BUSY)
> 
> @@ -92,6 +94,7 @@ struct atmel_aes_caps {
>  	bool			has_ctr32;
>  	bool			has_gcm;
>  	bool			has_xts;
> +	bool			has_authenc;
>  	u32			max_burst_size;
>  };
> 
> @@ -144,10 +147,31 @@ struct atmel_aes_xts_ctx {
>  	u32			key2[AES_KEYSIZE_256 / sizeof(u32)];
>  };
> 
> +#ifdef CONFIG_CRYPTO_DEV_ATMEL_AUTHENC
> +struct atmel_aes_authenc_ctx {
> +	struct atmel_aes_base_ctx	base;
> +	struct atmel_sha_authenc_ctx	*auth;
> +};
> +#endif
> +
>  struct atmel_aes_reqctx {
>  	unsigned long		mode;
>  };
> 
> +#ifdef CONFIG_CRYPTO_DEV_ATMEL_AUTHENC
> +struct atmel_aes_authenc_reqctx {
> +	struct atmel_aes_reqctx	base;
> +
> +	struct scatterlist	src[2];
> +	struct scatterlist	dst[2];
> +	size_t			textlen;
> +	u32			digest[SHA512_DIGEST_SIZE / sizeof(u32)];
> +
> +	/* auth_req MUST be place last. */
> +	struct ahash_request	auth_req;
> +};
> +#endif
> +
>  struct atmel_aes_dma {
>  	struct dma_chan		*chan;
>  	struct scatterlist	*sg;
> @@ -291,6 +315,9 @@ static const char *atmel_aes_reg_name(u32 offset, char
> *tmp, size_t sz) snprintf(tmp, sz, "GCMHR[%u]", (offset - AES_GCMHR(0)) >>
> 2);
>  		break;
> 
> +	case AES_EMR:
> +		return "EMR";
> +
>  	case AES_TWR(0):
>  	case AES_TWR(1):
>  	case AES_TWR(2):
> @@ -463,8 +490,16 @@ static inline bool atmel_aes_is_encrypt(const struct
> atmel_aes_dev *dd) return (dd->flags & AES_FLAGS_ENCRYPT);
>  }
> 
> +#ifdef CONFIG_CRYPTO_DEV_ATMEL_AUTHENC
> +static void atmel_aes_authenc_complete(struct atmel_aes_dev *dd, int err);
> +#endif
> +
>  static inline int atmel_aes_complete(struct atmel_aes_dev *dd, int err)
>  {
> +#ifdef CONFIG_CRYPTO_DEV_ATMEL_AUTHENC
> +	atmel_aes_authenc_complete(dd, err);
> +#endif
> +
>  	clk_disable(dd->iclk);
>  	dd->flags &= ~AES_FLAGS_BUSY;
> 
> @@ -1931,6 +1966,407 @@ static struct crypto_alg aes_xts_alg = {
>  	}
>  };
> 
> +#ifdef CONFIG_CRYPTO_DEV_ATMEL_AUTHENC
> +/* authenc aead functions */
> +
> +static int atmel_aes_authenc_start(struct atmel_aes_dev *dd);
> +static int atmel_aes_authenc_init(struct atmel_aes_dev *dd, int err,
> +				  bool is_async);
> +static int atmel_aes_authenc_transfer(struct atmel_aes_dev *dd, int err,
> +				      bool is_async);
> +static int atmel_aes_authenc_digest(struct atmel_aes_dev *dd);
> +static int atmel_aes_authenc_final(struct atmel_aes_dev *dd, int err,
> +				   bool is_async);
> +
> +static void atmel_aes_authenc_complete(struct atmel_aes_dev *dd, int err)
> +{
> +	struct aead_request *req = aead_request_cast(dd->areq);
> +	struct atmel_aes_authenc_reqctx *rctx = aead_request_ctx(req);
> +
> +	if (err && (dd->flags & AES_FLAGS_OWN_SHA))
> +		atmel_sha_authenc_abort(&rctx->auth_req);
> +	dd->flags &= ~AES_FLAGS_OWN_SHA;
> +}
> +
> +static int atmel_aes_authenc_copy_assoc(struct aead_request *req)
> +{
> +	size_t buflen, assoclen = req->assoclen;
> +	off_t skip = 0;
> +	u8 buf[256];
> +
> +	while (assoclen) {
> +		buflen = min_t(size_t, assoclen, sizeof(buf));
> +
> +		if (sg_pcopy_to_buffer(req->src, sg_nents(req->src),
> +				       buf, buflen, skip) != buflen)
> +			return -EINVAL;
> +
> +		if (sg_pcopy_from_buffer(req->dst, sg_nents(req->dst),
> +					 buf, buflen, skip) != buflen)
> +			return -EINVAL;
> +
> +		skip += buflen;
> +		assoclen -= buflen;
> +	}

This seems to be a very expansive operation. Wouldn't it be easier, leaner and 
with one less memcpy to use the approach of crypto_authenc_copy_assoc?

Instead of copying crypto_authenc_copy_assoc, what about carving the logic in 
crypto/authenc.c out into a generic aead helper code as we need to add that to 
other AEAD implementations?
> +
> +	return 0;
> +}
> +
> +static int atmel_aes_authenc_start(struct atmel_aes_dev *dd)
> +{
> +	struct aead_request *req = aead_request_cast(dd->areq);
> +	struct atmel_aes_authenc_reqctx *rctx = aead_request_ctx(req);
> +	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
> +	struct atmel_aes_authenc_ctx *ctx = crypto_aead_ctx(tfm);
> +	int err;
> +
> +	atmel_aes_set_mode(dd, &rctx->base);
> +
> +	err = atmel_aes_hw_init(dd);
> +	if (err)
> +		return atmel_aes_complete(dd, err);
> +
> +	return atmel_sha_authenc_schedule(&rctx->auth_req, ctx->auth,
> +					  atmel_aes_authenc_init, dd);
> +}
> +
> +static int atmel_aes_authenc_init(struct atmel_aes_dev *dd, int err,
> +				  bool is_async)
> +{
> +	struct aead_request *req = aead_request_cast(dd->areq);
> +	struct atmel_aes_authenc_reqctx *rctx = aead_request_ctx(req);
> +
> +	if (is_async)
> +		dd->is_async = true;
> +	if (err)
> +		return atmel_aes_complete(dd, err);
> +
> +	/* If here, we've got the ownership of the SHA device. */
> +	dd->flags |= AES_FLAGS_OWN_SHA;
> +
> +	/* Configure the SHA device. */
> +	return atmel_sha_authenc_init(&rctx->auth_req,
> +				      req->src, req->assoclen,
> +				      rctx->textlen,
> +				      atmel_aes_authenc_transfer, dd);
> +}
> +
> +static int atmel_aes_authenc_transfer(struct atmel_aes_dev *dd, int err,
> +				      bool is_async)
> +{
> +	struct aead_request *req = aead_request_cast(dd->areq);
> +	struct atmel_aes_authenc_reqctx *rctx = aead_request_ctx(req);
> +	bool enc = atmel_aes_is_encrypt(dd);
> +	struct scatterlist *src, *dst;
> +	u32 iv[AES_BLOCK_SIZE / sizeof(u32)];
> +	u32 emr;
> +
> +	if (is_async)
> +		dd->is_async = true;
> +	if (err)
> +		return atmel_aes_complete(dd, err);
> +
> +	/* Prepare src and dst scatter-lists to transfer cipher/plain texts. */
> +	src = scatterwalk_ffwd(rctx->src, req->src, req->assoclen);
> +	dst = src;
> +
> +	if (req->src != req->dst) {
> +		err = atmel_aes_authenc_copy_assoc(req);
> +		if (err)
> +			return atmel_aes_complete(dd, err);
> +
> +		dst = scatterwalk_ffwd(rctx->dst, req->dst, req->assoclen);
> +	}
> +
> +	/* Configure the AES device. */
> +	memcpy(iv, req->iv, sizeof(iv));
> +
> +	/*
> +	 * Here we always set the 2nd parameter of atmel_aes_write_ctrl() to
> +	 * 'true' even if the data transfer is actually performed by the CPU (so
> +	 * not by the DMA) because we must force the AES_MR_SMOD bitfield to the
> +	 * value AES_MR_SMOD_IDATAR0. Indeed, both AES_MR_SMOD and SHA_MR_SMOD
> +	 * must be set to *_MR_SMOD_IDATAR0.
> +	 */
> +	atmel_aes_write_ctrl(dd, true, iv);
> +	emr = AES_EMR_PLIPEN;
> +	if (!enc)
> +		emr |= AES_EMR_PLIPD;
> +	atmel_aes_write(dd, AES_EMR, emr);
> +
> +	/* Transfer data. */
> +	return atmel_aes_dma_start(dd, src, dst, rctx->textlen,
> +				   atmel_aes_authenc_digest);
> +}
> +
> +static int atmel_aes_authenc_digest(struct atmel_aes_dev *dd)
> +{
> +	struct aead_request *req = aead_request_cast(dd->areq);
> +	struct atmel_aes_authenc_reqctx *rctx = aead_request_ctx(req);
> +
> +	/* atmel_sha_authenc_final() releases the SHA device. */
> +	dd->flags &= ~AES_FLAGS_OWN_SHA;
> +	return atmel_sha_authenc_final(&rctx->auth_req,
> +				       rctx->digest, sizeof(rctx->digest),
> +				       atmel_aes_authenc_final, dd);
> +}
> +
> +static int atmel_aes_authenc_final(struct atmel_aes_dev *dd, int err,
> +				   bool is_async)
> +{
> +	struct aead_request *req = aead_request_cast(dd->areq);
> +	struct atmel_aes_authenc_reqctx *rctx = aead_request_ctx(req);
> +	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
> +	bool enc = atmel_aes_is_encrypt(dd);
> +	u32 idigest[SHA512_DIGEST_SIZE / sizeof(u32)], *odigest = rctx->digest;
> +	u32 offs, authsize;
> +
> +	if (is_async)
> +		dd->is_async = true;
> +	if (err)
> +		goto complete;
> +
> +	offs = req->assoclen + rctx->textlen;
> +	authsize = crypto_aead_authsize(tfm);
> +	if (enc) {
> +		scatterwalk_map_and_copy(odigest, req->dst, offs, authsize, 1);
> +	} else {
> +		scatterwalk_map_and_copy(idigest, req->src, offs, authsize, 0);
> +		if (crypto_memneq(idigest, odigest, authsize))
> +			err = -EBADMSG;
> +	}
> +
> +complete:
> +	return atmel_aes_complete(dd, err);
> +}
> +
> +static int atmel_aes_authenc_setkey(struct crypto_aead *tfm, const u8 *key,
> +				    unsigned int keylen)
> +{
> +	struct atmel_aes_authenc_ctx *ctx = crypto_aead_ctx(tfm);
> +	struct crypto_authenc_keys keys;
> +	u32 flags;
> +	int err;
> +
> +	if (crypto_authenc_extractkeys(&keys, key, keylen) != 0)
> +		goto badkey;
> +
> +	if (keys.enckeylen > sizeof(ctx->base.key))
> +		goto badkey;
> +
> +	/* Save auth key. */
> +	flags = crypto_aead_get_flags(tfm);
> +	err = atmel_sha_authenc_setkey(ctx->auth,
> +				       keys.authkey, keys.authkeylen,
> +				       &flags);
> +	crypto_aead_set_flags(tfm, flags & CRYPTO_TFM_RES_MASK);
> +	if (err)
> +		return err;
> +
> +	/* Save enc key. */
> +	ctx->base.keylen = keys.enckeylen;
> +	memcpy(ctx->base.key, keys.enckey, keys.enckeylen);

memzero_explicit(keys) please
> +
> +	return 0;
> +
> +badkey:
> +	crypto_aead_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
> +	return -EINVAL;
> +}
> +
> +static int atmel_aes_authenc_init_tfm(struct crypto_aead *tfm,
> +				      unsigned long auth_mode)
> +{
> +	struct atmel_aes_authenc_ctx *ctx = crypto_aead_ctx(tfm);
> +	unsigned int auth_reqsize = atmel_sha_authenc_get_reqsize();
> +
> +	ctx->auth = atmel_sha_authenc_spawn(auth_mode);
> +	if (IS_ERR(ctx->auth))
> +		return PTR_ERR(ctx->auth);
> +
> +	crypto_aead_set_reqsize(tfm, (sizeof(struct atmel_aes_authenc_reqctx) +
> +				      auth_reqsize));
> +	ctx->base.start = atmel_aes_authenc_start;
> +
> +	return 0;
> +}
> +
> +static int atmel_aes_authenc_hmac_sha1_init_tfm(struct crypto_aead *tfm)
> +{
> +	return atmel_aes_authenc_init_tfm(tfm, SHA_FLAGS_HMAC_SHA1);
> +}
> +
> +static int atmel_aes_authenc_hmac_sha224_init_tfm(struct crypto_aead *tfm)
> +{
> +	return atmel_aes_authenc_init_tfm(tfm, SHA_FLAGS_HMAC_SHA224);
> +}
> +
> +static int atmel_aes_authenc_hmac_sha256_init_tfm(struct crypto_aead *tfm)
> +{
> +	return atmel_aes_authenc_init_tfm(tfm, SHA_FLAGS_HMAC_SHA256);
> +}
> +
> +static int atmel_aes_authenc_hmac_sha384_init_tfm(struct crypto_aead *tfm)
> +{
> +	return atmel_aes_authenc_init_tfm(tfm, SHA_FLAGS_HMAC_SHA384);
> +}
> +
> +static int atmel_aes_authenc_hmac_sha512_init_tfm(struct crypto_aead *tfm)
> +{
> +	return atmel_aes_authenc_init_tfm(tfm, SHA_FLAGS_HMAC_SHA512);
> +}
> +
> +static void atmel_aes_authenc_exit_tfm(struct crypto_aead *tfm)
> +{
> +	struct atmel_aes_authenc_ctx *ctx = crypto_aead_ctx(tfm);
> +
> +	atmel_sha_authenc_free(ctx->auth);
> +}
> +
> +static int atmel_aes_authenc_crypt(struct aead_request *req,
> +				   unsigned long mode)
> +{
> +	struct atmel_aes_authenc_reqctx *rctx = aead_request_ctx(req);
> +	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
> +	struct atmel_aes_base_ctx *ctx = crypto_aead_ctx(tfm);
> +	u32 authsize = crypto_aead_authsize(tfm);
> +	bool enc = (mode & AES_FLAGS_ENCRYPT);
> +	struct atmel_aes_dev *dd;
> +
> +	/* Compute text length. */
> +	rctx->textlen = req->cryptlen - (enc ? 0 : authsize);

Is there somewhere a check that authsize is always < req->cryptlen (at least 
it escaped me)? Note, this logic will be exposed to user space which may do 
funky things.
> +
> +	/*
> +	 * Currently, empty messages are not supported yet:
> +	 * the SHA auto-padding can be used only on non-empty messages.
> +	 * Hence a special case needs to be implemented for empty message.
> +	 */
> +	if (!rctx->textlen && !req->assoclen)
> +		return -EINVAL;
> +
> +	rctx->base.mode = mode;
> +	ctx->block_size = AES_BLOCK_SIZE;
> +
> +	dd = atmel_aes_find_dev(ctx);
> +	if (!dd)
> +		return -ENODEV;
> +
> +	return atmel_aes_handle_queue(dd, &req->base);

Ciao
Stephan

^ permalink raw reply

* Re: BPF hash algo (Re: [kernel-hardening] Re: [PATCH v7 3/6] random: use SipHash in place of MD5)
From: Hannes Frederic Sowa @ 2016-12-23 10:59 UTC (permalink / raw)
  To: Daniel Borkmann, Andy Lutomirski, Alexei Starovoitov
  Cc: Jason A. Donenfeld, kernel-hardening@lists.openwall.com,
	Theodore Ts'o, Netdev, LKML, Linux Crypto Mailing List,
	David Laight, Eric Dumazet, Linus Torvalds, Eric Biggers,
	Tom Herbert, Andi Kleen, David S. Miller, Jean-Philippe Aumasson
In-Reply-To: <585CF6A3.1050107@iogearbox.net>

On Fri, 2016-12-23 at 11:04 +0100, Daniel Borkmann wrote:
> On 12/22/2016 05:59 PM, Hannes Frederic Sowa wrote:
> > On Thu, 2016-12-22 at 08:07 -0800, Andy Lutomirski wrote:
> > > On Thu, Dec 22, 2016 at 7:51 AM, Hannes Frederic Sowa
> > > <hannes@stressinduktion.org> wrote:
> > > > On Thu, 2016-12-22 at 16:41 +0100, Jason A. Donenfeld wrote:
> > > > > On Thu, Dec 22, 2016 at 4:33 PM, Hannes Frederic Sowa
> > > > > <hannes@stressinduktion.org> wrote:
> > > > > > IPv6 you cannot touch anymore. The hashing algorithm is part of uAPI.
> > > > > > You don't want to give people new IPv6 addresses with the same stable
> > > > > > secret (across reboots) after a kernel upgrade. Maybe they lose
> > > > > > connectivity then and it is extra work?
> > > > > 
> > > > > Ahh, too bad. So it goes.
> > > > 
> > > > If no other users survive we can put it into the ipv6 module.
> > > > 
> > > > > > The bpf hash stuff can be changed during this merge window, as it is
> > > > > > not yet in a released kernel. Albeit I would probably have preferred
> > > > > > something like sha256 here, which can be easily replicated by user
> > > > > > space tools (minus the problem of patching out references to not
> > > > > > hashable data, which must be zeroed).
> > > > > 
> > > > > Oh, interesting, so time is of the essence then. Do you want to handle
> > > > > changing the new eBPF code to something not-SHA1 before it's too late,
> > > > > as part of a ne
> > > 
> > > w patchset that can fast track itself to David? And
> > > > > then I can preserve my large series for the next merge window.
> > > > 
> > > > This algorithm should be a non-seeded algorithm, because the hashes
> > > > should be stable and verifiable by user space tooling. Thus this would
> > > > need a hashing algorithm that is hardened against pre-image
> > > > attacks/collision resistance, which siphash is not. I would prefer some
> > > > higher order SHA algorithm for that actually.
> > > > 
> > > 
> > > You mean:
> > > 
> > > commit 7bd509e311f408f7a5132fcdde2069af65fa05ae
> > > Author: Daniel Borkmann <daniel@iogearbox.net>
> > > Date:   Sun Dec 4 23:19:41 2016 +0100
> > > 
> > >      bpf: add prog_digest and expose it via fdinfo/netlink
> > > 
> > > 
> > > Yes, please!  This actually matters for security -- imagine a
> > > malicious program brute-forcing a collision so that it gets loaded
> > > wrong.  And this is IMO a use case for SHA-256 or SHA-512/256
> > > (preferably the latter).  Speed basically doesn't matter here and
> > > Blake2 is both less stable (didn't they slightly change it recently?)
> > > and much less well studied.
> > 
> > We don't prevent ebpf programs being loaded based on the digest but
> > just to uniquely identify loaded programs from user space and match up
> > with their source.
> > 
> > There have been talks about signing bpf programs, thus this would
> > probably need another digest algorithm additionally to that one,
> > wasting probably instructions. Probably going somewhere in direction of
> > PKCS#7 might be the thing to do (which leads to the problem to make
> > PKCS#7 attackable by ordinary unpriv users, hmpf).
> > 
> > > My inclination would have been to seed them with something that isn't
> > > exposed to userspace for the precise reason that it would prevent user
> > > code from making assumptions about what's in the hash.  But if there's
> > > a use case for why user code needs to be able to calculate the hash on
> > > its own, then that's fine.  But perhaps the actual fdinfo string
> > > should be "sha256:abcd1234..." to give some flexibility down the road.
> > > 
> > > Also:
> > > 
> > > +       result = (__force __be32 *)fp->digest;
> > > +       for (i = 0; i < SHA_DIGEST_WORDS; i++)
> > > +               result[i] = cpu_to_be32(fp->digest[i]);
> > > 
> > > Everyone, please, please, please don't open-code crypto primitives.
> > > Is this and the code above it even correct?  It might be but on a very
> > > brief glance it looks wrong to me.  If you're doing this to avoid
> > > depending on crypto, then fix crypto so you can pull in the algorithm
> > > without pulling in the whole crypto core.
> > 
> > The hashing is not a proper sha1 neither, unfortunately. I think that
> > is why it will have a custom implementation in iproute2?
> 
> Still trying to catch up on this admittedly bit confusing thread. I
> did run automated tests over couple of days comparing the data I got
> from fdinfo with the one from af_alg and found no mismatch on the test
> cases varying from min to max possible program sizes. In the process
> of testing, as you might have seen on netdev, I found couple of other
> bugs in bpf code along the way and fixed them up as well. So my question,
> do you or Andy or anyone participating in claiming this have any
> concrete data or test cases that suggests something different? If yes,
> I'm very curious to hear about it and willing fix it up, of course.
> When I'm back from pto I'll prep and cook up my test suite to be
> included into the selftests/bpf/, should have done this initially,
> sorry about that. I'll also post something to expose the alg, that
> sounds fine to me.

Looking into your code closer, I noticed that you indeed seem to do the
finalization of sha-1 by hand by aligning and padding the buffer
accordingly and also patching in the necessary payload length.

Apologies for my side for claiming that this is not correct sha1
output, I was only looking at sha_transform and its implementation and
couldn't see the padding and finalization round with embedding the data
length in there and hadn't thought of it being done manually.

Anyway, is it difficult to get the sha finalization into some common
code library? It is not very bpf specific and crypto code reviewers
won't find it there at all.

Thanks,
Hannes

^ permalink raw reply

* Re: BPF hash algo (Re: [kernel-hardening] Re: [PATCH v7 3/6] random: use SipHash in place of MD5)
From: Daniel Borkmann @ 2016-12-23 10:23 UTC (permalink / raw)
  To: Andy Lutomirski, Hannes Frederic Sowa
  Cc: Alexei Starovoitov, Jason A. Donenfeld,
	kernel-hardening@lists.openwall.com, Theodore Ts'o, Netdev,
	LKML, Linux Crypto Mailing List, David Laight, Eric Dumazet,
	Linus Torvalds, Eric Biggers, Tom Herbert, Andi Kleen,
	David S. Miller, Jean-Philippe Aumasson
In-Reply-To: <CALCETrW_T1v4qKPJDs5dXwAnAit3M52AWMH-K+GJLb1WoLMuRQ@mail.gmail.com>

On 12/22/2016 06:25 PM, Andy Lutomirski wrote:
> On Thu, Dec 22, 2016 at 8:59 AM, Hannes Frederic Sowa
[...]
>> I wondered if bpf program loading should have used the module loading
>> infrastructure from the beginning...
>
> That would be way too complicated and would be nasty for the unprivileged cases.

Also, there are users be it privileged or not that don't require to have a full
obj loader from user space, but are fine with just hard-coding parts or all of
the insns in their application. Back then we settled with using fds based on
Andy's suggestion, it has both ups and downs as we saw along the way but worked
okay thus far.

^ permalink raw reply

* Re: BPF hash algo (Re: [kernel-hardening] Re: [PATCH v7 3/6] random: use SipHash in place of MD5)
From: Daniel Borkmann @ 2016-12-23 10:04 UTC (permalink / raw)
  To: Hannes Frederic Sowa, Andy Lutomirski, Alexei Starovoitov
  Cc: Jason A. Donenfeld, kernel-hardening@lists.openwall.com,
	Theodore Ts'o, Netdev, LKML, Linux Crypto Mailing List,
	David Laight, Eric Dumazet, Linus Torvalds, Eric Biggers,
	Tom Herbert, Andi Kleen, David S. Miller, Jean-Philippe Aumasson
In-Reply-To: <1482425969.2673.5.camel@stressinduktion.org>

On 12/22/2016 05:59 PM, Hannes Frederic Sowa wrote:
> On Thu, 2016-12-22 at 08:07 -0800, Andy Lutomirski wrote:
>> On Thu, Dec 22, 2016 at 7:51 AM, Hannes Frederic Sowa
>> <hannes@stressinduktion.org> wrote:
>>> On Thu, 2016-12-22 at 16:41 +0100, Jason A. Donenfeld wrote:
>>>> On Thu, Dec 22, 2016 at 4:33 PM, Hannes Frederic Sowa
>>>> <hannes@stressinduktion.org> wrote:
>>>>> IPv6 you cannot touch anymore. The hashing algorithm is part of uAPI.
>>>>> You don't want to give people new IPv6 addresses with the same stable
>>>>> secret (across reboots) after a kernel upgrade. Maybe they lose
>>>>> connectivity then and it is extra work?
>>>>
>>>> Ahh, too bad. So it goes.
>>>
>>> If no other users survive we can put it into the ipv6 module.
>>>
>>>>> The bpf hash stuff can be changed during this merge window, as it is
>>>>> not yet in a released kernel. Albeit I would probably have preferred
>>>>> something like sha256 here, which can be easily replicated by user
>>>>> space tools (minus the problem of patching out references to not
>>>>> hashable data, which must be zeroed).
>>>>
>>>> Oh, interesting, so time is of the essence then. Do you want to handle
>>>> changing the new eBPF code to something not-SHA1 before it's too late,
>>>> as part of a ne
>>
>> w patchset that can fast track itself to David? And
>>>> then I can preserve my large series for the next merge window.
>>>
>>> This algorithm should be a non-seeded algorithm, because the hashes
>>> should be stable and verifiable by user space tooling. Thus this would
>>> need a hashing algorithm that is hardened against pre-image
>>> attacks/collision resistance, which siphash is not. I would prefer some
>>> higher order SHA algorithm for that actually.
>>>
>>
>> You mean:
>>
>> commit 7bd509e311f408f7a5132fcdde2069af65fa05ae
>> Author: Daniel Borkmann <daniel@iogearbox.net>
>> Date:   Sun Dec 4 23:19:41 2016 +0100
>>
>>      bpf: add prog_digest and expose it via fdinfo/netlink
>>
>>
>> Yes, please!  This actually matters for security -- imagine a
>> malicious program brute-forcing a collision so that it gets loaded
>> wrong.  And this is IMO a use case for SHA-256 or SHA-512/256
>> (preferably the latter).  Speed basically doesn't matter here and
>> Blake2 is both less stable (didn't they slightly change it recently?)
>> and much less well studied.
>
> We don't prevent ebpf programs being loaded based on the digest but
> just to uniquely identify loaded programs from user space and match up
> with their source.
>
> There have been talks about signing bpf programs, thus this would
> probably need another digest algorithm additionally to that one,
> wasting probably instructions. Probably going somewhere in direction of
> PKCS#7 might be the thing to do (which leads to the problem to make
> PKCS#7 attackable by ordinary unpriv users, hmpf).
>
>> My inclination would have been to seed them with something that isn't
>> exposed to userspace for the precise reason that it would prevent user
>> code from making assumptions about what's in the hash.  But if there's
>> a use case for why user code needs to be able to calculate the hash on
>> its own, then that's fine.  But perhaps the actual fdinfo string
>> should be "sha256:abcd1234..." to give some flexibility down the road.
>>
>> Also:
>>
>> +       result = (__force __be32 *)fp->digest;
>> +       for (i = 0; i < SHA_DIGEST_WORDS; i++)
>> +               result[i] = cpu_to_be32(fp->digest[i]);
>>
>> Everyone, please, please, please don't open-code crypto primitives.
>> Is this and the code above it even correct?  It might be but on a very
>> brief glance it looks wrong to me.  If you're doing this to avoid
>> depending on crypto, then fix crypto so you can pull in the algorithm
>> without pulling in the whole crypto core.
>
> The hashing is not a proper sha1 neither, unfortunately. I think that
> is why it will have a custom implementation in iproute2?

Still trying to catch up on this admittedly bit confusing thread. I
did run automated tests over couple of days comparing the data I got
from fdinfo with the one from af_alg and found no mismatch on the test
cases varying from min to max possible program sizes. In the process
of testing, as you might have seen on netdev, I found couple of other
bugs in bpf code along the way and fixed them up as well. So my question,
do you or Andy or anyone participating in claiming this have any
concrete data or test cases that suggests something different? If yes,
I'm very curious to hear about it and willing fix it up, of course.
When I'm back from pto I'll prep and cook up my test suite to be
included into the selftests/bpf/, should have done this initially,
sorry about that. I'll also post something to expose the alg, that
sounds fine to me.

> I wondered if bpf program loading should have used the module loading
> infrastructure from the beginning...
>
>> At the very least, there should be a separate function that calculates
>> the hash of a buffer and that function should explicitly run itself
>> against test vectors of various lengths to make sure that it's
>> calculating what it claims to be calculating.  And it doesn't look
>> like the userspace code has landed, so, if this thing isn't
>> calculating SHA1 correctly, it's plausible that no one has noticed.
>
> I hope this was known from the beginning, this is not sha1 unfortunately.
>
> But ebpf elf programs also need preprocessing to get rid of some
> embedded load-depending data, so maybe it was considered to be just
> enough?
>
> Bye,
> Hannes
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply

* Re: [RFC PATCH v2] crypto: Add IV generation algorithms
From: Herbert Xu @ 2016-12-23  7:51 UTC (permalink / raw)
  To: Binoy Jayan
  Cc: Milan Broz, Oded, Ofir, David S. Miller, linux-crypto, Mark Brown,
	Arnd Bergmann, Linux kernel mailing list, Alasdair Kergon,
	Mike Snitzer, dm-devel, Shaohua Li, linux-raid, Rajendra
In-Reply-To: <CAHv-k_-EKq2g=Wb+YkPVp9gCXTDEyrxZhQpT5JMSw=WtZ1OC9w@mail.gmail.com>

On Thu, Dec 22, 2016 at 04:25:12PM +0530, Binoy Jayan wrote:
>
> > It doesn't have to live outside of dm-crypt.  You can register
> > these IV generators from there if you really want.
> 
> Sorry, but I didn't understand this part.

What I mean is that moving the IV generators into the crypto API
does not mean the dm-crypt team giving up control over them.  You
could continue to keep them within the dm-crypt code base and
still register them through the normal crypto API mechanism.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: Test AEAD/authenc algorithms from userspace
From: Harsh Jain @ 2016-12-23  5:46 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Stephan Mueller, linux-crypto
In-Reply-To: <20161221085429.GB29501@gondor.apana.org.au>



On 21-12-2016 14:24, Herbert Xu wrote:
> On Mon, Dec 19, 2016 at 04:08:11PM +0530, Harsh Jain wrote:
>> Hi Herbert,
>>
>> TLS default mode of operation is MAC-then-Encrypt for Authenc algos.
>> Currently framework only supports EtM used in IPSec. User space
>> programs like openssl cannot use af-alg interface to encrypt/decrypt
>> in TLS mode.
>> Are we going to support Mac-then-Encrypt mode in future kernel releases?
> If someone finally adds TLS to the kernel then we'll likely do
> something about it.  
Till that time we cannot use crypto authenc type algos with AF-ALG socket interface for TLS or MtE( separation into 2 operation always not possible).  TLS RFC7366 allow users to decide weather to use EtM or MtE in TLS. We can solve this, If we have some way to communicate drivers  to operate in TLS mode like in setsockopt or msghdr of sendmsg.

> Otherwise you can just separate it out into
> two operations via af-alg.
 Always not possible. If openssl has software implementation of Authec( Cipher and hash with 1 algo) it expects same from af-alg engine only then he will override. Its like if Openssl has super set(AES+ SHA256) available it expect same super set in engine(af-alg) for comparison.
The machines with instruction set extensions has authenc implemented in user space like intel aes-ni.

>
> Cheers,

^ permalink raw reply

* Re: George's crazy full state idea (Re: HalfSipHash Acceptable Usage)
From: George Spelvin @ 2016-12-23  0:07 UTC (permalink / raw)
  To: hannes, linux, luto
  Cc: ak, davem, David.Laight, djb, ebiggers3, eric.dumazet, Jason,
	jeanphilippe.aumasson, kernel-hardening, linux-crypto,
	linux-kernel, netdev, tom, torvalds, tytso, vegard.nossum
In-Reply-To: <30d9513a-129b-d246-1461-2326130e118f@stressinduktion.org>

Hannes Frederic Sowa wrote:
> A lockdep test should still be done. ;)

Adding might_lock() annotations will improve coverage a lot.

> Yes, that does look nice indeed. Accounting for bits instead of bytes
> shouldn't be a huge problem either. Maybe it gets a bit more verbose in
> case you can't satisfy a request with one batched entropy block and have
> to consume randomness from two.

The bit granularity is also for the callers' convenience, so they don't
have to mask again.  Whether get_random_bits rounds up to byte boundaries
internally or not is something else.

When the current batch runs low, I was actually thinking of throwing
away the remaining bits and computing a new batch of 512.  But it's
whatever works best at implementation time.

>>> It could only mix the output back in every two calls, in which case
>>> you can backtrack up to one call but you need to do 2^128 work to
>>> backtrack farther.  But yes, this is getting excessively complicated.
> 
>> No, if you're willing to accept limited backtrack, this is a perfectly
>> acceptable solution, and not too complicated.  You could do it phase-less
>> if you like; store the previous output, then after generating the new
>> one, mix in both.  Then overwrite the previous output.  (But doing two
>> rounds of a crypto primtive to avoid one conditional jump is stupid,
>> so forget that.)

> Can you quickly explain why we lose the backtracking capability?

Sure.  An RNG is (state[i], output[i]) = f(state[i-1]).  The goal of
backtracking is to compute output[i], or better yet state[i-1], given
state[i].

For example, consider an OFB or CTR mode generator.  The state is a key
and and IV, and you encrypt the IV with the key to produce output, then
either replace the IV with the output, or increment it.  Either way,
since you still have the key, you can invert the transformation and
recover the previous IV.

The standard way around this is to use the Davies-Meyer construction:

IV[i] = IV[i-1] + E(IV[i-1], key)

This is the standard way to make a non-invertible random function
out of an invertible random permutation.

>From the sum, there's no easy way to find the ciphertext *or* the
plaintext that was encrypted.  Assuming the encryption is secure,
the only way to reverse it is brute force: guess IV[i-1] and run the
operation forward to see if the resultant IV[i] matches.

There are a variety of ways to organize this computation, since the
guess gives toy both IV[i-1] and E(IV[i-1], key) = IV[i] - IV[i-1], including
running E forward, backward, or starting from both ends to see if you
meet in the middle.

The way you add the encryption output to the IV is not very important.
It can be addition, xor, or some more complex invertible transformation.
In the case of SipHash, the "encryption" output is smaller than the
input, so we have to get a bit more creative, but it's still basically
the same thing.

The problem is that the output which is combined with the IV is too small.
With only 64 bits, trying all possible values is practical.  (The world's
Bitcoin miners are collectively computing SHA-256(SHA-256(input)) 1.7 * 2^64
times per second.)

By basically doing two iterations at once and mixing in 128 bits of
output, the guessing attack is rendered impractical.  The only downside
is that you need to remember and store one result between when it's
computed and last used.  This is part of the state, so an attack can
find output[i-1], but not anything farther back.

> ChaCha as a block cipher gives a "perfect" permutation from the output
> of either the CRNG or the CPRNG, which actually itself has backtracking
> protection.

I'm not quite understanding.  The /dev/random implementation uses some
of the ChaCha output as a new ChaCha key (that's another way to mix output
back into the state) to prevent backtracking.  But this slows it down, and
again if you want to be efficient, you're generating and storing large batches
of entropy and storing it in the RNG state.

^ permalink raw reply

* Re: George's crazy full state idea (Re: HalfSipHash Acceptable Usage)
From: Hannes Frederic Sowa @ 2016-12-22 21:38 UTC (permalink / raw)
  To: George Spelvin, luto
  Cc: ak, davem, David.Laight, djb, ebiggers3, eric.dumazet, Jason,
	jeanphilippe.aumasson, kernel-hardening, linux-crypto,
	linux-kernel, netdev, tom, torvalds, tytso, vegard.nossum
In-Reply-To: <20161222211140.2816.qmail@ns.sciencehorizons.net>

On 22.12.2016 22:11, George Spelvin wrote:
>> I do tend to like Ted's version in which we use batched
>> get_random_bytes() output.  If it's fast enough, it's simpler and lets
>> us get the full strength of a CSPRNG.
> 
> With the ChaCha20 generator, that's fine, although note that this abandons
> anti-backtracking entirely.
> 
> It also takes locks, something the previous get_random_int code
> path avoided.  Do we need to audit the call sites to ensure that's safe?

We have spin_lock_irq* locks on the way. Of course they can hurt in when
contended. The situation should be the same as with get_random_bytes,
callable from every possible situation in the kernel without fear, so
this should also work for get_random_int. A lockdep test should still be
done. ;)

> And there is the issue that the existing callers assume that there's a
> fixed cost per word.  A good half of get_random_long calls are followed by
> "& ~PAGE_MASK" to extract the low 12 bits.  Or "& ((1ul << mmap_rnd_bits)
> - 1)" to extract the low 28.  If we have a buffer we're going to have to
> pay to refill, it would be nice to use less than 8 bytes to satisfy those.
> 
> But that can be a followup patch.  I'm thinking
> 
> unsigned long get_random_bits(unsigned bits)
> 	E.g. get_random_bits(PAGE_SHIFT),
> 	     get_random_bits(mmap_rnd_bits),
> 	u32 imm_rnd = get_random_bits(32)
> 
> unsigned get_random_mod(unsigned modulus)
> 	E.g. get_random_mod(hole) & ~(alignment - 1);
> 	     get_random_mod(port_scan_backoff)
> 	(Althogh probably drivers/s390/scsi/zfcp_fc.c should be changed
> 	to prandom.)
> 
> with, until the audit is completed:
> #define get_random_int() get_random_bits(32)
> #define get_random_long() get_random_bits(BITS_PER_LONG)

Yes, that does look nice indeed. Accounting for bits instead of bytes
shouldn't be a huge problem either. Maybe it gets a bit more verbose in
case you can't satisfy a request with one batched entropy block and have
to consume randomness from two.

>> It could only mix the output back in every two calls, in which case
>> you can backtrack up to one call but you need to do 2^128 work to
>> backtrack farther.  But yes, this is getting excessively complicated.
> 
> No, if you're willing to accept limited backtrack, this is a perfectly
> acceptable solution, and not too complicated.  You could do it phase-less
> if you like; store the previous output, then after generating the new
> one, mix in both.  Then overwrite the previous output.  (But doing two
> rounds of a crypto primtive to avoid one conditional jump is stupid,
> so forget that.)

Can you quickly explain why we lose the backtracking capability?

ChaCha as a block cipher gives a "perfect" permutation from the output
of either the CRNG or the CPRNG, which actually itself has backtracking
protection.

Thanks for explaining,
Hannes

^ permalink raw reply

* Re: George's crazy full state idea (Re: HalfSipHash Acceptable Usage)
From: George Spelvin @ 2016-12-22 21:11 UTC (permalink / raw)
  To: linux, luto
  Cc: ak, davem, David.Laight, djb, ebiggers3, eric.dumazet, hannes,
	Jason, jeanphilippe.aumasson, kernel-hardening, linux-crypto,
	linux-kernel, netdev, tom, torvalds, tytso, vegard.nossum
In-Reply-To: <CALCETrW63gL1821jCMvxnm8gR+o5ifdOXKnWHsLnqoFK77-epw@mail.gmail.com>

> I do tend to like Ted's version in which we use batched
> get_random_bytes() output.  If it's fast enough, it's simpler and lets
> us get the full strength of a CSPRNG.

With the ChaCha20 generator, that's fine, although note that this abandons
anti-backtracking entirely.

It also takes locks, something the previous get_random_int code
path avoided.  Do we need to audit the call sites to ensure that's safe?

And there is the issue that the existing callers assume that there's a
fixed cost per word.  A good half of get_random_long calls are followed by
"& ~PAGE_MASK" to extract the low 12 bits.  Or "& ((1ul << mmap_rnd_bits)
- 1)" to extract the low 28.  If we have a buffer we're going to have to
pay to refill, it would be nice to use less than 8 bytes to satisfy those.

But that can be a followup patch.  I'm thinking

unsigned long get_random_bits(unsigned bits)
	E.g. get_random_bits(PAGE_SHIFT),
	     get_random_bits(mmap_rnd_bits),
	u32 imm_rnd = get_random_bits(32)

unsigned get_random_mod(unsigned modulus)
	E.g. get_random_mod(hole) & ~(alignment - 1);
	     get_random_mod(port_scan_backoff)
	(Althogh probably drivers/s390/scsi/zfcp_fc.c should be changed
	to prandom.)

with, until the audit is completed:
#define get_random_int() get_random_bits(32)
#define get_random_long() get_random_bits(BITS_PER_LONG)

> It could only mix the output back in every two calls, in which case
> you can backtrack up to one call but you need to do 2^128 work to
> backtrack farther.  But yes, this is getting excessively complicated.

No, if you're willing to accept limited backtrack, this is a perfectly
acceptable solution, and not too complicated.  You could do it phase-less
if you like; store the previous output, then after generating the new
one, mix in both.  Then overwrite the previous output.  (But doing two
rounds of a crypto primtive to avoid one conditional jump is stupid,
so forget that.)

>> Hmm, interesting.  Although, for ASLR, we could use get_random_bytes()
>> directly and be done with it.  It won't be a bottleneck.

Isn't that what you already suggested?

I don't mind fewer primtives; I got a bit fixated on "Replace MD5 with
SipHash".  It's just the locking that I want to check isn't a problem.

^ permalink raw reply

* Re: [PATCH v2 11/12] crypto: atmel-authenc: add support to authenc(hmac(shaX),Y(aes)) modes
From: kbuild test robot @ 2016-12-22 21:10 UTC (permalink / raw)
  To: Cyrille Pitchen
  Cc: kbuild-all, herbert, davem, nicolas.ferre, linux-crypto,
	linux-kernel, linux-arm-kernel, Cyrille Pitchen
In-Reply-To: <2a6cc4637621d2ef0d84754817128c43795cb022.1482424395.git.cyrille.pitchen@atmel.com>

[-- Attachment #1: Type: text/plain, Size: 18106 bytes --]

Hi Cyrille,

[auto build test WARNING on cryptodev/master]
[also build test WARNING on next-20161222]
[cannot apply to v4.9]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Cyrille-Pitchen/crypto-atmel-authenc-add-support-to-authenc-hmac-shaX-Y-aes-modes/20161223-015820
base:   https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git master
config: alpha-allyesconfig (attached as .config)
compiler: alpha-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=alpha 

All warnings (new ones prefixed by >>):

   In file included from include/linux/printk.h:305:0,
                    from include/linux/kernel.h:13,
                    from drivers/crypto/atmel-sha.c:17:
   drivers/crypto/atmel-sha.c: In function 'atmel_sha_xmit_cpu':
>> drivers/crypto/atmel-sha.c:465:19: warning: format '%d' expects argument of type 'int', but argument 6 has type 'size_t {aka long unsigned int}' [-Wformat=]
     dev_dbg(dd->dev, "xmit_cpu: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n",
                      ^
   include/linux/dynamic_debug.h:134:39: note: in definition of macro 'dynamic_dev_dbg'
      __dynamic_dev_dbg(&descriptor, dev, fmt, \
                                          ^~~
>> drivers/crypto/atmel-sha.c:465:2: note: in expansion of macro 'dev_dbg'
     dev_dbg(dd->dev, "xmit_cpu: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n",
     ^~~~~~~
   drivers/crypto/atmel-sha.c: In function 'atmel_sha_xmit_pdc':
   drivers/crypto/atmel-sha.c:494:19: warning: format '%d' expects argument of type 'int', but argument 6 has type 'size_t {aka long unsigned int}' [-Wformat=]
     dev_dbg(dd->dev, "xmit_pdc: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n",
                      ^
   include/linux/dynamic_debug.h:134:39: note: in definition of macro 'dynamic_dev_dbg'
      __dynamic_dev_dbg(&descriptor, dev, fmt, \
                                          ^~~
   drivers/crypto/atmel-sha.c:494:2: note: in expansion of macro 'dev_dbg'
     dev_dbg(dd->dev, "xmit_pdc: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n",
     ^~~~~~~
   drivers/crypto/atmel-sha.c: In function 'atmel_sha_xmit_dma':
   drivers/crypto/atmel-sha.c:541:19: warning: format '%d' expects argument of type 'int', but argument 6 has type 'size_t {aka long unsigned int}' [-Wformat=]
     dev_dbg(dd->dev, "xmit_dma: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n",
                      ^
   include/linux/dynamic_debug.h:134:39: note: in definition of macro 'dynamic_dev_dbg'
      __dynamic_dev_dbg(&descriptor, dev, fmt, \
                                          ^~~
   drivers/crypto/atmel-sha.c:541:2: note: in expansion of macro 'dev_dbg'
     dev_dbg(dd->dev, "xmit_dma: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n",
     ^~~~~~~
   drivers/crypto/atmel-sha.c: In function 'atmel_sha_xmit_dma_map':
>> drivers/crypto/atmel-sha.c:620:26: warning: format '%u' expects argument of type 'unsigned int', but argument 3 has type 'size_t {aka long unsigned int}' [-Wformat=]
      dev_err(dd->dev, "dma %u bytes error\n", ctx->buflen +
                             ^
   In file included from include/linux/printk.h:305:0,
                    from include/linux/kernel.h:13,
                    from drivers/crypto/atmel-sha.c:17:
   drivers/crypto/atmel-sha.c: In function 'atmel_sha_update_dma_slow':
   drivers/crypto/atmel-sha.c:641:19: warning: format '%u' expects argument of type 'unsigned int', but argument 4 has type 'size_t {aka long unsigned int}' [-Wformat=]
     dev_dbg(dd->dev, "slow: bufcnt: %u, digcnt: 0x%llx 0x%llx, final: %d\n",
                      ^
   include/linux/dynamic_debug.h:134:39: note: in definition of macro 'dynamic_dev_dbg'
      __dynamic_dev_dbg(&descriptor, dev, fmt, \
                                          ^~~
   drivers/crypto/atmel-sha.c:641:2: note: in expansion of macro 'dev_dbg'
     dev_dbg(dd->dev, "slow: bufcnt: %u, digcnt: 0x%llx 0x%llx, final: %d\n",
     ^~~~~~~
   drivers/crypto/atmel-sha.c: In function 'atmel_sha_update_dma_start':
   drivers/crypto/atmel-sha.c:669:19: warning: format '%u' expects argument of type 'unsigned int', but argument 6 has type 'size_t {aka long unsigned int}' [-Wformat=]
     dev_dbg(dd->dev, "fast: digcnt: 0x%llx 0x%llx, bufcnt: %u, total: %u\n",
                      ^
   include/linux/dynamic_debug.h:134:39: note: in definition of macro 'dynamic_dev_dbg'
      __dynamic_dev_dbg(&descriptor, dev, fmt, \
                                          ^~~
   drivers/crypto/atmel-sha.c:669:2: note: in expansion of macro 'dev_dbg'
     dev_dbg(dd->dev, "fast: digcnt: 0x%llx 0x%llx, bufcnt: %u, total: %u\n",
     ^~~~~~~
   drivers/crypto/atmel-sha.c:711:27: warning: format '%u' expects argument of type 'unsigned int', but argument 3 has type 'size_t {aka long unsigned int}' [-Wformat=]
       dev_err(dd->dev, "dma %u bytes error\n",
                              ^
   In file included from include/linux/printk.h:305:0,
                    from include/linux/kernel.h:13,
                    from drivers/crypto/atmel-sha.c:17:
   drivers/crypto/atmel-sha.c: In function 'atmel_sha_finish':
   drivers/crypto/atmel-sha.c:891:19: warning: format '%d' expects argument of type 'int', but argument 6 has type 'size_t {aka long unsigned int}' [-Wformat=]
     dev_dbg(dd->dev, "digcnt: 0x%llx 0x%llx, bufcnt: %d\n", ctx->digcnt[1],
                      ^
   include/linux/dynamic_debug.h:134:39: note: in definition of macro 'dynamic_dev_dbg'
      __dynamic_dev_dbg(&descriptor, dev, fmt, \
                                          ^~~
   drivers/crypto/atmel-sha.c:891:2: note: in expansion of macro 'dev_dbg'
     dev_dbg(dd->dev, "digcnt: 0x%llx 0x%llx, bufcnt: %d\n", ctx->digcnt[1],
     ^~~~~~~

vim +465 drivers/crypto/atmel-sha.c

ebc82efa Nicolas Royer   2012-07-01  459  			      size_t length, int final)
ebc82efa Nicolas Royer   2012-07-01  460  {
ebc82efa Nicolas Royer   2012-07-01  461  	struct atmel_sha_reqctx *ctx = ahash_request_ctx(dd->req);
ebc82efa Nicolas Royer   2012-07-01  462  	int count, len32;
ebc82efa Nicolas Royer   2012-07-01  463  	const u32 *buffer = (const u32 *)buf;
ebc82efa Nicolas Royer   2012-07-01  464  
d4905b38 Nicolas Royer   2013-02-20 @465  	dev_dbg(dd->dev, "xmit_cpu: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n",
d4905b38 Nicolas Royer   2013-02-20  466  		ctx->digcnt[1], ctx->digcnt[0], length, final);
ebc82efa Nicolas Royer   2012-07-01  467  
ebc82efa Nicolas Royer   2012-07-01  468  	atmel_sha_write_ctrl(dd, 0);
ebc82efa Nicolas Royer   2012-07-01  469  
ebc82efa Nicolas Royer   2012-07-01  470  	/* should be non-zero before next lines to disable clocks later */
d4905b38 Nicolas Royer   2013-02-20  471  	ctx->digcnt[0] += length;
d4905b38 Nicolas Royer   2013-02-20  472  	if (ctx->digcnt[0] < length)
d4905b38 Nicolas Royer   2013-02-20  473  		ctx->digcnt[1]++;
ebc82efa Nicolas Royer   2012-07-01  474  
ebc82efa Nicolas Royer   2012-07-01  475  	if (final)
ebc82efa Nicolas Royer   2012-07-01  476  		dd->flags |= SHA_FLAGS_FINAL; /* catch last interrupt */
ebc82efa Nicolas Royer   2012-07-01  477  
ebc82efa Nicolas Royer   2012-07-01  478  	len32 = DIV_ROUND_UP(length, sizeof(u32));
ebc82efa Nicolas Royer   2012-07-01  479  
ebc82efa Nicolas Royer   2012-07-01  480  	dd->flags |= SHA_FLAGS_CPU;
ebc82efa Nicolas Royer   2012-07-01  481  
ebc82efa Nicolas Royer   2012-07-01  482  	for (count = 0; count < len32; count++)
ebc82efa Nicolas Royer   2012-07-01  483  		atmel_sha_write(dd, SHA_REG_DIN(count), buffer[count]);
ebc82efa Nicolas Royer   2012-07-01  484  
ebc82efa Nicolas Royer   2012-07-01  485  	return -EINPROGRESS;
ebc82efa Nicolas Royer   2012-07-01  486  }
ebc82efa Nicolas Royer   2012-07-01  487  
ebc82efa Nicolas Royer   2012-07-01  488  static int atmel_sha_xmit_pdc(struct atmel_sha_dev *dd, dma_addr_t dma_addr1,
ebc82efa Nicolas Royer   2012-07-01  489  		size_t length1, dma_addr_t dma_addr2, size_t length2, int final)
ebc82efa Nicolas Royer   2012-07-01  490  {
ebc82efa Nicolas Royer   2012-07-01  491  	struct atmel_sha_reqctx *ctx = ahash_request_ctx(dd->req);
ebc82efa Nicolas Royer   2012-07-01  492  	int len32;
ebc82efa Nicolas Royer   2012-07-01  493  
d4905b38 Nicolas Royer   2013-02-20  494  	dev_dbg(dd->dev, "xmit_pdc: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n",
d4905b38 Nicolas Royer   2013-02-20  495  		ctx->digcnt[1], ctx->digcnt[0], length1, final);
ebc82efa Nicolas Royer   2012-07-01  496  
ebc82efa Nicolas Royer   2012-07-01  497  	len32 = DIV_ROUND_UP(length1, sizeof(u32));
ebc82efa Nicolas Royer   2012-07-01  498  	atmel_sha_write(dd, SHA_PTCR, SHA_PTCR_TXTDIS);
ebc82efa Nicolas Royer   2012-07-01  499  	atmel_sha_write(dd, SHA_TPR, dma_addr1);
ebc82efa Nicolas Royer   2012-07-01  500  	atmel_sha_write(dd, SHA_TCR, len32);
ebc82efa Nicolas Royer   2012-07-01  501  
ebc82efa Nicolas Royer   2012-07-01  502  	len32 = DIV_ROUND_UP(length2, sizeof(u32));
ebc82efa Nicolas Royer   2012-07-01  503  	atmel_sha_write(dd, SHA_TNPR, dma_addr2);
ebc82efa Nicolas Royer   2012-07-01  504  	atmel_sha_write(dd, SHA_TNCR, len32);
ebc82efa Nicolas Royer   2012-07-01  505  
ebc82efa Nicolas Royer   2012-07-01  506  	atmel_sha_write_ctrl(dd, 1);
ebc82efa Nicolas Royer   2012-07-01  507  
ebc82efa Nicolas Royer   2012-07-01  508  	/* should be non-zero before next lines to disable clocks later */
d4905b38 Nicolas Royer   2013-02-20  509  	ctx->digcnt[0] += length1;
d4905b38 Nicolas Royer   2013-02-20  510  	if (ctx->digcnt[0] < length1)
d4905b38 Nicolas Royer   2013-02-20  511  		ctx->digcnt[1]++;
ebc82efa Nicolas Royer   2012-07-01  512  
ebc82efa Nicolas Royer   2012-07-01  513  	if (final)
ebc82efa Nicolas Royer   2012-07-01  514  		dd->flags |= SHA_FLAGS_FINAL; /* catch last interrupt */
ebc82efa Nicolas Royer   2012-07-01  515  
ebc82efa Nicolas Royer   2012-07-01  516  	dd->flags |=  SHA_FLAGS_DMA_ACTIVE;
ebc82efa Nicolas Royer   2012-07-01  517  
ebc82efa Nicolas Royer   2012-07-01  518  	/* Start DMA transfer */
ebc82efa Nicolas Royer   2012-07-01  519  	atmel_sha_write(dd, SHA_PTCR, SHA_PTCR_TXTEN);
ebc82efa Nicolas Royer   2012-07-01  520  
ebc82efa Nicolas Royer   2012-07-01  521  	return -EINPROGRESS;
ebc82efa Nicolas Royer   2012-07-01  522  }
ebc82efa Nicolas Royer   2012-07-01  523  
d4905b38 Nicolas Royer   2013-02-20  524  static void atmel_sha_dma_callback(void *data)
d4905b38 Nicolas Royer   2013-02-20  525  {
d4905b38 Nicolas Royer   2013-02-20  526  	struct atmel_sha_dev *dd = data;
d4905b38 Nicolas Royer   2013-02-20  527  
b48b114c Cyrille Pitchen 2016-12-22  528  	dd->is_async = true;
b48b114c Cyrille Pitchen 2016-12-22  529  
d4905b38 Nicolas Royer   2013-02-20  530  	/* dma_lch_in - completed - wait DATRDY */
d4905b38 Nicolas Royer   2013-02-20  531  	atmel_sha_write(dd, SHA_IER, SHA_INT_DATARDY);
d4905b38 Nicolas Royer   2013-02-20  532  }
d4905b38 Nicolas Royer   2013-02-20  533  
d4905b38 Nicolas Royer   2013-02-20  534  static int atmel_sha_xmit_dma(struct atmel_sha_dev *dd, dma_addr_t dma_addr1,
d4905b38 Nicolas Royer   2013-02-20  535  		size_t length1, dma_addr_t dma_addr2, size_t length2, int final)
d4905b38 Nicolas Royer   2013-02-20  536  {
d4905b38 Nicolas Royer   2013-02-20  537  	struct atmel_sha_reqctx *ctx = ahash_request_ctx(dd->req);
d4905b38 Nicolas Royer   2013-02-20  538  	struct dma_async_tx_descriptor	*in_desc;
d4905b38 Nicolas Royer   2013-02-20  539  	struct scatterlist sg[2];
d4905b38 Nicolas Royer   2013-02-20  540  
d4905b38 Nicolas Royer   2013-02-20 @541  	dev_dbg(dd->dev, "xmit_dma: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n",
d4905b38 Nicolas Royer   2013-02-20  542  		ctx->digcnt[1], ctx->digcnt[0], length1, final);
d4905b38 Nicolas Royer   2013-02-20  543  
d4905b38 Nicolas Royer   2013-02-20  544  	dd->dma_lch_in.dma_conf.src_maxburst = 16;
d4905b38 Nicolas Royer   2013-02-20  545  	dd->dma_lch_in.dma_conf.dst_maxburst = 16;
d4905b38 Nicolas Royer   2013-02-20  546  
d4905b38 Nicolas Royer   2013-02-20  547  	dmaengine_slave_config(dd->dma_lch_in.chan, &dd->dma_lch_in.dma_conf);
d4905b38 Nicolas Royer   2013-02-20  548  
d4905b38 Nicolas Royer   2013-02-20  549  	if (length2) {
d4905b38 Nicolas Royer   2013-02-20  550  		sg_init_table(sg, 2);
d4905b38 Nicolas Royer   2013-02-20  551  		sg_dma_address(&sg[0]) = dma_addr1;
d4905b38 Nicolas Royer   2013-02-20  552  		sg_dma_len(&sg[0]) = length1;
d4905b38 Nicolas Royer   2013-02-20  553  		sg_dma_address(&sg[1]) = dma_addr2;
d4905b38 Nicolas Royer   2013-02-20  554  		sg_dma_len(&sg[1]) = length2;
d4905b38 Nicolas Royer   2013-02-20  555  		in_desc = dmaengine_prep_slave_sg(dd->dma_lch_in.chan, sg, 2,
d4905b38 Nicolas Royer   2013-02-20  556  			DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
d4905b38 Nicolas Royer   2013-02-20  557  	} else {
d4905b38 Nicolas Royer   2013-02-20  558  		sg_init_table(sg, 1);
d4905b38 Nicolas Royer   2013-02-20  559  		sg_dma_address(&sg[0]) = dma_addr1;
d4905b38 Nicolas Royer   2013-02-20  560  		sg_dma_len(&sg[0]) = length1;
d4905b38 Nicolas Royer   2013-02-20  561  		in_desc = dmaengine_prep_slave_sg(dd->dma_lch_in.chan, sg, 1,
d4905b38 Nicolas Royer   2013-02-20  562  			DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
d4905b38 Nicolas Royer   2013-02-20  563  	}
d4905b38 Nicolas Royer   2013-02-20  564  	if (!in_desc)
b48b114c Cyrille Pitchen 2016-12-22  565  		atmel_sha_complete(dd, -EINVAL);
d4905b38 Nicolas Royer   2013-02-20  566  
d4905b38 Nicolas Royer   2013-02-20  567  	in_desc->callback = atmel_sha_dma_callback;
d4905b38 Nicolas Royer   2013-02-20  568  	in_desc->callback_param = dd;
d4905b38 Nicolas Royer   2013-02-20  569  
d4905b38 Nicolas Royer   2013-02-20  570  	atmel_sha_write_ctrl(dd, 1);
d4905b38 Nicolas Royer   2013-02-20  571  
d4905b38 Nicolas Royer   2013-02-20  572  	/* should be non-zero before next lines to disable clocks later */
d4905b38 Nicolas Royer   2013-02-20  573  	ctx->digcnt[0] += length1;
d4905b38 Nicolas Royer   2013-02-20  574  	if (ctx->digcnt[0] < length1)
d4905b38 Nicolas Royer   2013-02-20  575  		ctx->digcnt[1]++;
d4905b38 Nicolas Royer   2013-02-20  576  
d4905b38 Nicolas Royer   2013-02-20  577  	if (final)
d4905b38 Nicolas Royer   2013-02-20  578  		dd->flags |= SHA_FLAGS_FINAL; /* catch last interrupt */
d4905b38 Nicolas Royer   2013-02-20  579  
d4905b38 Nicolas Royer   2013-02-20  580  	dd->flags |=  SHA_FLAGS_DMA_ACTIVE;
d4905b38 Nicolas Royer   2013-02-20  581  
d4905b38 Nicolas Royer   2013-02-20  582  	/* Start DMA transfer */
d4905b38 Nicolas Royer   2013-02-20  583  	dmaengine_submit(in_desc);
d4905b38 Nicolas Royer   2013-02-20  584  	dma_async_issue_pending(dd->dma_lch_in.chan);
d4905b38 Nicolas Royer   2013-02-20  585  
d4905b38 Nicolas Royer   2013-02-20  586  	return -EINPROGRESS;
d4905b38 Nicolas Royer   2013-02-20  587  }
d4905b38 Nicolas Royer   2013-02-20  588  
d4905b38 Nicolas Royer   2013-02-20  589  static int atmel_sha_xmit_start(struct atmel_sha_dev *dd, dma_addr_t dma_addr1,
d4905b38 Nicolas Royer   2013-02-20  590  		size_t length1, dma_addr_t dma_addr2, size_t length2, int final)
d4905b38 Nicolas Royer   2013-02-20  591  {
d4905b38 Nicolas Royer   2013-02-20  592  	if (dd->caps.has_dma)
d4905b38 Nicolas Royer   2013-02-20  593  		return atmel_sha_xmit_dma(dd, dma_addr1, length1,
d4905b38 Nicolas Royer   2013-02-20  594  				dma_addr2, length2, final);
d4905b38 Nicolas Royer   2013-02-20  595  	else
d4905b38 Nicolas Royer   2013-02-20  596  		return atmel_sha_xmit_pdc(dd, dma_addr1, length1,
d4905b38 Nicolas Royer   2013-02-20  597  				dma_addr2, length2, final);
d4905b38 Nicolas Royer   2013-02-20  598  }
d4905b38 Nicolas Royer   2013-02-20  599  
ebc82efa Nicolas Royer   2012-07-01  600  static int atmel_sha_update_cpu(struct atmel_sha_dev *dd)
ebc82efa Nicolas Royer   2012-07-01  601  {
ebc82efa Nicolas Royer   2012-07-01  602  	struct atmel_sha_reqctx *ctx = ahash_request_ctx(dd->req);
ebc82efa Nicolas Royer   2012-07-01  603  	int bufcnt;
ebc82efa Nicolas Royer   2012-07-01  604  
ebc82efa Nicolas Royer   2012-07-01  605  	atmel_sha_append_sg(ctx);
ebc82efa Nicolas Royer   2012-07-01  606  	atmel_sha_fill_padding(ctx, 0);
ebc82efa Nicolas Royer   2012-07-01  607  	bufcnt = ctx->bufcnt;
ebc82efa Nicolas Royer   2012-07-01  608  	ctx->bufcnt = 0;
ebc82efa Nicolas Royer   2012-07-01  609  
ebc82efa Nicolas Royer   2012-07-01  610  	return atmel_sha_xmit_cpu(dd, ctx->buffer, bufcnt, 1);
ebc82efa Nicolas Royer   2012-07-01  611  }
ebc82efa Nicolas Royer   2012-07-01  612  
ebc82efa Nicolas Royer   2012-07-01  613  static int atmel_sha_xmit_dma_map(struct atmel_sha_dev *dd,
ebc82efa Nicolas Royer   2012-07-01  614  					struct atmel_sha_reqctx *ctx,
ebc82efa Nicolas Royer   2012-07-01  615  					size_t length, int final)
ebc82efa Nicolas Royer   2012-07-01  616  {
ebc82efa Nicolas Royer   2012-07-01  617  	ctx->dma_addr = dma_map_single(dd->dev, ctx->buffer,
d4905b38 Nicolas Royer   2013-02-20  618  				ctx->buflen + ctx->block_size, DMA_TO_DEVICE);
ebc82efa Nicolas Royer   2012-07-01  619  	if (dma_mapping_error(dd->dev, ctx->dma_addr)) {
ebc82efa Nicolas Royer   2012-07-01 @620  		dev_err(dd->dev, "dma %u bytes error\n", ctx->buflen +
d4905b38 Nicolas Royer   2013-02-20  621  				ctx->block_size);
b48b114c Cyrille Pitchen 2016-12-22  622  		atmel_sha_complete(dd, -EINVAL);
ebc82efa Nicolas Royer   2012-07-01  623  	}

:::::: The code at line 465 was first introduced by commit
:::::: d4905b38d1f6b60761a6fd16f45ebd1fac8b6e1f crypto: atmel-sha - add support for latest release of the IP (0x410)

:::::: TO: Nicolas Royer <nicolas@eukrea.com>
:::::: CC: Herbert Xu <herbert@gondor.apana.org.au>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 47848 bytes --]

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox