Linux cryptographic layer development
 help / color / mirror / Atom feed
* [PATCH RESEND] crypto: gf128mul - remove dead gf128mul_64k_lle code
From: Alex Cope @ 2016-11-09  1:16 UTC (permalink / raw)
  To: linux-crypto; +Cc: Alex Cope

This code is unlikely to be useful in the future because transforms
don't know how often keys will be changed, new algorithms are unlikely
to use lle representation, and tables should be replaced with
carryless multiplication instructions when available.

Signed-off-by: Alex Cope <alexcope@google.com>
---
 crypto/gf128mul.c         | 55 -----------------------------------------------
 include/crypto/gf128mul.h | 13 ++++++-----
 2 files changed, 6 insertions(+), 62 deletions(-)

diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c
index 5276607..57c85dd 100644
--- a/crypto/gf128mul.c
+++ b/crypto/gf128mul.c
@@ -263,48 +263,6 @@ EXPORT_SYMBOL(gf128mul_bbe);
  * t[1][BYTE] contains g*x^8*BYTE
  *  ..
  * t[15][BYTE] contains g*x^120*BYTE */
-struct gf128mul_64k *gf128mul_init_64k_lle(const be128 *g)
-{
-	struct gf128mul_64k *t;
-	int i, j, k;
-
-	t = kzalloc(sizeof(*t), GFP_KERNEL);
-	if (!t)
-		goto out;
-
-	for (i = 0; i < 16; i++) {
-		t->t[i] = kzalloc(sizeof(*t->t[i]), GFP_KERNEL);
-		if (!t->t[i]) {
-			gf128mul_free_64k(t);
-			t = NULL;
-			goto out;
-		}
-	}
-
-	t->t[0]->t[128] = *g;
-	for (j = 64; j > 0; j >>= 1)
-		gf128mul_x_lle(&t->t[0]->t[j], &t->t[0]->t[j + j]);
-
-	for (i = 0;;) {
-		for (j = 2; j < 256; j += j)
-			for (k = 1; k < j; ++k)
-				be128_xor(&t->t[i]->t[j + k],
-					  &t->t[i]->t[j], &t->t[i]->t[k]);
-
-		if (++i >= 16)
-			break;
-
-		for (j = 128; j > 0; j >>= 1) {
-			t->t[i]->t[j] = t->t[i - 1]->t[j];
-			gf128mul_x8_lle(&t->t[i]->t[j]);
-		}
-	}
-
-out:
-	return t;
-}
-EXPORT_SYMBOL(gf128mul_init_64k_lle);
-
 struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g)
 {
 	struct gf128mul_64k *t;
@@ -357,19 +315,6 @@ void gf128mul_free_64k(struct gf128mul_64k *t)
 }
 EXPORT_SYMBOL(gf128mul_free_64k);
 
-void gf128mul_64k_lle(be128 *a, struct gf128mul_64k *t)
-{
-	u8 *ap = (u8 *)a;
-	be128 r[1];
-	int i;
-
-	*r = t->t[0]->t[ap[0]];
-	for (i = 1; i < 16; ++i)
-		be128_xor(r, r, &t->t[i]->t[ap[i]]);
-	*a = *r;
-}
-EXPORT_SYMBOL(gf128mul_64k_lle);
-
 void gf128mul_64k_bbe(be128 *a, struct gf128mul_64k *t)
 {
 	u8 *ap = (u8 *)a;
diff --git a/include/crypto/gf128mul.h b/include/crypto/gf128mul.h
index da2530e..b611aa9 100644
--- a/include/crypto/gf128mul.h
+++ b/include/crypto/gf128mul.h
@@ -181,20 +181,19 @@ static inline void gf128mul_free_4k(struct gf128mul_4k *t)
 }
 
 
-/* 64k table optimization, implemented for lle and bbe */
+/* 64k table optimization, implemented for bbe */
 
 struct gf128mul_64k {
 	struct gf128mul_4k *t[16];
 };
 
-/* first initialize with the constant factor with which you
- * want to multiply and then call gf128_64k_lle with the other
- * factor in the first argument, the table in the second and a
- * scratch register in the third. Afterwards *a = *r. */
-struct gf128mul_64k *gf128mul_init_64k_lle(const be128 *g);
+/* First initialize with the constant factor with which you
+ * want to multiply and then call gf128mul_64k_bbe with the other
+ * factor in the first argument, and the table in the second.
+ * Afterwards, the result is stored in *a.
+ */
 struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g);
 void gf128mul_free_64k(struct gf128mul_64k *t);
-void gf128mul_64k_lle(be128 *a, struct gf128mul_64k *t);
 void gf128mul_64k_bbe(be128 *a, struct gf128mul_64k *t);
 
 #endif /* _CRYPTO_GF128MUL_H */
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* [PATCH] crypto: dh - Consistenly return negative error codes
From: Mat Martineau @ 2016-11-08 23:48 UTC (permalink / raw)
  To: linux-crypto, herbert; +Cc: Mat Martineau, salvatore.benedetto

Fix the single instance where a positive EINVAL was returned.

Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
---
 crypto/dh.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/dh.c b/crypto/dh.c
index 9d19360..ddcb528 100644
--- a/crypto/dh.c
+++ b/crypto/dh.c
@@ -118,7 +118,7 @@ static int dh_compute_value(struct kpp_request *req)
 	if (req->src) {
 		base = mpi_read_raw_from_sgl(req->src, req->src_len);
 		if (!base) {
-			ret = EINVAL;
+			ret = -EINVAL;
 			goto err_free_val;
 		}
 	} else {
-- 
2.10.2

^ permalink raw reply related

* Re: [PATCH v4] poly1305: generic C can be faster on chips with slow unaligned access
From: Eric Biggers @ 2016-11-08 17:26 UTC (permalink / raw)
  To: Martin Willi
  Cc: Jason A. Donenfeld, Herbert Xu, David S. Miller, linux-crypto,
	linux-kernel, René van Dorst
In-Reply-To: <1478591559.5216.7.camel@strongswan.org>

On Tue, Nov 08, 2016 at 08:52:39AM +0100, Martin Willi wrote:
> 
> 
> Not sure what the exact alignment rules for key/iv are, but maybe we
> want to replace the same function in chacha20_generic.c as well?
> 
> Martin

chacha20-generic provides a blkcipher API and sets an alignmask of sizeof(u32)
- 1.  This applies to the key buffer for ->setkey() and to the mapped addresses
for the input/output buffers and IV during the blkcipher walk.  So it does not
appear to have the problems that poly1305 has.  

I do however see one small problem which is that
'u8 stream[CHACHA20_BLOCK_SIZE];' is, strictly speaking, not guaranteed to be
aligned to u32.

Eric

^ permalink raw reply

* Re: [PATCH 6/6] Add support for AEAD algos.
From: Harsh Jain @ 2016-11-08 14:21 UTC (permalink / raw)
  To: Stephan Mueller
  Cc: dan.carpenter, herbert, linux-crypto, jlulla, atul.gupta,
	yeshaswi, hariprasad
In-Reply-To: <46208469.tzXHy1sfql@positron.chronox.de>



On 08-11-2016 18:29, Stephan Mueller wrote:
> Am Dienstag, 8. November 2016, 17:16:38 CET schrieb Harsh Jain:
>
> Hi Harsh,
>
>> On 08-11-2016 16:45, Stephan Mueller wrote:
>>> Am Donnerstag, 27. Oktober 2016, 15:36:08 CET schrieb Harsh Jain:
>>>
>>> Hi Harsh,
>>>
>>>>>> +static void chcr_verify_tag(struct aead_request *req, u8 *input, int
>>>>>> *err)
>>>>>> +{
>>>>>> +	u8 temp[SHA512_DIGEST_SIZE];
>>>>>> +	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
>>>>>> +	int authsize = crypto_aead_authsize(tfm);
>>>>>> +	struct cpl_fw6_pld *fw6_pld;
>>>>>> +	int cmp = 0;
>>>>>> +
>>>>>> +	fw6_pld = (struct cpl_fw6_pld *)input;
>>>>>> +	if ((get_aead_subtype(tfm) == CRYPTO_ALG_SUB_TYPE_AEAD_RFC4106) ||
>>>>>> +	    (get_aead_subtype(tfm) == CRYPTO_ALG_SUB_TYPE_AEAD_GCM)) {
>>>>>> +		cmp = memcmp(&fw6_pld->data[2], (fw6_pld + 1), authsize);
>>>>>> +	} else {
>>>>>> +
>>>>>> +		sg_pcopy_to_buffer(req->src, sg_nents(req->src), temp,
>>>>>> +				authsize, req->assoclen +
>>>>>> +				req->cryptlen - authsize);
>>>>> I am wondering whether the math is correct here in any case. It is
>>>>> permissible that we have an AAD size of 0 and even a zero-sized
>>>>> ciphertext. How is such scenario covered here?
>>>> Here we are trying to copy user supplied tag to local buffer(temp) for
>>>> decrypt operation only. relative index of tag in src sg list will not
>>>> change when AAD is zero and in decrypt operation cryptlen > authsize.
>>> I am just wondering where this is checked. Since all of these
>>> implementations are directly accessible from unprivileged user space, we
>>> should be careful.
>> chcr_verify_tag() will be called when req->verify is set to "VERIFY_SW", 
>> same will set in decrypt callback function of Algo(like chcr_aead_decrypt)
>> only. It will ensure calling of chcr_verify_tag() in de-crypt operation
>> only.
> I think that limiting to the decryption path may not be enough. What happens 
> if a caller sets some assoclen, but when invoking the decryption operation it 
> provides input data that is smaller than the assoclen? The API allows this 
> scenario.
If I understand correctly, in this case passed sg list will be smaller. We should return with error -EINVAL at entry point only (like create_gcm_wr), control should not reach to chcr_verify_tag().

>
> Ciao
> Stephan

^ permalink raw reply

* Re: [PATCH 6/6] Add support for AEAD algos.
From: Stephan Mueller @ 2016-11-08 12:59 UTC (permalink / raw)
  To: Harsh Jain
  Cc: dan.carpenter, herbert, linux-crypto, jlulla, atul.gupta,
	yeshaswi, hariprasad
In-Reply-To: <ab295a27-cb83-0ee7-0cf1-03d7c05b20c5@chelsio.com>

Am Dienstag, 8. November 2016, 17:16:38 CET schrieb Harsh Jain:

Hi Harsh,

> On 08-11-2016 16:45, Stephan Mueller wrote:
> > Am Donnerstag, 27. Oktober 2016, 15:36:08 CET schrieb Harsh Jain:
> > 
> > Hi Harsh,
> > 
> >>>> +static void chcr_verify_tag(struct aead_request *req, u8 *input, int
> >>>> *err)
> >>>> +{
> >>>> +	u8 temp[SHA512_DIGEST_SIZE];
> >>>> +	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
> >>>> +	int authsize = crypto_aead_authsize(tfm);
> >>>> +	struct cpl_fw6_pld *fw6_pld;
> >>>> +	int cmp = 0;
> >>>> +
> >>>> +	fw6_pld = (struct cpl_fw6_pld *)input;
> >>>> +	if ((get_aead_subtype(tfm) == CRYPTO_ALG_SUB_TYPE_AEAD_RFC4106) ||
> >>>> +	    (get_aead_subtype(tfm) == CRYPTO_ALG_SUB_TYPE_AEAD_GCM)) {
> >>>> +		cmp = memcmp(&fw6_pld->data[2], (fw6_pld + 1), authsize);
> >>>> +	} else {
> >>>> +
> >>>> +		sg_pcopy_to_buffer(req->src, sg_nents(req->src), temp,
> >>>> +				authsize, req->assoclen +
> >>>> +				req->cryptlen - authsize);
> >>> 
> >>> I am wondering whether the math is correct here in any case. It is
> >>> permissible that we have an AAD size of 0 and even a zero-sized
> >>> ciphertext. How is such scenario covered here?
> >> 
> >> Here we are trying to copy user supplied tag to local buffer(temp) for
> >> decrypt operation only. relative index of tag in src sg list will not
> >> change when AAD is zero and in decrypt operation cryptlen > authsize.
> > 
> > I am just wondering where this is checked. Since all of these
> > implementations are directly accessible from unprivileged user space, we
> > should be careful.
> chcr_verify_tag() will be called when req->verify is set to "VERIFY_SW", 
> same will set in decrypt callback function of Algo(like chcr_aead_decrypt)
> only. It will ensure calling of chcr_verify_tag() in de-crypt operation
> only.

I think that limiting to the decryption path may not be enough. What happens 
if a caller sets some assoclen, but when invoking the decryption operation it 
provides input data that is smaller than the assoclen? The API allows this 
scenario.

Ciao
Stephan

^ permalink raw reply

* Re: [PATCH] crypto: rsa: rename two rsa key files
From: Herbert Xu @ 2016-11-08 12:09 UTC (permalink / raw)
  To: yanjiang.jin; +Cc: davem, linux-kernel, linux-crypto, jinyanjiang
In-Reply-To: <1478240017-23117-1-git-send-email-yanjiang.jin@windriver.com>

yanjiang.jin@windriver.com wrote:
> From: Yanjiang Jin <yanjiang.jin@windriver.com>
> 
> This is to eliminate the below compile error:
> 
> crypto/rsa_helper.c:19:29: fatal error: rsaprivkey-asn1.h: No such file or directory
> #include "rsaprivkey-asn1.h"
>                             ^
> compilation terminated.
> 
> Signed-off-by: Yanjiang Jin <yanjiang.jin@windriver.com>

This patch is bogus.  The header files are meant to be generated.
Please find out why they are not being generated in your case.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH] crypto: gf128mul - remove dead gf128mul_64k_lle code
From: Herbert Xu @ 2016-11-08 12:04 UTC (permalink / raw)
  To: Alex Cope; +Cc: linux-crypto
In-Reply-To: <CA+cSK10Q4aLK+uXtRwWuUR4_vuZZWaVadPpofNpWf9G=QkGSyg@mail.gmail.com>

Alex Cope <alexcope@google.com> wrote:
> This code is unlikely to be useful in the future because transforms
> don't know how often keys will be changed, new algorithms are unlikely
> to use lle representation, and tables should be replaced with
> carryless multiplication instructions when available.
> 
> Signed-off-by: Alex Cope <alexcope@google.com>

Your patch doesn't apply.  Please fix it and resubmit.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH 6/6] Add support for AEAD algos.
From: Harsh Jain @ 2016-11-08 11:46 UTC (permalink / raw)
  To: Stephan Mueller
  Cc: dan.carpenter, herbert, linux-crypto, jlulla, atul.gupta,
	yeshaswi, hariprasad
In-Reply-To: <3617075.SV0YkRBZ96@positron.chronox.de>



On 08-11-2016 16:45, Stephan Mueller wrote:
> Am Donnerstag, 27. Oktober 2016, 15:36:08 CET schrieb Harsh Jain:
>
> Hi Harsh,
>
>>>> +static void chcr_verify_tag(struct aead_request *req, u8 *input, int
>>>> *err)
>>>> +{
>>>> +	u8 temp[SHA512_DIGEST_SIZE];
>>>> +	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
>>>> +	int authsize = crypto_aead_authsize(tfm);
>>>> +	struct cpl_fw6_pld *fw6_pld;
>>>> +	int cmp = 0;
>>>> +
>>>> +	fw6_pld = (struct cpl_fw6_pld *)input;
>>>> +	if ((get_aead_subtype(tfm) == CRYPTO_ALG_SUB_TYPE_AEAD_RFC4106) ||
>>>> +	    (get_aead_subtype(tfm) == CRYPTO_ALG_SUB_TYPE_AEAD_GCM)) {
>>>> +		cmp = memcmp(&fw6_pld->data[2], (fw6_pld + 1), authsize);
>>>> +	} else {
>>>> +
>>>> +		sg_pcopy_to_buffer(req->src, sg_nents(req->src), temp,
>>>> +				authsize, req->assoclen +
>>>> +				req->cryptlen - authsize);
>>> I am wondering whether the math is correct here in any case. It is
>>> permissible that we have an AAD size of 0 and even a zero-sized
>>> ciphertext. How is such scenario covered here?
>> Here we are trying to copy user supplied tag to local buffer(temp) for
>> decrypt operation only. relative index of tag in src sg list will not
>> change when AAD is zero and in decrypt operation cryptlen > authsize.
> I am just wondering where this is checked. Since all of these implementations 
> are directly accessible from unprivileged user space, we should be careful.
chcr_verify_tag() will be called when req->verify is set to "VERIFY_SW",  same will set in decrypt callback function of Algo(like chcr_aead_decrypt) only. It will ensure calling of chcr_verify_tag() in de-crypt operation only.


>
>>>> +		cmp = memcmp(temp, (fw6_pld + 1), authsize);
>>> I would guess in both cases memcmp should be replaced with crypto_memneq
>> Yes can be done
>>
>>>> +	}
>>>> +	if (cmp)
>>>> +		*err = -EBADMSG;
>>>> +	else
>>>> +		*err = 0;
>>> What do you think about memzero_explicit(tmp)?
>> No Idea why we needs explicitly setting of zero for local variable.  Please
>> share some online resources to understand this.
> In dumps, the stack is also produced. Yet I see that stack memory is very 
> volatile and thus will be overwritten soon. Thus my common approach for 
> sensitive data is that heap variables must be zeroized. Stack variables are 
> suggested to be zeroized. As far as I understand the code, temp will hold a 
> copy of the tag value, i.e. a public piece of information. If this is correct, 
> that I concur that a memset may not be needed after all.
Yes, temp contains user supplied tag. We can ignore memset here. I will review the other function weather they need similar memset or not.
>
> Ciao
> Stephan

^ permalink raw reply

* Re: [PATCH 6/6] Add support for AEAD algos.
From: Stephan Mueller @ 2016-11-08 11:15 UTC (permalink / raw)
  To: Harsh Jain
  Cc: dan.carpenter, herbert, linux-crypto, jlulla, atul.gupta,
	yeshaswi, hariprasad
In-Reply-To: <3420937b-51f3-cc6e-8919-d698ba087170@chelsio.com>

Am Donnerstag, 27. Oktober 2016, 15:36:08 CET schrieb Harsh Jain:

Hi Harsh,

> >> +static void chcr_verify_tag(struct aead_request *req, u8 *input, int
> >> *err)
> >> +{
> >> +	u8 temp[SHA512_DIGEST_SIZE];
> >> +	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
> >> +	int authsize = crypto_aead_authsize(tfm);
> >> +	struct cpl_fw6_pld *fw6_pld;
> >> +	int cmp = 0;
> >> +
> >> +	fw6_pld = (struct cpl_fw6_pld *)input;
> >> +	if ((get_aead_subtype(tfm) == CRYPTO_ALG_SUB_TYPE_AEAD_RFC4106) ||
> >> +	    (get_aead_subtype(tfm) == CRYPTO_ALG_SUB_TYPE_AEAD_GCM)) {
> >> +		cmp = memcmp(&fw6_pld->data[2], (fw6_pld + 1), authsize);
> >> +	} else {
> >> +
> >> +		sg_pcopy_to_buffer(req->src, sg_nents(req->src), temp,
> >> +				authsize, req->assoclen +
> >> +				req->cryptlen - authsize);
> > 
> > I am wondering whether the math is correct here in any case. It is
> > permissible that we have an AAD size of 0 and even a zero-sized
> > ciphertext. How is such scenario covered here?
> 
> Here we are trying to copy user supplied tag to local buffer(temp) for
> decrypt operation only. relative index of tag in src sg list will not
> change when AAD is zero and in decrypt operation cryptlen > authsize.

I am just wondering where this is checked. Since all of these implementations 
are directly accessible from unprivileged user space, we should be careful.

> >> +		cmp = memcmp(temp, (fw6_pld + 1), authsize);
> > 
> > I would guess in both cases memcmp should be replaced with crypto_memneq
> 
> Yes can be done
> 
> >> +	}
> >> +	if (cmp)
> >> +		*err = -EBADMSG;
> >> +	else
> >> +		*err = 0;
> > 
> > What do you think about memzero_explicit(tmp)?
> 
> No Idea why we needs explicitly setting of zero for local variable.  Please
> share some online resources to understand this.

In dumps, the stack is also produced. Yet I see that stack memory is very 
volatile and thus will be overwritten soon. Thus my common approach for 
sensitive data is that heap variables must be zeroized. Stack variables are 
suggested to be zeroized. As far as I understand the code, temp will hold a 
copy of the tag value, i.e. a public piece of information. If this is correct, 
that I concur that a memset may not be needed after all.

Ciao
Stephan

^ permalink raw reply

* Re: [PATCH v4] poly1305: generic C can be faster on chips with slow unaligned access
From: Martin Willi @ 2016-11-08  7:52 UTC (permalink / raw)
  To: Jason A. Donenfeld, Herbert Xu, David S. Miller, linux-crypto,
	linux-kernel, Eric Biggers, René van Dorst
In-Reply-To: <20161107194709.20457-1-Jason@zx2c4.com>


> By using the unaligned access helpers, we drastically improve
> performance on small MIPS routers that have to go through the
> exception fix-up handler for these unaligned accesses.

I couldn't measure any slowdown here, so:

Acked-by: Martin Willi <martin@strongswan.org>

> -       dctx->s[0] = le32_to_cpuvp(key +  0);
> +       dctx->s[0] = get_unaligned_le32(key +  0);

Not sure what the exact alignment rules for key/iv are, but maybe we
want to replace the same function in chacha20_generic.c as well?

Martin

^ permalink raw reply

* Re: [PATCH v4] poly1305: generic C can be faster on chips with slow unaligned access
From: Eric Biggers @ 2016-11-07 20:40 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Herbert Xu, David S. Miller, linux-crypto, linux-kernel,
	Martin Willi, René van Dorst
In-Reply-To: <20161107194709.20457-1-Jason@zx2c4.com>

On Mon, Nov 07, 2016 at 08:47:09PM +0100, Jason A. Donenfeld wrote:
> By using the unaligned access helpers, we drastically improve
> performance on small MIPS routers that have to go through the exception
> fix-up handler for these unaligned accesses.
> 
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> ---

Reviewed-by: Eric Biggers <ebiggers@google.com>

Nit: the subject line is a little unclear about what was changed.
"make generic C faster on chips with slow unaligned access" would be better.

^ permalink raw reply

* [PATCH v4] poly1305: generic C can be faster on chips with slow unaligned access
From: Jason A. Donenfeld @ 2016-11-07 19:47 UTC (permalink / raw)
  To: Herbert Xu, David S. Miller, linux-crypto, linux-kernel,
	Martin Willi, Eric Biggers, René van Dorst
  Cc: Jason A. Donenfeld
In-Reply-To: <20161107191253.17998-1-Jason@zx2c4.com>

By using the unaligned access helpers, we drastically improve
performance on small MIPS routers that have to go through the exception
fix-up handler for these unaligned accesses.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 crypto/poly1305_generic.c | 34 +++++++++++++++-------------------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/crypto/poly1305_generic.c b/crypto/poly1305_generic.c
index 2df9835d..b1c2d57 100644
--- a/crypto/poly1305_generic.c
+++ b/crypto/poly1305_generic.c
@@ -17,6 +17,7 @@
 #include <linux/crypto.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <asm/unaligned.h>
 
 static inline u64 mlt(u64 a, u64 b)
 {
@@ -33,11 +34,6 @@ static inline u32 and(u32 v, u32 mask)
 	return v & mask;
 }
 
-static inline u32 le32_to_cpuvp(const void *p)
-{
-	return le32_to_cpup(p);
-}
-
 int crypto_poly1305_init(struct shash_desc *desc)
 {
 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
@@ -65,19 +61,19 @@ EXPORT_SYMBOL_GPL(crypto_poly1305_setkey);
 static void poly1305_setrkey(struct poly1305_desc_ctx *dctx, const u8 *key)
 {
 	/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
-	dctx->r[0] = (le32_to_cpuvp(key +  0) >> 0) & 0x3ffffff;
-	dctx->r[1] = (le32_to_cpuvp(key +  3) >> 2) & 0x3ffff03;
-	dctx->r[2] = (le32_to_cpuvp(key +  6) >> 4) & 0x3ffc0ff;
-	dctx->r[3] = (le32_to_cpuvp(key +  9) >> 6) & 0x3f03fff;
-	dctx->r[4] = (le32_to_cpuvp(key + 12) >> 8) & 0x00fffff;
+	dctx->r[0] = (get_unaligned_le32(key +  0) >> 0) & 0x3ffffff;
+	dctx->r[1] = (get_unaligned_le32(key +  3) >> 2) & 0x3ffff03;
+	dctx->r[2] = (get_unaligned_le32(key +  6) >> 4) & 0x3ffc0ff;
+	dctx->r[3] = (get_unaligned_le32(key +  9) >> 6) & 0x3f03fff;
+	dctx->r[4] = (get_unaligned_le32(key + 12) >> 8) & 0x00fffff;
 }
 
 static void poly1305_setskey(struct poly1305_desc_ctx *dctx, const u8 *key)
 {
-	dctx->s[0] = le32_to_cpuvp(key +  0);
-	dctx->s[1] = le32_to_cpuvp(key +  4);
-	dctx->s[2] = le32_to_cpuvp(key +  8);
-	dctx->s[3] = le32_to_cpuvp(key + 12);
+	dctx->s[0] = get_unaligned_le32(key +  0);
+	dctx->s[1] = get_unaligned_le32(key +  4);
+	dctx->s[2] = get_unaligned_le32(key +  8);
+	dctx->s[3] = get_unaligned_le32(key + 12);
 }
 
 unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
@@ -137,11 +133,11 @@ static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx,
 	while (likely(srclen >= POLY1305_BLOCK_SIZE)) {
 
 		/* h += m[i] */
-		h0 += (le32_to_cpuvp(src +  0) >> 0) & 0x3ffffff;
-		h1 += (le32_to_cpuvp(src +  3) >> 2) & 0x3ffffff;
-		h2 += (le32_to_cpuvp(src +  6) >> 4) & 0x3ffffff;
-		h3 += (le32_to_cpuvp(src +  9) >> 6) & 0x3ffffff;
-		h4 += (le32_to_cpuvp(src + 12) >> 8) | hibit;
+		h0 += (get_unaligned_le32(src +  0) >> 0) & 0x3ffffff;
+		h1 += (get_unaligned_le32(src +  3) >> 2) & 0x3ffffff;
+		h2 += (get_unaligned_le32(src +  6) >> 4) & 0x3ffffff;
+		h3 += (get_unaligned_le32(src +  9) >> 6) & 0x3ffffff;
+		h4 += (get_unaligned_le32(src + 12) >> 8) | hibit;
 
 		/* h *= r */
 		d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
-- 
2.10.2

^ permalink raw reply related

* [PATCH v3] poly1305: generic C can be faster on chips with slow unaligned access
From: Jason A. Donenfeld @ 2016-11-07 19:43 UTC (permalink / raw)
  To: Herbert Xu, David S. Miller, linux-crypto, linux-kernel,
	Martin Willi, Eric Biggers, René van Dorst
  Cc: Jason A. Donenfeld
In-Reply-To: <20161107191253.17998-1-Jason@zx2c4.com>

By using the unaligned access helpers, we drastically improve
performance on small MIPS routers that have to go through the exception
fix-up handler for these unaligned accesses.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 crypto/poly1305_generic.c | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/crypto/poly1305_generic.c b/crypto/poly1305_generic.c
index 2df9835d..7a77cfc 100644
--- a/crypto/poly1305_generic.c
+++ b/crypto/poly1305_generic.c
@@ -33,11 +33,6 @@ static inline u32 and(u32 v, u32 mask)
 	return v & mask;
 }
 
-static inline u32 le32_to_cpuvp(const void *p)
-{
-	return le32_to_cpup(p);
-}
-
 int crypto_poly1305_init(struct shash_desc *desc)
 {
 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
@@ -65,19 +60,19 @@ EXPORT_SYMBOL_GPL(crypto_poly1305_setkey);
 static void poly1305_setrkey(struct poly1305_desc_ctx *dctx, const u8 *key)
 {
 	/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
-	dctx->r[0] = (le32_to_cpuvp(key +  0) >> 0) & 0x3ffffff;
-	dctx->r[1] = (le32_to_cpuvp(key +  3) >> 2) & 0x3ffff03;
-	dctx->r[2] = (le32_to_cpuvp(key +  6) >> 4) & 0x3ffc0ff;
-	dctx->r[3] = (le32_to_cpuvp(key +  9) >> 6) & 0x3f03fff;
-	dctx->r[4] = (le32_to_cpuvp(key + 12) >> 8) & 0x00fffff;
+	dctx->r[0] = (get_unaligned_le32(key +  0) >> 0) & 0x3ffffff;
+	dctx->r[1] = (get_unaligned_le32(key +  3) >> 2) & 0x3ffff03;
+	dctx->r[2] = (get_unaligned_le32(key +  6) >> 4) & 0x3ffc0ff;
+	dctx->r[3] = (get_unaligned_le32(key +  9) >> 6) & 0x3f03fff;
+	dctx->r[4] = (get_unaligned_le32(key + 12) >> 8) & 0x00fffff;
 }
 
 static void poly1305_setskey(struct poly1305_desc_ctx *dctx, const u8 *key)
 {
-	dctx->s[0] = le32_to_cpuvp(key +  0);
-	dctx->s[1] = le32_to_cpuvp(key +  4);
-	dctx->s[2] = le32_to_cpuvp(key +  8);
-	dctx->s[3] = le32_to_cpuvp(key + 12);
+	dctx->s[0] = get_unaligned_le32(key +  0);
+	dctx->s[1] = get_unaligned_le32(key +  4);
+	dctx->s[2] = get_unaligned_le32(key +  8);
+	dctx->s[3] = get_unaligned_le32(key + 12);
 }
 
 unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
@@ -137,11 +132,11 @@ static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx,
 	while (likely(srclen >= POLY1305_BLOCK_SIZE)) {
 
 		/* h += m[i] */
-		h0 += (le32_to_cpuvp(src +  0) >> 0) & 0x3ffffff;
-		h1 += (le32_to_cpuvp(src +  3) >> 2) & 0x3ffffff;
-		h2 += (le32_to_cpuvp(src +  6) >> 4) & 0x3ffffff;
-		h3 += (le32_to_cpuvp(src +  9) >> 6) & 0x3ffffff;
-		h4 += (le32_to_cpuvp(src + 12) >> 8) | hibit;
+		h0 += (get_unaligned_le32(src +  0) >> 0) & 0x3ffffff;
+		h1 += (get_unaligned_le32(src +  3) >> 2) & 0x3ffffff;
+		h2 += (get_unaligned_le32(src +  6) >> 4) & 0x3ffffff;
+		h3 += (get_unaligned_le32(src +  9) >> 6) & 0x3ffffff;
+		h4 += (get_unaligned_le32(src + 12) >> 8) | hibit;
 
 		/* h *= r */
 		d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
-- 
2.10.2

^ permalink raw reply related

* Re: [PATCH] poly1305: generic C can be faster on chips with slow unaligned access
From: Jason A. Donenfeld @ 2016-11-07 19:41 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Herbert Xu, Martin Willi, LKML, linux-crypto, David Miller,
	WireGuard mailing list
In-Reply-To: <20161107192505.GB34388@google.com>

On Mon, Nov 7, 2016 at 8:25 PM, Eric Biggers <ebiggers@google.com> wrote:
> No it does *not* buffer all incoming blocks, which is why the source pointer can
> fall out of alignment.  Yes, I actually tested this.  In fact this situation is
> even hit, in both possible places, in the self-tests.

Urgh! v3 coming right up...

^ permalink raw reply

* Re: [PATCH] poly1305: generic C can be faster on chips with slow unaligned access
From: Eric Biggers @ 2016-11-07 19:25 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Herbert Xu, Martin Willi, LKML, linux-crypto, David Miller,
	WireGuard mailing list
In-Reply-To: <CAHmME9rzBST8Lqapzykwf2rUyxLXuyOF0wX+Sy259Qtx9a4YSg@mail.gmail.com>

On Mon, Nov 07, 2016 at 08:02:35PM +0100, Jason A. Donenfeld wrote:
> On Mon, Nov 7, 2016 at 7:26 PM, Eric Biggers <ebiggers@google.com> wrote:
> >
> > I was not referring to any users in particular, only what users could do.  As an
> > example, if you did crypto_shash_update() with 32, 15, then 17 bytes, and the
> > underlying algorithm is poly1305-generic, the last block would end up
> > misaligned.  This doesn't appear possible with your pseudocode because it only
> > passes in multiples of the block size until the very end.  However I don't see
> > it claimed anywhere that shash API users have to do that.
> 
> Actually it appears that crypto/poly1305_generic.c already buffers
> incoming blocks to a buffer that definitely looks aligned, to prevent
> this condition!
> 

No it does *not* buffer all incoming blocks, which is why the source pointer can
fall out of alignment.  Yes, I actually tested this.  In fact this situation is
even hit, in both possible places, in the self-tests.

Eric

^ permalink raw reply

* [PATCH v2] poly1305: generic C can be faster on chips with slow unaligned access
From: Jason A. Donenfeld @ 2016-11-07 19:12 UTC (permalink / raw)
  To: Herbert Xu, David S. Miller, linux-crypto, linux-kernel,
	Martin Willi, Eric Biggers, René van Dorst
  Cc: Jason A. Donenfeld
In-Reply-To: <20161102175810.18647-1-Jason@zx2c4.com>

By using the unaligned access helpers, we drastically improve
performance on small MIPS routers that have to go through the exception
fix-up handler for these unaligned accesses.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 crypto/poly1305_generic.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/crypto/poly1305_generic.c b/crypto/poly1305_generic.c
index 2df9835d..0b86f4e 100644
--- a/crypto/poly1305_generic.c
+++ b/crypto/poly1305_generic.c
@@ -66,9 +66,9 @@ static void poly1305_setrkey(struct poly1305_desc_ctx *dctx, const u8 *key)
 {
 	/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
 	dctx->r[0] = (le32_to_cpuvp(key +  0) >> 0) & 0x3ffffff;
-	dctx->r[1] = (le32_to_cpuvp(key +  3) >> 2) & 0x3ffff03;
-	dctx->r[2] = (le32_to_cpuvp(key +  6) >> 4) & 0x3ffc0ff;
-	dctx->r[3] = (le32_to_cpuvp(key +  9) >> 6) & 0x3f03fff;
+	dctx->r[1] = (get_unaligned_le32(key +  3) >> 2) & 0x3ffff03;
+	dctx->r[2] = (get_unaligned_le32(key +  6) >> 4) & 0x3ffc0ff;
+	dctx->r[3] = (get_unaligned_le32(key +  9) >> 6) & 0x3f03fff;
 	dctx->r[4] = (le32_to_cpuvp(key + 12) >> 8) & 0x00fffff;
 }
 
@@ -138,9 +138,9 @@ static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx,
 
 		/* h += m[i] */
 		h0 += (le32_to_cpuvp(src +  0) >> 0) & 0x3ffffff;
-		h1 += (le32_to_cpuvp(src +  3) >> 2) & 0x3ffffff;
-		h2 += (le32_to_cpuvp(src +  6) >> 4) & 0x3ffffff;
-		h3 += (le32_to_cpuvp(src +  9) >> 6) & 0x3ffffff;
+		h1 += (get_unaligned_le32(src +  3) >> 2) & 0x3ffffff;
+		h2 += (get_unaligned_le32(src +  6) >> 4) & 0x3ffffff;
+		h3 += (get_unaligned_le32(src +  9) >> 6) & 0x3ffffff;
 		h4 += (le32_to_cpuvp(src + 12) >> 8) | hibit;
 
 		/* h *= r */
-- 
2.10.2

^ permalink raw reply related

* Re: [PATCH] poly1305: generic C can be faster on chips with slow unaligned access
From: Jason A. Donenfeld @ 2016-11-07 19:02 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Herbert Xu, Martin Willi, LKML, linux-crypto, David Miller,
	WireGuard mailing list
In-Reply-To: <20161107182646.GA34388@google.com>

On Mon, Nov 7, 2016 at 7:26 PM, Eric Biggers <ebiggers@google.com> wrote:
>
> I was not referring to any users in particular, only what users could do.  As an
> example, if you did crypto_shash_update() with 32, 15, then 17 bytes, and the
> underlying algorithm is poly1305-generic, the last block would end up
> misaligned.  This doesn't appear possible with your pseudocode because it only
> passes in multiples of the block size until the very end.  However I don't see
> it claimed anywhere that shash API users have to do that.

Actually it appears that crypto/poly1305_generic.c already buffers
incoming blocks to a buffer that definitely looks aligned, to prevent
this condition!

I'll submit a v2 with only the inner unaligned operations changed.

^ permalink raw reply

* Re: [PATCH] poly1305: generic C can be faster on chips with slow unaligned access
From: Eric Biggers @ 2016-11-07 18:26 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Herbert Xu, Martin Willi, LKML, linux-crypto, David Miller,
	WireGuard mailing list
In-Reply-To: <CAHmME9oejs+USiGJX2P0TgvnR6XRzV1HYZPrq=c-CjGzq59=NQ@mail.gmail.com>

On Mon, Nov 07, 2016 at 07:08:22PM +0100, Jason A. Donenfeld wrote:
> Hmm... The general data flow that strikes me as most pertinent is
> something like:
> 
> struct sk_buff *skb = get_it_from_somewhere();
> skb = skb_share_check(skb, GFP_ATOMIC);
> num_frags = skb_cow_data(skb, ..., ...);
> struct scatterlist sg[num_frags];
> sg_init_table(sg, num_frags);
> skb_to_sgvec(skb, sg, ..., ...);
> blkcipher_walk_init(&walk, sg, sg, len);
> blkcipher_walk_virt_block(&desc, &walk, BLOCK_SIZE);
> while (walk.nbytes >= BLOCK_SIZE) {
>     size_t chunk_len = rounddown(walk.nbytes, BLOCK_SIZE);
>     poly1305_update(&poly1305_state, walk.src.virt.addr, chunk_len);
>     blkcipher_walk_done(&desc, &walk, walk.nbytes % BLOCK_SIZE);
> }
> if (walk.nbytes) {
>     poly1305_update(&poly1305_state, walk.src.virt.addr, walk.nbytes);
>     blkcipher_walk_done(&desc, &walk, 0);
> }
> 
> Is your suggestion that that in the final if block, walk.src.virt.addr
> might be unaligned? Like in the case of the last fragment being 67
> bytes long?

I was not referring to any users in particular, only what users could do.  As an
example, if you did crypto_shash_update() with 32, 15, then 17 bytes, and the
underlying algorithm is poly1305-generic, the last block would end up
misaligned.  This doesn't appear possible with your pseudocode because it only
passes in multiples of the block size until the very end.  However I don't see
it claimed anywhere that shash API users have to do that.

Eric

^ permalink raw reply

* Re: [PATCH] poly1305: generic C can be faster on chips with slow unaligned access
From: Jason A. Donenfeld @ 2016-11-07 18:23 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Herbert Xu, Martin Willi, LKML, linux-crypto, David Miller,
	WireGuard mailing list
In-Reply-To: <CAHmME9oejs+USiGJX2P0TgvnR6XRzV1HYZPrq=c-CjGzq59=NQ@mail.gmail.com>

On Mon, Nov 7, 2016 at 7:08 PM, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> Hmm... The general data flow that strikes me as most pertinent is
> something like:
>
> struct sk_buff *skb = get_it_from_somewhere();
> skb = skb_share_check(skb, GFP_ATOMIC);
> num_frags = skb_cow_data(skb, ..., ...);
> struct scatterlist sg[num_frags];
> sg_init_table(sg, num_frags);
> skb_to_sgvec(skb, sg, ..., ...);
> blkcipher_walk_init(&walk, sg, sg, len);
> blkcipher_walk_virt_block(&desc, &walk, BLOCK_SIZE);
> while (walk.nbytes >= BLOCK_SIZE) {
>     size_t chunk_len = rounddown(walk.nbytes, BLOCK_SIZE);
>     poly1305_update(&poly1305_state, walk.src.virt.addr, chunk_len);
>     blkcipher_walk_done(&desc, &walk, walk.nbytes % BLOCK_SIZE);
> }
> if (walk.nbytes) {
>     poly1305_update(&poly1305_state, walk.src.virt.addr, walk.nbytes);
>     blkcipher_walk_done(&desc, &walk, 0);
> }
>
> Is your suggestion that that in the final if block, walk.src.virt.addr
> might be unaligned? Like in the case of the last fragment being 67
> bytes long?

In fact, I'm not so sure this happens here. In the while loop, each
new walk.src.virt.addr will be aligned to BLOCK_SIZE or be aligned by
virtue of being at the start of a new page. In the subsequent if
block, walk.src.virt.addr will either be
some_aligned_address+BLOCK_SIZE, which will be aligned, or it will be
a start of a new page, which will be aligned.

So what did you have in mind exactly?

I don't think anybody is running code like:

for (size_t i = 0; i < len; i += 17)
    poly1305_update(&poly, &buffer[i], 17);

(And if so, those consumers should be fixed.)

^ permalink raw reply

* Re: [PATCH] poly1305: generic C can be faster on chips with slow unaligned access
From: Jason A. Donenfeld @ 2016-11-07 18:08 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Herbert Xu, Martin Willi, LKML, linux-crypto, David Miller,
	WireGuard mailing list
In-Reply-To: <20161104173723.GB34176@google.com>

Hi Eric,

On Fri, Nov 4, 2016 at 6:37 PM, Eric Biggers <ebiggers@google.com> wrote:
> I agree, and the current code is wrong; but do note that this proposal is
> correct for poly1305_setrkey() but not for poly1305_setskey() and
> poly1305_blocks().  In the latter two cases, 4-byte alignment of the source
> buffer is *not* guaranteed.  Although crypto_poly1305_update() will be called
> with a 4-byte aligned buffer due to the alignmask set on poly1305_alg, the
> algorithm operates on 16-byte blocks and therefore has to buffer partial blocks.
> If some number of bytes that is not 0 mod 4 is buffered, then the buffer will
> fall out of alignment on the next update call.  Hence, get_unaligned_le32() is
> actually needed on all the loads, since the buffer will, in general, be of
> unknown alignment.

Hmm... The general data flow that strikes me as most pertinent is
something like:

struct sk_buff *skb = get_it_from_somewhere();
skb = skb_share_check(skb, GFP_ATOMIC);
num_frags = skb_cow_data(skb, ..., ...);
struct scatterlist sg[num_frags];
sg_init_table(sg, num_frags);
skb_to_sgvec(skb, sg, ..., ...);
blkcipher_walk_init(&walk, sg, sg, len);
blkcipher_walk_virt_block(&desc, &walk, BLOCK_SIZE);
while (walk.nbytes >= BLOCK_SIZE) {
    size_t chunk_len = rounddown(walk.nbytes, BLOCK_SIZE);
    poly1305_update(&poly1305_state, walk.src.virt.addr, chunk_len);
    blkcipher_walk_done(&desc, &walk, walk.nbytes % BLOCK_SIZE);
}
if (walk.nbytes) {
    poly1305_update(&poly1305_state, walk.src.virt.addr, walk.nbytes);
    blkcipher_walk_done(&desc, &walk, 0);
}

Is your suggestion that that in the final if block, walk.src.virt.addr
might be unaligned? Like in the case of the last fragment being 67
bytes long?

If so, what a hassle. I hope the performance overhead isn't too
awful... I'll resubmit taking into account your suggestions.

By the way -- offlist benchmarks sent to me concluded that using the
unaligned load helpers like David suggested is just as fast as that
handrolled bit magic in the v1.

Regards,
Jason

^ permalink raw reply

* AW: [PATCH] crypto: caam: do not register AES-XTS mode on LP units
From: Sven Ebenfeld @ 2016-11-07 17:58 UTC (permalink / raw)
  To: 'Horia Geanta Neag', linux-crypto, linux-kernel
  Cc: herbert, davem, 'Cata Vasile'
In-Reply-To: <DB4PR04MB08471753F3A712A732373E0598A70@DB4PR04MB0847.eurprd04.prod.outlook.com>

[-- Attachment #1: Type: text/plain, Size: 1278 bytes --]

> -----Ursprüngliche Nachricht-----
> Von: Horia Geanta Neag [mailto:horia.geanta@nxp.com]
> Gesendet: Montag, 7. November 2016 08:14
> An: Sven Ebenfeld <sven.ebenfeld@gmail.com>; linux-
> crypto@vger.kernel.org; linux-kernel@vger.kernel.org
> Cc: herbert@gondor.apana.org.au; davem@davemloft.net; Cata Vasile
> <cata.vasile@nxp.com>
> Betreff: Re: [PATCH] crypto: caam: do not register AES-XTS mode on LP
units
> 
> On 11/5/2016 1:17 AM, Sven Ebenfeld wrote:
> > When using AES-XTS on a Wandboard, we receive a Mode error:
> > caam_jr 2102000.jr1: 20001311: CCB: desc idx 19: AES: Mode error.
> >
> > Due to the Security Reference Manual, the Low Power AES units
> s/Due to/According to
> 
> > of the i.MX6 do not support the XTS mode. Therefore we should try to
> > provide them them in the API.
> >
> Rephrase: Therefore we mustn't register XTS implementations to Crypto API
> in this case.
> 
> > Signed-off-by: Sven Ebenfeld <sven.ebenfeld@gmail.com>
> Reviewed-by: Horia Geantă <horia.geanta@nxp.com>
> 
> Please send the patch to -stable and mention the offending commit:
> Cc: <stable@vger.kernel.org> # 4.4+
> Fixes: c6415a6016bf "crypto: caam - add support for acipher xts(aes)"
> 
> Thanks,
> Horia

Thanks, I've sent a v2.
Sven


[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 6016 bytes --]

^ permalink raw reply

* [PATCH v2] crypto: caam: do not register AES-XTS mode on LP units
From: Sven Ebenfeld @ 2016-11-07 17:51 UTC (permalink / raw)
  To: linux-crypto, linux-kernel, stable, horia.geanta
  Cc: herbert, davem, cata.vasile, Sven Ebenfeld

When using AES-XTS on a Wandboard, we receive a Mode error:
caam_jr 2102000.jr1: 20001311: CCB: desc idx 19: AES: Mode error.

According to the Security Reference Manual, the Low Power AES units
of the i.MX6 do not support the XTS mode. Therefore we must not
register XTS implementations in the Crypto API.

Signed-off-by: Sven Ebenfeld <sven.ebenfeld@gmail.com>
Reviewed-by: Horia Geantă <horia.geanta@nxp.com>

Cc: <stable@vger.kernel.org> # 4.4+
Fixes: c6415a6016bf "crypto: caam - add support for acipher xts(aes)"

---
 drivers/crypto/caam/caamalg.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index 156aad1..f5a63ba 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -4583,6 +4583,15 @@ static int __init caam_algapi_init(void)
 		if (!aes_inst && (alg_sel == OP_ALG_ALGSEL_AES))
 				continue;
 
+		/*
+		 * Check support for AES modes not available
+		 * on LP devices.
+		 */
+		if ((cha_vid & CHA_ID_LS_AES_MASK) == CHA_ID_LS_AES_LP)
+			if ((alg->class1_alg_type & OP_ALG_AAI_MASK) ==
+			     OP_ALG_AAI_XTS)
+				continue;
+
 		t_alg = caam_alg_alloc(alg);
 		if (IS_ERR(t_alg)) {
 			err = PTR_ERR(t_alg);
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH 1/2] fscrypto: don't use on-stack buffer for filename encryption
From: Christoph Hellwig @ 2016-11-07 15:44 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: Eric Biggers, linux-fsdevel, linux-ext4, linux-f2fs-devel,
	linux-crypto, tytso, jaegeuk, richard, luto
In-Reply-To: <20161105151349.e5ap547uno3hfit7@kmo-pixel>

On Sat, Nov 05, 2016 at 07:13:49AM -0800, Kent Overstreet wrote:
> Vmalloc memory does have struct pages - you just need to use vmalloc_to_page()
> instead of virt_to_page. Look at drivers/md/bcache/util.c bch_bio_map() if you
> want an example.

That example seems to be clearly broken on virtually index caches
due to the lack of flush_kernel_vmap_range and
invalidate_kernel_vmap_range calls.

^ permalink raw reply

* Re: [PATCH] crypto: caam: do not register AES-XTS mode on LP units
From: Horia Geanta Neag @ 2016-11-07  7:13 UTC (permalink / raw)
  To: Sven Ebenfeld, linux-crypto@vger.kernel.org,
	linux-kernel@vger.kernel.org
  Cc: herbert@gondor.apana.org.au, davem@davemloft.net, Cata Vasile
In-Reply-To: <1478301447-75861-1-git-send-email-sven.ebenfeld@gmail.com>

On 11/5/2016 1:17 AM, Sven Ebenfeld wrote:
> When using AES-XTS on a Wandboard, we receive a Mode error:
> caam_jr 2102000.jr1: 20001311: CCB: desc idx 19: AES: Mode error.
> 
> Due to the Security Reference Manual, the Low Power AES units
s/Due to/According to

> of the i.MX6 do not support the XTS mode. Therefore we should
> try to provide them them in the API.
> 
Rephrase: Therefore we mustn't register XTS implementations to Crypto
API in this case.

> Signed-off-by: Sven Ebenfeld <sven.ebenfeld@gmail.com>
Reviewed-by: Horia Geantă <horia.geanta@nxp.com>

Please send the patch to -stable and mention the offending commit:
Cc: <stable@vger.kernel.org> # 4.4+
Fixes: c6415a6016bf "crypto: caam - add support for acipher xts(aes)"

Thanks,
Horia

> ---
>  drivers/crypto/caam/caamalg.c | 9 +++++++++
>  1 file changed, 9 insertions(+)
> 
> diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
> index 156aad1..f5a63ba 100644
> --- a/drivers/crypto/caam/caamalg.c
> +++ b/drivers/crypto/caam/caamalg.c
> @@ -4583,6 +4583,15 @@ static int __init caam_algapi_init(void)
>  		if (!aes_inst && (alg_sel == OP_ALG_ALGSEL_AES))
>  				continue;
>  
> +		/*
> +		 * Check support for AES modes not available
> +		 * on LP devices.
> +		 */
> +		if ((cha_vid & CHA_ID_LS_AES_MASK) == CHA_ID_LS_AES_LP)
> +			if ((alg->class1_alg_type & OP_ALG_AAI_MASK) ==
> +			     OP_ALG_AAI_XTS)
> +				continue;
> +
>  		t_alg = caam_alg_alloc(alg);
>  		if (IS_ERR(t_alg)) {
>  			err = PTR_ERR(t_alg);
> 


^ permalink raw reply

* Re: [PATCH 2/2] fscrypto: don't use on-stack buffer for key derivation
From: Richard Weinberger @ 2016-11-07 13:22 UTC (permalink / raw)
  To: Eric Biggers, linux-fsdevel
  Cc: linux-ext4, linux-f2fs-devel, linux-crypto, tytso, jaegeuk, luto
In-Reply-To: <1478210582-86338-2-git-send-email-ebiggers@google.com>

On 03.11.2016 23:03, Eric Biggers wrote:
> With the new (in 4.9) option to use a virtually-mapped stack
> (CONFIG_VMAP_STACK), stack buffers cannot be used as input/output for
> the scatterlist crypto API because they may not be directly mappable to
> struct page.  get_crypt_info() was using a stack buffer to hold the

See 1/2. :-)

> output from the encryption operation used to derive the per-file key.
> Fix it by using a heap buffer.
> 
> This bug could most easily be observed in a CONFIG_DEBUG_SG kernel
> because this allowed the BUG in sg_set_buf() to be triggered.
> 
> Signed-off-by: Eric Biggers <ebiggers@google.com>
> ---
>  fs/crypto/keyinfo.c | 16 +++++++++++++---
>  1 file changed, 13 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
> index 82f0285..67fb6d8 100644
> --- a/fs/crypto/keyinfo.c
> +++ b/fs/crypto/keyinfo.c
> @@ -185,7 +185,7 @@ int get_crypt_info(struct inode *inode)
>  	struct crypto_skcipher *ctfm;
>  	const char *cipher_str;
>  	int keysize;
> -	u8 raw_key[FS_MAX_KEY_SIZE];
> +	u8 *raw_key = NULL;
>  	int res;
>  
>  	res = fscrypt_initialize();
> @@ -238,6 +238,15 @@ int get_crypt_info(struct inode *inode)
>  	if (res)
>  		goto out;
>  
> +	/*
> +	 * This cannot be a stack buffer because it is passed to the scatterlist
> +	 * crypto API as part of key derivation.
> +	 */
> +	res = -ENOMEM;
> +	raw_key = kmalloc(FS_MAX_KEY_SIZE, GFP_NOFS);
> +	if (!raw_key)
> +		goto out;
> +
>  	if (fscrypt_dummy_context_enabled(inode)) {
>  		memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE);
>  		goto got_key;
> @@ -276,7 +285,8 @@ int get_crypt_info(struct inode *inode)
>  	if (res)
>  		goto out;
>  
> -	memzero_explicit(raw_key, sizeof(raw_key));
> +	kzfree(raw_key);
> +	raw_key = NULL;
>  	if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) {
>  		put_crypt_info(crypt_info);
>  		goto retry;
> @@ -287,7 +297,7 @@ int get_crypt_info(struct inode *inode)
>  	if (res == -ENOKEY)
>  		res = 0;
>  	put_crypt_info(crypt_info);
> -	memzero_explicit(raw_key, sizeof(raw_key));
> +	kzfree(raw_key);
>  	return res;
>  }

Reviewed-by: Richard Weinberger <richard@nod.at>

Thanks,
//richard


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox