* [RFC 0/3] Experiments with AES-AltiVec
@ 2007-04-11 16:49 Sebastian Siewior
2007-04-11 16:49 ` [RFC 1/3] cryptoapi: AES with AltiVec support Sebastian Siewior
` (2 more replies)
0 siblings, 3 replies; 10+ messages in thread
From: Sebastian Siewior @ 2007-04-11 16:49 UTC (permalink / raw)
To: linuxppc-dev
I tried to use SIMD instructions for AES. This is stage 1 with AltiVec support,
the next will be on SPUs. Unfortunately it is slower than the generic code on my
ps3. It might be faster on some other machine. I'm interrested in hearing
numbers from other people.
--
^ permalink raw reply [flat|nested] 10+ messages in thread
* [RFC 1/3] cryptoapi: AES with AltiVec support
2007-04-11 16:49 [RFC 0/3] Experiments with AES-AltiVec Sebastian Siewior
@ 2007-04-11 16:49 ` Sebastian Siewior
2007-04-11 18:24 ` Arnd Bergmann
2007-04-11 22:22 ` Benjamin Herrenschmidt
2007-04-11 16:49 ` [RFC 2/3] PowerPC: lazy altivec enabling in kernel Sebastian Siewior
2007-04-11 16:49 ` [RFC 3/3] cryptoapi: speed test Sebastian Siewior
2 siblings, 2 replies; 10+ messages in thread
From: Sebastian Siewior @ 2007-04-11 16:49 UTC (permalink / raw)
To: linuxppc-dev
That's the best I could do. I get
Average: 4728 msec, approx. 33840 kb/sec || 33 mb/sec
on encryption
Average: 5364 msec, approx. 29828 kb/sec || 29 mb/sec
on decryption
the generic module is faster:
Average: 3853 msec, approx. 41526 kb/sec || 40 mb/sec
on encryption
Average: 3736 msec, approx. 42826 kb/sec || 41 mb/sec
on decryption
AltiVec measurement was done with "kernel altivec patch" see next patch. Without
it is slower:
Average: 7079 msec, approx. 22602 kb/sec || 22 mb/sec
on encryption
Average: 10083 msec, approx. 15868 kb/sec || 15 mb/sec
on decryption
It would be nice if someone could play around with different machines. This has
been on a ps3.
Signed-off-by: Sebastian Siewior <bigeasy@linux.vnet.ibm.com>
Index: ps3-linux/crypto/aes-alti.c
===================================================================
--- /dev/null
+++ ps3-linux/crypto/aes-alti.c
@@ -0,0 +1,136 @@
+/*
+ * based on crypto/aes.c
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <asm/byteorder.h>
+#include <asm/system.h>
+
+#include "aes-altivec.h"
+
+#define AES_MIN_KEY_SIZE 16
+#define AES_MAX_KEY_SIZE 32
+
+#define AES_BLOCK_SIZE 16
+
+/* max rounds is 14. Every round needs 1 vector as key (=4 ints or 16 bytes)
+ * The first slot is the given key
+ */
+
+#define MAX_AES_ROUNDS 15
+#define MAX_AES_KEYSIZE_INT (MAX_AES_ROUNDS *4)
+#define MAX_AES_KEYSIZE_BYTE (MAX_AES_KEYSIZE_INT *4)
+
+struct aes_ctx {
+#ifdef KERN_EMU
+ unsigned char pad0;
+ unsigned char pad1;
+ unsigned char pad2;
+ unsigned char pad3;
+#endif
+ unsigned char key_enc_ch[MAX_AES_KEYSIZE_BYTE];// __attribute__ ((aligned (16)));
+ unsigned char key_dec_ch[MAX_AES_KEYSIZE_BYTE];// __attribute__ ((aligned (16)));
+ unsigned int key_length;
+};
+
+static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct aes_ctx *ctx = crypto_tfm_ctx(tfm);
+ u32 *flags = &tfm->crt_flags;
+ u32 i;
+
+ switch (key_len) {
+ case 16:
+ break;
+
+ case 24:
+ break;
+
+ case 32:
+ break;
+
+ default:
+ *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+
+ preempt_disable();
+ enable_kernel_altivec();
+
+ printk("ctx @ %p\n", ctx);
+ ctx->key_length = key_len;
+ i = expand_key(in_key, key_len/4 , ctx->key_enc_ch, ctx->key_dec_ch);
+
+ preempt_enable();
+ return i;
+}
+
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+ const struct aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ preempt_disable();
+ enable_kernel_altivec();
+
+ aes_encrypt_altivec(in, out, ctx->key_enc_ch, ctx->key_length);
+
+ preempt_enable();
+}
+
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+ const struct aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ preempt_disable();
+ enable_kernel_altivec();
+
+ aes_decrypt_altivec(in, out, ctx->key_dec_ch, ctx->key_length);
+
+ preempt_enable();
+}
+
+
+static struct crypto_alg aes_alg = {
+ .cra_name = "aes",
+ .cra_driver_name = "aes-altivec",
+ .cra_priority = 123,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct aes_ctx),
+ .cra_alignmask = 15,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(aes_alg.cra_list),
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = AES_MIN_KEY_SIZE,
+ .cia_max_keysize = AES_MAX_KEY_SIZE,
+ .cia_setkey = aes_set_key,
+ .cia_encrypt = aes_encrypt,
+ .cia_decrypt = aes_decrypt
+ }
+ }
+};
+
+static int __init aes_init(void)
+{
+ if (!(cpu_has_feature(CPU_FTR_ALTIVEC))) {
+ printk("aes-alti: No altivec unit available\n");
+ return -ENODEV;
+ }
+
+ return crypto_register_alg(&aes_alg);
+}
+
+static void __exit aes_fini(void)
+{
+ crypto_unregister_alg(&aes_alg);
+}
+
+module_init(aes_init);
+module_exit(aes_fini);
+
+MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm");
+MODULE_LICENSE("GPL");
Index: ps3-linux/crypto/aes-altivec.c
===================================================================
--- /dev/null
+++ ps3-linux/crypto/aes-altivec.c
@@ -0,0 +1,706 @@
+/*
+ * AES implementation with AltiVec support.
+ * v.02
+ *
+ * Author:
+ * Sebastian Siewior (bigeasy _at_ breakpoint.cc)
+ * Arnd Bergmann (arnd _at_ arndb.de)
+ *
+ * License: GPL v2
+ *
+ * Code based on ideas from "Effincient Galois Field Arithmetic on SIMD Architectures" by
+ * Raghav Bhaskar, Prapdeep K. Dubey, Vijay Kumar, Atri Rudra and Animesh Sharma.
+ *
+ * This implementation makes use of AltiVec and asumes therefore big endian (on the other
+ * hand only Intel makes it (still) wrong (well it made porting to 64bit probably a lot of
+ * easier)).
+ * Tables for MixColumn() and InvMixColumn() are adjusted in order to omit ShiftRow in all but
+ * last round.
+ */
+
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <altivec.h>
+#include <linux/autoconf.h>
+#include "aes-altivec.h"
+
+static const vector unsigned char imm_7Fh = {
+ 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+ 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f
+};
+
+/*
+ * This values are either defined in AES standard or can be
+ * computed.
+ */
+static const unsigned int Rcon[] = {
+ 0x00000000, 0x01000000, 0x02000000, 0x04000000, 0x08000000,
+ 0x10000000, 0x20000000, 0x40000000, 0x80000000, 0x1b000000,
+ 0x36000000
+};
+
+static const vector unsigned char sbox_enc[16] = {
+ { 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
+ 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 },
+ { 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
+ 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 },
+ { 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc,
+ 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 },
+ { 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a,
+ 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 },
+ { 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
+ 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 },
+ { 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b,
+ 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf },
+ { 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85,
+ 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 },
+ { 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
+ 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 },
+ { 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17,
+ 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 },
+ { 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88,
+ 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb },
+ { 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
+ 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 },
+ { 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9,
+ 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 },
+ { 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6,
+ 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a },
+ { 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
+ 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e },
+ { 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94,
+ 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf },
+ { 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68,
+ 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 }
+};
+
+static const vector unsigned char shift_round = {
+ 0x00, 0x05, 0x0a, 0x0f,
+ 0x04, 0x09, 0x0e, 0x03,
+ 0x08, 0x0d, 0x02, 0x07,
+ 0x0c, 0x01, 0x06, 0x0b
+};
+
+static const vector unsigned char pre_xor_s0 = {
+ 0x10, 0x00, 0x00, 0x10,
+ 0x14, 0x04, 0x04, 0x14,
+ 0x18, 0x08, 0x08, 0x18,
+ 0x1c, 0x0c, 0x0c, 0x1c
+};
+
+static const vector unsigned char pre_xor_s1 = {
+ 0x15, 0x15, 0x05, 0x00,
+ 0x19, 0x19, 0x09, 0x04,
+ 0x1d, 0x1d, 0x0d, 0x08,
+ 0x11, 0x11, 0x01, 0x0c
+};
+
+static const vector unsigned char pre_xor_s2 = {
+ 0x05, 0x1a, 0x1a, 0x05,
+ 0x09, 0x1e, 0x1e, 0x09,
+ 0x0d, 0x12, 0x12, 0x0d,
+ 0x01, 0x16, 0x16, 0x01
+};
+
+static const vector unsigned char pre_xor_s3 = {
+ 0x0a, 0x0a, 0x1f, 0x0a,
+ 0x0e, 0x0e, 0x13, 0x0e,
+ 0x02, 0x02, 0x17, 0x02,
+ 0x06, 0x06, 0x1b, 0x06
+};
+
+static const vector unsigned char pre_xor_s4 = {
+ 0x0f, 0x0f, 0x0f, 0x1f,
+ 0x03, 0x03, 0x03, 0x13,
+ 0x07, 0x07, 0x07, 0x17,
+ 0x0b, 0x0b, 0x0b, 0x1b
+};
+
+static const vector unsigned char sbox_dec[16] = {
+ { 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
+ 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb },
+ { 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
+ 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb },
+ { 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d,
+ 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e },
+ { 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2,
+ 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 },
+ { 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
+ 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 },
+ { 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda,
+ 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 },
+ { 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a,
+ 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 },
+ { 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
+ 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b },
+ { 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea,
+ 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 },
+ { 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85,
+ 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e },
+ { 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
+ 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b },
+ { 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20,
+ 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 },
+ { 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31,
+ 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f },
+ { 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
+ 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef },
+ { 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0,
+ 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 },
+ { 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26,
+ 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d }
+};
+
+static const vector unsigned char inv_shift_round = {
+ 0x00, 0x0d, 0x0a, 0x07,
+ 0x04, 0x01, 0x0e, 0x0B,
+ 0x08, 0x05, 0x02, 0x0f,
+ 0x0c, 0x09, 0x06, 0x03
+};
+
+static const vector unsigned char inv_select_0e_shifted = {
+ 0x00, 0x0d, 0x0a, 0x07,
+ 0x04, 0x01, 0x0e, 0x0B,
+ 0x08, 0x05, 0x02, 0x0f,
+ 0x0c, 0x09, 0x06, 0x03
+};
+
+static const vector unsigned char inv_select_0b_shifted = {
+ 0x0d, 0x0a, 0x07, 0x00,
+ 0x01, 0x0e, 0x0b, 0x04,
+ 0x05, 0x02, 0x0f, 0x08,
+ 0x09, 0x06, 0x03, 0x0c
+};
+
+static const vector unsigned char inv_select_0d_shifted = {
+ 0x0a, 0x07, 0x00, 0x0d,
+ 0x0e, 0x0b, 0x04, 0x01,
+ 0x02, 0x0f, 0x08, 0x05,
+ 0x06, 0x03, 0x0c, 0x09
+};
+
+static const vector unsigned char inv_select_09_shifted = {
+ 0x07, 0x00, 0x0d, 0x0a,
+ 0x0b, 0x04, 0x01, 0x0e,
+ 0x0f, 0x08, 0x05, 0x02,
+ 0x03, 0x0c, 0x09, 0x06
+};
+
+static const vector unsigned char inv_select_0e_norm = {
+ 0x00, 0x01, 0x02, 0x03,
+ 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b,
+ 0x0c, 0x0d, 0x0e, 0x0f
+};
+
+static const vector unsigned char inv_select_0b_norm = {
+ 0x01, 0x02, 0x03, 0x00,
+ 0x05, 0x06, 0x07, 0x04,
+ 0x09, 0x0a, 0x0b, 0x08,
+ 0x0d, 0x0e, 0x0f, 0x0c
+};
+
+static const vector unsigned char inv_select_0d_norm = {
+ 0x02, 0x03, 0x00, 0x01,
+ 0x06, 0x07, 0x04, 0x05,
+ 0x0a, 0x0b, 0x08, 0x09,
+ 0x0e, 0x0f, 0x0c, 0x0d
+};
+
+static const vector unsigned char inv_select_09_norm = {
+ 0x03, 0x00, 0x01, 0x02,
+ 0x07, 0x04, 0x05, 0x06,
+ 0x0b, 0x08, 0x09, 0x0a,
+ 0x0f, 0x0c, 0x0d, 0x0e
+};
+
+#ifdef CONFIG_CRYPTO_AES_ALTIVEC_TABLE
+/* small GF lookup table */
+static const vector unsigned char gf_mul_8_high = {
+ 0x00, 0x80, 0x1b, 0x9b, 0x36, 0xb6, 0x2d, 0xad,
+ 0x6c, 0xec, 0x77, 0xf7, 0x5a, 0xda, 0x41, 0xc1
+};
+static const vector unsigned char gf_mul_a_high = {
+ 0x00, 0xa0, 0x5b, 0xfb, 0xb6, 0x16, 0xed, 0x4d,
+ 0x77, 0xd7, 0x2c, 0x8c, 0xc1, 0x61, 0x9a, 0x3a
+};
+static const vector unsigned char gf_mul_c_high = {
+ 0x00, 0xc0, 0x9b, 0x5b, 0x2d, 0xed, 0xb6, 0x76,
+ 0x5a, 0x9a, 0xc1, 0x01, 0x77, 0xb7, 0xec, 0x2c
+};
+static const vector unsigned char gf_mul_e_high = {
+ 0x00, 0xe0, 0xdb, 0x3b, 0xad, 0x4d, 0x76, 0x96,
+ 0x41, 0xa1, 0x9a, 0x7a, 0xec, 0x0c, 0x37, 0xd7
+};
+static const vector unsigned char gf_mul_8_low = {
+ 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38,
+ 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78
+};
+static const vector unsigned char gf_mul_a_low = {
+ 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36,
+ 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66
+};
+static const vector unsigned char gf_mul_c_low = {
+ 0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24,
+ 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44
+};
+static const vector unsigned char gf_mul_e_low = {
+ 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a,
+ 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a
+};
+#endif
+/* encryption code */
+
+static vector unsigned char ByteSub(vector unsigned char state)
+{
+ /* line of the s-box */
+ vector unsigned char line_01, line_23, line_45, line_67,
+ line_89, line_AB, line_CD, line_EF;
+ /* selector */
+ vector unsigned char sel1, sel2, sel7;
+ /* correct lines */
+ vector unsigned char cor_0123, cor_4567, cor_89AB, cor_CDEF,
+ cor_0to7, cor_8toF;
+ vector unsigned char ret_state;
+ vector unsigned char state_shift2, state_shift1;
+
+ line_01 = vec_perm(sbox_enc[0], sbox_enc[1], state);
+ line_23 = vec_perm(sbox_enc[2], sbox_enc[3], state);
+ line_45 = vec_perm(sbox_enc[4], sbox_enc[5], state);
+ line_67 = vec_perm(sbox_enc[6], sbox_enc[7], state);
+ line_89 = vec_perm(sbox_enc[8], sbox_enc[9], state);
+ line_AB = vec_perm(sbox_enc[10], sbox_enc[11], state);
+ line_CD = vec_perm(sbox_enc[12], sbox_enc[13], state);
+ line_EF = vec_perm(sbox_enc[14], sbox_enc[15], state);
+
+ state_shift2 = vec_vslb(state, vec_splat_u8(2));
+ sel2 = (typeof (sel2)) vec_vcmpgtub(state_shift2, imm_7Fh);
+ cor_0123 = vec_sel(line_01, line_23, sel2);
+ cor_4567 = vec_sel(line_45, line_67, sel2);
+ cor_89AB = vec_sel(line_89, line_AB, sel2);
+ cor_CDEF = vec_sel(line_CD, line_EF, sel2);
+
+ state_shift1 = vec_vslb(state, vec_splat_u8(1));
+ sel1 = (typeof (sel1))vec_vcmpgtub(state_shift1, imm_7Fh);
+ cor_0to7 = vec_sel(cor_0123, cor_4567, sel1);
+ cor_8toF = vec_sel(cor_89AB, cor_CDEF, sel1);
+
+ sel7 = (typeof (sel7))vec_vcmpgtub(state, imm_7Fh);
+ ret_state = vec_sel(cor_0to7, cor_8toF, sel7);
+
+ return ret_state;
+}
+
+static vector unsigned char ShiftRow(vector unsigned char state)
+{
+
+ return vec_perm(state, state, shift_round);
+}
+
+static vector unsigned char MixColumn(vector unsigned char state)
+{
+ vector unsigned char imm_00h, imm_01h;
+ vector unsigned char need_add;
+ vector unsigned char shifted_vec, modul;
+ vector unsigned char toadd, xtimed;
+ vector unsigned char op1, op2, op3, op4, op5;
+ vector unsigned char xor_12, xor_34, xor_1234, ret;
+
+ imm_00h = vec_splat_u8(0x00);
+ imm_01h = vec_splat_u8(0x01);
+
+ modul = vec_splat( vec_lvsr(0, (unsigned char *) 0), 0x0b); // 0x1b
+
+ need_add = (vector unsigned char)vec_vcmpgtub(state, imm_7Fh);
+ shifted_vec = vec_vslb(state, imm_01h);
+
+ toadd = vec_sel(imm_00h, modul, need_add);
+
+ xtimed = vec_xor(toadd, shifted_vec);
+
+ op1 = vec_perm(state, xtimed, pre_xor_s0);
+ op2 = vec_perm(state, xtimed, pre_xor_s1);
+ op3 = vec_perm(state, xtimed, pre_xor_s2);
+ op4 = vec_perm(state, xtimed, pre_xor_s3);
+ op5 = vec_perm(state, xtimed, pre_xor_s4);
+
+ xor_12 = vec_xor(op1, op2);
+ xor_34 = vec_xor(op3, op4);
+ xor_1234 = vec_xor(xor_12, xor_34);
+ ret = vec_xor(xor_1234, op5);
+
+ return ret;
+}
+
+static vector unsigned char AddRoundKey(vector unsigned char state,
+ vector unsigned char key)
+{
+ return vec_xor(state,key);
+}
+
+static vector unsigned char normalRound(vector unsigned char state, vector unsigned char key)
+{
+ vector unsigned char pstate;
+
+ pstate = ByteSub(state);
+ pstate = MixColumn(pstate);
+ pstate = AddRoundKey(pstate, key);
+ return pstate;
+}
+
+static vector unsigned char finalRound(vector unsigned char state, vector unsigned char key)
+{
+ vector unsigned char pstate;
+
+ pstate = ByteSub(state);
+ pstate = ShiftRow(pstate);
+ pstate = AddRoundKey(pstate, key);
+ return pstate;
+}
+
+int aes_encrypt_altivec(const unsigned char *in, unsigned char *out,
+ const unsigned char *kp, unsigned int key_len)
+{
+ unsigned char i;
+ vector unsigned char pstate;
+ const vector unsigned char *key;
+ unsigned char tmpbuf[16] __attribute__ ((aligned (16)));
+
+ memcpy(tmpbuf, in, sizeof(tmpbuf));
+ pstate = vec_ld(0, tmpbuf);
+ key = (const vector unsigned char*) kp;
+
+ pstate = vec_xor(pstate, *key++);
+
+ switch (key_len) {
+ case 32: /* 14 rounds */
+ pstate = normalRound(pstate, *key++);
+ pstate = normalRound(pstate, *key++);
+
+ case 24: /* 12 rounds */
+ pstate = normalRound(pstate, *key++);
+ pstate = normalRound(pstate, *key++);
+
+ case 16: /* 10 rounds */
+ for (i=0; i<9; i++)
+ pstate = normalRound(pstate, *key++);
+
+ break;
+
+ default:
+ /* unsupported */
+ BUG();
+ }
+
+ pstate = finalRound(pstate, *key);
+
+ vec_st(pstate, 0, tmpbuf);
+ memcpy(out, tmpbuf, 16);
+
+ return 0;
+}
+
+/* decryption code, alternative version */
+
+static vector unsigned char InvByteSub(vector unsigned char state)
+{
+ /* line of the s-box */
+ vector unsigned char line_01, line_23, line_45, line_67,
+ line_89, line_AB, line_CD, line_EF;
+ /* selector */
+ vector unsigned char sel1, sel2, sel7;
+ /* correct lines */
+ vector unsigned char cor_0123, cor_4567, cor_89AB, cor_CDEF,
+ cor_0to7, cor_8toF;
+ vector unsigned char ret_state;
+ vector unsigned char state_shift2, state_shift1;
+
+ line_01 = vec_perm(sbox_dec[0], sbox_dec[1], state);
+ line_23 = vec_perm(sbox_dec[2], sbox_dec[3], state);
+ line_45 = vec_perm(sbox_dec[4], sbox_dec[5], state);
+ line_67 = vec_perm(sbox_dec[6], sbox_dec[7], state);
+ line_89 = vec_perm(sbox_dec[8], sbox_dec[9], state);
+ line_AB = vec_perm(sbox_dec[10], sbox_dec[11], state);
+ line_CD = vec_perm(sbox_dec[12], sbox_dec[13], state);
+ line_EF = vec_perm(sbox_dec[14], sbox_dec[15], state);
+
+ state_shift2 = vec_vslb(state, vec_splat_u8(2));
+ sel2 = (typeof (sel2)) vec_vcmpgtub(state_shift2, imm_7Fh);
+ cor_0123 = vec_sel(line_01, line_23, sel2);
+ cor_4567 = vec_sel(line_45, line_67, sel2);
+ cor_89AB = vec_sel(line_89, line_AB, sel2);
+ cor_CDEF = vec_sel(line_CD, line_EF, sel2);
+
+ state_shift1 = vec_vslb(state, vec_splat_u8(1));
+ sel1 = (typeof (sel1))vec_vcmpgtub(state_shift1, imm_7Fh);
+ cor_0to7 = vec_sel(cor_0123, cor_4567, sel1);
+ cor_8toF = vec_sel(cor_89AB, cor_CDEF, sel1);
+
+ sel7 = (typeof (sel7))vec_vcmpgtub(state, imm_7Fh);
+ ret_state = vec_sel(cor_0to7, cor_8toF, sel7);
+
+ return ret_state;
+}
+
+static vector unsigned char InvShiftRow(vector unsigned char state)
+{
+
+ return vec_perm(state, state, inv_shift_round);
+}
+
+static vector unsigned char InvMixColumn(vector unsigned char state,
+ vector unsigned char inv_select_0e, vector unsigned char inv_select_0b,
+ vector unsigned char inv_select_0d, vector unsigned char inv_select_09 )
+{
+ vector unsigned char op0, op1, op2, op3, op4, op5;
+ vector unsigned char mul_0e, mul_09, mul_0d, mul_0b;
+ vector unsigned char ret;
+
+#ifdef CONFIG_CRYPTO_AES_ALTIVEC_TABLE
+
+ vector unsigned char state_high, state_low;
+ vector unsigned char imm_04h, imm_0fh;
+ vector unsigned char mul_08, mul_0c, mul_0a;
+ vector unsigned char mul_08_hi, mul_08_lo, mul_0a_hi, mul_0a_lo, mul_0c_hi,
+ mul_0c_lo, mul_0e_hi, mul_0e_lo;
+
+ /* 19 operations, 1x 8 memory loads */
+ imm_04h = vec_splat_u8(0x04);
+ imm_0fh = vec_splat_u8(0x0f);
+
+ state_high = vec_sr(state, imm_04h);
+ state_low = vec_and(state, imm_0fh);
+
+ mul_08_hi = vec_perm(gf_mul_8_high, gf_mul_8_high, state_high);
+ mul_0a_hi = vec_perm(gf_mul_a_high, gf_mul_a_high, state_high);
+ mul_0c_hi = vec_perm(gf_mul_c_high, gf_mul_c_high, state_high);
+ mul_0e_hi = vec_perm(gf_mul_e_high, gf_mul_e_high, state_high);
+
+ mul_08_lo = vec_perm(gf_mul_8_low, gf_mul_8_low, state_low);
+ mul_0a_lo = vec_perm(gf_mul_a_low, gf_mul_a_low, state_low);
+ mul_0c_lo = vec_perm(gf_mul_c_low, gf_mul_c_low, state_low);
+ mul_0e_lo = vec_perm(gf_mul_e_low, gf_mul_e_low, state_low);
+
+ mul_08 = vec_xor(mul_08_hi, mul_08_lo);
+ mul_0a = vec_xor(mul_0a_hi, mul_0a_lo);
+ mul_0c = vec_xor(mul_0c_hi, mul_0c_lo);
+ mul_0e = vec_xor(mul_0e_hi, mul_0e_lo);
+
+ mul_09 = vec_xor(mul_08, state);
+ mul_0b = vec_xor(mul_0a, state);
+ mul_0d = vec_xor(mul_0c, state);
+
+#else
+
+ vector unsigned char imm_00h, imm_01h;
+ vector unsigned char need_add;
+ vector unsigned char shifted_vec, modul;
+ vector unsigned char toadd;
+ vector unsigned char mul_2, mul_4, mul_8;
+ vector unsigned char mul_2_4;
+
+ /* 21 operations, 3x 1 memory loads */
+ /* compute 0e, 0b, 0d, 09 in GF */
+ imm_00h = vec_splat_u8(0x00);
+ imm_01h = vec_splat_u8(0x01);
+
+ modul = vec_splat( vec_lvsr(0, (unsigned char *) 0), 0x0b); // 0x1b
+
+ need_add = (vector unsigned char)vec_vcmpgtub(state, imm_7Fh);
+ shifted_vec = vec_vslb(state, imm_01h);
+ toadd = vec_sel(imm_00h, modul, need_add);
+ mul_2 = vec_xor(toadd, shifted_vec);
+
+ need_add = (vector unsigned char)vec_vcmpgtub(mul_2, imm_7Fh);
+ shifted_vec = vec_vslb(mul_2, imm_01h);
+ toadd = vec_sel(imm_00h, modul, need_add);
+ mul_4 = vec_xor(toadd, shifted_vec);
+
+ need_add = (vector unsigned char)vec_vcmpgtub(mul_4, imm_7Fh);
+ shifted_vec = vec_vslb(mul_4, imm_01h);
+ toadd = vec_sel(imm_00h, modul, need_add);
+ mul_8 = vec_xor(toadd, shifted_vec);
+
+ mul_2_4 = vec_xor(mul_2, mul_4);
+ /* 09 = 8 * 1 */
+ mul_09 = vec_xor(mul_8, state);
+
+ /* 0e = 2 * 4 * 8 */
+ mul_0e = vec_xor(mul_2_4, mul_8);
+
+ /* 0b = 2 * 8 * 1 */
+ mul_0b = vec_xor(mul_2, mul_09);
+
+ /* 0d = 4 * 8 * 1 */
+ mul_0d = vec_xor(mul_4, mul_09);
+#endif
+
+ /* prepare vectors for add */
+
+ op0 = vec_perm(mul_0e, mul_0e, inv_select_0e);
+ op1 = vec_perm(mul_0b, mul_0b, inv_select_0b);
+ op2 = vec_perm(mul_0d, mul_0d, inv_select_0d);
+ op3 = vec_perm(mul_09, mul_09, inv_select_09);
+
+ op4 = vec_xor(op0, op1);
+ op5 = vec_xor(op2, op3);
+ ret = vec_xor(op4, op5);
+ return ret;
+}
+
+static vector unsigned char InvNormalRound(vector unsigned char state,
+ vector unsigned char key)
+{
+ vector unsigned char pstate;
+
+ pstate = InvByteSub(state);
+ pstate = InvMixColumn(pstate, inv_select_0e_shifted, inv_select_0b_shifted,
+ inv_select_0d_shifted, inv_select_09_shifted);
+ pstate = AddRoundKey(pstate, key);
+ return pstate;
+}
+
+static vector unsigned char InvfinalRound(vector unsigned char state,
+ vector unsigned char key)
+{
+ vector unsigned char pstate;
+
+ pstate = InvByteSub(state);
+ pstate = InvShiftRow(pstate);
+ pstate = AddRoundKey(pstate, key);
+ return pstate;
+}
+
+int aes_decrypt_altivec(const unsigned char *in, unsigned char *out,
+ const unsigned char *kp, unsigned char key_len)
+{
+ unsigned char i;
+ vector unsigned char pstate;
+ const vector unsigned char *key;
+ unsigned char tmpbuf[16] __attribute__ ((aligned (16)));
+
+ memcpy(tmpbuf, in, sizeof(tmpbuf));
+ pstate = vec_ld(0, tmpbuf);
+
+ key = (const vector unsigned char*) kp;
+
+ pstate = vec_xor(pstate, *key++);
+
+ switch (key_len) {
+ case 32: /* 14 rounds */
+ pstate = InvNormalRound(pstate, *key++);
+ pstate = InvNormalRound(pstate, *key++);
+
+ case 24: /* 12 rounds */
+ pstate = InvNormalRound(pstate, *key++);
+ pstate = InvNormalRound(pstate, *key++);
+
+ case 16: /* 10 rounds */
+ for (i=0; i<9; i++)
+ pstate = InvNormalRound(pstate, *key++);
+
+ break;
+
+ default:
+ BUG();
+ }
+
+ pstate = InvfinalRound(pstate, *key);
+
+ vec_st(pstate, 0, tmpbuf);
+ memcpy(out, tmpbuf, 16);
+ return 0;
+}
+
+/* expand key */
+
+static unsigned int SubWord(unsigned int in)
+{
+ unsigned char buff[16] __attribute__ ((aligned (16)));
+ vector unsigned char vec_buf;
+
+ buff[0] = in >> 24;
+ buff[1] = (in >> 16) & 0xff;
+ buff[2] = (in >> 8) & 0xff;
+ buff[3] = in & 0xff;
+
+ vec_buf = vec_ld(0, buff);
+ vec_buf = ByteSub(vec_buf);
+ vec_st(vec_buf, 0, buff);
+ return buff[0] << 24 | buff[1] << 16 | buff[2] << 8 | buff[3];
+}
+
+static unsigned int RotWord(unsigned int word)
+{
+ return (word << 8 | word >> 24);
+}
+
+int expand_key(const unsigned char *key, unsigned int keylen,
+ unsigned char exp_enc_key[15 *4*4], unsigned char exp_dec_key[15*4*4])
+{
+ unsigned int tmp, i, rounds;
+ unsigned int expanded_key[15 *4] __attribute__ ((aligned (16)));
+ vector unsigned char expanded_dec_key[15];
+ vector unsigned char mixed_key;
+ vector unsigned char *cur_key;
+
+ switch (keylen) {
+ case 4:
+ rounds = 10;
+ break;
+
+ case 6:
+ rounds = 12;
+ break;
+
+ case 8:
+ rounds = 14;
+ break;
+
+ default:
+ /* wrong key size */
+ return -EINVAL;
+ }
+
+ memcpy(expanded_key, key, keylen*4);
+
+ i = keylen;
+
+ /* setup enc key */
+
+ for (; i< 4 * (rounds+1); i++) {
+ tmp = expanded_key[i-1];
+
+ if (!(i % keylen)) {
+ tmp = RotWord(tmp);
+ tmp = SubWord(tmp);
+ tmp ^= Rcon[i / keylen ];
+ } else if (keylen > 6 && (i % keylen == 4))
+ tmp = SubWord(tmp);
+
+ expanded_key[i] = expanded_key[i-keylen] ^ tmp;
+ }
+
+ memcpy(exp_enc_key, expanded_key, 15*4*4);
+
+ /* setup dec key: the key is turned arround and prepared for the
+ * "alternative decryption" mode
+ */
+
+ cur_key = (vector unsigned char*) expanded_key;
+
+ memcpy(&expanded_dec_key[rounds], &expanded_key[0], 4*4);
+ memcpy(&expanded_dec_key[0], &expanded_key[(rounds) *4], 4*4);
+
+ cur_key++;
+ for (i = (rounds-1); i> 0; i--) {
+
+ mixed_key = InvMixColumn(*cur_key++, inv_select_0e_norm, inv_select_0b_norm,
+ inv_select_0d_norm, inv_select_09_norm);
+ expanded_dec_key[i] = mixed_key;
+ }
+
+ memcpy(exp_dec_key, expanded_dec_key, 15*4*4);
+ return 0;
+}
Index: ps3-linux/crypto/aes-altivec.h
===================================================================
--- /dev/null
+++ ps3-linux/crypto/aes-altivec.h
@@ -0,0 +1,12 @@
+#ifndef __AES_ALTIVEC_H__
+#define __AES_ALTIVEC_H__
+
+extern int expand_key(const unsigned char *key, unsigned int keylen,
+ unsigned char exp_enc_key[15 *4*4], unsigned char expanded_dec_key[15*4*4]);
+
+extern int aes_encrypt_altivec(const unsigned char *in, unsigned char *out,
+ const unsigned char *kp, unsigned int key_len);
+
+extern int aes_decrypt_altivec(const unsigned char *in, unsigned char *out,
+ const unsigned char *kp, unsigned char key_len);
+#endif
Index: ps3-linux/crypto/Kconfig
===================================================================
--- ps3-linux.orig/crypto/Kconfig
+++ ps3-linux/crypto/Kconfig
@@ -325,6 +325,21 @@ config CRYPTO_AES_X86_64
See <http://csrc.nist.gov/encryption/aes/> for more information.
+config CRYPTO_AES_ALTIVEC
+ tristate "AES with AltiVec support"
+ select CRYPTO_ALGAPI
+ depends on ALTIVEC
+ help
+ AES cipher algorithms (FIPS-197). AES uses the Rijndael
+ algorithm. This implementation has AltiVec support.
+
+config CRYPTO_AES_ALTIVEC_TABLE
+ bool "Use table lookup for decryption"
+ depends on CRYPTO_AES_ALTIVEC
+ help
+ Use precomputed tables for decryption instead of computing
+ "by hand" in GF. This solution is slower.
+
config CRYPTO_CAST5
tristate "CAST5 (CAST-128) cipher algorithm"
select CRYPTO_ALGAPI
Index: ps3-linux/crypto/Makefile
===================================================================
--- ps3-linux.orig/crypto/Makefile
+++ ps3-linux/crypto/Makefile
@@ -48,3 +48,7 @@ obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += mich
obj-$(CONFIG_CRYPTO_CRC32C) += crc32c.o
obj-$(CONFIG_CRYPTO_TEST) += tcrypt.o
+
+CFLAGS_aes-altivec.o += -O3 -maltivec -mcpu=cell
+aes_altivec-objs := aes-alti.o aes-altivec.o
+obj-$(CONFIG_CRYPTO_AES_ALTIVEC) += aes_altivec.o
--
^ permalink raw reply [flat|nested] 10+ messages in thread
* [RFC 2/3] PowerPC: lazy altivec enabling in kernel
2007-04-11 16:49 [RFC 0/3] Experiments with AES-AltiVec Sebastian Siewior
2007-04-11 16:49 ` [RFC 1/3] cryptoapi: AES with AltiVec support Sebastian Siewior
@ 2007-04-11 16:49 ` Sebastian Siewior
2007-04-11 16:49 ` [RFC 3/3] cryptoapi: speed test Sebastian Siewior
2 siblings, 0 replies; 10+ messages in thread
From: Sebastian Siewior @ 2007-04-11 16:49 UTC (permalink / raw)
To: linuxppc-dev
This works right only for 64bit kernel and will break any 32bit kernel.
Switching on altivec takes some time due to the MSR access. The speed-up is
about 50% in my aes-code. It might be usefull for the raid module as well.
Signed-off-by: Sebastian Siewior <bigeasy@linux.vnet.ibm.com>
Index: ps3-linux/arch/powerpc/kernel/head_64.S
===================================================================
--- ps3-linux.orig/arch/powerpc/kernel/head_64.S
+++ ps3-linux/arch/powerpc/kernel/head_64.S
@@ -1229,6 +1229,13 @@ altivec_unavailable_common:
#ifdef CONFIG_ALTIVEC
BEGIN_FTR_SECTION
bne .load_up_altivec /* if from user, just load it up */
+ /*
+ * the kernel is going to use AltiVec.
+ * hopefully enable_kernel_altivec() has been called
+ */
+ bl .altivec_enable_for_kernel_exception
+ b .ret_from_except
+
END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
#endif
bl .save_nvgprs
Index: ps3-linux/arch/powerpc/kernel/misc_64.S
===================================================================
--- ps3-linux.orig/arch/powerpc/kernel/misc_64.S
+++ ps3-linux/arch/powerpc/kernel/misc_64.S
@@ -493,6 +493,8 @@ _GLOBAL(giveup_altivec)
mfmsr r5
oris r5,r5,MSR_VEC@h
mtmsrd r5 /* enable use of VMX now */
+
+giveup_user_altivec_save_vmx:
isync
cmpdi 0,r3,0
beqlr- /* if no previous owner, done */
@@ -516,6 +518,14 @@ _GLOBAL(giveup_altivec)
#endif /* CONFIG_SMP */
blr
+/*
+ * giveup_user_altivec(tsk)
+ * Same as giveup_altivec() but lets the exception handler
+ * enable AltiVec
+ */
+_GLOBAL(giveup_user_altivec)
+ b save_vmx
+
#endif /* CONFIG_ALTIVEC */
_GLOBAL(kernel_execve)
Index: ps3-linux/arch/powerpc/kernel/process.c
===================================================================
--- ps3-linux.orig/arch/powerpc/kernel/process.c
+++ ps3-linux/arch/powerpc/kernel/process.c
@@ -119,15 +119,21 @@ int dump_task_fpu(struct task_struct *ts
#ifdef CONFIG_ALTIVEC
void enable_kernel_altivec(void)
{
- WARN_ON(preemptible());
+ BUG_ON(preemptible());
+ /*
+ * enable_kernel_altivec() will just save current AltiVec registers (if needed) and
+ * return to caller (with MSR_VEC unchanged (probably not set)). The first AltiVec
+ * instruction will raise an exception and the exception will enable the AltiVec for
+ * the kernel. This is done to avoid the expensive "enable altivec" operation if it
+ * is allready enabled. However, you have to disable preemtion while you are using
+ * AltiVec.
+ */
#ifdef CONFIG_SMP
if (current->thread.regs && (current->thread.regs->msr & MSR_VEC))
- giveup_altivec(current);
- else
- giveup_altivec(NULL); /* just enable AltiVec for kernel - force */
+ giveup_user_altivec(current);
#else
- giveup_altivec(last_task_used_altivec);
+ giveup_user_altivec(last_task_used_altivec);
#endif /* CONFIG_SMP */
}
EXPORT_SYMBOL(enable_kernel_altivec);
Index: ps3-linux/arch/powerpc/kernel/traps.c
===================================================================
--- ps3-linux.orig/arch/powerpc/kernel/traps.c
+++ ps3-linux/arch/powerpc/kernel/traps.c
@@ -886,6 +886,12 @@ void altivec_unavailable_exception(struc
die("Unrecoverable VMX/Altivec Unavailable Exception", regs, SIGABRT);
}
+void altivec_enable_for_kernel_exception(struct pt_regs *regs)
+{
+ printk("altivec_enable_for_kernel_exception: AltiVec mode on for kernel\n");
+ regs->msr |= MSR_VEC;
+}
+
void performance_monitor_exception(struct pt_regs *regs)
{
perf_irq(regs);
Index: ps3-linux/include/asm-powerpc/system.h
===================================================================
--- ps3-linux.orig/include/asm-powerpc/system.h
+++ ps3-linux/include/asm-powerpc/system.h
@@ -129,6 +129,7 @@ extern void enable_kernel_fp(void);
extern void flush_fp_to_thread(struct task_struct *);
extern void enable_kernel_altivec(void);
extern void giveup_altivec(struct task_struct *);
+extern void giveup_user_altivec(struct task_struct *);
extern void load_up_altivec(struct task_struct *);
extern int emulate_altivec(struct pt_regs *);
extern void giveup_spe(struct task_struct *);
--
^ permalink raw reply [flat|nested] 10+ messages in thread
* [RFC 3/3] cryptoapi: speed test
2007-04-11 16:49 [RFC 0/3] Experiments with AES-AltiVec Sebastian Siewior
2007-04-11 16:49 ` [RFC 1/3] cryptoapi: AES with AltiVec support Sebastian Siewior
2007-04-11 16:49 ` [RFC 2/3] PowerPC: lazy altivec enabling in kernel Sebastian Siewior
@ 2007-04-11 16:49 ` Sebastian Siewior
2 siblings, 0 replies; 10+ messages in thread
From: Sebastian Siewior @ 2007-04-11 16:49 UTC (permalink / raw)
To: linuxppc-dev
This has been used for performance testing in my aes altivec code.
Signed-off-by: Sebastian Siewior <bigeasy@linux.vnet.ibm.com>
Index: ps3-linux/crypto/limi-speed.c
===================================================================
--- /dev/null
+++ ps3-linux/crypto/limi-speed.c
@@ -0,0 +1,138 @@
+/*
+ * Code derived von crypt/tcrypt.h
+ *
+ * Small speed test with time resolution in msec.
+ * Author: Sebastian Siewior (bigeasy _at_ breakpoint.cc)
+ * License: GPL v2
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/scatterlist.h>
+#include <linux/crypto.h>
+#include <linux/jiffies.h>
+#include <linux/types.h>
+
+static char *in;
+
+static unsigned int buff_size = 16 * 1024;
+module_param(buff_size, uint, 0444);
+MODULE_PARM_DESC(buff_size, "Buffer allocated by kmalloc()");
+
+static unsigned int keylen = 16;
+module_param(keylen, uint, 0444);
+MODULE_PARM_DESC(keylen, "Length of the key (16,24 or 32 bits");
+
+static unsigned int mode = 0;
+module_param(mode, uint, 0444);
+MODULE_PARM_DESC(mode, "0 -> encryption else decryption");
+
+static unsigned int big_loops = 10;
+module_param(big_loops, uint, 0444);
+MODULE_PARM_DESC(big_loops, "Number of mensurations.");
+
+static unsigned int small_loops = 10000;
+module_param(small_loops, uint, 0444);
+MODULE_PARM_DESC(small_loops, "loops within one mesurement.");
+
+
+static int __init init(void)
+{
+ struct scatterlist sg[1];
+ struct crypto_blkcipher *tfm;
+ struct blkcipher_desc desc;
+ unsigned int i;
+ unsigned int ret;
+ unsigned int iv_len;
+ unsigned long start, end;
+ unsigned long total = 0;
+ unsigned long size_kb;
+ unsigned char key[32] = { 1, 2, 3, 4, 5, 6 };
+
+ printk("Limi-speed: buff_size: %u, keylen: %d, mode: %s\n", buff_size, keylen, mode ? "decryption" : "encryption");
+ printk("loops: %d, iterations: %d, ", big_loops, small_loops);
+ size_kb = small_loops * buff_size / 1024;
+ printk("=> %lu kb or %lu mb a loop\n", size_kb, size_kb/1024);
+
+ if (keylen != 16 && keylen != 24 && keylen != 32) {
+ printk("Invalid keysize\n");
+ return -EINVAL;
+ }
+
+ in = kmalloc(buff_size, GFP_KERNEL);
+ if (in == NULL) {
+ printk("Failed to allocate memory.\n");
+ return -ENOMEM;
+ }
+
+ memset(in, 0x24, buff_size);
+
+ sg_set_buf(sg, in, buff_size);
+
+ tfm = crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
+
+ if (IS_ERR(tfm)) {
+ printk("failed to load transform for cbc(aes): %ld\n", PTR_ERR(tfm));
+ goto leave;
+ }
+
+ crypto_blkcipher_setkey(tfm, key, keylen);
+
+ iv_len = crypto_blkcipher_ivsize(tfm);
+ if (iv_len)
+ crypto_blkcipher_set_iv(tfm, in, iv_len);
+
+ desc.tfm = tfm;
+ desc.flags = 0;
+
+ preempt_disable();
+ enable_kernel_altivec();
+
+ for (i=0 ; i<big_loops; i++) {
+ int j;
+ start = jiffies;
+ ret = 0;
+
+ for (j=0; j < small_loops && !ret; j++) {
+
+ if (!mode)
+ ret = crypto_blkcipher_encrypt(&desc, sg, sg, buff_size);
+ else
+ ret = crypto_blkcipher_decrypt(&desc, sg, sg, buff_size);
+ }
+
+ if (ret) {
+ printk("encryption failed: %d after (i,j) (%u,%u) iterations\n", ret, i, j);
+ goto leave_loop;
+ }
+ end = jiffies;
+ if ( !time_after(start, end)) {
+ printk("Run: %u msec\n", jiffies_to_msecs(end - start));
+ total += jiffies_to_msecs(end - start);
+ } else {
+ printk("Run: %u msec\n", jiffies_to_msecs(start - end));
+ total += jiffies_to_msecs(start - end);
+ }
+ }
+
+ total /= big_loops;
+ size_kb *= 1000;
+ size_kb /= total;
+ printk("Average: %lu msec, approx. %lu kb/sec || %lu mb/sec \n", total,
+ size_kb, size_kb/1024);
+
+leave_loop:
+ preempt_enable();
+ crypto_free_blkcipher(tfm);
+
+leave:
+ kfree(in);
+ return -ENODEV;
+}
+
+static void __exit fini(void) { }
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
Index: ps3-linux/crypto/Kconfig
===================================================================
--- ps3-linux.orig/crypto/Kconfig
+++ ps3-linux/crypto/Kconfig
@@ -462,6 +462,12 @@ config CRYPTO_TEST
help
Quick & dirty crypto test module.
+config CRYPTO_LIMI_SPEED
+ tristate "Crypto algorithm speed test with msec resolution"
+ help
+ insmod/modprobe the module, and watch dmesg for results.
+ Test is for aes only.
+
source "drivers/crypto/Kconfig"
endif # if CRYPTO
Index: ps3-linux/crypto/Makefile
===================================================================
--- ps3-linux.orig/crypto/Makefile
+++ ps3-linux/crypto/Makefile
@@ -52,3 +52,4 @@ obj-$(CONFIG_CRYPTO_TEST) += tcrypt.o
CFLAGS_aes-altivec.o += -O3 -maltivec -mcpu=cell
aes_altivec-objs := aes-alti.o aes-altivec.o
obj-$(CONFIG_CRYPTO_AES_ALTIVEC) += aes_altivec.o
+obj-$(CONFIG_CRYPTO_LIMI_SPEED) += limi-speed.o
--
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [RFC 1/3] cryptoapi: AES with AltiVec support
2007-04-11 16:49 ` [RFC 1/3] cryptoapi: AES with AltiVec support Sebastian Siewior
@ 2007-04-11 18:24 ` Arnd Bergmann
2007-04-12 13:40 ` Sebastian Siewior
2007-04-11 22:22 ` Benjamin Herrenschmidt
1 sibling, 1 reply; 10+ messages in thread
From: Arnd Bergmann @ 2007-04-11 18:24 UTC (permalink / raw)
To: linuxppc-dev
Just a few things I noticed in this version:
On Wednesday 11 April 2007, Sebastian Siewior wrote:
> +#ifdef CONFIG_CRYPTO_AES_ALTIVEC_TABLE
> +
> + /* 19 operations, 1x 8 memory loads */
> + imm_04h = vec_splat_u8(0x04);
> + imm_0fh = vec_splat_u8(0x0f);
> +
> + state_high = vec_sr(state, imm_04h);
> + state_low = vec_and(state, imm_0fh);
Why do you need to and with 0x0f here? I thought vec_perm simply ignored
the high bits anyway.
> + mul_08_hi = vec_perm(gf_mul_8_high, gf_mul_8_high, state_high);
> + mul_0a_hi = vec_perm(gf_mul_a_high, gf_mul_a_high, state_high);
> + mul_0c_hi = vec_perm(gf_mul_c_high, gf_mul_c_high, state_high);
> + mul_0e_hi = vec_perm(gf_mul_e_high, gf_mul_e_high, state_high);
> +
> + mul_08_lo = vec_perm(gf_mul_8_low, gf_mul_8_low, state_low);
> + mul_0a_lo = vec_perm(gf_mul_a_low, gf_mul_a_low, state_low);
> + mul_0c_lo = vec_perm(gf_mul_c_low, gf_mul_c_low, state_low);
> + mul_0e_lo = vec_perm(gf_mul_e_low, gf_mul_e_low, state_low);
> +
> + mul_08 = vec_xor(mul_08_hi, mul_08_lo);
> + mul_0a = vec_xor(mul_0a_hi, mul_0a_lo);
> + mul_0c = vec_xor(mul_0c_hi, mul_0c_lo);
> + mul_0e = vec_xor(mul_0e_hi, mul_0e_lo);
> +
> + mul_09 = vec_xor(mul_08, state);
> + mul_0b = vec_xor(mul_0a, state);
> + mul_0d = vec_xor(mul_0c, state);
What are the last three xor used for? I'd think you can have the values
for 0x9, 0xb and 0xd in the table directly.
> +int aes_decrypt_altivec(const unsigned char *in, unsigned char *out,
> + const unsigned char *kp, unsigned char key_len)
> +{
> + unsigned char i;
> + vector unsigned char pstate;
> + const vector unsigned char *key;
> + unsigned char tmpbuf[16] __attribute__ ((aligned (16)));
My understanding is that gcc will not align the latter on the stack
now does it warn about the fact that the attribute gets ignored.
Have you checked that the variable is really put into a 16 byte
aligned stack slot? Does it even make a difference?
> + memcpy(tmpbuf, in, sizeof(tmpbuf));
> + pstate = vec_ld(0, tmpbuf);
> +
> + key = (const vector unsigned char*) kp;
> +
> + pstate = vec_xor(pstate, *key++);
> +
> + switch (key_len) {
> + case 32: /* 14 rounds */
> + pstate = InvNormalRound(pstate, *key++);
> + pstate = InvNormalRound(pstate, *key++);
> +
> + case 24: /* 12 rounds */
> + pstate = InvNormalRound(pstate, *key++);
> + pstate = InvNormalRound(pstate, *key++);
> +
> + case 16: /* 10 rounds */
> + for (i=0; i<9; i++)
> + pstate = InvNormalRound(pstate, *key++);
> +
> + break;
> +
> + default:
> + BUG();
> + }
Did this manual partial unrolling actually make a difference compared
this?
for (i=0; i<(5 + key_len / 8); i++)
pstate = InvNormalRound(pstate, *key++);
If they are the same speed, there should probably be no unrolling,
because the larger object code will be bad for the instruction
cache.
> ===================================================================
> --- ps3-linux.orig/crypto/Makefile
> +++ ps3-linux/crypto/Makefile
> @@ -48,3 +48,7 @@ obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += mich
> obj-$(CONFIG_CRYPTO_CRC32C) += crc32c.o
>
> obj-$(CONFIG_CRYPTO_TEST) += tcrypt.o
> +
> +CFLAGS_aes-altivec.o += -O3 -maltivec -mcpu=cell
mcpu=cell probably breaks on most compilers. This needs some
experiments, but most systems should either leave this out
or specify the cpu they are actually compiling for.
Some time ago, I did a patch to extend the CPU selection in
Kconfig so you can choose sensible mcpu= and mtune= flags
semi-automatically.
Arnd <><
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [RFC 1/3] cryptoapi: AES with AltiVec support
2007-04-11 16:49 ` [RFC 1/3] cryptoapi: AES with AltiVec support Sebastian Siewior
2007-04-11 18:24 ` Arnd Bergmann
@ 2007-04-11 22:22 ` Benjamin Herrenschmidt
2007-04-12 7:45 ` Sebastian Siewior
1 sibling, 1 reply; 10+ messages in thread
From: Benjamin Herrenschmidt @ 2007-04-11 22:22 UTC (permalink / raw)
To: Sebastian Siewior; +Cc: linuxppc-dev
> ==================================================================
> --- ps3-linux.orig/crypto/Makefile
> +++ ps3-linux/crypto/Makefile
> @@ -48,3 +48,7 @@ obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += mich
> obj-$(CONFIG_CRYPTO_CRC32C) += crc32c.o
>
> obj-$(CONFIG_CRYPTO_TEST) += tcrypt.o
> +
> +CFLAGS_aes-altivec.o += -O3 -maltivec -mcpu=cell
> +aes_altivec-objs := aes-alti.o aes-altivec.o
> +obj-$(CONFIG_CRYPTO_AES_ALTIVEC) += aes_altivec.o
Ideally (and I know the RAID6 code isnt doing it), the
code that contains enable_kernel_altivec/disable_kernel_altivec should
-not- itself be compiled with -malitvec. You don't want the compiler to
"inadvertently" generate altivec instructions outside of those calls (in
the function prolog for example).
I noticed quite a bit of memcpy's around too... see if you can limit
usage of these.
Ben.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [RFC 1/3] cryptoapi: AES with AltiVec support
2007-04-11 22:22 ` Benjamin Herrenschmidt
@ 2007-04-12 7:45 ` Sebastian Siewior
2007-04-12 8:39 ` Benjamin Herrenschmidt
0 siblings, 1 reply; 10+ messages in thread
From: Sebastian Siewior @ 2007-04-12 7:45 UTC (permalink / raw)
To: Benjamin Herrenschmidt; +Cc: linuxppc-dev
Benjamin Herrenschmidt wrote:
>> ==================================================================
>> --- ps3-linux.orig/crypto/Makefile
>> +++ ps3-linux/crypto/Makefile
>> @@ -48,3 +48,7 @@ obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += mich
>> obj-$(CONFIG_CRYPTO_CRC32C) += crc32c.o
>>
>> obj-$(CONFIG_CRYPTO_TEST) += tcrypt.o
>> +
>> +CFLAGS_aes-altivec.o += -O3 -maltivec -mcpu=cell
>> +aes_altivec-objs := aes-alti.o aes-altivec.o
>> +obj-$(CONFIG_CRYPTO_AES_ALTIVEC) += aes_altivec.o
>
> Ideally (and I know the RAID6 code isnt doing it), the
> code that contains enable_kernel_altivec/disable_kernel_altivec should
> -not- itself be compiled with -malitvec. You don't want the compiler to
> "inadvertently" generate altivec instructions outside of those calls (in
> the function prolog for example).
Yes. aes-altivec.o contains the -maltivec flag and is the only module
using AltiVec. aes-alti.o uses enable_kernel_altivec &
disable_kernel_altivec and calls then the AltiVec module.
The raid6 code compiles the raid[1,2,4,8].c files with -maltivec flag
and uses the noinline keyword in the function that uses AltiVec.
Should I prefer the latter or were you confused with with the - _ in the
filename and you recommend renaming them?
> I noticed quite a bit of memcpy's around too... see if you can limit
> usage of these.
I check if I can be be sure that the data is always properly aligned
>
> Ben.
>
Sebastian
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [RFC 1/3] cryptoapi: AES with AltiVec support
2007-04-12 7:45 ` Sebastian Siewior
@ 2007-04-12 8:39 ` Benjamin Herrenschmidt
0 siblings, 0 replies; 10+ messages in thread
From: Benjamin Herrenschmidt @ 2007-04-12 8:39 UTC (permalink / raw)
To: Sebastian Siewior; +Cc: linuxppc-dev
> Yes. aes-altivec.o contains the -maltivec flag and is the only module
> using AltiVec. aes-alti.o uses enable_kernel_altivec &
> disable_kernel_altivec and calls then the AltiVec module.
> The raid6 code compiles the raid[1,2,4,8].c files with -maltivec flag
> and uses the noinline keyword in the function that uses AltiVec.
> Should I prefer the latter or were you confused with with the - _ in the
> filename and you recommend renaming them?
Nah, your stuff looks fine, I got confused.
Ben.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [RFC 1/3] cryptoapi: AES with AltiVec support
2007-04-11 18:24 ` Arnd Bergmann
@ 2007-04-12 13:40 ` Sebastian Siewior
0 siblings, 0 replies; 10+ messages in thread
From: Sebastian Siewior @ 2007-04-12 13:40 UTC (permalink / raw)
To: Arnd Bergmann; +Cc: linuxppc-dev
Arnd Bergmann wrote:
> Just a few things I noticed in this version:
>
>> + imm_04h = vec_splat_u8(0x04);
>> + imm_0fh = vec_splat_u8(0x0f);
>> +
>> + state_high = vec_sr(state, imm_04h);
>> + state_low = vec_and(state, imm_0fh);
>
> Why do you need to and with 0x0f here? I thought vec_perm simply ignored
> the high bits anyway.
Yes it does, my bad
>> + mul_09 = vec_xor(mul_08, state);
>> + mul_0b = vec_xor(mul_0a, state);
>> + mul_0d = vec_xor(mul_0c, state);
>
> What are the last three xor used for? I'd think you can have the values
> for 0x9, 0xb and 0xd in the table directly.
Yes. With recomputing table the performance seems to be unchanged.
>> +int aes_decrypt_altivec(const unsigned char *in, unsigned char *out,
>> + const unsigned char *kp, unsigned char key_len)
>> +{
>> + unsigned char i;
>> + vector unsigned char pstate;
>> + const vector unsigned char *key;
>> + unsigned char tmpbuf[16] __attribute__ ((aligned (16)));
>
> My understanding is that gcc will not align the latter on the stack
> now does it warn about the fact that the attribute gets ignored.
> Have you checked that the variable is really put into a 16 byte
> aligned stack slot? Does it even make a difference?
Quote from the PPC64 ABI [1]:
r1: The stack pointer (stored in r1) shall maintain quadword alignment.
It shall always point to the lowest allocated valid stack frame, and
grow toward low addresses. The contents of the word at that address
always point to the previously allocated stack frame.
...
PPC32 ABI[2] page 32 contains the same sentence about r1. The compiler
does not behave correctly on >16 byte alignment and should warn you.
>> + switch (key_len) {
>> + case 32: /* 14 rounds */
>> + pstate = InvNormalRound(pstate, *key++);
>> + pstate = InvNormalRound(pstate, *key++);
>> +
>> + case 24: /* 12 rounds */
>> + pstate = InvNormalRound(pstate, *key++);
>> + pstate = InvNormalRound(pstate, *key++);
>> +
>> + case 16: /* 10 rounds */
>> + for (i=0; i<9; i++)
>> + pstate = InvNormalRound(pstate, *key++);
>> +
>> + break;
>> +
>> + default:
>> + BUG();
>> + }
>
> Did this manual partial unrolling actually make a difference compared
> this?
>
> for (i=0; i<(5 + key_len / 8); i++)
> pstate = InvNormalRound(pstate, *key++);
The numbers for encryption change from 33mb/sec to 23mb/sec and for
decryption from 29mb/sec to 26mb/sec. The table decryption, however,
improves from 19mb/sec to 23mb/sec.
> If they are the same speed, there should probably be no unrolling,
> because the larger object code will be bad for the instruction
> cache.
I don't force him to unroll the code, there is no inline keyword there.
ppu-gcc from SDK 2.1 unrolls the code, gentoo's gcc 4.1.2 doesn't.
>> +CFLAGS_aes-altivec.o += -O3 -maltivec -mcpu=cell
>
> mcpu=cell probably breaks on most compilers. This needs some
> experiments, but most systems should either leave this out
> or specify the cpu they are actually compiling for.
>
> Some time ago, I did a patch to extend the CPU selection in
> Kconfig so you can choose sensible mcpu= and mtune= flags
> semi-automatically.
I have no cpu selection in my current Kconfig. Will take a look on that
patch
>
> Arnd <><
Sebastian
[1] ftp://ftp.linuxppc64.org/pub/people/sjmunroe/PPC64-VMXabi.txt
[2]http://www.cloudcaptech.com/MPC555%20Resources/Programming%20Environment/SVR4abippc.pdf
^ permalink raw reply [flat|nested] 10+ messages in thread
* [RFC 3/3] cryptoapi: speed test
2007-04-17 11:52 [RFC 0/3] Experiments with AES-AltiVec, part 2 Sebastian Siewior
@ 2007-04-17 11:52 ` Sebastian Siewior
0 siblings, 0 replies; 10+ messages in thread
From: Sebastian Siewior @ 2007-04-17 11:52 UTC (permalink / raw)
To: linuxppc-dev
This has been used for performance testing of my aes altivec code.
Signed-off-by: Sebastian Siewior <bigeasy@linux.vnet.ibm.com>
Index: linux/crypto/limi-speed.c
===================================================================
--- /dev/null
+++ linux/crypto/limi-speed.c
@@ -0,0 +1,140 @@
+/*
+ * Code derived von crypt/tcrypt.h
+ *
+ * Small speed test with time resolution in msec.
+ * Author: Sebastian Siewior (bigeasy _at_ breakpoint.cc)
+ * License: GPL v2
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/scatterlist.h>
+#include <linux/crypto.h>
+#include <linux/jiffies.h>
+#include <linux/types.h>
+
+static char *in;
+
+static unsigned int buff_size = 16 * 1024;
+module_param(buff_size, uint, 0444);
+MODULE_PARM_DESC(buff_size, "Buffer allocated by kmalloc()");
+
+static unsigned int keylen = 16;
+module_param(keylen, uint, 0444);
+MODULE_PARM_DESC(keylen, "Length of the key (16,24 or 32 bits");
+
+static unsigned int mode = 0;
+module_param(mode, uint, 0444);
+MODULE_PARM_DESC(mode, "0 -> encryption else decryption");
+
+static unsigned int big_loops = 10;
+module_param(big_loops, uint, 0444);
+MODULE_PARM_DESC(big_loops, "Number of mensurations.");
+
+static unsigned int small_loops = 10000;
+module_param(small_loops, uint, 0444);
+MODULE_PARM_DESC(small_loops, "loops within one mesurement.");
+
+static unsigned int alg = 1;
+module_param(alg, uint, 0444);
+MODULE_PARM_DESC(alg, "0 -> ecb(aes), else -> cbc(aes)");
+
+static int __init init(void)
+{
+ struct scatterlist sg[1];
+ struct crypto_blkcipher *tfm;
+ struct blkcipher_desc desc;
+ unsigned int i;
+ unsigned int ret;
+ unsigned int iv_len;
+ unsigned long start, end;
+ unsigned long total = 0;
+ unsigned long size_kb;
+ unsigned char key[32] = { 1, 2, 3, 4, 5, 6 };
+ const unsigned char *algname;
+
+ algname = alg ? "cbc(aes)" : "ecb(aes)";
+ printk("Limi-speed: %s buff_size: %u, keylen: %d, mode: %s\n", algname, buff_size, keylen,
+ mode ? "decryption" : "encryption");
+ printk("loops: %d, iterations: %d, ", big_loops, small_loops);
+ size_kb = small_loops * buff_size / 1024;
+ printk("=> %lu kb or %lu mb a loop\n", size_kb, size_kb/1024);
+
+ if (keylen != 16 && keylen != 24 && keylen != 32) {
+ printk("Invalid keysize\n");
+ return -EINVAL;
+ }
+
+ in = kmalloc(buff_size, GFP_KERNEL);
+ if (in == NULL) {
+ printk("Failed to allocate memory.\n");
+ return -ENOMEM;
+ }
+
+ memset(in, 0x24, buff_size);
+
+ sg_set_buf(sg, in, buff_size);
+
+ tfm = crypto_alloc_blkcipher(algname, 0, CRYPTO_ALG_ASYNC);
+
+ if (IS_ERR(tfm)) {
+ printk("failed to load transform for %s: %ld\n", algname, PTR_ERR(tfm));
+ goto leave;
+ }
+
+ crypto_blkcipher_setkey(tfm, key, keylen);
+
+ iv_len = crypto_blkcipher_ivsize(tfm);
+ if (iv_len)
+ crypto_blkcipher_set_iv(tfm, in, iv_len);
+
+ desc.tfm = tfm;
+ desc.flags = 0;
+
+ for (i=0 ; i<big_loops; i++) {
+ int j;
+ start = jiffies;
+ ret = 0;
+
+ for (j=0; j < small_loops && !ret; j++) {
+
+ if (!mode)
+ ret = crypto_blkcipher_encrypt(&desc, sg, sg, buff_size);
+ else
+ ret = crypto_blkcipher_decrypt(&desc, sg, sg, buff_size);
+ }
+
+ if (ret) {
+ printk("encryption failed: %d after (i,j) (%u,%u) iterations\n", ret, i, j);
+ goto leave_loop;
+ }
+ end = jiffies;
+ if ( !time_after(start, end)) {
+ printk("Run: %u msec\n", jiffies_to_msecs(end - start));
+ total += jiffies_to_msecs(end - start);
+ } else {
+ printk("Run: %u msec\n", jiffies_to_msecs(start - end));
+ total += jiffies_to_msecs(start - end);
+ }
+ }
+
+ total /= big_loops;
+ size_kb *= 1000;
+ size_kb /= total;
+ printk("Average: %lu msec, approx. %lu kb/sec || %lu mb/sec \n", total,
+ size_kb, size_kb/1024);
+
+leave_loop:
+ crypto_free_blkcipher(tfm);
+
+leave:
+ kfree(in);
+ return -ENODEV;
+}
+
+static void __exit fini(void) { }
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
Index: linux/crypto/Kconfig
===================================================================
--- linux.orig/crypto/Kconfig
+++ linux/crypto/Kconfig
@@ -462,6 +462,12 @@ config CRYPTO_TEST
help
Quick & dirty crypto test module.
+config CRYPTO_LIMI_SPEED
+ tristate "Crypto algorithm speed test with msec resolution"
+ help
+ insmod/modprobe the module, and watch dmesg for results.
+ Test is for aes only, see modinfo for options
+
source "drivers/crypto/Kconfig"
endif # if CRYPTO
Index: linux/crypto/Makefile
===================================================================
--- linux.orig/crypto/Makefile
+++ linux/crypto/Makefile
@@ -52,3 +52,4 @@ obj-$(CONFIG_CRYPTO_TEST) += tcrypt.o
CFLAGS_aes-altivec.o += -O3 -maltivec
aes_altivec-objs := aes-alti.o aes-altivec.o
obj-$(CONFIG_CRYPTO_AES_ALTIVEC) += aes_altivec.o
+obj-$(CONFIG_CRYPTO_LIMI_SPEED) += limi-speed.o
--
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2007-04-17 12:24 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-04-11 16:49 [RFC 0/3] Experiments with AES-AltiVec Sebastian Siewior
2007-04-11 16:49 ` [RFC 1/3] cryptoapi: AES with AltiVec support Sebastian Siewior
2007-04-11 18:24 ` Arnd Bergmann
2007-04-12 13:40 ` Sebastian Siewior
2007-04-11 22:22 ` Benjamin Herrenschmidt
2007-04-12 7:45 ` Sebastian Siewior
2007-04-12 8:39 ` Benjamin Herrenschmidt
2007-04-11 16:49 ` [RFC 2/3] PowerPC: lazy altivec enabling in kernel Sebastian Siewior
2007-04-11 16:49 ` [RFC 3/3] cryptoapi: speed test Sebastian Siewior
-- strict thread matches above, loose matches on Subject: below --
2007-04-17 11:52 [RFC 0/3] Experiments with AES-AltiVec, part 2 Sebastian Siewior
2007-04-17 11:52 ` [RFC 3/3] cryptoapi: speed test Sebastian Siewior
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).