From: Lukasz Stelmach <l.stelmach@samsung.com>
To: Eric Biggers <ebiggers@kernel.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>,
"David S. Miller" <davem@davemloft.net>,
linux-crypto@vger.kernel.org
Subject: Re: xor_blocks() assumptions
Date: Tue, 03 Jan 2023 12:13:30 +0100 [thread overview]
Message-ID: <dleftjbknfoopx.fsf%l.stelmach@samsung.com> (raw)
In-Reply-To: <Y7NizHFsWfMW/cC2@sol.localdomain> (Eric Biggers's message of "Mon, 2 Jan 2023 15:03:40 -0800")
[-- Attachment #1: Type: text/plain, Size: 6725 bytes --]
It was <2023-01-02 pon 15:03>, when Eric Biggers wrote:
> On Mon, Jan 02, 2023 at 11:44:35PM +0100, Lukasz Stelmach wrote:
>> I am researching possibility to use xor_blocks() in crypto_xor() and
>> crypto_xor_cpy(). What I've found already is that different architecture
>> dependent xor functions work on different blocks between 16 and 512
>> (Intel AVX) bytes long. There is a hint in the comment for
>> async_xor_offs() that src_cnt (as passed to do_sync_xor_offs()) counts
>> pages. Thus, it is assumed, that the smallest chunk xor_blocks() gets is
>> a single page. Am I right?
>>
>> Do you think adding block_len field to struct xor_block_template (and
>> maybe some information about required alignment) and using it to call
>> do_2 from crypto_xor() may work? I am thinking especially about disk
>> encryption where sectors of 512~4096 are handled.
>>
>
> Taking a step back, it sounds like you think the word-at-a-time XOR in
> crypto_xor() is not performant enough, so you want to use a SIMD (e.g. NEON,
> SSE, or AVX) implementation instead.
Yes.
> Have you tested that this would actually give a benefit on the input
> sizes in question,
--8<---------------cut here---------------start------------->8---
[ 0.938006] xor: measuring software checksum speed
[ 0.947383] crypto : 1052 MB/sec
[ 0.953299] arm4regs : 1689 MB/sec
[ 0.960674] 8regs : 1352 MB/sec
[ 0.968033] 32regs : 1352 MB/sec
[ 0.972078] neon : 2448 MB/sec
--8<---------------cut here---------------end--------------->8---
(Linux 6.2.0-rc1 running on Odroid XU3 board with Arm Cortex-A15)
The patch below copies, adapts and plugs in __crypto_xor() as
xor_block_crypto.do_2. You can see its results labeled "crypto" above.
Disk encryption is comparable to RAID5 checksumming so the results above
should be adequate.
> especially considering that SIMD can only be used in the kernel if
> kernel_fpu_begin() is executed first?
That depends on architecture. As far as I can tell this applies to Intel
only.
> It also would be worth considering just optimizing crypto_xor() by
> unrolling the word-at-a-time loop to 4x or so.
If I understand correctly the generic 8regs and 32regs implementations
in include/asm-generic/xor.h are what you mean. Using xor_blocks() in
crypto_xor() could enable them for free on architectures lacking SIMD or
vector instructions.
---
diff --git a/arch/arm/include/asm/xor.h b/arch/arm/include/asm/xor.h
index 934b549905f5..ffb67b3cbcfc 100644
--- a/arch/arm/include/asm/xor.h
+++ b/arch/arm/include/asm/xor.h
@@ -142,6 +142,7 @@ static struct xor_block_template xor_block_arm4regs = {
#undef XOR_TRY_TEMPLATES
#define XOR_TRY_TEMPLATES \
do { \
+ xor_speed(&xor_block_crypto); \
xor_speed(&xor_block_arm4regs); \
xor_speed(&xor_block_8regs); \
xor_speed(&xor_block_32regs); \
diff --git a/include/asm-generic/xor.h b/include/asm-generic/xor.h
index 44509d48fca2..a04f27607c65 100644
--- a/include/asm-generic/xor.h
+++ b/include/asm-generic/xor.h
@@ -6,6 +6,85 @@
*/
#include <linux/prefetch.h>
+#include <asm-generic/unaligned.h>
+
+// A.K.A. __crypto_xor()
+static void
+xor_crypto_2(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2)
+{
+ u8 *dst = (u8*)p1;
+ u8 *src1 = (u8*)p1;
+ u8 *src2 = (u8*)p2;
+ unsigned int len = bytes;
+ int relalign = 0;
+
+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
+ int size = sizeof(unsigned long);
+ int d = (((unsigned long)dst ^ (unsigned long)src1) |
+ ((unsigned long)dst ^ (unsigned long)src2)) &
+ (size - 1);
+
+ relalign = d ? 1 << __ffs(d) : size;
+
+ /*
+ * If we care about alignment, process as many bytes as
+ * needed to advance dst and src to values whose alignments
+ * equal their relative alignment. This will allow us to
+ * process the remainder of the input using optimal strides.
+ */
+ while (((unsigned long)dst & (relalign - 1)) && len > 0) {
+ *dst++ = *src1++ ^ *src2++;
+ len--;
+ }
+ }
+
+ while (IS_ENABLED(CONFIG_64BIT) && len >= 8 && !(relalign & 7)) {
+ if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
+ u64 l = get_unaligned((u64 *)src1) ^
+ get_unaligned((u64 *)src2);
+ put_unaligned(l, (u64 *)dst);
+ } else {
+ *(u64 *)dst = *(u64 *)src1 ^ *(u64 *)src2;
+ }
+ dst += 8;
+ src1 += 8;
+ src2 += 8;
+ len -= 8;
+ }
+
+ while (len >= 4 && !(relalign & 3)) {
+ if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
+ u32 l = get_unaligned((u32 *)src1) ^
+ get_unaligned((u32 *)src2);
+ put_unaligned(l, (u32 *)dst);
+ } else {
+ *(u32 *)dst = *(u32 *)src1 ^ *(u32 *)src2;
+ }
+ dst += 4;
+ src1 += 4;
+ src2 += 4;
+ len -= 4;
+ }
+
+ while (len >= 2 && !(relalign & 1)) {
+ if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
+ u16 l = get_unaligned((u16 *)src1) ^
+ get_unaligned((u16 *)src2);
+ put_unaligned(l, (u16 *)dst);
+ } else {
+ *(u16 *)dst = *(u16 *)src1 ^ *(u16 *)src2;
+ }
+ dst += 2;
+ src1 += 2;
+ src2 += 2;
+ len -= 2;
+ }
+
+ while (len--)
+ *dst++ = *src1++ ^ *src2++;
+}
+
static void
xor_8regs_2(unsigned long bytes, unsigned long * __restrict p1,
@@ -697,6 +776,14 @@ xor_32regs_p_5(unsigned long bytes, unsigned long * __restrict p1,
goto once_more;
}
+static struct xor_block_template xor_block_crypto = {
+ .name = "crypto",
+ .do_2 = xor_crypto_2,
+ .do_3 = NULL,
+ .do_4 = NULL,
+ .do_5 = NULL,
+};
+
static struct xor_block_template xor_block_8regs = {
.name = "8regs",
.do_2 = xor_8regs_2,
--
Łukasz Stelmach
Samsung R&D Institute Poland
Samsung Electronics
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 487 bytes --]
next prev parent reply other threads:[~2023-01-03 11:14 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <CGME20230102224447eucas1p1dad1a2362030eee0d3890dd3546a1532@eucas1p1.samsung.com>
2023-01-02 22:44 ` xor_blocks() assumptions Lukasz Stelmach
2023-01-02 23:03 ` Eric Biggers
2023-01-03 11:13 ` Lukasz Stelmach [this message]
2023-01-03 14:01 ` Ard Biesheuvel
2023-01-04 7:46 ` Eric Biggers
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=dleftjbknfoopx.fsf%l.stelmach@samsung.com \
--to=l.stelmach@samsung.com \
--cc=davem@davemloft.net \
--cc=ebiggers@kernel.org \
--cc=herbert@gondor.apana.org.au \
--cc=linux-crypto@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox