* [PATCH 1/2] crpyto: support encryt and decrypt parallelly using thread pool
2024-11-28 10:51 [PATCH 0/2] support block encryption/decryption in parallel tugy
@ 2024-11-28 10:51 ` tugy
2024-11-28 10:51 ` [PATCH 2/2] qapi/crypto: support enable encryption/decryption in parallel tugy
` (2 subsequent siblings)
3 siblings, 0 replies; 9+ messages in thread
From: tugy @ 2024-11-28 10:51 UTC (permalink / raw)
To: eblake, armbru, kwolf, hreitz, qemu-block; +Cc: qemu-devel, tugy
From: Guoyi Tu <tugy@chinatelecom.cn>
Currently, disk I/O encryption and decryption operations are performed sequentially
in the main thread or IOthread. When the number of I/O requests increases,
this becomes a performance bottleneck.
To address this issue, this patch using the thread pool to perform I/O encryption
and decryption in parallel to improving overall efficiency.
Signed-off-by: Guoyi Tu <tugy@chinatelecom.cn>
---
block/crypto.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 97 insertions(+), 6 deletions(-)
diff --git a/block/crypto.c b/block/crypto.c
index 80b2dba17a..c085f331ce 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -22,6 +22,7 @@
#include "block/block_int.h"
#include "block/qdict.h"
+#include "block/thread-pool.h"
#include "sysemu/block-backend.h"
#include "crypto/block.h"
#include "qapi/opts-visitor.h"
@@ -40,6 +41,7 @@ struct BlockCrypto {
QCryptoBlock *block;
bool updating_keys;
BdrvChild *header; /* Reference to the detached LUKS header */
+ bool encrypt_in_parallel;
};
@@ -460,6 +462,94 @@ static int block_crypto_reopen_prepare(BDRVReopenState *state,
return 0;
}
+
+typedef struct CryptoAIOData {
+ QCryptoBlock *block;
+ uint64_t offset;
+ uint8_t *buf;
+ size_t len;
+ bool encrypt;
+ Error **errp;
+} CryptoAIOData;
+
+
+static int handle_aiocb_encdec(void *opaque)
+{
+ CryptoAIOData *aiocb = opaque;
+
+ if (aiocb->encrypt) {
+ if (qcrypto_block_encrypt(aiocb->block, aiocb->offset,
+ aiocb->buf, aiocb->len, aiocb->errp) < 0) {
+ return -EIO;
+ }
+ } else {
+ if (qcrypto_block_decrypt(aiocb->block, aiocb->offset,
+ aiocb->buf, aiocb->len, aiocb->errp) < 0) {
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
+
+static int coroutine_fn block_crypto_submit_co(BlockDriverState *bs, uint64_t offset,
+ uint8_t *buf, size_t len, bool encrypt,
+ Error **errp)
+{
+ BlockCrypto *crypto = bs->opaque;
+ CryptoAIOData acb;
+
+ acb = (CryptoAIOData) {
+ .block = crypto->block,
+ .offset = offset,
+ .buf = buf,
+ .len = len,
+ .encrypt = encrypt,
+ .errp = errp,
+ };
+ return thread_pool_submit_co(handle_aiocb_encdec, &acb);
+}
+
+
+static int coroutine_fn GRAPH_RDLOCK
+block_crypto_encrypt(BlockDriverState *bs, uint64_t offset,
+ uint8_t *buf, size_t len, Error **errp)
+{
+ BlockCrypto *crypto = bs->opaque;
+ int ret = 0;
+
+ if (crypto->encrypt_in_parallel) {
+ ret = block_crypto_submit_co(bs, offset, buf, len, true, errp);
+ } else {
+ if (qcrypto_block_encrypt(crypto->block, offset, buf, len, errp) < 0) {
+ ret = -EIO;
+ }
+ }
+
+ return ret;
+}
+
+
+static int coroutine_fn GRAPH_RDLOCK
+block_crypto_decrypt(BlockDriverState *bs, uint64_t offset,
+ uint8_t *buf, size_t len, Error **errp)
+{
+ BlockCrypto *crypto = bs->opaque;
+ int ret = 0;
+
+ if (crypto->encrypt_in_parallel) {
+ ret = block_crypto_submit_co(bs, offset, buf, len, false, errp);
+ } else {
+ if (qcrypto_block_decrypt(crypto->block, offset, buf, len, errp) < 0) {
+ ret = -EIO;
+ }
+ }
+
+ return ret;
+}
+
+
/*
* 1 MB bounce buffer gives good performance / memory tradeoff
* when using cache=none|directsync.
@@ -508,9 +598,10 @@ block_crypto_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
goto cleanup;
}
- if (qcrypto_block_decrypt(crypto->block, offset + bytes_done,
- cipher_data, cur_bytes, NULL) < 0) {
- ret = -EIO;
+ ret = block_crypto_decrypt(bs, offset + bytes_done,
+ cipher_data, cur_bytes, NULL);
+
+ if (ret < 0) {
goto cleanup;
}
@@ -565,9 +656,9 @@ block_crypto_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
qemu_iovec_to_buf(qiov, bytes_done, cipher_data, cur_bytes);
- if (qcrypto_block_encrypt(crypto->block, offset + bytes_done,
- cipher_data, cur_bytes, NULL) < 0) {
- ret = -EIO;
+ ret = block_crypto_encrypt(bs, offset + bytes_done,
+ cipher_data, cur_bytes, NULL);
+ if (ret < 0) {
goto cleanup;
}
--
2.17.1
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH 2/2] qapi/crypto: support enable encryption/decryption in parallel
2024-11-28 10:51 [PATCH 0/2] support block encryption/decryption in parallel tugy
2024-11-28 10:51 ` [PATCH 1/2] crpyto: support encryt and decrypt parallelly using thread pool tugy
@ 2024-11-28 10:51 ` tugy
2025-01-17 13:04 ` Daniel P. Berrangé
2024-12-13 8:26 ` [PATCH 0/2] support block " Guoyi Tu
2024-12-13 15:56 ` Daniel P. Berrangé
3 siblings, 1 reply; 9+ messages in thread
From: tugy @ 2024-11-28 10:51 UTC (permalink / raw)
To: eblake, armbru, kwolf, hreitz, qemu-block; +Cc: qemu-devel, tugy
From: Guoyi Tu <tugy@chinatelecom.cn>
add encrypt-in-parallel option to enable encryption/decryption
in parallel
Signed-off-by: Guoyi Tu <tugy@chinatelecom.cn>
---
block/crypto.c | 8 ++++++++
block/crypto.h | 9 +++++++++
qapi/block-core.json | 6 +++++-
qapi/crypto.json | 6 +++++-
4 files changed, 27 insertions(+), 2 deletions(-)
diff --git a/block/crypto.c b/block/crypto.c
index c085f331ce..b02400fb26 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -212,6 +212,7 @@ static QemuOptsList block_crypto_runtime_opts_luks = {
.head = QTAILQ_HEAD_INITIALIZER(block_crypto_runtime_opts_luks.head),
.desc = {
BLOCK_CRYPTO_OPT_DEF_LUKS_KEY_SECRET(""),
+ BLOCK_CRYPTO_OPT_DEF_LUKS_ENCRYPT_IN_PARALLEL(""),
{ /* end of list */ }
},
};
@@ -347,6 +348,13 @@ static int block_crypto_open_generic(QCryptoBlockFormat format,
}
cryptoopts = qemu_opts_to_qdict(opts, NULL);
+
+ if (!g_strcmp0(qdict_get_try_str(cryptoopts,
+ BLOCK_CRYPTO_OPT_LUKS_ENCRYPT_IN_PARALLEL), "on") ||
+ qdict_get_try_bool(cryptoopts,
+ BLOCK_CRYPTO_OPT_LUKS_ENCRYPT_IN_PARALLEL, false)) {
+ crypto->encrypt_in_parallel = true;
+ }
qdict_put_str(cryptoopts, "format", QCryptoBlockFormat_str(format));
open_opts = block_crypto_open_opts_init(cryptoopts, errp);
diff --git a/block/crypto.h b/block/crypto.h
index dc3d2d5ed9..6729420941 100644
--- a/block/crypto.h
+++ b/block/crypto.h
@@ -46,6 +46,7 @@
#define BLOCK_CRYPTO_OPT_LUKS_STATE "state"
#define BLOCK_CRYPTO_OPT_LUKS_OLD_SECRET "old-secret"
#define BLOCK_CRYPTO_OPT_LUKS_NEW_SECRET "new-secret"
+#define BLOCK_CRYPTO_OPT_LUKS_ENCRYPT_IN_PARALLEL "encrypt-in-parallel"
#define BLOCK_CRYPTO_OPT_DEF_LUKS_KEY_SECRET(prefix) \
@@ -130,6 +131,14 @@
"Empty string to erase", \
}
+#define BLOCK_CRYPTO_OPT_DEF_LUKS_ENCRYPT_IN_PARALLEL(prefix) \
+ { \
+ .name = prefix BLOCK_CRYPTO_OPT_LUKS_ENCRYPT_IN_PARALLEL, \
+ .type = QEMU_OPT_BOOL, \
+ .help = "perform encryption and decryption through " \
+ "thread pool", \
+ }
+
QCryptoBlockCreateOptions *
block_crypto_create_opts_init(QDict *opts, Error **errp);
diff --git a/qapi/block-core.json b/qapi/block-core.json
index fd3bcc1c17..1e47b6ea80 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -3365,12 +3365,16 @@
#
# @header: block device holding a detached LUKS header. (since 9.0)
#
+# @encrypt-in-parallel: perform encryption and decryption through
+# thread pool
+#
# Since: 2.9
##
{ 'struct': 'BlockdevOptionsLUKS',
'base': 'BlockdevOptionsGenericFormat',
'data': { '*key-secret': 'str',
- '*header': 'BlockdevRef'} }
+ '*header': 'BlockdevRef',
+ '*encrypt-in-parallel': 'bool'} }
##
# @BlockdevOptionsGenericCOWFormat:
diff --git a/qapi/crypto.json b/qapi/crypto.json
index c9d967d782..91963c693f 100644
--- a/qapi/crypto.json
+++ b/qapi/crypto.json
@@ -192,10 +192,14 @@
# decryption key. Mandatory except when probing image for
# metadata only.
#
+# @encrypt-in-parallel: perform encryption and decryption through
+# thread pool
+#
# Since: 2.6
##
{ 'struct': 'QCryptoBlockOptionsLUKS',
- 'data': { '*key-secret': 'str' }}
+ 'data': { '*key-secret': 'str',
+ '*encrypt-in-parallel': 'bool' }}
##
# @QCryptoBlockCreateOptionsLUKS:
--
2.17.1
^ permalink raw reply related [flat|nested] 9+ messages in thread* Re: [PATCH 2/2] qapi/crypto: support enable encryption/decryption in parallel
2024-11-28 10:51 ` [PATCH 2/2] qapi/crypto: support enable encryption/decryption in parallel tugy
@ 2025-01-17 13:04 ` Daniel P. Berrangé
0 siblings, 0 replies; 9+ messages in thread
From: Daniel P. Berrangé @ 2025-01-17 13:04 UTC (permalink / raw)
To: tugy; +Cc: eblake, armbru, kwolf, hreitz, qemu-block, qemu-devel
On Thu, Nov 28, 2024 at 06:51:22PM +0800, tugy@chinatelecom.cn wrote:
> From: Guoyi Tu <tugy@chinatelecom.cn>
>
> add encrypt-in-parallel option to enable encryption/decryption
> in parallel
>
> Signed-off-by: Guoyi Tu <tugy@chinatelecom.cn>
> ---
> block/crypto.c | 8 ++++++++
> block/crypto.h | 9 +++++++++
> qapi/block-core.json | 6 +++++-
> qapi/crypto.json | 6 +++++-
> 4 files changed, 27 insertions(+), 2 deletions(-)
>
> diff --git a/block/crypto.c b/block/crypto.c
> index c085f331ce..b02400fb26 100644
> --- a/block/crypto.c
> +++ b/block/crypto.c
> @@ -212,6 +212,7 @@ static QemuOptsList block_crypto_runtime_opts_luks = {
> .head = QTAILQ_HEAD_INITIALIZER(block_crypto_runtime_opts_luks.head),
> .desc = {
> BLOCK_CRYPTO_OPT_DEF_LUKS_KEY_SECRET(""),
> + BLOCK_CRYPTO_OPT_DEF_LUKS_ENCRYPT_IN_PARALLEL(""),
> { /* end of list */ }
> },
> };
> @@ -347,6 +348,13 @@ static int block_crypto_open_generic(QCryptoBlockFormat format,
> }
>
> cryptoopts = qemu_opts_to_qdict(opts, NULL);
> +
> + if (!g_strcmp0(qdict_get_try_str(cryptoopts,
> + BLOCK_CRYPTO_OPT_LUKS_ENCRYPT_IN_PARALLEL), "on") ||
> + qdict_get_try_bool(cryptoopts,
> + BLOCK_CRYPTO_OPT_LUKS_ENCRYPT_IN_PARALLEL, false)) {
> + crypto->encrypt_in_parallel = true;
> + }
> qdict_put_str(cryptoopts, "format", QCryptoBlockFormat_str(format));
>
> open_opts = block_crypto_open_opts_init(cryptoopts, errp);
> diff --git a/block/crypto.h b/block/crypto.h
> index dc3d2d5ed9..6729420941 100644
> --- a/block/crypto.h
> +++ b/block/crypto.h
> @@ -46,6 +46,7 @@
> #define BLOCK_CRYPTO_OPT_LUKS_STATE "state"
> #define BLOCK_CRYPTO_OPT_LUKS_OLD_SECRET "old-secret"
> #define BLOCK_CRYPTO_OPT_LUKS_NEW_SECRET "new-secret"
> +#define BLOCK_CRYPTO_OPT_LUKS_ENCRYPT_IN_PARALLEL "encrypt-in-parallel"
>
>
> #define BLOCK_CRYPTO_OPT_DEF_LUKS_KEY_SECRET(prefix) \
> @@ -130,6 +131,14 @@
> "Empty string to erase", \
> }
>
> +#define BLOCK_CRYPTO_OPT_DEF_LUKS_ENCRYPT_IN_PARALLEL(prefix) \
> + { \
> + .name = prefix BLOCK_CRYPTO_OPT_LUKS_ENCRYPT_IN_PARALLEL, \
> + .type = QEMU_OPT_BOOL, \
> + .help = "perform encryption and decryption through " \
> + "thread pool", \
> + }
> +
> QCryptoBlockCreateOptions *
> block_crypto_create_opts_init(QDict *opts, Error **errp);
>
> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index fd3bcc1c17..1e47b6ea80 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -3365,12 +3365,16 @@
> #
> # @header: block device holding a detached LUKS header. (since 9.0)
> #
> +# @encrypt-in-parallel: perform encryption and decryption through
> +# thread pool
> +#
> # Since: 2.9
> ##
> { 'struct': 'BlockdevOptionsLUKS',
> 'base': 'BlockdevOptionsGenericFormat',
> 'data': { '*key-secret': 'str',
> - '*header': 'BlockdevRef'} }
> + '*header': 'BlockdevRef',
> + '*encrypt-in-parallel': 'bool'} }
>
> ##
> # @BlockdevOptionsGenericCOWFormat:
> diff --git a/qapi/crypto.json b/qapi/crypto.json
> index c9d967d782..91963c693f 100644
> --- a/qapi/crypto.json
> +++ b/qapi/crypto.json
> @@ -192,10 +192,14 @@
> # decryption key. Mandatory except when probing image for
> # metadata only.
> #
> +# @encrypt-in-parallel: perform encryption and decryption through
> +# thread pool
> +#
> # Since: 2.6
> ##
> { 'struct': 'QCryptoBlockOptionsLUKS',
> - 'data': { '*key-secret': 'str' }}
> + 'data': { '*key-secret': 'str',
> + '*encrypt-in-parallel': 'bool' }}
Perhaps better named as 'use-thread-pool': 'bool', not least because
this applies to decrypt too, not just encrypt .
NB, both the qapi changes need a "(since 10.0.0)" annotation too
With regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 0/2] support block encryption/decryption in parallel
2024-11-28 10:51 [PATCH 0/2] support block encryption/decryption in parallel tugy
2024-11-28 10:51 ` [PATCH 1/2] crpyto: support encryt and decrypt parallelly using thread pool tugy
2024-11-28 10:51 ` [PATCH 2/2] qapi/crypto: support enable encryption/decryption in parallel tugy
@ 2024-12-13 8:26 ` Guoyi Tu
2024-12-13 15:56 ` Daniel P. Berrangé
3 siblings, 0 replies; 9+ messages in thread
From: Guoyi Tu @ 2024-12-13 8:26 UTC (permalink / raw)
To: eblake, armbru, kwolf, hreitz, qemu-block; +Cc: tugy, qemu-devel
Hi Kevin and Hanna, could you share your thoughts on this patch?
I’d greatly appreciate your feedback
--
Guoyi
On 2024/11/28 18:51, tugy@chinatelecom.cn wrote:
> From: Guoyi Tu <tugy@chinatelecom.cn>
>
> Currently, disk I/O encryption and decryption operations are performed sequentially
> in the main thread or IOthread. When the number of I/O requests increases,
> this becomes a performance bottleneck.
>
> To address this issue, this patch use thread pool to perform I/O encryption
> and decryption in parallel, improving overall efficiency.
>
> Test results show that enabling the thread pool for encryption and decryption
> significantly improve the performance of virtual machine storage devices.
>
>
> Test Case1: Disk read/write performance using fio in a virtual machine
>
> Virtual Machine: 8c16g, with a disk backing by a LUKS storage device and
> Ceph as storage backend.
> Test Method:
> fio -direct=1 -iodepth=32 -rw=xx -ioengine=libaio -bs=4k -size=10G -numjobs=x \
> -runtime=1000 -group_reporting -filename=/dev/vdb -name=xxx
>
> Runing the VM on the Intel Xeon 5218 server, The test results are as follows:
>
> | | Serial encryption | Thread pool encryption|
> | | and decryption | and decryption |
> | fio |-----------|---------|-----------|---------|
> | | BW(MiB/s) | IOPS(K) | BW(MiB/s) | IOPS(K) |
> |------------------------|-----------|---------|-----------|---------|
> | rw=read numjobs=2 | 499 | 128 | 605 | 155 |
> | rw=read numjobs=4 | 529 | 136 | 632 | 162 |
> | rw=write numjobs=2 | 493 | 126 | 617 | 158 |
> | rw=write numjobs=4 | 534 | 137 | 743 | 190 |
>
>
> Runing the VM on the HiSilicon Kunpeng-920 server, The test results are as follows:
>
> | | Serial encryption | Thread pool encryption|
> | | and decryption | and decryption |
> | fio |-----------|---------|-----------|---------|
> | | BW(MiB/s) | IOPS(K) | BW(MiB/s) | IOPS(K) |
> |------------------------|-----------|---------|-----------|---------|
> | rw=read numjobs=2 | 73.2 | 18.8 | 128 | 39.2 |
> | rw=read numjobs=4 | 77.9 | 19.9 | 246 | 62.9 |
> | rw=write numjobs=2 | 78 | 19 | 140 | 35.8 |
> | rw=write numjobs=4 | 78 | 20.2 | 270 | 69.1 |
>
>
> Test Case 2:
> In addition, performance comparisons were also conducted on the HiSilicon Kunpeng-920
> server, testing the conversion of a qcow2 image to a LUKS image using qemu-img convert.
> The results show that using thread pool to encryption and decryption all significantly
> improve the performance.
>
> Test Method: Create a 40GB qcow2 image and fill it with data, then convert it to a LUKS
> image using qemu-img
>
> * Serial encryption and decryption:
> time qemu-img convert -p -m 16 -W --image-opts file.filename=/home/tgy/data.qcow2 \
> --object secret,id=sec,data=password -n \
> --target-image-opts driver=luks,key-secret=sec,file.filename=/home/tgy/data.luks
>
> real 7m53.681s
> user 7m52.595s
> sys 0m11.248s
>
>
> * Thread pool encryption and decryption:
> time qemu-img convert -p -m 16 -W --image-opts file.filename=/home/tgy/data.qcow2 \
> --object secret,id=sec,data=password -n --target-image-opts \
> driver=luks,key-secret=sec,encrypt-in-parallel=on,file.filename=/home/tgy/data.luks
>
> real 1m43.101s
> user 10m30.239s
> sys 13m13.758s
>
> Guoyi Tu (2):
> crpyto: support encryt and decrypt parallelly using thread pool
> qapi/crypto: support enable encryption/decryption in parallel
>
> block/crypto.c | 111 ++++++++++++++++++++++++++++++++++++++++---
> block/crypto.h | 9 ++++
> qapi/block-core.json | 6 ++-
> qapi/crypto.json | 6 ++-
> 4 files changed, 124 insertions(+), 8 deletions(-)
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 0/2] support block encryption/decryption in parallel
2024-11-28 10:51 [PATCH 0/2] support block encryption/decryption in parallel tugy
` (2 preceding siblings ...)
2024-12-13 8:26 ` [PATCH 0/2] support block " Guoyi Tu
@ 2024-12-13 15:56 ` Daniel P. Berrangé
2025-01-16 12:37 ` Kevin Wolf
3 siblings, 1 reply; 9+ messages in thread
From: Daniel P. Berrangé @ 2024-12-13 15:56 UTC (permalink / raw)
To: tugy; +Cc: eblake, armbru, kwolf, hreitz, qemu-block, qemu-devel
On Thu, Nov 28, 2024 at 06:51:20PM +0800, tugy@chinatelecom.cn wrote:
> From: Guoyi Tu <tugy@chinatelecom.cn>
>
> Currently, disk I/O encryption and decryption operations are performed sequentially
> in the main thread or IOthread. When the number of I/O requests increases,
> this becomes a performance bottleneck.
>
> To address this issue, this patch use thread pool to perform I/O encryption
> and decryption in parallel, improving overall efficiency.
We already have support for parallel encryption through use of IO threads
since approximately this commit:
commit af206c284e4c1b17cdfb0f17e898b288c0fc1751
Author: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon May 27 11:58:50 2024 -0400
block/crypto: create ciphers on demand
Ciphers are pre-allocated by qcrypto_block_init_cipher() depending on
the given number of threads. The -device
virtio-blk-pci,iothread-vq-mapping= feature allows users to assign
multiple IOThreads to a virtio-blk device, but the association between
the virtio-blk device and the block driver happens after the block
driver is already open.
When the number of threads given to qcrypto_block_init_cipher() is
smaller than the actual number of threads at runtime, the
block->n_free_ciphers > 0 assertion in qcrypto_block_pop_cipher() can
fail.
Get rid of qcrypto_block_init_cipher() n_thread's argument and allocate
ciphers on demand.
Say we have QEMU pinned to 4 host CPUs, and we've setup 4 IO threads
for the disk, then encryption can max out 4 host CPUs worth of resource.
How is this new proposed way to use a thread pool going to do better
than that in an apples-to-apples comparison ? ie allow same number
of host CPUs for both. The fundamental limit is still the AES performance
of the host CPU(s) that you allow QEMU to execute work on. If the thread
pool is allowed to use 4 host CPUs, it shouldn't be significantly different
from allowing use of 4 host CPUs for I/O threads surely ?
Having multiple different ways to support parallel encryption is not
ideal. If there's something I/O threads can't do optimally right
now, is it practical to make them work better ?
With regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH 0/2] support block encryption/decryption in parallel
2024-12-13 15:56 ` Daniel P. Berrangé
@ 2025-01-16 12:37 ` Kevin Wolf
2025-01-17 12:55 ` Daniel P. Berrangé
0 siblings, 1 reply; 9+ messages in thread
From: Kevin Wolf @ 2025-01-16 12:37 UTC (permalink / raw)
To: Daniel P. Berrangé
Cc: tugy, eblake, armbru, hreitz, qemu-block, qemu-devel
Am 13.12.2024 um 16:56 hat Daniel P. Berrangé geschrieben:
> On Thu, Nov 28, 2024 at 06:51:20PM +0800, tugy@chinatelecom.cn wrote:
> > From: Guoyi Tu <tugy@chinatelecom.cn>
> >
> > Currently, disk I/O encryption and decryption operations are performed sequentially
> > in the main thread or IOthread. When the number of I/O requests increases,
> > this becomes a performance bottleneck.
> >
> > To address this issue, this patch use thread pool to perform I/O encryption
> > and decryption in parallel, improving overall efficiency.
>
> We already have support for parallel encryption through use of IO threads
> since approximately this commit:
>
> commit af206c284e4c1b17cdfb0f17e898b288c0fc1751
> Author: Stefan Hajnoczi <stefanha@redhat.com>
> Date: Mon May 27 11:58:50 2024 -0400
>
> block/crypto: create ciphers on demand
>
> Ciphers are pre-allocated by qcrypto_block_init_cipher() depending on
> the given number of threads. The -device
> virtio-blk-pci,iothread-vq-mapping= feature allows users to assign
> multiple IOThreads to a virtio-blk device, but the association between
> the virtio-blk device and the block driver happens after the block
> driver is already open.
>
> When the number of threads given to qcrypto_block_init_cipher() is
> smaller than the actual number of threads at runtime, the
> block->n_free_ciphers > 0 assertion in qcrypto_block_pop_cipher() can
> fail.
>
> Get rid of qcrypto_block_init_cipher() n_thread's argument and allocate
> ciphers on demand.
>
>
> Say we have QEMU pinned to 4 host CPUs, and we've setup 4 IO threads
> for the disk, then encryption can max out 4 host CPUs worth of resource.
This is a lot of "if"s. Even just that it requires explicit
configuration and doesn't work out of the box would be a strong point
for me why having something that works by default (like a thread pool)
is worth it.
You're assuming that it's even possible to setup 4 iothreads which share
the load evenly. That's not a given at all. The only device that can
even make use of more than one iothread is virtio-blk. (And if we're
looking at all devices that exist in QEMU, most devices can't even make
use of a single iothread!) But if you do have a virtio-blk device, then
that setup means one iothread per queue. In a Linux guest, if all I/O
comes from a single CPU, then it will use the same queue and we'll have
three idle iothreads and one that is overloaded.
So in order to achieve a similar effect with iothreads, you must be
using virtio-blk, you must explicitly configure four iothreads and four
mappings of virtqueues to iothreads, and you must run a workload in the
guest that performs I/O from four different threads running on different
CPUs.
There are certainly good use cases for iothreads, but with this number
of preconditions, I don't think we can assume that they make it
unnecessary to parallelise things in other ways, too.
> How is this new proposed way to use a thread pool going to do better
> than that in an apples-to-apples comparison ? ie allow same number
> of host CPUs for both. The fundamental limit is still the AES performance
> of the host CPU(s) that you allow QEMU to execute work on. If the thread
> pool is allowed to use 4 host CPUs, it shouldn't be significantly different
> from allowing use of 4 host CPUs for I/O threads surely ?
>
> Having multiple different ways to support parallel encryption is not
> ideal. If there's something I/O threads can't do optimally right
> now, is it practical to make them work better ?
The limitations inside the guest obviously can't be changed by QEMU.
We could in theory add iothread support to more devices, though if they
don't have a concept of multiple queues that could be processed by
different threads, it's pretty pointless (apart from working around
limitations in the backends like you're suggesting here).
And of course, the most interesting one would be solving the
out-of-the-box aspect. This is far from trivial, because the optimal
configuration really depends on your workload, and nothing on the host
can know automatically what will eventually run in the guest. So it will
be tough finding defaults that improve this case without also hurting
other cases.
Kevin
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 0/2] support block encryption/decryption in parallel
2025-01-16 12:37 ` Kevin Wolf
@ 2025-01-17 12:55 ` Daniel P. Berrangé
2025-01-18 14:39 ` Guoyi Tu
0 siblings, 1 reply; 9+ messages in thread
From: Daniel P. Berrangé @ 2025-01-17 12:55 UTC (permalink / raw)
To: Kevin Wolf; +Cc: tugy, eblake, armbru, hreitz, qemu-block, qemu-devel
On Thu, Jan 16, 2025 at 01:37:44PM +0100, Kevin Wolf wrote:
> Am 13.12.2024 um 16:56 hat Daniel P. Berrangé geschrieben:
> > On Thu, Nov 28, 2024 at 06:51:20PM +0800, tugy@chinatelecom.cn wrote:
> > > From: Guoyi Tu <tugy@chinatelecom.cn>
> > >
> > > Currently, disk I/O encryption and decryption operations are performed sequentially
> > > in the main thread or IOthread. When the number of I/O requests increases,
> > > this becomes a performance bottleneck.
> > >
> > > To address this issue, this patch use thread pool to perform I/O encryption
> > > and decryption in parallel, improving overall efficiency.
> >
> > We already have support for parallel encryption through use of IO threads
> > since approximately this commit:
> >
> > commit af206c284e4c1b17cdfb0f17e898b288c0fc1751
> > Author: Stefan Hajnoczi <stefanha@redhat.com>
> > Date: Mon May 27 11:58:50 2024 -0400
> >
> > block/crypto: create ciphers on demand
> >
> > Ciphers are pre-allocated by qcrypto_block_init_cipher() depending on
> > the given number of threads. The -device
> > virtio-blk-pci,iothread-vq-mapping= feature allows users to assign
> > multiple IOThreads to a virtio-blk device, but the association between
> > the virtio-blk device and the block driver happens after the block
> > driver is already open.
> >
> > When the number of threads given to qcrypto_block_init_cipher() is
> > smaller than the actual number of threads at runtime, the
> > block->n_free_ciphers > 0 assertion in qcrypto_block_pop_cipher() can
> > fail.
> >
> > Get rid of qcrypto_block_init_cipher() n_thread's argument and allocate
> > ciphers on demand.
> >
> >
> > Say we have QEMU pinned to 4 host CPUs, and we've setup 4 IO threads
> > for the disk, then encryption can max out 4 host CPUs worth of resource.
>
> This is a lot of "if"s. Even just that it requires explicit
> configuration and doesn't work out of the box would be a strong point
> for me why having something that works by default (like a thread pool)
> is worth it.
>
> You're assuming that it's even possible to setup 4 iothreads which share
> the load evenly. That's not a given at all. The only device that can
> even make use of more than one iothread is virtio-blk. (And if we're
> looking at all devices that exist in QEMU, most devices can't even make
> use of a single iothread!) But if you do have a virtio-blk device, then
> that setup means one iothread per queue. In a Linux guest, if all I/O
> comes from a single CPU, then it will use the same queue and we'll have
> three idle iothreads and one that is overloaded.
>
> So in order to achieve a similar effect with iothreads, you must be
> using virtio-blk, you must explicitly configure four iothreads and four
> mappings of virtqueues to iothreads, and you must run a workload in the
> guest that performs I/O from four different threads running on different
> CPUs.
>
> There are certainly good use cases for iothreads, but with this number
> of preconditions, I don't think we can assume that they make it
> unnecessary to parallelise things in other ways, too.
Ok thanks for the background, that all makes sense as a justification.
Lets capture a summary of this in the commit message.
> > How is this new proposed way to use a thread pool going to do better
> > than that in an apples-to-apples comparison ? ie allow same number
> > of host CPUs for both. The fundamental limit is still the AES performance
> > of the host CPU(s) that you allow QEMU to execute work on. If the thread
> > pool is allowed to use 4 host CPUs, it shouldn't be significantly different
> > from allowing use of 4 host CPUs for I/O threads surely ?
> >
> > Having multiple different ways to support parallel encryption is not
> > ideal. If there's something I/O threads can't do optimally right
> > now, is it practical to make them work better ?
>
> The limitations inside the guest obviously can't be changed by QEMU.
>
> We could in theory add iothread support to more devices, though if they
> don't have a concept of multiple queues that could be processed by
> different threads, it's pretty pointless (apart from working around
> limitations in the backends like you're suggesting here).
>
> And of course, the most interesting one would be solving the
> out-of-the-box aspect. This is far from trivial, because the optimal
> configuration really depends on your workload, and nothing on the host
> can know automatically what will eventually run in the guest. So it will
> be tough finding defaults that improve this case without also hurting
> other cases.
Yes, understood, I was missing the impact of the guest usage model.
With regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 0/2] support block encryption/decryption in parallel
2025-01-17 12:55 ` Daniel P. Berrangé
@ 2025-01-18 14:39 ` Guoyi Tu
0 siblings, 0 replies; 9+ messages in thread
From: Guoyi Tu @ 2025-01-18 14:39 UTC (permalink / raw)
To: Daniel P. Berrangé, Kevin Wolf
Cc: tugy, eblake, armbru, hreitz, qemu-block, qemu-devel
Thank you for the additional background and suggestions.
I will resend the second version.
On 2025/1/17 20:55, Daniel 【外部账号】P. Berrangé wrote:
> On Thu, Jan 16, 2025 at 01:37:44PM +0100, Kevin Wolf wrote:
>> Am 13.12.2024 um 16:56 hat Daniel P. Berrangé geschrieben:
>>> On Thu, Nov 28, 2024 at 06:51:20PM +0800, tugy@chinatelecom.cn wrote:
>>>> From: Guoyi Tu <tugy@chinatelecom.cn>
>>>>
>>>> Currently, disk I/O encryption and decryption operations are performed sequentially
>>>> in the main thread or IOthread. When the number of I/O requests increases,
>>>> this becomes a performance bottleneck.
>>>>
>>>> To address this issue, this patch use thread pool to perform I/O encryption
>>>> and decryption in parallel, improving overall efficiency.
>>>
>>> We already have support for parallel encryption through use of IO threads
>>> since approximately this commit:
>>>
>>> commit af206c284e4c1b17cdfb0f17e898b288c0fc1751
>>> Author: Stefan Hajnoczi <stefanha@redhat.com>
>>> Date: Mon May 27 11:58:50 2024 -0400
>>>
>>> block/crypto: create ciphers on demand
>>>
>>> Ciphers are pre-allocated by qcrypto_block_init_cipher() depending on
>>> the given number of threads. The -device
>>> virtio-blk-pci,iothread-vq-mapping= feature allows users to assign
>>> multiple IOThreads to a virtio-blk device, but the association between
>>> the virtio-blk device and the block driver happens after the block
>>> driver is already open.
>>>
>>> When the number of threads given to qcrypto_block_init_cipher() is
>>> smaller than the actual number of threads at runtime, the
>>> block->n_free_ciphers > 0 assertion in qcrypto_block_pop_cipher() can
>>> fail.
>>>
>>> Get rid of qcrypto_block_init_cipher() n_thread's argument and allocate
>>> ciphers on demand.
>>>
>>>
>>> Say we have QEMU pinned to 4 host CPUs, and we've setup 4 IO threads
>>> for the disk, then encryption can max out 4 host CPUs worth of resource.
>>
>> This is a lot of "if"s. Even just that it requires explicit
>> configuration and doesn't work out of the box would be a strong point
>> for me why having something that works by default (like a thread pool)
>> is worth it.
>>
>> You're assuming that it's even possible to setup 4 iothreads which share
>> the load evenly. That's not a given at all. The only device that can
>> even make use of more than one iothread is virtio-blk. (And if we're
>> looking at all devices that exist in QEMU, most devices can't even make
>> use of a single iothread!) But if you do have a virtio-blk device, then
>> that setup means one iothread per queue. In a Linux guest, if all I/O
>> comes from a single CPU, then it will use the same queue and we'll have
>> three idle iothreads and one that is overloaded.
>>
>> So in order to achieve a similar effect with iothreads, you must be
>> using virtio-blk, you must explicitly configure four iothreads and four
>> mappings of virtqueues to iothreads, and you must run a workload in the
>> guest that performs I/O from four different threads running on different
>> CPUs.
>>
>> There are certainly good use cases for iothreads, but with this number
>> of preconditions, I don't think we can assume that they make it
>> unnecessary to parallelise things in other ways, too.
>
> Ok thanks for the background, that all makes sense as a justification.
> Lets capture a summary of this in the commit message.
>
>>> How is this new proposed way to use a thread pool going to do better
>>> than that in an apples-to-apples comparison ? ie allow same number
>>> of host CPUs for both. The fundamental limit is still the AES performance
>>> of the host CPU(s) that you allow QEMU to execute work on. If the thread
>>> pool is allowed to use 4 host CPUs, it shouldn't be significantly different
>>> from allowing use of 4 host CPUs for I/O threads surely ?
>>>
>>> Having multiple different ways to support parallel encryption is not
>>> ideal. If there's something I/O threads can't do optimally right
>>> now, is it practical to make them work better ?
>>
>> The limitations inside the guest obviously can't be changed by QEMU.
>>
>> We could in theory add iothread support to more devices, though if they
>> don't have a concept of multiple queues that could be processed by
>> different threads, it's pretty pointless (apart from working around
>> limitations in the backends like you're suggesting here).
>>
>> And of course, the most interesting one would be solving the
>> out-of-the-box aspect. This is far from trivial, because the optimal
>> configuration really depends on your workload, and nothing on the host
>> can know automatically what will eventually run in the guest. So it will
>> be tough finding defaults that improve this case without also hurting
>> other cases.
>
> Yes, understood, I was missing the impact of the guest usage model.
>
> With regards,
> Daniel
^ permalink raw reply [flat|nested] 9+ messages in thread