[PATCH v3] migration/rdma: add x-rdma-chunk-size parameter

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter
@ 2026-03-27  6:50 Samuel Zhang
  2026-03-27  9:45 ` Markus Armbruster
  2026-03-30 16:10 ` Peter Xu
  0 siblings, 2 replies; 14+ messages in thread
From: Samuel Zhang @ 2026-03-27  6:50 UTC (permalink / raw)
  To: qemu-devel
  Cc: peterx, farosas, lizhijian, eblake, armbru, Emily.Deng,
	Victor.Zhao, PengJu.Zhou, Qing.Ma, Samuel Zhang

The default 1MB RDMA chunk size causes slow live migration because
each chunk triggers a write_flush (ibv_post_send). For 8GB RAM,
1MB chunk size produces ~15000 flushes vs ~3700 with 1024MB chunk size.

Add x-rdma-chunk-size parameter to configure the RDMA chunk size for
faster migration.
Usage: `migrate_set_parameter x-rdma-chunk-size 1024M`

Performance with RDMA live migration of 8GB RAM VM:

| x-rdma-chunk-size (B) | time (s) | throughput (MB/s) |
|-----------------------|----------|-------------------|
| 1M (default)          | 37.915   |  1,007            |
| 32M                   | 17.880   |  2,260            |
| 1024M                 |  4.368   | 17,529            |

Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
---
v2:
- Renamed x-rdma-chunk-shift to x-rdma-chunk-size (byte count)
- Added validation in migrate_params_check()
- Added hmp_migrate_set_parameter() support
- Added hmp_info_migrate_parameters() support
- Added migrate_mark_all_params_present()
- Use qemu_strtosz() for size suffix support
v3:
- Use visit_type_size() in HMP set parameter
- Use MiB/GiB constants

 migration/migration-hmp-cmds.c | 11 +++++++++++
 migration/options.c            | 33 ++++++++++++++++++++++++++++++++-
 migration/options.h            |  1 +
 migration/rdma.c               | 30 ++++++++++++++++--------------
 qapi/migration.json            | 12 ++++++++++--
 5 files changed, 70 insertions(+), 17 deletions(-)

diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
index 0a193b8f54..4f6c1dbf89 100644
--- a/migration/migration-hmp-cmds.c
+++ b/migration/migration-hmp-cmds.c
@@ -451,6 +451,13 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict)
                            params->direct_io ? "on" : "off");
         }
 
+        if (params->has_x_rdma_chunk_size) {
+            monitor_printf(mon, "%s: %" PRIu64 " bytes\n",
+                           MigrationParameter_str(
+                               MIGRATION_PARAMETER_X_RDMA_CHUNK_SIZE),
+                           params->x_rdma_chunk_size);
+        }
+
         assert(params->has_cpr_exec_command);
         monitor_print_cpr_exec_command(mon, params->cpr_exec_command);
     }
@@ -734,6 +741,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
         p->has_direct_io = true;
         visit_type_bool(v, param, &p->direct_io, &err);
         break;
+    case MIGRATION_PARAMETER_X_RDMA_CHUNK_SIZE:
+        p->has_x_rdma_chunk_size = true;
+        visit_type_size(v, param, &p->x_rdma_chunk_size, &err);
+        break;
     case MIGRATION_PARAMETER_CPR_EXEC_COMMAND: {
         /*
          * NOTE: g_autofree will only auto g_free() the strv array when
diff --git a/migration/options.c b/migration/options.c
index f33b297929..bc61c8665d 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -13,6 +13,7 @@
 
 #include "qemu/osdep.h"
 #include "qemu/error-report.h"
+#include "qemu/units.h"
 #include "exec/target_page.h"
 #include "qapi/clone-visitor.h"
 #include "qapi/error.h"
@@ -90,6 +91,7 @@ const PropertyInfo qdev_prop_StrOrNull;
 
 #define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT_PERIOD     1000    /* milliseconds */
 #define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT            1       /* MB/s */
+#define DEFAULT_MIGRATE_X_RDMA_CHUNK_SIZE           MiB     /* 1MB */
 
 const Property migration_properties[] = {
     DEFINE_PROP_BOOL("store-global-state", MigrationState,
@@ -183,6 +185,9 @@ const Property migration_properties[] = {
     DEFINE_PROP_ZERO_PAGE_DETECTION("zero-page-detection", MigrationState,
                        parameters.zero_page_detection,
                        ZERO_PAGE_DETECTION_MULTIFD),
+    DEFINE_PROP_UINT64("x-rdma-chunk-size", MigrationState,
+                      parameters.x_rdma_chunk_size,
+                      DEFAULT_MIGRATE_X_RDMA_CHUNK_SIZE),
 
     /* Migration capabilities */
     DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
@@ -993,6 +998,15 @@ ZeroPageDetection migrate_zero_page_detection(void)
     return s->parameters.zero_page_detection;
 }
 
+uint64_t migrate_rdma_chunk_size(void)
+{
+    MigrationState *s = migrate_get_current();
+    uint64_t size = s->parameters.x_rdma_chunk_size;
+
+    assert(MiB <= size && size <= GiB && is_power_of_2(size));
+    return size;
+}
+
 /* parameters helpers */
 
 AnnounceParameters *migrate_announce_params(void)
@@ -1055,7 +1069,7 @@ static void migrate_mark_all_params_present(MigrationParameters *p)
         &p->has_announce_step, &p->has_block_bitmap_mapping,
         &p->has_x_vcpu_dirty_limit_period, &p->has_vcpu_dirty_limit,
         &p->has_mode, &p->has_zero_page_detection, &p->has_direct_io,
-        &p->has_cpr_exec_command,
+        &p->has_x_rdma_chunk_size, &p->has_cpr_exec_command,
     };
 
     len = ARRAY_SIZE(has_fields);
@@ -1266,6 +1280,15 @@ bool migrate_params_check(MigrationParameters *params, Error **errp)
         return false;
     }
 
+    if (params->has_x_rdma_chunk_size &&
+        (params->x_rdma_chunk_size < MiB ||
+         params->x_rdma_chunk_size > GiB ||
+         !is_power_of_2(params->x_rdma_chunk_size))) {
+        error_setg(errp, "Option x_rdma_chunk_size expects "
+                   "a power of 2 in the range 1MiB to 1024MiB");
+        return false;
+    }
+
     return true;
 }
 
@@ -1391,6 +1414,10 @@ static void migrate_params_test_apply(MigrationParameters *params,
         dest->direct_io = params->direct_io;
     }
 
+    if (params->has_x_rdma_chunk_size) {
+        dest->x_rdma_chunk_size = params->x_rdma_chunk_size;
+    }
+
     if (params->has_cpr_exec_command) {
         dest->cpr_exec_command = params->cpr_exec_command;
     }
@@ -1517,6 +1544,10 @@ static void migrate_params_apply(MigrationParameters *params)
         s->parameters.direct_io = params->direct_io;
     }
 
+    if (params->has_x_rdma_chunk_size) {
+        s->parameters.x_rdma_chunk_size = params->x_rdma_chunk_size;
+    }
+
     if (params->has_cpr_exec_command) {
         qapi_free_strList(s->parameters.cpr_exec_command);
         s->parameters.cpr_exec_command =
diff --git a/migration/options.h b/migration/options.h
index b502871097..b46221998a 100644
--- a/migration/options.h
+++ b/migration/options.h
@@ -87,6 +87,7 @@ const char *migrate_tls_creds(void);
 const char *migrate_tls_hostname(void);
 uint64_t migrate_xbzrle_cache_size(void);
 ZeroPageDetection migrate_zero_page_detection(void);
+uint64_t migrate_rdma_chunk_size(void);
 
 /* parameters helpers */
 
diff --git a/migration/rdma.c b/migration/rdma.c
index 55ab85650a..3e37a1d440 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -45,10 +45,12 @@
 #define RDMA_RESOLVE_TIMEOUT_MS 10000
 
 /* Do not merge data if larger than this. */
-#define RDMA_MERGE_MAX (2 * 1024 * 1024)
-#define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
+static inline uint64_t rdma_merge_max(void)
+{
+    return migrate_rdma_chunk_size() * 2;
+}
 
-#define RDMA_REG_CHUNK_SHIFT 20 /* 1 MB */
+#define RDMA_SIGNALED_SEND_MAX 512
 
 /*
  * This is only for non-live state being migrated.
@@ -527,21 +529,21 @@ static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
 static inline uint64_t ram_chunk_index(const uint8_t *start,
                                        const uint8_t *host)
 {
-    return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
+    return ((uintptr_t) host - (uintptr_t) start) / migrate_rdma_chunk_size();
 }
 
 static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block,
                                        uint64_t i)
 {
     return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr +
-                                  (i << RDMA_REG_CHUNK_SHIFT));
+                                  (i * migrate_rdma_chunk_size()));
 }
 
 static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
                                      uint64_t i)
 {
     uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
-                                         (1UL << RDMA_REG_CHUNK_SHIFT);
+                                         migrate_rdma_chunk_size();
 
     if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
         result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
@@ -1841,6 +1843,7 @@ static int qemu_rdma_write_one(RDMAContext *rdma,
     struct ibv_send_wr *bad_wr;
     int reg_result_idx, ret, count = 0;
     uint64_t chunk, chunks;
+    uint64_t chunk_size = migrate_rdma_chunk_size();
     uint8_t *chunk_start, *chunk_end;
     RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
     RDMARegister reg;
@@ -1861,22 +1864,21 @@ retry:
     chunk_start = ram_chunk_start(block, chunk);
 
     if (block->is_ram_block) {
-        chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
+        chunks = length / chunk_size;
 
-        if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
+        if (chunks && ((length % chunk_size) == 0)) {
             chunks--;
         }
     } else {
-        chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
+        chunks = block->length / chunk_size;
 
-        if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
+        if (chunks && ((block->length % chunk_size) == 0)) {
             chunks--;
         }
     }
 
     trace_qemu_rdma_write_one_top(chunks + 1,
-                                  (chunks + 1) *
-                                  (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
+                                  (chunks + 1) * chunk_size / 1024 / 1024);
 
     chunk_end = ram_chunk_end(block, chunk + chunks);
 
@@ -2176,7 +2178,7 @@ static int qemu_rdma_write(RDMAContext *rdma,
     rdma->current_length += len;
 
     /* flush it if buffer is too large */
-    if (rdma->current_length >= RDMA_MERGE_MAX) {
+    if (rdma->current_length >= rdma_merge_max()) {
         return qemu_rdma_write_flush(rdma, errp);
     }
 
@@ -3522,7 +3524,7 @@ int rdma_registration_handle(QEMUFile *f)
                 } else {
                     chunk = reg->key.chunk;
                     host_addr = block->local_host_addr +
-                        (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
+                        (reg->key.chunk * migrate_rdma_chunk_size());
                     /* Check for particularly bad chunk value */
                     if (host_addr < (void *)block->local_host_addr) {
                         error_report("rdma: bad chunk for block %s"
diff --git a/qapi/migration.json b/qapi/migration.json
index 7134d4ce47..292d96c95a 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -806,7 +806,7 @@
 #
 # Features:
 #
-# @unstable: Members @x-checkpoint-delay and
+# @unstable: Members @x-checkpoint-delay, @x-rdma-chunk-size, and
 #     @x-vcpu-dirty-limit-period are experimental.
 #
 # Since: 2.4
@@ -831,6 +831,7 @@
            'mode',
            'zero-page-detection',
            'direct-io',
+           { 'name': 'x-rdma-chunk-size', 'features': [ 'unstable' ] },
            'cpr-exec-command'] }
 
 ##
@@ -1007,9 +1008,14 @@
 #     is @cpr-exec.  The first list element is the program's filename,
 #     the remainder its arguments.  (Since 10.2)
 #
+# @x-rdma-chunk-size: RDMA memory registration chunk size in bytes.
+#     Default is 1MiB.  Must be a power of 2 in the range
+#     [1MiB, 1024MiB].  Only takes effect for RDMA migration.
+#     (Since 11.1)
+#
 # Features:
 #
-# @unstable: Members @x-checkpoint-delay and
+# @unstable: Members @x-checkpoint-delay, @x-rdma-chunk-size, and
 #     @x-vcpu-dirty-limit-period are experimental.
 #
 # Since: 2.4
@@ -1046,6 +1052,8 @@
             '*mode': 'MigMode',
             '*zero-page-detection': 'ZeroPageDetection',
             '*direct-io': 'bool',
+            '*x-rdma-chunk-size': { 'type': 'uint64',
+                                    'features': [ 'unstable' ] },
             '*cpr-exec-command': [ 'str' ]} }
 
 ##
-- 
2.43.7



^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter
  2026-03-27  6:50 [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter Samuel Zhang
@ 2026-03-27  9:45 ` Markus Armbruster
  2026-03-27 10:27   ` Zhang, GuoQing (Sam)
  2026-03-30 16:10 ` Peter Xu
  1 sibling, 1 reply; 14+ messages in thread
From: Markus Armbruster @ 2026-03-27  9:45 UTC (permalink / raw)
  To: Samuel Zhang
  Cc: qemu-devel, peterx, farosas, lizhijian, eblake, Emily.Deng,
	Victor.Zhao, PengJu.Zhou, Qing.Ma

Samuel Zhang <guoqing.zhang@amd.com> writes:

> The default 1MB RDMA chunk size causes slow live migration because
> each chunk triggers a write_flush (ibv_post_send). For 8GB RAM,
> 1MB chunk size produces ~15000 flushes vs ~3700 with 1024MB chunk size.
>
> Add x-rdma-chunk-size parameter to configure the RDMA chunk size for
> faster migration.
> Usage: `migrate_set_parameter x-rdma-chunk-size 1024M`
>
> Performance with RDMA live migration of 8GB RAM VM:
>
> | x-rdma-chunk-size (B) | time (s) | throughput (MB/s) |
> |-----------------------|----------|-------------------|
> | 1M (default)          | 37.915   |  1,007            |
> | 32M                   | 17.880   |  2,260            |
> | 1024M                 |  4.368   | 17,529            |
>
> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>

[...]

> diff --git a/migration/options.c b/migration/options.c
> index f33b297929..bc61c8665d 100644
> --- a/migration/options.c
> +++ b/migration/options.c
> @@ -13,6 +13,7 @@
>  
>  #include "qemu/osdep.h"
>  #include "qemu/error-report.h"
> +#include "qemu/units.h"
>  #include "exec/target_page.h"
>  #include "qapi/clone-visitor.h"
>  #include "qapi/error.h"
> @@ -90,6 +91,7 @@ const PropertyInfo qdev_prop_StrOrNull;
>  
>  #define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT_PERIOD     1000    /* milliseconds */
>  #define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT            1       /* MB/s */
> +#define DEFAULT_MIGRATE_X_RDMA_CHUNK_SIZE           MiB     /* 1MB */

The comment is now superfluous.

>  
>  const Property migration_properties[] = {
>      DEFINE_PROP_BOOL("store-global-state", MigrationState,

[...]

> diff --git a/qapi/migration.json b/qapi/migration.json
> index 7134d4ce47..292d96c95a 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -806,7 +806,7 @@
>  #
>  # Features:
>  #
> -# @unstable: Members @x-checkpoint-delay and
> +# @unstable: Members @x-checkpoint-delay, @x-rdma-chunk-size, and
>  #     @x-vcpu-dirty-limit-period are experimental.
>  #
>  # Since: 2.4
> @@ -831,6 +831,7 @@
>             'mode',
>             'zero-page-detection',
>             'direct-io',
> +           { 'name': 'x-rdma-chunk-size', 'features': [ 'unstable' ] },
>             'cpr-exec-command'] }
>  
>  ##
> @@ -1007,9 +1008,14 @@
>  #     is @cpr-exec.  The first list element is the program's filename,
>  #     the remainder its arguments.  (Since 10.2)
>  #
> +# @x-rdma-chunk-size: RDMA memory registration chunk size in bytes.
> +#     Default is 1MiB.  Must be a power of 2 in the range
> +#     [1MiB, 1024MiB].  Only takes effect for RDMA migration.

I believe it applies to channels whose migration address type is is
"rdma".  In MigrationChannel syntax

    {"channel-type": ..., "addr": {"transport": "rdma", ...}}

Correct?

> +#     (Since 11.1)
> +#
>  # Features:
>  #
> -# @unstable: Members @x-checkpoint-delay and
> +# @unstable: Members @x-checkpoint-delay, @x-rdma-chunk-size, and
>  #     @x-vcpu-dirty-limit-period are experimental.
>  #
>  # Since: 2.4
> @@ -1046,6 +1052,8 @@
>              '*mode': 'MigMode',
>              '*zero-page-detection': 'ZeroPageDetection',
>              '*direct-io': 'bool',
> +            '*x-rdma-chunk-size': { 'type': 'uint64',
> +                                    'features': [ 'unstable' ] },
>              '*cpr-exec-command': [ 'str' ]} }
>  
>  ##



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter
  2026-03-27  9:45 ` Markus Armbruster
@ 2026-03-27 10:27   ` Zhang, GuoQing (Sam)
  2026-03-27 11:24     ` Markus Armbruster
  0 siblings, 1 reply; 14+ messages in thread
From: Zhang, GuoQing (Sam) @ 2026-03-27 10:27 UTC (permalink / raw)
  To: Markus Armbruster, Samuel Zhang
  Cc: qemu-devel, peterx, farosas, lizhijian, eblake, Emily.Deng,
	Victor.Zhao, PengJu.Zhou, Qing.Ma


On 2026/3/27 17:45, Markus Armbruster wrote:
> Samuel Zhang <guoqing.zhang@amd.com> writes:
>
>> The default 1MB RDMA chunk size causes slow live migration because
>> each chunk triggers a write_flush (ibv_post_send). For 8GB RAM,
>> 1MB chunk size produces ~15000 flushes vs ~3700 with 1024MB chunk size.
>>
>> Add x-rdma-chunk-size parameter to configure the RDMA chunk size for
>> faster migration.
>> Usage: `migrate_set_parameter x-rdma-chunk-size 1024M`
>>
>> Performance with RDMA live migration of 8GB RAM VM:
>>
>> | x-rdma-chunk-size (B) | time (s) | throughput (MB/s) |
>> |-----------------------|----------|-------------------|
>> | 1M (default)          | 37.915   |  1,007            |
>> | 32M                   | 17.880   |  2,260            |
>> | 1024M                 |  4.368   | 17,529            |
>>
>> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
> [...]
>
>> diff --git a/migration/options.c b/migration/options.c
>> index f33b297929..bc61c8665d 100644
>> --- a/migration/options.c
>> +++ b/migration/options.c
>> @@ -13,6 +13,7 @@
>>   
>>   #include "qemu/osdep.h"
>>   #include "qemu/error-report.h"
>> +#include "qemu/units.h"
>>   #include "exec/target_page.h"
>>   #include "qapi/clone-visitor.h"
>>   #include "qapi/error.h"
>> @@ -90,6 +91,7 @@ const PropertyInfo qdev_prop_StrOrNull;
>>   
>>   #define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT_PERIOD     1000    /* milliseconds */
>>   #define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT            1       /* MB/s */
>> +#define DEFAULT_MIGRATE_X_RDMA_CHUNK_SIZE           MiB     /* 1MB */
> The comment is now superfluous.
>
>>   
>>   const Property migration_properties[] = {
>>       DEFINE_PROP_BOOL("store-global-state", MigrationState,
> [...]
>
>> diff --git a/qapi/migration.json b/qapi/migration.json
>> index 7134d4ce47..292d96c95a 100644
>> --- a/qapi/migration.json
>> +++ b/qapi/migration.json
>> @@ -806,7 +806,7 @@
>>   #
>>   # Features:
>>   #
>> -# @unstable: Members @x-checkpoint-delay and
>> +# @unstable: Members @x-checkpoint-delay, @x-rdma-chunk-size, and
>>   #     @x-vcpu-dirty-limit-period are experimental.
>>   #
>>   # Since: 2.4
>> @@ -831,6 +831,7 @@
>>              'mode',
>>              'zero-page-detection',
>>              'direct-io',
>> +           { 'name': 'x-rdma-chunk-size', 'features': [ 'unstable' ] },
>>              'cpr-exec-command'] }
>>   
>>   ##
>> @@ -1007,9 +1008,14 @@
>>   #     is @cpr-exec.  The first list element is the program's filename,
>>   #     the remainder its arguments.  (Since 10.2)
>>   #
>> +# @x-rdma-chunk-size: RDMA memory registration chunk size in bytes.
>> +#     Default is 1MiB.  Must be a power of 2 in the range
>> +#     [1MiB, 1024MiB].  Only takes effect for RDMA migration.
> I believe it applies to channels whose migration address type is is
> "rdma".  In MigrationChannel syntax
>
>      {"channel-type": ..., "addr": {"transport": "rdma", ...}}
>
> Correct?


Correct! Is it OK to update the doc to the following one? Thank you!

`Only takes effect when migration address transport is "rdma".`


>
>> +#     (Since 11.1)
>> +#
>>   # Features:
>>   #
>> -# @unstable: Members @x-checkpoint-delay and
>> +# @unstable: Members @x-checkpoint-delay, @x-rdma-chunk-size, and
>>   #     @x-vcpu-dirty-limit-period are experimental.
>>   #
>>   # Since: 2.4
>> @@ -1046,6 +1052,8 @@
>>               '*mode': 'MigMode',
>>               '*zero-page-detection': 'ZeroPageDetection',
>>               '*direct-io': 'bool',
>> +            '*x-rdma-chunk-size': { 'type': 'uint64',
>> +                                    'features': [ 'unstable' ] },
>>               '*cpr-exec-command': [ 'str' ]} }
>>   
>>   ##


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter
  2026-03-27 10:27   ` Zhang, GuoQing (Sam)
@ 2026-03-27 11:24     ` Markus Armbruster
  0 siblings, 0 replies; 14+ messages in thread
From: Markus Armbruster @ 2026-03-27 11:24 UTC (permalink / raw)
  To: Zhang, GuoQing (Sam)
  Cc: Samuel Zhang, qemu-devel, peterx, farosas, lizhijian, eblake,
	Emily.Deng, Victor.Zhao, PengJu.Zhou, Qing.Ma

"Zhang, GuoQing (Sam)" <guoqzhan@amd.com> writes:

> On 2026/3/27 17:45, Markus Armbruster wrote:
>> Samuel Zhang <guoqing.zhang@amd.com> writes:
>>
>>> The default 1MB RDMA chunk size causes slow live migration because
>>> each chunk triggers a write_flush (ibv_post_send). For 8GB RAM,
>>> 1MB chunk size produces ~15000 flushes vs ~3700 with 1024MB chunk size.
>>>
>>> Add x-rdma-chunk-size parameter to configure the RDMA chunk size for
>>> faster migration.
>>> Usage: `migrate_set_parameter x-rdma-chunk-size 1024M`
>>>
>>> Performance with RDMA live migration of 8GB RAM VM:
>>>
>>> | x-rdma-chunk-size (B) | time (s) | throughput (MB/s) |
>>> |-----------------------|----------|-------------------|
>>> | 1M (default)          | 37.915   |  1,007            |
>>> | 32M                   | 17.880   |  2,260            |
>>> | 1024M                 |  4.368   | 17,529            |
>>>
>>> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>

[...]

>>> diff --git a/qapi/migration.json b/qapi/migration.json
>>> index 7134d4ce47..292d96c95a 100644
>>> --- a/qapi/migration.json
>>> +++ b/qapi/migration.json
>>> @@ -806,7 +806,7 @@
>>>  #
>>>  # Features:
>>>  #
>>> -# @unstable: Members @x-checkpoint-delay and
>>> +# @unstable: Members @x-checkpoint-delay, @x-rdma-chunk-size, and
>>>  #     @x-vcpu-dirty-limit-period are experimental.
>>>  #
>>>  # Since: 2.4
>>> @@ -831,6 +831,7 @@
>>>             'mode',
>>>             'zero-page-detection',
>>>             'direct-io',
>>> +           { 'name': 'x-rdma-chunk-size', 'features': [ 'unstable' ] },
>>>             'cpr-exec-command'] }
>>>  
>>>  ##
>>> @@ -1007,9 +1008,14 @@
>>>  #     is @cpr-exec.  The first list element is the program's filename,
>>>  #     the remainder its arguments.  (Since 10.2)
>>>  #
>>> +# @x-rdma-chunk-size: RDMA memory registration chunk size in bytes.
>>> +#     Default is 1MiB.  Must be a power of 2 in the range
>>> +#     [1MiB, 1024MiB].  Only takes effect for RDMA migration.
>>
>> I believe it applies to channels whose migration address type is is
>> "rdma".  In MigrationChannel syntax
>>
>>      {"channel-type": ..., "addr": {"transport": "rdma", ...}}
>>
>> Correct?
>
>
> Correct! Is it OK to update the doc to the following one? Thank you!
>
> `Only takes effect when migration address transport is "rdma".`

The phrasing in the patch feels okay as is.  Perhaps "Only applies when
migrating via RDMA" to more closely match the description of
MigrationAddressType @rdma: Migrate via RDMA.

Either way, QAPI schema
Acked-by: Markus Armbruster <armbru@redhat.com>

>>> +#     (Since 11.1)
>>> +#
>>>  # Features:
>>>  #
>>> -# @unstable: Members @x-checkpoint-delay and
>>> +# @unstable: Members @x-checkpoint-delay, @x-rdma-chunk-size, and
>>>  #     @x-vcpu-dirty-limit-period are experimental.
>>>  #
>>>  # Since: 2.4
>>> @@ -1046,6 +1052,8 @@
>>>              '*mode': 'MigMode',
>>>              '*zero-page-detection': 'ZeroPageDetection',
>>>              '*direct-io': 'bool',
>>> +            '*x-rdma-chunk-size': { 'type': 'uint64',
>>> +                                    'features': [ 'unstable' ] },
>>>              '*cpr-exec-command': [ 'str' ]} }
>>  
>>>  ##



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter
  2026-03-27  6:50 [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter Samuel Zhang
  2026-03-27  9:45 ` Markus Armbruster
@ 2026-03-30 16:10 ` Peter Xu
  2026-03-31  3:30   ` Zhijian Li (Fujitsu)
  2026-03-31 11:06   ` Markus Armbruster
  1 sibling, 2 replies; 14+ messages in thread
From: Peter Xu @ 2026-03-30 16:10 UTC (permalink / raw)
  To: Samuel Zhang
  Cc: qemu-devel, farosas, lizhijian, eblake, armbru, Emily.Deng,
	Victor.Zhao, PengJu.Zhou, Qing.Ma

Hi, Samuel,

On Fri, Mar 27, 2026 at 02:50:06PM +0800, Samuel Zhang wrote:
> The default 1MB RDMA chunk size causes slow live migration because
> each chunk triggers a write_flush (ibv_post_send). For 8GB RAM,
> 1MB chunk size produces ~15000 flushes vs ~3700 with 1024MB chunk size.
> 
> Add x-rdma-chunk-size parameter to configure the RDMA chunk size for
> faster migration.
> Usage: `migrate_set_parameter x-rdma-chunk-size 1024M`
> 
> Performance with RDMA live migration of 8GB RAM VM:
> 
> | x-rdma-chunk-size (B) | time (s) | throughput (MB/s) |
> |-----------------------|----------|-------------------|
> | 1M (default)          | 37.915   |  1,007            |

This is the default. It surprised me a bit knowing it can only reach 1GB/s
throughput with the current code base.  Do you know why?  I thought RDMA
should be much faster than this on throughput with whatever hardware setup.

> | 32M                   | 17.880   |  2,260            |
> | 1024M                 |  4.368   | 17,529            |
> 
> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>

One thing to mention is RDMA migration is in odd-fixes stage, actually it
doesn't have a real maintainer so it is kind of "orphaned".  In this case,
I actually won't suggest we add any new knobs for performance reasons.

Do you have a strong reason to propose this patch to land upstream?  Is it
used in production systems and it solves some real problems for you?

I also wonder what Zhijian would say on this.

Thanks,

-- 
Peter Xu

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter
  2026-03-30 16:10 ` Peter Xu
@ 2026-03-31  3:30   ` Zhijian Li (Fujitsu)
  2026-03-31 10:33     ` Zhang, GuoQing (Sam)
  2026-03-31 11:06   ` Markus Armbruster
  1 sibling, 1 reply; 14+ messages in thread
From: Zhijian Li (Fujitsu) @ 2026-03-31  3:30 UTC (permalink / raw)
  To: Peter Xu, Samuel Zhang
  Cc: qemu-devel@nongnu.org, farosas@suse.de, eblake@redhat.com,
	armbru@redhat.com, Emily.Deng@amd.com, Victor.Zhao@amd.com,
	PengJu.Zhou@amd.com, Qing.Ma@amd.com



On 31/03/2026 00:10, Peter Xu wrote:
> Hi, Samuel,
> 
> On Fri, Mar 27, 2026 at 02:50:06PM +0800, Samuel Zhang wrote:
>> The default 1MB RDMA chunk size causes slow live migration because
>> each chunk triggers a write_flush (ibv_post_send). For 8GB RAM,
>> 1MB chunk size produces ~15000 flushes vs ~3700 with 1024MB chunk size.
>>
>> Add x-rdma-chunk-size parameter to configure the RDMA chunk size for
>> faster migration.
>> Usage: `migrate_set_parameter x-rdma-chunk-size 1024M`
>>
>> Performance with RDMA live migration of 8GB RAM VM:
>>
>> | x-rdma-chunk-size (B) | time (s) | throughput (MB/s) |
>> |-----------------------|----------|-------------------|
>> | 1M (default)          | 37.915   |  1,007            |
> 
> This is the default. It surprised me a bit knowing it can only reach 1GB/s
> throughput with the current code base.  Do you know why?  I thought RDMA
> should be much faster than this on throughput with whatever hardware setup.

  
Regarding the baseline performance, Samuel's numbers look reasonable. I checked
some of my old test data on a ConnectX-4 Lx card years ago, and the throughput
was around 10 Gbps (~1.25 GB/s), which is consistent with the 1 GB/s he reported.

> 
>> | 32M                   | 17.880   |  2,260            |
>> | 1024M                 |  4.368   | 17,529            |

My guess for the dramatic performance improvement is that a larger chunk size
allows qemu_rdma_write() to batch more *contiguous dirty pages* into a single,
more efficient RDMA send operation.

Is there any workloads running on the guest during the migration, or just an idle guest? @Samuel

Given the significant benefit and the fact that the patch itself is straightforward,
I think it's a worthwhile addition.
  
Acked-by: Li Zhijian <lizhijian@fujitsu.com>



>>
>> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
> 
> One thing to mention is RDMA migration is in odd-fixes stage, actually it
> doesn't have a real maintainer so it is kind of "orphaned".  In this case,
> I actually won't suggest we add any new knobs for performance reasons.
> 
> Do you have a strong reason to propose this patch to land upstream?  Is it
> used in production systems and it solves some real problems for you?
> 
> I also wonder what Zhijian would say on this.
> 
> Thanks,
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter
  2026-03-31  3:30   ` Zhijian Li (Fujitsu)
@ 2026-03-31 10:33     ` Zhang, GuoQing (Sam)
  2026-03-31 11:29       ` Zhijian Li (Fujitsu)
  2026-04-01 15:56       ` Peter Xu
  0 siblings, 2 replies; 14+ messages in thread
From: Zhang, GuoQing (Sam) @ 2026-03-31 10:33 UTC (permalink / raw)
  To: Zhijian Li (Fujitsu), Peter Xu, Samuel Zhang
  Cc: qemu-devel@nongnu.org, farosas@suse.de, eblake@redhat.com,
	armbru@redhat.com, Emily.Deng@amd.com, Victor.Zhao@amd.com,
	PengJu.Zhou@amd.com, Qing.Ma@amd.com


On 2026/3/31 11:30, Zhijian Li (Fujitsu) wrote:
> [Some people who received this message don't often get email from lizhijian@fujitsu.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
>
> On 31/03/2026 00:10, Peter Xu wrote:
>> Hi, Samuel,
>>
>> On Fri, Mar 27, 2026 at 02:50:06PM +0800, Samuel Zhang wrote:
>>> The default 1MB RDMA chunk size causes slow live migration because
>>> each chunk triggers a write_flush (ibv_post_send). For 8GB RAM,
>>> 1MB chunk size produces ~15000 flushes vs ~3700 with 1024MB chunk size.
>>>
>>> Add x-rdma-chunk-size parameter to configure the RDMA chunk size for
>>> faster migration.
>>> Usage: `migrate_set_parameter x-rdma-chunk-size 1024M`
>>>
>>> Performance with RDMA live migration of 8GB RAM VM:
>>>
>>> | x-rdma-chunk-size (B) | time (s) | throughput (MB/s) |
>>> |-----------------------|----------|-------------------|
>>> | 1M (default)          | 37.915   |  1,007            |
>> This is the default. It surprised me a bit knowing it can only reach 1GB/s
>> throughput with the current code base.  Do you know why?  I thought RDMA
>> should be much faster than this on throughput with whatever hardware setup.
>
> Regarding the baseline performance, Samuel's numbers look reasonable. I checked
> some of my old test data on a ConnectX-4 Lx card years ago, and the throughput
> was around 10 Gbps (~1.25 GB/s), which is consistent with the 1 GB/s he reported.
>
>>> | 32M                   | 17.880   |  2,260            |
>>> | 1024M                 |  4.368   | 17,529            |
> My guess for the dramatic performance improvement is that a larger chunk size
> allows qemu_rdma_write() to batch more *contiguous dirty pages* into a single,
> more efficient RDMA send operation.

The `throughput` data is collected from `info migrate` qemu monitor 
command after live-migration.

Yes, Zhijian is right. As each chunk triggers a write_flush and each 
flush involves posting an RDMA WRITE and WAITING for completion, there's 
software overhead here.

For 8GB RAM VM migration, 1MB chunk size produces ~15000 flushes. The 
software overhead adds up and prevents the RDMA hardware from sustaining 
high throughput.

When chunk size is 1GB, there are ~3700 flushes. Reduced flush count 
means reduced software overhead and improved overall throughput.



>
> Is there any workloads running on the guest during the migration, or just an idle guest? @Samuel


The guest is idle when I test the migration and collect the data.


>
> Given the significant benefit and the fact that the patch itself is straightforward,
> I think it's a worthwhile addition.
>
> Acked-by: Li Zhijian <lizhijian@fujitsu.com>


Thank you for the ack, Zhijian!


>
>
>
>>> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
>> One thing to mention is RDMA migration is in odd-fixes stage, actually it
>> doesn't have a real maintainer so it is kind of "orphaned".  In this case,
>> I actually won't suggest we add any new knobs for performance reasons.
>>
>> Do you have a strong reason to propose this patch to land upstream?  Is it
>> used in production systems and it solves some real problems for you?


We have VMs with large RAM and find TCP live-migration is not fast 
enough and expect RDMA migration can be faster.

But we found the rdma mode migration speed is slower than tcp mode. See 
following data.


8GB RAM idle VM live-migration performance:
| transport mode       | time (s) | throughput (MB/s) |
|----------------------|----------|-------------------|
| TCP                  | 36.89    |  1,081            |
| RDMA, 1MB chunk size | 37.915   |  1,007            |
| RDMA, 1GB chunk size |  4.368   | 17,529            |

This patch allows us to use larger chunk size for faster RDMA migration.


Regards
Sam


>>
>> I also wonder what Zhijian would say on this.
>>
>> Thanks,
>>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter
  2026-03-31 10:33     ` Zhang, GuoQing (Sam)
@ 2026-03-31 11:29       ` Zhijian Li (Fujitsu)
  2026-04-01 15:56       ` Peter Xu
  1 sibling, 0 replies; 14+ messages in thread
From: Zhijian Li (Fujitsu) @ 2026-03-31 11:29 UTC (permalink / raw)
  To: Zhang, GuoQing (Sam), Peter Xu, Samuel Zhang
  Cc: qemu-devel@nongnu.org, farosas@suse.de, eblake@redhat.com,
	armbru@redhat.com, Emily.Deng@amd.com, Victor.Zhao@amd.com,
	PengJu.Zhou@amd.com, Qing.Ma@amd.com

On 31/03/2026 18:33, Zhang, GuoQing (Sam) wrote:
>>
>>>> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
>>> One thing to mention is RDMA migration is in odd-fixes stage, actually it
>>> doesn't have a real maintainer so it is kind of "orphaned".  In this case,
>>> I actually won't suggest we add any new knobs for performance reasons.
>>>
>>> Do you have a strong reason to propose this patch to land upstream?  Is it
>>> used in production systems and it solves some real problems for you?
> 
> 
> We have VMs with large RAM and find TCP live-migration is not fast
> enough and expect RDMA migration can be faster.
> 
> But we found the rdma mode migration speed is slower than tcp mode. See
> following data.
> 
> 
> 8GB RAM idle VM live-migration performance:
> | transport mode       | time (s) | throughput (MB/s) |
> |----------------------|----------|-------------------|
> | TCP                  | 36.89    |  1,081            |
> | RDMA, 1MB chunk size | 37.915   |  1,007            |
> | RDMA, 1GB chunk size |  4.368   | 17,529            |
> 
> This patch allows us to use larger chunk size for faster RDMA migration.

Hi Samuel,

Thanks for sharing this comparison data.

 From the fast completion time in your test (4.3s for 8GB), it looks like the VM was
mostly idle. This means after the first full memory pass, very few new dirty pages were
generated, allowing the migration to complete quickly. This scenario is perfect for highlighting
the benefit of large chunks when memory is dirtied in large, contiguous blocks (as it is
during the initial full scan).

To make the case for this new knob even stronger, it would be very helpful to also see data
from a more realistic workload scenario, especially one that generates randomly dirty pages.

In such a case, even with a large chunk size, `qemu_rdma_write()` would only be able to send
small, discontiguous blocks of dirty pages in each iteration.

I suspect the throughput gain will still be significant, but likely different from the idle case.
Having data from a workload that simulates random writes (e.g., using `stress-ng`, memhog etc) would
provide a more complete picture and help us understand the benefits in a worst-case scenario.

Thanks,
Zhijian

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter
  2026-03-31 10:33     ` Zhang, GuoQing (Sam)
  2026-03-31 11:29       ` Zhijian Li (Fujitsu)
@ 2026-04-01 15:56       ` Peter Xu
  2026-04-03  6:15         ` Zhang, GuoQing (Sam)
  1 sibling, 1 reply; 14+ messages in thread
From: Peter Xu @ 2026-04-01 15:56 UTC (permalink / raw)
  To: Zhang, GuoQing (Sam)
  Cc: Zhijian Li (Fujitsu), Samuel Zhang, qemu-devel@nongnu.org,
	farosas@suse.de, eblake@redhat.com, armbru@redhat.com,
	Emily.Deng@amd.com, Victor.Zhao@amd.com, PengJu.Zhou@amd.com,
	Qing.Ma@amd.com

On Tue, Mar 31, 2026 at 06:33:23PM +0800, Zhang, GuoQing (Sam) wrote:
> 
> On 2026/3/31 11:30, Zhijian Li (Fujitsu) wrote:
> > [Some people who received this message don't often get email from lizhijian@fujitsu.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
> > 
> > On 31/03/2026 00:10, Peter Xu wrote:
> > > Hi, Samuel,
> > > 
> > > On Fri, Mar 27, 2026 at 02:50:06PM +0800, Samuel Zhang wrote:
> > > > The default 1MB RDMA chunk size causes slow live migration because
> > > > each chunk triggers a write_flush (ibv_post_send). For 8GB RAM,
> > > > 1MB chunk size produces ~15000 flushes vs ~3700 with 1024MB chunk size.
> > > > 
> > > > Add x-rdma-chunk-size parameter to configure the RDMA chunk size for
> > > > faster migration.
> > > > Usage: `migrate_set_parameter x-rdma-chunk-size 1024M`
> > > > 
> > > > Performance with RDMA live migration of 8GB RAM VM:
> > > > 
> > > > | x-rdma-chunk-size (B) | time (s) | throughput (MB/s) |
> > > > |-----------------------|----------|-------------------|
> > > > | 1M (default)          | 37.915   |  1,007            |
> > > This is the default. It surprised me a bit knowing it can only reach 1GB/s
> > > throughput with the current code base.  Do you know why?  I thought RDMA
> > > should be much faster than this on throughput with whatever hardware setup.
> > 
> > Regarding the baseline performance, Samuel's numbers look reasonable. I checked
> > some of my old test data on a ConnectX-4 Lx card years ago, and the throughput
> > was around 10 Gbps (~1.25 GB/s), which is consistent with the 1 GB/s he reported.
> > 
> > > > | 32M                   | 17.880   |  2,260            |
> > > > | 1024M                 |  4.368   | 17,529            |
> > My guess for the dramatic performance improvement is that a larger chunk size
> > allows qemu_rdma_write() to batch more *contiguous dirty pages* into a single,
> > more efficient RDMA send operation.
> 
> The `throughput` data is collected from `info migrate` qemu monitor command
> after live-migration.
> 
> Yes, Zhijian is right. As each chunk triggers a write_flush and each flush
> involves posting an RDMA WRITE and WAITING for completion, there's software
> overhead here.
> 
> For 8GB RAM VM migration, 1MB chunk size produces ~15000 flushes. The
> software overhead adds up and prevents the RDMA hardware from sustaining
> high throughput.
> 
> When chunk size is 1GB, there are ~3700 flushes. Reduced flush count means
> reduced software overhead and improved overall throughput.
> 

OK, thanks both.

> 
> > 
> > Is there any workloads running on the guest during the migration, or just an idle guest? @Samuel
> 
> 
> The guest is idle when I test the migration and collect the data.
> 
> 
> > 
> > Given the significant benefit and the fact that the patch itself is straightforward,
> > I think it's a worthwhile addition.
> > 
> > Acked-by: Li Zhijian <lizhijian@fujitsu.com>
> 
> 
> Thank you for the ack, Zhijian!
> 
> 
> > 
> > 
> > 
> > > > Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
> > > One thing to mention is RDMA migration is in odd-fixes stage, actually it
> > > doesn't have a real maintainer so it is kind of "orphaned".  In this case,
> > > I actually won't suggest we add any new knobs for performance reasons.
> > > 
> > > Do you have a strong reason to propose this patch to land upstream?  Is it
> > > used in production systems and it solves some real problems for you?
> 
> 
> We have VMs with large RAM and find TCP live-migration is not fast enough
> and expect RDMA migration can be faster.
> 
> But we found the rdma mode migration speed is slower than tcp mode. See
> following data.
> 
> 
> 8GB RAM idle VM live-migration performance:
> | transport mode       | time (s) | throughput (MB/s) |
> |----------------------|----------|-------------------|
> | TCP                  | 36.89    |  1,081            |

What is the NIC setup?  Did you try to enable multifd to offload zeropage
detections?  Or is that not feasible due to some reason?

> | RDMA, 1MB chunk size | 37.915   |  1,007            |
> | RDMA, 1GB chunk size |  4.368   | 17,529            |
> 
> This patch allows us to use larger chunk size for faster RDMA migration.

Sure, Zhijian's point is reasonable.  If he's fine, I'm OK.

Thanks,

> 
> 
> Regards
> Sam
> 
> 
> > > 
> > > I also wonder what Zhijian would say on this.
> > > 
> > > Thanks,
> > > 
> 

-- 
Peter Xu



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter
  2026-04-01 15:56       ` Peter Xu
@ 2026-04-03  6:15         ` Zhang, GuoQing (Sam)
  2026-04-03  9:39           ` Zhijian Li (Fujitsu)
  0 siblings, 1 reply; 14+ messages in thread
From: Zhang, GuoQing (Sam) @ 2026-04-03  6:15 UTC (permalink / raw)
  To: Peter Xu
  Cc: Zhijian Li (Fujitsu), Samuel Zhang, qemu-devel@nongnu.org,
	farosas@suse.de, eblake@redhat.com, armbru@redhat.com,
	Emily.Deng@amd.com, Victor.Zhao@amd.com, PengJu.Zhou@amd.com,
	Qing.Ma@amd.com, Guoqing Zhang

Hi Peter,

The NIC is `Mellanox Technologies MT43244 BlueField-3 integrated 
ConnectX-7 network controller`. 2 Servers directly connected with the 
RDMA cable.

I tried to enable multifd with following monitor cmd on source VM, but 
migration failed with errir `Failed to connect to '192.168.100.3:4440': 
Connection timed out`. I don't know why.

```
migrate_set_capability multifd on
migrate_set_parameter multifd-channels 8
```

Hi Zhijian,

Follow is the perf data you requested.

NIC: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 
network controller
8GB RAM VM.
workload in guest: `stress-ng --vm 4 --vm-bytes {size}G --vm-method 
rand-set --timeout 0`

| transport | chunk size | stress-ng size (G) | time(ms) | throughput |
|-----------|------------|--------------------|----------|------------|
| tcp       | n/a        | 1                  | 44955    | 1245      |
| rdma      | 1m         | 1                  | 46893    | 1240      |
| rdma      | 2m         | 1                  | 47374    | 1308      |
| rdma      | 8m         | 1                  | 38271    | 1542      |
| rdma      | 16m        | 1                  | 38901    | 1558      |
| rdma      | 32m        | 1                  | 13895    | 3762      |
| rdma      | 64m        | 1                  | 4288     | 14357      |
| rdma      | 64m        | 2                  | 5405     | 13719      |
| rdma      | 64m        | 4                  | 9976     | 9475      |
| rdma      | 128m       | 4                  | 3345     | 27428      |
| rdma      | 256m       | 4                  | 3537     | 26097      |
| rdma      | 512m       | 4                  | 3417     | 28440      |
| rdma      | 1024m      | 4                  | 3412     | 27637      |

Regards
Sam


On 2026/4/1 23:56, Peter Xu wrote:
> [You don't often get email from peterx@redhat.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
>
> On Tue, Mar 31, 2026 at 06:33:23PM +0800, Zhang, GuoQing (Sam) wrote:
>> On 2026/3/31 11:30, Zhijian Li (Fujitsu) wrote:
>>> [Some people who received this message don't often get email from lizhijian@fujitsu.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
>>>
>>> On 31/03/2026 00:10, Peter Xu wrote:
>>>> Hi, Samuel,
>>>>
>>>> On Fri, Mar 27, 2026 at 02:50:06PM +0800, Samuel Zhang wrote:
>>>>> The default 1MB RDMA chunk size causes slow live migration because
>>>>> each chunk triggers a write_flush (ibv_post_send). For 8GB RAM,
>>>>> 1MB chunk size produces ~15000 flushes vs ~3700 with 1024MB chunk size.
>>>>>
>>>>> Add x-rdma-chunk-size parameter to configure the RDMA chunk size for
>>>>> faster migration.
>>>>> Usage: `migrate_set_parameter x-rdma-chunk-size 1024M`
>>>>>
>>>>> Performance with RDMA live migration of 8GB RAM VM:
>>>>>
>>>>> | x-rdma-chunk-size (B) | time (s) | throughput (MB/s) |
>>>>> |-----------------------|----------|-------------------|
>>>>> | 1M (default)          | 37.915   |  1,007            |
>>>> This is the default. It surprised me a bit knowing it can only reach 1GB/s
>>>> throughput with the current code base.  Do you know why?  I thought RDMA
>>>> should be much faster than this on throughput with whatever hardware setup.
>>> Regarding the baseline performance, Samuel's numbers look reasonable. I checked
>>> some of my old test data on a ConnectX-4 Lx card years ago, and the throughput
>>> was around 10 Gbps (~1.25 GB/s), which is consistent with the 1 GB/s he reported.
>>>
>>>>> | 32M                   | 17.880   |  2,260            |
>>>>> | 1024M                 |  4.368   | 17,529            |
>>> My guess for the dramatic performance improvement is that a larger chunk size
>>> allows qemu_rdma_write() to batch more *contiguous dirty pages* into a single,
>>> more efficient RDMA send operation.
>> The `throughput` data is collected from `info migrate` qemu monitor command
>> after live-migration.
>>
>> Yes, Zhijian is right. As each chunk triggers a write_flush and each flush
>> involves posting an RDMA WRITE and WAITING for completion, there's software
>> overhead here.
>>
>> For 8GB RAM VM migration, 1MB chunk size produces ~15000 flushes. The
>> software overhead adds up and prevents the RDMA hardware from sustaining
>> high throughput.
>>
>> When chunk size is 1GB, there are ~3700 flushes. Reduced flush count means
>> reduced software overhead and improved overall throughput.
>>
> OK, thanks both.
>
>>> Is there any workloads running on the guest during the migration, or just an idle guest? @Samuel
>>
>> The guest is idle when I test the migration and collect the data.
>>
>>
>>> Given the significant benefit and the fact that the patch itself is straightforward,
>>> I think it's a worthwhile addition.
>>>
>>> Acked-by: Li Zhijian <lizhijian@fujitsu.com>
>>
>> Thank you for the ack, Zhijian!
>>
>>
>>>
>>>
>>>>> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
>>>> One thing to mention is RDMA migration is in odd-fixes stage, actually it
>>>> doesn't have a real maintainer so it is kind of "orphaned".  In this case,
>>>> I actually won't suggest we add any new knobs for performance reasons.
>>>>
>>>> Do you have a strong reason to propose this patch to land upstream?  Is it
>>>> used in production systems and it solves some real problems for you?
>>
>> We have VMs with large RAM and find TCP live-migration is not fast enough
>> and expect RDMA migration can be faster.
>>
>> But we found the rdma mode migration speed is slower than tcp mode. See
>> following data.
>>
>>
>> 8GB RAM idle VM live-migration performance:
>> | transport mode       | time (s) | throughput (MB/s) |
>> |----------------------|----------|-------------------|
>> | TCP                  | 36.89    |  1,081            |
> What is the NIC setup?  Did you try to enable multifd to offload zeropage
> detections?  Or is that not feasible due to some reason?
>
>> | RDMA, 1MB chunk size | 37.915   |  1,007            |
>> | RDMA, 1GB chunk size |  4.368   | 17,529            |
>>
>> This patch allows us to use larger chunk size for faster RDMA migration.
> Sure, Zhijian's point is reasonable.  If he's fine, I'm OK.
>
> Thanks,
>
>>
>> Regards
>> Sam
>>
>>
>>>> I also wonder what Zhijian would say on this.
>>>>
>>>> Thanks,
>>>>
> --
> Peter Xu
>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter
  2026-04-03  6:15         ` Zhang, GuoQing (Sam)
@ 2026-04-03  9:39           ` Zhijian Li (Fujitsu)
  2026-04-03  9:59             ` Zhang, GuoQing (Sam)
  0 siblings, 1 reply; 14+ messages in thread
From: Zhijian Li (Fujitsu) @ 2026-04-03  9:39 UTC (permalink / raw)
  To: Zhang, GuoQing (Sam), Peter Xu
  Cc: Samuel Zhang, qemu-devel@nongnu.org, farosas@suse.de,
	eblake@redhat.com, armbru@redhat.com, Emily.Deng@amd.com,
	Victor.Zhao@amd.com, PengJu.Zhou@amd.com, Qing.Ma@amd.com,
	Guoqing Zhang



On 03/04/2026 14:15, Zhang, GuoQing (Sam) wrote:
> [You don't often get email from guoqzhan@amd.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
> 
> Hi Peter,
> 
> The NIC is `Mellanox Technologies MT43244 BlueField-3 integrated
> ConnectX-7 network controller`. 2 Servers directly connected with the
> RDMA cable.
> 
> I tried to enable multifd with following monitor cmd on source VM, but
> migration failed with errir `Failed to connect to '192.168.100.3:4440':
> Connection timed out`. I don't know why.
> 
> ```
> migrate_set_capability multifd on
> migrate_set_parameter multifd-channels 8
> ```
> 
> Hi Zhijian,
> 
> Follow is the perf data you requested.


Hello Sam

I applied your V4 patch on v11.0.0-rc2,, but encountered following error during migration.

(qemu) migrate_set_parameter x-rdma-chunk-size 2m # only 1m worked
(qemu) migrate -d rdma:192.168.33.1:6666
(qemu) qemu-system-x86_64: rdma migration: send polling control error
qemu-system-x86_64: RDMA is in an error state waiting migration to abort!
qemu-system-x86_64: failed to save SaveStateEntry with id(name): 1(ram): -1
qemu-system-x86_64: Channel error: Operation not permitted
qemu-system-x86_64: warning: Early error. Sending error.
qemu-system-x86_64: warning: rdma migration: send polling control error

Any idea ?



> 
> NIC: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7
> network controller
> 8GB RAM VM.
> workload in guest: `stress-ng --vm 4 --vm-bytes {size}G --vm-method
> rand-set --timeout 0`
> 
> | transport | chunk size | stress-ng size (G) | time(ms) | throughput |
> |-----------|------------|--------------------|----------|------------|
> | tcp       | n/a        | 1                  | 44955    | 1245      |
> | rdma      | 1m         | 1                  | 46893    | 1240      |
> | rdma      | 2m         | 1                  | 47374    | 1308      |
> | rdma      | 8m         | 1                  | 38271    | 1542      |
> | rdma      | 16m        | 1                  | 38901    | 1558      |
> | rdma      | 32m        | 1                  | 13895    | 3762      |
> | rdma      | 64m        | 1                  | 4288     | 14357      |
> | rdma      | 64m        | 2                  | 5405     | 13719      |
> | rdma      | 64m        | 4                  | 9976     | 9475      |
> | rdma      | 128m       | 4                  | 3345     | 27428      |
> | rdma      | 256m       | 4                  | 3537     | 26097      |
> | rdma      | 512m       | 4                  | 3417     | 28440      |
> | rdma      | 1024m      | 4                  | 3412     | 27637      |
> 
> Regards
> Sam
> 
> 
> On 2026/4/1 23:56, Peter Xu wrote:
>> [You don't often get email from peterx@redhat.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
>>
>> On Tue, Mar 31, 2026 at 06:33:23PM +0800, Zhang, GuoQing (Sam) wrote:
>>> On 2026/3/31 11:30, Zhijian Li (Fujitsu) wrote:
>>>> [Some people who received this message don't often get email from lizhijian@fujitsu.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
>>>>
>>>> On 31/03/2026 00:10, Peter Xu wrote:
>>>>> Hi, Samuel,
>>>>>
>>>>> On Fri, Mar 27, 2026 at 02:50:06PM +0800, Samuel Zhang wrote:
>>>>>> The default 1MB RDMA chunk size causes slow live migration because
>>>>>> each chunk triggers a write_flush (ibv_post_send). For 8GB RAM,
>>>>>> 1MB chunk size produces ~15000 flushes vs ~3700 with 1024MB chunk size.
>>>>>>
>>>>>> Add x-rdma-chunk-size parameter to configure the RDMA chunk size for
>>>>>> faster migration.
>>>>>> Usage: `migrate_set_parameter x-rdma-chunk-size 1024M`
>>>>>>
>>>>>> Performance with RDMA live migration of 8GB RAM VM:
>>>>>>
>>>>>> | x-rdma-chunk-size (B) | time (s) | throughput (MB/s) |
>>>>>> |-----------------------|----------|-------------------|
>>>>>> | 1M (default)          | 37.915   |  1,007            |
>>>>> This is the default. It surprised me a bit knowing it can only reach 1GB/s
>>>>> throughput with the current code base.  Do you know why?  I thought RDMA
>>>>> should be much faster than this on throughput with whatever hardware setup.
>>>> Regarding the baseline performance, Samuel's numbers look reasonable. I checked
>>>> some of my old test data on a ConnectX-4 Lx card years ago, and the throughput
>>>> was around 10 Gbps (~1.25 GB/s), which is consistent with the 1 GB/s he reported.
>>>>
>>>>>> | 32M                   | 17.880   |  2,260            |
>>>>>> | 1024M                 |  4.368   | 17,529            |
>>>> My guess for the dramatic performance improvement is that a larger chunk size
>>>> allows qemu_rdma_write() to batch more *contiguous dirty pages* into a single,
>>>> more efficient RDMA send operation.
>>> The `throughput` data is collected from `info migrate` qemu monitor command
>>> after live-migration.
>>>
>>> Yes, Zhijian is right. As each chunk triggers a write_flush and each flush
>>> involves posting an RDMA WRITE and WAITING for completion, there's software
>>> overhead here.
>>>
>>> For 8GB RAM VM migration, 1MB chunk size produces ~15000 flushes. The
>>> software overhead adds up and prevents the RDMA hardware from sustaining
>>> high throughput.
>>>
>>> When chunk size is 1GB, there are ~3700 flushes. Reduced flush count means
>>> reduced software overhead and improved overall throughput.
>>>
>> OK, thanks both.
>>
>>>> Is there any workloads running on the guest during the migration, or just an idle guest? @Samuel
>>>
>>> The guest is idle when I test the migration and collect the data.
>>>
>>>
>>>> Given the significant benefit and the fact that the patch itself is straightforward,
>>>> I think it's a worthwhile addition.
>>>>
>>>> Acked-by: Li Zhijian <lizhijian@fujitsu.com>
>>>
>>> Thank you for the ack, Zhijian!
>>>
>>>
>>>>
>>>>
>>>>>> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
>>>>> One thing to mention is RDMA migration is in odd-fixes stage, actually it
>>>>> doesn't have a real maintainer so it is kind of "orphaned".  In this case,
>>>>> I actually won't suggest we add any new knobs for performance reasons.
>>>>>
>>>>> Do you have a strong reason to propose this patch to land upstream?  Is it
>>>>> used in production systems and it solves some real problems for you?
>>>
>>> We have VMs with large RAM and find TCP live-migration is not fast enough
>>> and expect RDMA migration can be faster.
>>>
>>> But we found the rdma mode migration speed is slower than tcp mode. See
>>> following data.
>>>
>>>
>>> 8GB RAM idle VM live-migration performance:
>>> | transport mode       | time (s) | throughput (MB/s) |
>>> |----------------------|----------|-------------------|
>>> | TCP                  | 36.89    |  1,081            |
>> What is the NIC setup?  Did you try to enable multifd to offload zeropage
>> detections?  Or is that not feasible due to some reason?
>>
>>> | RDMA, 1MB chunk size | 37.915   |  1,007            |
>>> | RDMA, 1GB chunk size |  4.368   | 17,529            |
>>>
>>> This patch allows us to use larger chunk size for faster RDMA migration.
>> Sure, Zhijian's point is reasonable.  If he's fine, I'm OK.
>>
>> Thanks,
>>
>>>
>>> Regards
>>> Sam
>>>
>>>
>>>>> I also wonder what Zhijian would say on this.
>>>>>
>>>>> Thanks,
>>>>>
>> -- 
>> Peter Xu
>>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter
  2026-04-03  9:39           ` Zhijian Li (Fujitsu)
@ 2026-04-03  9:59             ` Zhang, GuoQing (Sam)
  2026-04-07  6:15               ` Zhijian Li (Fujitsu)
  0 siblings, 1 reply; 14+ messages in thread
From: Zhang, GuoQing (Sam) @ 2026-04-03  9:59 UTC (permalink / raw)
  To: Zhijian Li (Fujitsu), Peter Xu
  Cc: Samuel Zhang, qemu-devel@nongnu.org, farosas@suse.de,
	eblake@redhat.com, armbru@redhat.com, Emily.Deng@amd.com,
	Victor.Zhao@amd.com, PengJu.Zhou@amd.com, Qing.Ma@amd.com,
	Guoqing Zhang


On 2026/4/3 17:39, Zhijian Li (Fujitsu) wrote:
> [Some people who received this message don't often get email from lizhijian@fujitsu.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
>
> On 03/04/2026 14:15, Zhang, GuoQing (Sam) wrote:
>> [You don't often get email from guoqzhan@amd.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
>>
>> Hi Peter,
>>
>> The NIC is `Mellanox Technologies MT43244 BlueField-3 integrated
>> ConnectX-7 network controller`. 2 Servers directly connected with the
>> RDMA cable.
>>
>> I tried to enable multifd with following monitor cmd on source VM, but
>> migration failed with errir `Failed to connect to '192.168.100.3:4440':
>> Connection timed out`. I don't know why.
>>
>> ```
>> migrate_set_capability multifd on
>> migrate_set_parameter multifd-channels 8
>> ```
>>
>> Hi Zhijian,
>>
>> Follow is the perf data you requested.
>
> Hello Sam
>
> I applied your V4 patch on v11.0.0-rc2,, but encountered following error during migration.
>
> (qemu) migrate_set_parameter x-rdma-chunk-size 2m # only 1m worked
> (qemu) migrate -d rdma:192.168.33.1:6666
> (qemu) qemu-system-x86_64: rdma migration: send polling control error
> qemu-system-x86_64: RDMA is in an error state waiting migration to abort!
> qemu-system-x86_64: failed to save SaveStateEntry with id(name): 1(ram): -1
> qemu-system-x86_64: Channel error: Operation not permitted
> qemu-system-x86_64: warning: Early error. Sending error.
> qemu-system-x86_64: warning: rdma migration: send polling control error
>
> Any idea ?


Hi Zhijian,

the perf data is based on commit 
fff352b9b6080e580aa1fadd29b4eccf4cb2922a. I just tried v11.0.0-rc2, it's 
also working on my side.

one thing to notice is that `migrate_set_parameter x-rdma-chunk-size 2m` 
should be run on both source and dest qemu before migration starts.

Regards
Sam


>
>
>
>> NIC: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7
>> network controller
>> 8GB RAM VM.
>> workload in guest: `stress-ng --vm 4 --vm-bytes {size}G --vm-method
>> rand-set --timeout 0`
>>
>> | transport | chunk size | stress-ng size (G) | time(ms) | throughput |
>> |-----------|------------|--------------------|----------|------------|
>> | tcp       | n/a        | 1                  | 44955    | 1245      |
>> | rdma      | 1m         | 1                  | 46893    | 1240      |
>> | rdma      | 2m         | 1                  | 47374    | 1308      |
>> | rdma      | 8m         | 1                  | 38271    | 1542      |
>> | rdma      | 16m        | 1                  | 38901    | 1558      |
>> | rdma      | 32m        | 1                  | 13895    | 3762      |
>> | rdma      | 64m        | 1                  | 4288     | 14357      |
>> | rdma      | 64m        | 2                  | 5405     | 13719      |
>> | rdma      | 64m        | 4                  | 9976     | 9475      |
>> | rdma      | 128m       | 4                  | 3345     | 27428      |
>> | rdma      | 256m       | 4                  | 3537     | 26097      |
>> | rdma      | 512m       | 4                  | 3417     | 28440      |
>> | rdma      | 1024m      | 4                  | 3412     | 27637      |
>>
>> Regards
>> Sam
>>
>>
>> On 2026/4/1 23:56, Peter Xu wrote:
>>> [You don't often get email from peterx@redhat.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
>>>
>>> On Tue, Mar 31, 2026 at 06:33:23PM +0800, Zhang, GuoQing (Sam) wrote:
>>>> On 2026/3/31 11:30, Zhijian Li (Fujitsu) wrote:
>>>>> [Some people who received this message don't often get email from lizhijian@fujitsu.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
>>>>>
>>>>> On 31/03/2026 00:10, Peter Xu wrote:
>>>>>> Hi, Samuel,
>>>>>>
>>>>>> On Fri, Mar 27, 2026 at 02:50:06PM +0800, Samuel Zhang wrote:
>>>>>>> The default 1MB RDMA chunk size causes slow live migration because
>>>>>>> each chunk triggers a write_flush (ibv_post_send). For 8GB RAM,
>>>>>>> 1MB chunk size produces ~15000 flushes vs ~3700 with 1024MB chunk size.
>>>>>>>
>>>>>>> Add x-rdma-chunk-size parameter to configure the RDMA chunk size for
>>>>>>> faster migration.
>>>>>>> Usage: `migrate_set_parameter x-rdma-chunk-size 1024M`
>>>>>>>
>>>>>>> Performance with RDMA live migration of 8GB RAM VM:
>>>>>>>
>>>>>>> | x-rdma-chunk-size (B) | time (s) | throughput (MB/s) |
>>>>>>> |-----------------------|----------|-------------------|
>>>>>>> | 1M (default)          | 37.915   |  1,007            |
>>>>>> This is the default. It surprised me a bit knowing it can only reach 1GB/s
>>>>>> throughput with the current code base.  Do you know why?  I thought RDMA
>>>>>> should be much faster than this on throughput with whatever hardware setup.
>>>>> Regarding the baseline performance, Samuel's numbers look reasonable. I checked
>>>>> some of my old test data on a ConnectX-4 Lx card years ago, and the throughput
>>>>> was around 10 Gbps (~1.25 GB/s), which is consistent with the 1 GB/s he reported.
>>>>>
>>>>>>> | 32M                   | 17.880   |  2,260            |
>>>>>>> | 1024M                 |  4.368   | 17,529            |
>>>>> My guess for the dramatic performance improvement is that a larger chunk size
>>>>> allows qemu_rdma_write() to batch more *contiguous dirty pages* into a single,
>>>>> more efficient RDMA send operation.
>>>> The `throughput` data is collected from `info migrate` qemu monitor command
>>>> after live-migration.
>>>>
>>>> Yes, Zhijian is right. As each chunk triggers a write_flush and each flush
>>>> involves posting an RDMA WRITE and WAITING for completion, there's software
>>>> overhead here.
>>>>
>>>> For 8GB RAM VM migration, 1MB chunk size produces ~15000 flushes. The
>>>> software overhead adds up and prevents the RDMA hardware from sustaining
>>>> high throughput.
>>>>
>>>> When chunk size is 1GB, there are ~3700 flushes. Reduced flush count means
>>>> reduced software overhead and improved overall throughput.
>>>>
>>> OK, thanks both.
>>>
>>>>> Is there any workloads running on the guest during the migration, or just an idle guest? @Samuel
>>>> The guest is idle when I test the migration and collect the data.
>>>>
>>>>
>>>>> Given the significant benefit and the fact that the patch itself is straightforward,
>>>>> I think it's a worthwhile addition.
>>>>>
>>>>> Acked-by: Li Zhijian <lizhijian@fujitsu.com>
>>>> Thank you for the ack, Zhijian!
>>>>
>>>>
>>>>>
>>>>>>> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
>>>>>> One thing to mention is RDMA migration is in odd-fixes stage, actually it
>>>>>> doesn't have a real maintainer so it is kind of "orphaned".  In this case,
>>>>>> I actually won't suggest we add any new knobs for performance reasons.
>>>>>>
>>>>>> Do you have a strong reason to propose this patch to land upstream?  Is it
>>>>>> used in production systems and it solves some real problems for you?
>>>> We have VMs with large RAM and find TCP live-migration is not fast enough
>>>> and expect RDMA migration can be faster.
>>>>
>>>> But we found the rdma mode migration speed is slower than tcp mode. See
>>>> following data.
>>>>
>>>>
>>>> 8GB RAM idle VM live-migration performance:
>>>> | transport mode       | time (s) | throughput (MB/s) |
>>>> |----------------------|----------|-------------------|
>>>> | TCP                  | 36.89    |  1,081            |
>>> What is the NIC setup?  Did you try to enable multifd to offload zeropage
>>> detections?  Or is that not feasible due to some reason?
>>>
>>>> | RDMA, 1MB chunk size | 37.915   |  1,007            |
>>>> | RDMA, 1GB chunk size |  4.368   | 17,529            |
>>>>
>>>> This patch allows us to use larger chunk size for faster RDMA migration.
>>> Sure, Zhijian's point is reasonable.  If he's fine, I'm OK.
>>>
>>> Thanks,
>>>
>>>> Regards
>>>> Sam
>>>>
>>>>
>>>>>> I also wonder what Zhijian would say on this.
>>>>>>
>>>>>> Thanks,
>>>>>>
>>> --
>>> Peter Xu
>>>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter
  2026-04-03  9:59             ` Zhang, GuoQing (Sam)
@ 2026-04-07  6:15               ` Zhijian Li (Fujitsu)
  0 siblings, 0 replies; 14+ messages in thread
From: Zhijian Li (Fujitsu) @ 2026-04-07  6:15 UTC (permalink / raw)
  To: Zhang, GuoQing (Sam), Peter Xu
  Cc: Samuel Zhang, qemu-devel@nongnu.org, farosas@suse.de,
	eblake@redhat.com, armbru@redhat.com, Emily.Deng@amd.com,
	Victor.Zhao@amd.com, PengJu.Zhou@amd.com, Qing.Ma@amd.com,
	Guoqing Zhang



On 03/04/2026 17:59, Zhang, GuoQing (Sam) wrote:
> 
> On 2026/4/3 17:39, Zhijian Li (Fujitsu) wrote:
>> [Some people who received this message don't often get email from lizhijian@fujitsu.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
>>
>> On 03/04/2026 14:15, Zhang, GuoQing (Sam) wrote:
>>> [You don't often get email from guoqzhan@amd.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
>>>
>>> Hi Peter,
>>>
>>> The NIC is `Mellanox Technologies MT43244 BlueField-3 integrated
>>> ConnectX-7 network controller`. 2 Servers directly connected with the
>>> RDMA cable.
>>>
>>> I tried to enable multifd with following monitor cmd on source VM, but
>>> migration failed with errir `Failed to connect to '192.168.100.3:4440':
>>> Connection timed out`. I don't know why.
>>>
>>> ```
>>> migrate_set_capability multifd on
>>> migrate_set_parameter multifd-channels 8
>>> ```
>>>
>>> Hi Zhijian,
>>>
>>> Follow is the perf data you requested.
>>
>> Hello Sam
>>
>> I applied your V4 patch on v11.0.0-rc2,, but encountered following error during migration.
>>
>> (qemu) migrate_set_parameter x-rdma-chunk-size 2m # only 1m worked
>> (qemu) migrate -d rdma:192.168.33.1:6666
>> (qemu) qemu-system-x86_64: rdma migration: send polling control error
>> qemu-system-x86_64: RDMA is in an error state waiting migration to abort!
>> qemu-system-x86_64: failed to save SaveStateEntry with id(name): 1(ram): -1
>> qemu-system-x86_64: Channel error: Operation not permitted
>> qemu-system-x86_64: warning: Early error. Sending error.
>> qemu-system-x86_64: warning: rdma migration: send polling control error
>>
>> Any idea ?
> 
> 
> Hi Zhijian,
> 
> the perf data is based on commit fff352b9b6080e580aa1fadd29b4eccf4cb2922a. I just tried v11.0.0-rc2, it's also working on my side.
> 
> one thing to notice is that `migrate_set_parameter x-rdma-chunk-size 2m` should be run on both source and dest qemu before migration starts.

You were right, I missed setting this parameter on the destination side. It works now, thanks for the hint!
  
Here is my test environment and some observations:
  
ENV:
Mellanox Technologies MT27710 Family [ConnectX-4 Lx]
VM mem: 4g
workload: stress-ng --vm 4 --vm-bytes {size}G --vm-method rand-set --timeout 0
  
- For migrations that could complete (with stress-ng using <2G memory), the performance improvement is significant (10x+).
- For migrations that unable to complete (still active after 10 minutes), the throughput reported by `info migrate`
   fluctuates wildly, ranging from 1x to over 20x, as shown in [1].
  
So, feel free to add to v4:
Tested-by: Li Zhijian <lizhijian@fujitsu.com>



[1]
qemu) info migrate
Status:                 active
Time (ms):              total=237411, setup=5, exp_down=29519
RAM info:
   Throughput (Mbps):    1076.76
   Sizes:                pagesize=4 KiB, total=4.02 GiB
   Transfers:            transferred=57.1 GiB, remain=3.32 GiB
     Channels:           precopy=57.1 GiB, multifd=0 B, postcopy=0 B
     Page Types:         normal=14961923, zero=33
   Page Rates (pps):     transfer=32860, dirty=45497
   Others:               dirty_syncs=17
(qemu)
(qemu)
(qemu) info migrate
Status:                 active
Time (ms):              total=238506, setup=5, exp_down=6674
RAM info:
   Throughput (Mbps):    4762.50
   Sizes:                pagesize=4 KiB, total=4.02 GiB
   Transfers:            transferred=57.3 GiB, remain=3.07 GiB
     Channels:           precopy=57.3 GiB, multifd=0 B, postcopy=0 B
     Page Types:         normal=15027630, zero=33
   Page Rates (pps):     transfer=145340, dirty=45497
   Others:               dirty_syncs=17
(qemu) info migrate
Status:                 active
Time (ms):              total=309989, setup=5, exp_down=2207
RAM info:
   Throughput (Mbps):    14233.11
   Sizes:                pagesize=4 KiB, total=4.02 GiB
   Transfers:            transferred=73.3 GiB, remain=1.58 GiB
     Channels:           precopy=73.3 GiB, multifd=0 B, postcopy=0 B
     Page Types:         normal=19211335, zero=33
   Page Rates (pps):     transfer=434360, dirty=67389
   Others:               dirty_syncs=21
(qemu) info migrate
Status:                 active
Time (ms):              total=321980, setup=5, exp_down=29503
RAM info:
   Throughput (Mbps):    1079.05
   Sizes:                pagesize=4 KiB, total=4.02 GiB
   Transfers:            transferred=75.3 GiB, remain=3.21 GiB
     Channels:           precopy=75.3 GiB, multifd=0 B, postcopy=0 B
     Page Types:         normal=19746487, zero=33
   Page Rates (pps):     transfer=32930, dirty=66398
   Others:               dirty_syncs=22
(qemu) info migrate
Status:                 active
Time (ms):              total=326092, setup=5, exp_down=12821
RAM info:
   Throughput (Mbps):    2482.98
   Sizes:                pagesize=4 KiB, total=4.02 GiB
   Transfers:            transferred=77.7 GiB, remain=849 MiB
     Channels:           precopy=77.7 GiB, multifd=0 B, postcopy=0 B
     Page Types:         normal=20370821, zero=33
   Page Rates (pps):     transfer=75774, dirty=66398
   Others:               dirty_syncs=22


Thanks
Zhijian

>>
>>
>>> NIC: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7
>>> network controller
>>> 8GB RAM VM.
>>> workload in guest: `stress-ng --vm 4 --vm-bytes {size}G --vm-method
>>> rand-set --timeout 0`
>>>
>>> | transport | chunk size | stress-ng size (G) | time(ms) | throughput |
>>> |-----------|------------|--------------------|----------|------------|
>>> | tcp       | n/a        | 1                  | 44955    | 1245      |
>>> | rdma      | 1m         | 1                  | 46893    | 1240      |
>>> | rdma      | 2m         | 1                  | 47374    | 1308      |
>>> | rdma      | 8m         | 1                  | 38271    | 1542      |
>>> | rdma      | 16m        | 1                  | 38901    | 1558      |
>>> | rdma      | 32m        | 1                  | 13895    | 3762      |
>>> | rdma      | 64m        | 1                  | 4288     | 14357      |
>>> | rdma      | 64m        | 2                  | 5405     | 13719      |
>>> | rdma      | 64m        | 4                  | 9976     | 9475      |
>>> | rdma      | 128m       | 4                  | 3345     | 27428      |
>>> | rdma      | 256m       | 4                  | 3537     | 26097      |
>>> | rdma      | 512m       | 4                  | 3417     | 28440      |
>>> | rdma      | 1024m      | 4                  | 3412     | 27637      |
>>>
>>> Regards
>>> Sam
>>>
>>>
>>> On 2026/4/1 23:56, Peter Xu wrote:
>>>> [You don't often get email from peterx@redhat.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
>>>>
>>>> On Tue, Mar 31, 2026 at 06:33:23PM +0800, Zhang, GuoQing (Sam) wrote:
>>>>> On 2026/3/31 11:30, Zhijian Li (Fujitsu) wrote:
>>>>>> [Some people who received this message don't often get email from lizhijian@fujitsu.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
>>>>>>
>>>>>> On 31/03/2026 00:10, Peter Xu wrote:
>>>>>>> Hi, Samuel,
>>>>>>>
>>>>>>> On Fri, Mar 27, 2026 at 02:50:06PM +0800, Samuel Zhang wrote:
>>>>>>>> The default 1MB RDMA chunk size causes slow live migration because
>>>>>>>> each chunk triggers a write_flush (ibv_post_send). For 8GB RAM,
>>>>>>>> 1MB chunk size produces ~15000 flushes vs ~3700 with 1024MB chunk size.
>>>>>>>>
>>>>>>>> Add x-rdma-chunk-size parameter to configure the RDMA chunk size for
>>>>>>>> faster migration.
>>>>>>>> Usage: `migrate_set_parameter x-rdma-chunk-size 1024M`
>>>>>>>>
>>>>>>>> Performance with RDMA live migration of 8GB RAM VM:
>>>>>>>>
>>>>>>>> | x-rdma-chunk-size (B) | time (s) | throughput (MB/s) |
>>>>>>>> |-----------------------|----------|-------------------|
>>>>>>>> | 1M (default)          | 37.915   |  1,007            |
>>>>>>> This is the default. It surprised me a bit knowing it can only reach 1GB/s
>>>>>>> throughput with the current code base.  Do you know why?  I thought RDMA
>>>>>>> should be much faster than this on throughput with whatever hardware setup.
>>>>>> Regarding the baseline performance, Samuel's numbers look reasonable. I checked
>>>>>> some of my old test data on a ConnectX-4 Lx card years ago, and the throughput
>>>>>> was around 10 Gbps (~1.25 GB/s), which is consistent with the 1 GB/s he reported.
>>>>>>
>>>>>>>> | 32M                   | 17.880   |  2,260            |
>>>>>>>> | 1024M                 |  4.368   | 17,529            |
>>>>>> My guess for the dramatic performance improvement is that a larger chunk size
>>>>>> allows qemu_rdma_write() to batch more *contiguous dirty pages* into a single,
>>>>>> more efficient RDMA send operation.
>>>>> The `throughput` data is collected from `info migrate` qemu monitor command
>>>>> after live-migration.
>>>>>
>>>>> Yes, Zhijian is right. As each chunk triggers a write_flush and each flush
>>>>> involves posting an RDMA WRITE and WAITING for completion, there's software
>>>>> overhead here.
>>>>>
>>>>> For 8GB RAM VM migration, 1MB chunk size produces ~15000 flushes. The
>>>>> software overhead adds up and prevents the RDMA hardware from sustaining
>>>>> high throughput.
>>>>>
>>>>> When chunk size is 1GB, there are ~3700 flushes. Reduced flush count means
>>>>> reduced software overhead and improved overall throughput.
>>>>>
>>>> OK, thanks both.
>>>>
>>>>>> Is there any workloads running on the guest during the migration, or just an idle guest? @Samuel
>>>>> The guest is idle when I test the migration and collect the data.
>>>>>
>>>>>
>>>>>> Given the significant benefit and the fact that the patch itself is straightforward,
>>>>>> I think it's a worthwhile addition.
>>>>>>
>>>>>> Acked-by: Li Zhijian <lizhijian@fujitsu.com>
>>>>> Thank you for the ack, Zhijian!
>>>>>
>>>>>
>>>>>>
>>>>>>>> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
>>>>>>> One thing to mention is RDMA migration is in odd-fixes stage, actually it
>>>>>>> doesn't have a real maintainer so it is kind of "orphaned".  In this case,
>>>>>>> I actually won't suggest we add any new knobs for performance reasons.
>>>>>>>
>>>>>>> Do you have a strong reason to propose this patch to land upstream?  Is it
>>>>>>> used in production systems and it solves some real problems for you?
>>>>> We have VMs with large RAM and find TCP live-migration is not fast enough
>>>>> and expect RDMA migration can be faster.
>>>>>
>>>>> But we found the rdma mode migration speed is slower than tcp mode. See
>>>>> following data.
>>>>>
>>>>>
>>>>> 8GB RAM idle VM live-migration performance:
>>>>> | transport mode       | time (s) | throughput (MB/s) |
>>>>> |----------------------|----------|-------------------|
>>>>> | TCP                  | 36.89    |  1,081            |
>>>> What is the NIC setup?  Did you try to enable multifd to offload zeropage
>>>> detections?  Or is that not feasible due to some reason?
>>>>
>>>>> | RDMA, 1MB chunk size | 37.915   |  1,007            |
>>>>> | RDMA, 1GB chunk size |  4.368   | 17,529            |
>>>>>
>>>>> This patch allows us to use larger chunk size for faster RDMA migration.
>>>> Sure, Zhijian's point is reasonable.  If he's fine, I'm OK.
>>>>
>>>> Thanks,
>>>>
>>>>> Regards
>>>>> Sam
>>>>>
>>>>>
>>>>>>> I also wonder what Zhijian would say on this.
>>>>>>>
>>>>>>> Thanks,
>>>>>>>
>>>> -- 
>>>> Peter Xu
>>>>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter
  2026-03-30 16:10 ` Peter Xu
  2026-03-31  3:30   ` Zhijian Li (Fujitsu)
@ 2026-03-31 11:06   ` Markus Armbruster
  1 sibling, 0 replies; 14+ messages in thread
From: Markus Armbruster @ 2026-03-31 11:06 UTC (permalink / raw)
  To: Peter Xu
  Cc: Samuel Zhang, qemu-devel, farosas, lizhijian, eblake, Emily.Deng,
	Victor.Zhao, PengJu.Zhou, Qing.Ma

Peter Xu <peterx@redhat.com> writes:

[...]

> One thing to mention is RDMA migration is in odd-fixes stage, actually it
> doesn't have a real maintainer so it is kind of "orphaned".  In this case,
> I actually won't suggest we add any new knobs for performance reasons.

Good point.

> Do you have a strong reason to propose this patch to land upstream?  Is it
> used in production systems and it solves some real problems for you?

If you use it in production, finding an upstream maintainer is in your
best interest.  Give it a thought.

> I also wonder what Zhijian would say on this.
>
> Thanks,



^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2026-04-07 19:25 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-27  6:50 [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter Samuel Zhang
2026-03-27  9:45 ` Markus Armbruster
2026-03-27 10:27   ` Zhang, GuoQing (Sam)
2026-03-27 11:24     ` Markus Armbruster
2026-03-30 16:10 ` Peter Xu
2026-03-31  3:30   ` Zhijian Li (Fujitsu)
2026-03-31 10:33     ` Zhang, GuoQing (Sam)
2026-03-31 11:29       ` Zhijian Li (Fujitsu)
2026-04-01 15:56       ` Peter Xu
2026-04-03  6:15         ` Zhang, GuoQing (Sam)
2026-04-03  9:39           ` Zhijian Li (Fujitsu)
2026-04-03  9:59             ` Zhang, GuoQing (Sam)
2026-04-07  6:15               ` Zhijian Li (Fujitsu)
2026-03-31 11:06   ` Markus Armbruster

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.