public inbox for qemu-devel@nongnu.org
 help / color / mirror / Atom feed
From: Markus Armbruster <armbru@redhat.com>
To: Samuel Zhang <guoqing.zhang@amd.com>
Cc: <qemu-devel@nongnu.org>,  <Emily.Deng@amd.com>,  <PengJu.Zhou@amd.com>
Subject: Re: [PATCH] migration/rdma: add x-rdma-chunk-shift parameter
Date: Thu, 19 Mar 2026 09:48:30 +0100	[thread overview]
Message-ID: <874imcciht.fsf@pond.sub.org> (raw)
In-Reply-To: <20260316062308.1240426-1-guoqing.zhang@amd.com> (Samuel Zhang's message of "Mon, 16 Mar 2026 14:23:08 +0800")

Samuel Zhang <guoqing.zhang@amd.com> writes:

> The default 1MB RDMA chunk size causes slow live migration because
> each chunk triggers a write_flush (ibv_post_send). For 8GB RAM,
> 1MB chunks produce ~15000 flushes vs ~3700 with 1GB chunks.
>
> Add x-rdma-chunk-shift parameter to configure the RDMA chunk size
> (2^N bytes) for faster migration.
> Usage: -global migration.x-rdma-chunk-shift=30
>
> Performance with RDMA live migration of 8GB RAM VM:
>
> | x-rdma-chunk-shift | chunk size | time (s) | throughput (Mbps) |
> |--------------------|------------|----------|-------------------|
> | 20 (default)       | 1 MB       | 37.915   | 1,007             |
> | 25                 | 32 MB      | 17.880   | 2,260             |
> | 30                 | 1 GB       |  4.368   | 17,529            |
>
> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
> ---
>  migration/options.c | 13 +++++++++++++
>  migration/options.h |  1 +
>  migration/rdma.c    | 37 ++++++++++++++++++++++---------------
>  qapi/migration.json |  9 ++++++++-
>  4 files changed, 44 insertions(+), 16 deletions(-)
>
> diff --git a/migration/options.c b/migration/options.c
> index f33b297929..1503ae35a2 100644
> --- a/migration/options.c
> +++ b/migration/options.c
> @@ -90,6 +90,7 @@ const PropertyInfo qdev_prop_StrOrNull;
>  
>  #define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT_PERIOD     1000    /* milliseconds */
>  #define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT            1       /* MB/s */
> +#define DEFAULT_MIGRATE_X_RDMA_CHUNK_SHIFT          20      /* 1MB */
>  
>  const Property migration_properties[] = {
>      DEFINE_PROP_BOOL("store-global-state", MigrationState,
> @@ -183,6 +184,9 @@ const Property migration_properties[] = {
>      DEFINE_PROP_ZERO_PAGE_DETECTION("zero-page-detection", MigrationState,
>                         parameters.zero_page_detection,
>                         ZERO_PAGE_DETECTION_MULTIFD),
> +    DEFINE_PROP_UINT8("x-rdma-chunk-shift", MigrationState,
> +                      parameters.x_rdma_chunk_shift,
> +                      DEFAULT_MIGRATE_X_RDMA_CHUNK_SHIFT),
>  
>      /* Migration capabilities */
>      DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
> @@ -993,6 +997,15 @@ ZeroPageDetection migrate_zero_page_detection(void)
>      return s->parameters.zero_page_detection;
>  }
>  
> +uint8_t migrate_rdma_chunk_shift(void)
> +{
> +    MigrationState *s = migrate_get_current();
> +    uint8_t chunk_shift = s->parameters.x_rdma_chunk_shift;
> +
> +    assert(20 <= chunk_shift && chunk_shift <= 30);

Where is this ensured?

> +    return chunk_shift;
> +}
> +
>  /* parameters helpers */
>  
>  AnnounceParameters *migrate_announce_params(void)
> diff --git a/migration/options.h b/migration/options.h
> index b502871097..3f214465a3 100644
> --- a/migration/options.h
> +++ b/migration/options.h
> @@ -87,6 +87,7 @@ const char *migrate_tls_creds(void);
>  const char *migrate_tls_hostname(void);
>  uint64_t migrate_xbzrle_cache_size(void);
>  ZeroPageDetection migrate_zero_page_detection(void);
> +uint8_t migrate_rdma_chunk_shift(void);
>  
>  /* parameters helpers */
>  
> diff --git a/migration/rdma.c b/migration/rdma.c
> index 55ab85650a..d914a7cd3b 100644
> --- a/migration/rdma.c
> +++ b/migration/rdma.c
> @@ -44,11 +44,18 @@
>  
>  #define RDMA_RESOLVE_TIMEOUT_MS 10000
>  
> -/* Do not merge data if larger than this. */
> -#define RDMA_MERGE_MAX (2 * 1024 * 1024)
> -#define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
> +#define RDMA_SIGNALED_SEND_MAX 512
> +
> +static inline uint64_t rdma_chunk_size(void)
> +{
> +    return 1UL << migrate_rdma_chunk_shift();
> +}
>  
> -#define RDMA_REG_CHUNK_SHIFT 20 /* 1 MB */
> +/* Do not merge data if larger than this. */
> +static inline uint64_t rdma_merge_max(void)
> +{
> +    return rdma_chunk_size() * 2;
> +}
>  
>  /*
>   * This is only for non-live state being migrated.
> @@ -527,21 +534,21 @@ static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
>  static inline uint64_t ram_chunk_index(const uint8_t *start,
>                                         const uint8_t *host)
>  {
> -    return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
> +    return ((uintptr_t) host - (uintptr_t) start) >> migrate_rdma_chunk_shift();
>  }
>  
>  static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block,
>                                         uint64_t i)
>  {
>      return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr +
> -                                  (i << RDMA_REG_CHUNK_SHIFT));
> +                                  (i << migrate_rdma_chunk_shift()));
>  }
>  
>  static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
>                                       uint64_t i)
>  {
>      uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
> -                                         (1UL << RDMA_REG_CHUNK_SHIFT);
> +                                         rdma_chunk_size();
>  
>      if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
>          result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
> @@ -1841,6 +1848,7 @@ static int qemu_rdma_write_one(RDMAContext *rdma,
>      struct ibv_send_wr *bad_wr;
>      int reg_result_idx, ret, count = 0;
>      uint64_t chunk, chunks;
> +    uint64_t chunk_size = rdma_chunk_size();
>      uint8_t *chunk_start, *chunk_end;
>      RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
>      RDMARegister reg;
> @@ -1861,22 +1869,21 @@ retry:
>      chunk_start = ram_chunk_start(block, chunk);
>  
>      if (block->is_ram_block) {
> -        chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
> +        chunks = length / chunk_size;
>  
> -        if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
> +        if (chunks && ((length % chunk_size) == 0)) {
>              chunks--;
>          }
>      } else {
> -        chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
> +        chunks = block->length / chunk_size;
>  
> -        if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
> +        if (chunks && ((block->length % chunk_size) == 0)) {
>              chunks--;
>          }
>      }
>  
>      trace_qemu_rdma_write_one_top(chunks + 1,
> -                                  (chunks + 1) *
> -                                  (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
> +                                  (chunks + 1) * chunk_size / 1024 / 1024);
>  
>      chunk_end = ram_chunk_end(block, chunk + chunks);
>  
> @@ -2176,7 +2183,7 @@ static int qemu_rdma_write(RDMAContext *rdma,
>      rdma->current_length += len;
>  
>      /* flush it if buffer is too large */
> -    if (rdma->current_length >= RDMA_MERGE_MAX) {
> +    if (rdma->current_length >= rdma_merge_max()) {
>          return qemu_rdma_write_flush(rdma, errp);
>      }
>  
> @@ -3522,7 +3529,7 @@ int rdma_registration_handle(QEMUFile *f)
>                  } else {
>                      chunk = reg->key.chunk;
>                      host_addr = block->local_host_addr +
> -                        (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
> +                        (reg->key.chunk * rdma_chunk_size());
>                      /* Check for particularly bad chunk value */
>                      if (host_addr < (void *)block->local_host_addr) {
>                          error_report("rdma: bad chunk for block %s"
> diff --git a/qapi/migration.json b/qapi/migration.json
> index 7134d4ce47..0521bf3d69 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -1007,9 +1007,14 @@
>  #     is @cpr-exec.  The first list element is the program's filename,
>  #     the remainder its arguments.  (Since 10.2)
>  #
> +# @x-rdma-chunk-shift: RDMA memory registration chunk shift.
> +#     The chunk size is 2^N bytes where N is the value.

The value of what?  Oh, the value of @x-rdma-chunk-shift.

Acceptable range?  I doubt 0 or 255 work :)

Would this be easier to document if we make it a byte count
@x-rdma-chunk-size, must be a power of two?

> +#     Defaults to 20 (1 MiB).  Only takes effect for RDMA migration.

> +#     (Since 10.2)

11.0 right now, but realistically 11.1.

> +#
>  # Features:
>  #
> -# @unstable: Members @x-checkpoint-delay and
> +# @unstable: Members @x-rdma-chunk-shift, @x-checkpoint-delay and
>  #     @x-vcpu-dirty-limit-period are experimental.

Keep the list of members sorted:

   # @unstable: Members @x-checkpoint-delay, @x-rdma-chunk-shift, and
   #     @x-vcpu-dirty-limit-period are experimental.

>  #
>  # Since: 2.4
> @@ -1045,6 +1050,8 @@
>              '*vcpu-dirty-limit': 'uint64',
>              '*mode': 'MigMode',
>              '*zero-page-detection': 'ZeroPageDetection',
> +            '*x-rdma-chunk-shift': { 'type': 'uint8',
> +                                     'features': [ 'unstable' ] },
>              '*direct-io': 'bool',
>              '*cpr-exec-command': [ 'str' ]} }



  reply	other threads:[~2026-03-19  8:49 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-16  6:23 [PATCH] migration/rdma: add x-rdma-chunk-shift parameter Samuel Zhang
2026-03-19  8:48 ` Markus Armbruster [this message]
2026-03-20  3:39   ` Zhang, GuoQing (Sam)
2026-03-20  7:42     ` Markus Armbruster
2026-03-26  2:58 ` [PATCH v2] migration/rdma: add x-rdma-chunk-size parameter Samuel Zhang
2026-03-26  6:57   ` Markus Armbruster
2026-03-27  4:21     ` Zhang, GuoQing (Sam)
2026-03-27  6:05       ` Markus Armbruster

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=874imcciht.fsf@pond.sub.org \
    --to=armbru@redhat.com \
    --cc=Emily.Deng@amd.com \
    --cc=PengJu.Zhou@amd.com \
    --cc=guoqing.zhang@amd.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox