All of lore.kernel.org
 help / color / mirror / Atom feed
From: Markus Armbruster <armbru@redhat.com>
To: Samuel Zhang <guoqing.zhang@amd.com>
Cc: <qemu-devel@nongnu.org>,  <Emily.Deng@amd.com>,  <PengJu.Zhou@amd.com>
Subject: Re: [PATCH] migration/rdma: add x-rdma-chunk-shift parameter
Date: Thu, 19 Mar 2026 09:48:30 +0100	[thread overview]
Message-ID: <874imcciht.fsf@pond.sub.org> (raw)
In-Reply-To: <20260316062308.1240426-1-guoqing.zhang@amd.com> (Samuel Zhang's message of "Mon, 16 Mar 2026 14:23:08 +0800")

Samuel Zhang <guoqing.zhang@amd.com> writes:

> The default 1MB RDMA chunk size causes slow live migration because
> each chunk triggers a write_flush (ibv_post_send). For 8GB RAM,
> 1MB chunks produce ~15000 flushes vs ~3700 with 1GB chunks.
>
> Add x-rdma-chunk-shift parameter to configure the RDMA chunk size
> (2^N bytes) for faster migration.
> Usage: -global migration.x-rdma-chunk-shift=30
>
> Performance with RDMA live migration of 8GB RAM VM:
>
> | x-rdma-chunk-shift | chunk size | time (s) | throughput (Mbps) |
> |--------------------|------------|----------|-------------------|
> | 20 (default)       | 1 MB       | 37.915   | 1,007             |
> | 25                 | 32 MB      | 17.880   | 2,260             |
> | 30                 | 1 GB       |  4.368   | 17,529            |
>
> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
> ---
>  migration/options.c | 13 +++++++++++++
>  migration/options.h |  1 +
>  migration/rdma.c    | 37 ++++++++++++++++++++++---------------
>  qapi/migration.json |  9 ++++++++-
>  4 files changed, 44 insertions(+), 16 deletions(-)
>
> diff --git a/migration/options.c b/migration/options.c
> index f33b297929..1503ae35a2 100644
> --- a/migration/options.c
> +++ b/migration/options.c
> @@ -90,6 +90,7 @@ const PropertyInfo qdev_prop_StrOrNull;
>  
>  #define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT_PERIOD     1000    /* milliseconds */
>  #define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT            1       /* MB/s */
> +#define DEFAULT_MIGRATE_X_RDMA_CHUNK_SHIFT          20      /* 1MB */
>  
>  const Property migration_properties[] = {
>      DEFINE_PROP_BOOL("store-global-state", MigrationState,
> @@ -183,6 +184,9 @@ const Property migration_properties[] = {
>      DEFINE_PROP_ZERO_PAGE_DETECTION("zero-page-detection", MigrationState,
>                         parameters.zero_page_detection,
>                         ZERO_PAGE_DETECTION_MULTIFD),
> +    DEFINE_PROP_UINT8("x-rdma-chunk-shift", MigrationState,
> +                      parameters.x_rdma_chunk_shift,
> +                      DEFAULT_MIGRATE_X_RDMA_CHUNK_SHIFT),
>  
>      /* Migration capabilities */
>      DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
> @@ -993,6 +997,15 @@ ZeroPageDetection migrate_zero_page_detection(void)
>      return s->parameters.zero_page_detection;
>  }
>  
> +uint8_t migrate_rdma_chunk_shift(void)
> +{
> +    MigrationState *s = migrate_get_current();
> +    uint8_t chunk_shift = s->parameters.x_rdma_chunk_shift;
> +
> +    assert(20 <= chunk_shift && chunk_shift <= 30);

Where is this ensured?

> +    return chunk_shift;
> +}
> +
>  /* parameters helpers */
>  
>  AnnounceParameters *migrate_announce_params(void)
> diff --git a/migration/options.h b/migration/options.h
> index b502871097..3f214465a3 100644
> --- a/migration/options.h
> +++ b/migration/options.h
> @@ -87,6 +87,7 @@ const char *migrate_tls_creds(void);
>  const char *migrate_tls_hostname(void);
>  uint64_t migrate_xbzrle_cache_size(void);
>  ZeroPageDetection migrate_zero_page_detection(void);
> +uint8_t migrate_rdma_chunk_shift(void);
>  
>  /* parameters helpers */
>  
> diff --git a/migration/rdma.c b/migration/rdma.c
> index 55ab85650a..d914a7cd3b 100644
> --- a/migration/rdma.c
> +++ b/migration/rdma.c
> @@ -44,11 +44,18 @@
>  
>  #define RDMA_RESOLVE_TIMEOUT_MS 10000
>  
> -/* Do not merge data if larger than this. */
> -#define RDMA_MERGE_MAX (2 * 1024 * 1024)
> -#define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
> +#define RDMA_SIGNALED_SEND_MAX 512
> +
> +static inline uint64_t rdma_chunk_size(void)
> +{
> +    return 1UL << migrate_rdma_chunk_shift();
> +}
>  
> -#define RDMA_REG_CHUNK_SHIFT 20 /* 1 MB */
> +/* Do not merge data if larger than this. */
> +static inline uint64_t rdma_merge_max(void)
> +{
> +    return rdma_chunk_size() * 2;
> +}
>  
>  /*
>   * This is only for non-live state being migrated.
> @@ -527,21 +534,21 @@ static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
>  static inline uint64_t ram_chunk_index(const uint8_t *start,
>                                         const uint8_t *host)
>  {
> -    return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
> +    return ((uintptr_t) host - (uintptr_t) start) >> migrate_rdma_chunk_shift();
>  }
>  
>  static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block,
>                                         uint64_t i)
>  {
>      return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr +
> -                                  (i << RDMA_REG_CHUNK_SHIFT));
> +                                  (i << migrate_rdma_chunk_shift()));
>  }
>  
>  static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
>                                       uint64_t i)
>  {
>      uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
> -                                         (1UL << RDMA_REG_CHUNK_SHIFT);
> +                                         rdma_chunk_size();
>  
>      if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
>          result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
> @@ -1841,6 +1848,7 @@ static int qemu_rdma_write_one(RDMAContext *rdma,
>      struct ibv_send_wr *bad_wr;
>      int reg_result_idx, ret, count = 0;
>      uint64_t chunk, chunks;
> +    uint64_t chunk_size = rdma_chunk_size();
>      uint8_t *chunk_start, *chunk_end;
>      RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
>      RDMARegister reg;
> @@ -1861,22 +1869,21 @@ retry:
>      chunk_start = ram_chunk_start(block, chunk);
>  
>      if (block->is_ram_block) {
> -        chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
> +        chunks = length / chunk_size;
>  
> -        if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
> +        if (chunks && ((length % chunk_size) == 0)) {
>              chunks--;
>          }
>      } else {
> -        chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
> +        chunks = block->length / chunk_size;
>  
> -        if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
> +        if (chunks && ((block->length % chunk_size) == 0)) {
>              chunks--;
>          }
>      }
>  
>      trace_qemu_rdma_write_one_top(chunks + 1,
> -                                  (chunks + 1) *
> -                                  (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
> +                                  (chunks + 1) * chunk_size / 1024 / 1024);
>  
>      chunk_end = ram_chunk_end(block, chunk + chunks);
>  
> @@ -2176,7 +2183,7 @@ static int qemu_rdma_write(RDMAContext *rdma,
>      rdma->current_length += len;
>  
>      /* flush it if buffer is too large */
> -    if (rdma->current_length >= RDMA_MERGE_MAX) {
> +    if (rdma->current_length >= rdma_merge_max()) {
>          return qemu_rdma_write_flush(rdma, errp);
>      }
>  
> @@ -3522,7 +3529,7 @@ int rdma_registration_handle(QEMUFile *f)
>                  } else {
>                      chunk = reg->key.chunk;
>                      host_addr = block->local_host_addr +
> -                        (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
> +                        (reg->key.chunk * rdma_chunk_size());
>                      /* Check for particularly bad chunk value */
>                      if (host_addr < (void *)block->local_host_addr) {
>                          error_report("rdma: bad chunk for block %s"
> diff --git a/qapi/migration.json b/qapi/migration.json
> index 7134d4ce47..0521bf3d69 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -1007,9 +1007,14 @@
>  #     is @cpr-exec.  The first list element is the program's filename,
>  #     the remainder its arguments.  (Since 10.2)
>  #
> +# @x-rdma-chunk-shift: RDMA memory registration chunk shift.
> +#     The chunk size is 2^N bytes where N is the value.

The value of what?  Oh, the value of @x-rdma-chunk-shift.

Acceptable range?  I doubt 0 or 255 work :)

Would this be easier to document if we make it a byte count
@x-rdma-chunk-size, must be a power of two?

> +#     Defaults to 20 (1 MiB).  Only takes effect for RDMA migration.

> +#     (Since 10.2)

11.0 right now, but realistically 11.1.

> +#
>  # Features:
>  #
> -# @unstable: Members @x-checkpoint-delay and
> +# @unstable: Members @x-rdma-chunk-shift, @x-checkpoint-delay and
>  #     @x-vcpu-dirty-limit-period are experimental.

Keep the list of members sorted:

   # @unstable: Members @x-checkpoint-delay, @x-rdma-chunk-shift, and
   #     @x-vcpu-dirty-limit-period are experimental.

>  #
>  # Since: 2.4
> @@ -1045,6 +1050,8 @@
>              '*vcpu-dirty-limit': 'uint64',
>              '*mode': 'MigMode',
>              '*zero-page-detection': 'ZeroPageDetection',
> +            '*x-rdma-chunk-shift': { 'type': 'uint8',
> +                                     'features': [ 'unstable' ] },
>              '*direct-io': 'bool',
>              '*cpr-exec-command': [ 'str' ]} }



  reply	other threads:[~2026-03-19  8:49 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-16  6:23 [PATCH] migration/rdma: add x-rdma-chunk-shift parameter Samuel Zhang
2026-03-19  8:48 ` Markus Armbruster [this message]
2026-03-20  3:39   ` Zhang, GuoQing (Sam)
2026-03-20  7:42     ` Markus Armbruster
2026-03-26  2:58 ` [PATCH v2] migration/rdma: add x-rdma-chunk-size parameter Samuel Zhang
2026-03-26  6:57   ` Markus Armbruster
2026-03-27  4:21     ` Zhang, GuoQing (Sam)
2026-03-27  6:05       ` Markus Armbruster

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=874imcciht.fsf@pond.sub.org \
    --to=armbru@redhat.com \
    --cc=Emily.Deng@amd.com \
    --cc=PengJu.Zhou@amd.com \
    --cc=guoqing.zhang@amd.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.