qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Eric Blake <eblake@redhat.com>
To: qemu-devel@nongnu.org
Cc: qemu-block@nongnu.org, alex@alex.org.uk,
	Paolo Bonzini <pbonzini@redhat.com>,
	Kevin Wolf <kwolf@redhat.com>, Max Reitz <mreitz@redhat.com>
Subject: [Qemu-devel] [PATCH v3 44/44] nbd: Implement NBD_OPT_BLOCK_SIZE on client
Date: Fri, 22 Apr 2016 17:40:52 -0600	[thread overview]
Message-ID: <1461368452-10389-45-git-send-email-eblake@redhat.com> (raw)
In-Reply-To: <1461368452-10389-1-git-send-email-eblake@redhat.com>

The upstream NBD Protocol has defined a new extension to allow
the server to advertise block sizes to the client, as well as
a way for the client to inform the server that it intends to
obey block sizes.

Pass any received sizes on to the block layer.

Use the minimum block size as the sector size we pass to the
kernel - which also has the nice effect of cooperating with
(non-qemu) servers that don't do read-modify-write when exposing
a block device with 4k sectors; it can also allow us to visit a
file larger than 2T on a 32-bit kernel.

Signed-off-by: Eric Blake <eblake@redhat.com>
---
 include/block/nbd.h |  3 +++
 block/nbd-client.c  |  3 +++
 block/nbd.c         | 17 +++++++++---
 nbd/client.c        | 74 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 87 insertions(+), 10 deletions(-)

diff --git a/include/block/nbd.h b/include/block/nbd.h
index a5c68df..27a6854 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -133,6 +133,9 @@ enum {
 struct NbdExportInfo {
     uint64_t size;
     uint16_t flags;
+    uint32_t min_block;
+    uint32_t opt_block;
+    uint32_t max_block;
 };
 typedef struct NbdExportInfo NbdExportInfo;

diff --git a/block/nbd-client.c b/block/nbd-client.c
index 2b6ac27..602a8ab 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -443,6 +443,9 @@ int nbd_client_init(BlockDriverState *bs,
         logout("Failed to negotiate with the NBD server\n");
         return ret;
     }
+    if (client->info.min_block > bs->request_alignment) {
+        bs->request_alignment = client->info.min_block;
+    }

     qemu_co_mutex_init(&client->send_mutex);
     qemu_co_mutex_init(&client->free_sema);
diff --git a/block/nbd.c b/block/nbd.c
index 5172039..bb7df55 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -407,9 +407,20 @@ static int nbd_co_flush(BlockDriverState *bs)

 static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
 {
-    bs->bl.max_discard = UINT32_MAX >> BDRV_SECTOR_BITS;
-    bs->bl.max_write_zeroes = UINT32_MAX >> BDRV_SECTOR_BITS;
-    bs->bl.max_transfer_length = UINT32_MAX >> BDRV_SECTOR_BITS;
+    NbdClientSession *s = nbd_get_client_session(bs);
+    int max = UINT32_MAX >> BDRV_SECTOR_BITS;
+
+    if (s->info.max_block) {
+        max = s->info.max_block >> BDRV_SECTOR_BITS;
+    }
+    bs->bl.max_discard = max;
+    bs->bl.max_write_zeroes = max;
+    bs->bl.max_transfer_length = max;
+
+    if (s->info.opt_block &&
+        s->info.opt_block >> BDRV_SECTOR_BITS > bs->bl.opt_transfer_length) {
+        bs->bl.opt_transfer_length = s->info.opt_block >> BDRV_SECTOR_BITS;
+    }
 }

 static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
diff --git a/nbd/client.c b/nbd/client.c
index dac4f29..24f6b0b 100644
--- a/nbd/client.c
+++ b/nbd/client.c
@@ -232,6 +232,11 @@ static int nbd_handle_reply_err(QIOChannel *ioc, nbd_opt_reply *reply,
                    reply->option);
         break;

+    case NBD_REP_ERR_BLOCK_SIZE_REQD:
+        error_setg(errp, "Server wants OPT_BLOCK_SIZE before option %" PRIx32,
+                   reply->option);
+        break;
+
     default:
         error_setg(errp, "Unknown error code when asking for option %" PRIx32,
                    reply->option);
@@ -333,6 +338,21 @@ static int nbd_opt_go(QIOChannel *ioc, const char *wantname,
      * flags still 0 is a witness of a broken server. */
     info->flags = 0;

+    /* Some servers use NBD_OPT_GO to advertise non-default block
+     * sizes, and require that we first use NBD_OPT_BLOCK_SIZE to
+     * agree to that. */
+    TRACE("Attempting NBD_OPT_BLOCK_SIZE");
+    if (nbd_send_option_request(ioc, NBD_OPT_BLOCK_SIZE, 0, NULL, errp) < 0) {
+        return -1;
+    }
+    if (nbd_receive_option_reply(ioc, NBD_OPT_BLOCK_SIZE, &reply, errp) < 0) {
+        return -1;
+    }
+    error = nbd_handle_reply_err(ioc, &reply, errp);
+    if (error < 0) {
+        return error;
+    }
+
     TRACE("Attempting NBD_OPT_GO for export '%s'", wantname);
     if (nbd_send_option_request(ioc, NBD_OPT_GO, -1, wantname, errp) < 0) {
         return -1;
@@ -402,6 +422,45 @@ static int nbd_opt_go(QIOChannel *ioc, const char *wantname,
                   info->size, info->flags);
             break;

+        case NBD_INFO_BLOCK_SIZE:
+            if (len != sizeof(info->min_block) * 3) {
+                error_setg(errp, "remaining export info len %" PRIu32
+                           " is unexpected size", len);
+                return -1;
+            }
+            if (read_sync(ioc, &info->min_block, sizeof(info->min_block)) !=
+                sizeof(info->min_block)) {
+                error_setg(errp, "failed to read info minimum block size");
+                return -1;
+            }
+            be32_to_cpus(&info->min_block);
+            if (!is_power_of_2(info->min_block)) {
+                error_setg(errp, "server minimum block size %" PRId32
+                           "is not a power of two", info->min_block);
+                return -1;
+            }
+            if (read_sync(ioc, &info->opt_block, sizeof(info->opt_block)) !=
+                sizeof(info->opt_block)) {
+                error_setg(errp, "failed to read info preferred block size");
+                return -1;
+            }
+            be32_to_cpus(&info->opt_block);
+            if (!is_power_of_2(info->opt_block) ||
+                info->opt_block < info->min_block) {
+                error_setg(errp, "server preferred block size %" PRId32
+                           "is not valid", info->opt_block);
+                return -1;
+            }
+            if (read_sync(ioc, &info->max_block, sizeof(info->max_block)) !=
+                sizeof(info->max_block)) {
+                error_setg(errp, "failed to read info maximum block size");
+                return -1;
+            }
+            be32_to_cpus(&info->max_block);
+            TRACE("Block sizes are 0x%" PRIx32 ", 0x%" PRIx32 ", 0x%" PRIx32,
+                  info->min_block, info->opt_block, info->max_block);
+            break;
+
         default:
             TRACE("ignoring unknown export info %" PRIu16, type);
             if (drop_sync(ioc, len) != len) {
@@ -710,8 +769,9 @@ fail:
 #ifdef __linux__
 int nbd_init(int fd, QIOChannelSocket *sioc, NbdExportInfo *info)
 {
-    unsigned long sectors = info->size / BDRV_SECTOR_SIZE;
-    if (info->size / BDRV_SECTOR_SIZE != sectors) {
+    unsigned long sector_size = MAX(BDRV_SECTOR_SIZE, info->min_block);
+    unsigned long sectors = info->size / sector_size;
+    if (info->size / sector_size != sectors) {
         LOG("Export size %" PRId64 " too large for 32-bit kernel", info->size);
         return -E2BIG;
     }
@@ -724,18 +784,18 @@ int nbd_init(int fd, QIOChannelSocket *sioc, NbdExportInfo *info)
         return -serrno;
     }

-    TRACE("Setting block size to %lu", (unsigned long)BDRV_SECTOR_SIZE);
+    TRACE("Setting block size to %lu", sector_size);

-    if (ioctl(fd, NBD_SET_BLKSIZE, (unsigned long)BDRV_SECTOR_SIZE) < 0) {
+    if (ioctl(fd, NBD_SET_BLKSIZE, sector_size) < 0) {
         int serrno = errno;
         LOG("Failed setting NBD block size");
         return -serrno;
     }

     TRACE("Setting size to %lu block(s)", sectors);
-    if (size % BDRV_SECTOR_SIZE) {
-        TRACE("Ignoring trailing %d bytes of export",
-              (int) (size % BDRV_SECTOR_SIZE));
+    if (info->size % sector_size) {
+        TRACE("Ignoring trailing %" PRId64 " bytes of export",
+              info->size % sector_size);
     }

     if (ioctl(fd, NBD_SET_SIZE_BLOCKS, sectors) < 0) {
-- 
2.5.5

  parent reply	other threads:[~2016-04-22 23:41 UTC|newest]

Thread overview: 67+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-04-22 23:40 [Qemu-devel] [PATCH v3 00/44] NBD protocol additions Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 01/44] nbd: More debug typo fixes, use correct formats Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 02/44] nbd: Quit server after any write error Eric Blake
2016-04-25  9:21   ` Alex Bligh
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 03/44] nbd: Improve server handling of bogus commands Eric Blake
2016-04-25  9:29   ` Alex Bligh
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 04/44] nbd: Reject unknown request flags Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 05/44] nbd: Group all Linux-specific ioctl code in one place Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 06/44] nbd: Clean up ioctl handling of qemu-nbd -c Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 07/44] nbd: Limit nbdflags to 16 bits Eric Blake
2016-04-25  9:24   ` Alex Bligh
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 08/44] nbd: Add qemu-nbd -D for human-readable description Eric Blake
2016-04-25  9:25   ` Alex Bligh
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 09/44] block: Allow BDRV_REQ_FUA through blk_pwrite() Eric Blake
2016-04-23  8:12   ` Denis V. Lunev
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 10/44] fdc: Switch to byte-based block access Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 11/44] nand: " Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 12/44] onenand: " Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 13/44] pflash: " Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 14/44] sd: " Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 15/44] m25p80: " Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 16/44] atapi: " Eric Blake
2016-04-25 21:36   ` John Snow
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 17/44] nbd: " Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 18/44] qemu-img: " Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 19/44] qemu-io: " Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 20/44] block: Switch blk_read_unthrottled() to byte interface Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 21/44] block: Switch blk_write_zeroes() " Eric Blake
2016-04-23  8:12   ` Denis V. Lunev
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 22/44] block: Kill blk_write(), blk_read() Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 23/44] qemu-io: Add missing option documentation Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 24/44] qemu-io: Add 'write -f' to test FUA flag Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 25/44] qemu-io: Add 'open -u' to set BDRV_O_UNMAP after the fact Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 26/44] qemu-io: Add 'write -z -u' to test MAY_UNMAP flag Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 27/44] nbd: Use BDRV_REQ_FUA for better FUA where supported Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 28/44] nbd: Detect servers that send unexpected error values Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 29/44] nbd: Avoid magic number for NBD max name size Eric Blake
2016-04-25  9:32   ` Alex Bligh
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 30/44] nbd: Treat flags vs. command type as separate fields Eric Blake
2016-04-25  9:34   ` Alex Bligh
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 31/44] nbd: Share common reply-sending code in server Eric Blake
2016-04-25  9:34   ` Alex Bligh
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 32/44] nbd: Share common option-sending code in client Eric Blake
2016-04-25  9:37   ` Alex Bligh
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 33/44] nbd: Let client skip portions of server reply Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 34/44] nbd: Less allocation during NBD_OPT_LIST Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 35/44] nbd: Support shorter handshake Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 36/44] nbd: Improve handling of shutdown requests Eric Blake
2016-04-25  9:47   ` Alex Bligh
2016-04-25 19:20     ` Eric Blake
2016-04-25 19:40       ` Alex Bligh
2016-04-25 19:48         ` Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 37/44] nbd: Create struct for tracking export info Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 38/44] block: Add blk_get_opt_transfer_length() Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 39/44] nbd: Implement NBD_OPT_GO on server Eric Blake
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 40/44] nbd: Implement NBD_OPT_GO on client Eric Blake
2016-04-25 10:31   ` Alex Bligh
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 41/44] nbd: Implement NBD_CMD_WRITE_ZEROES on server Eric Blake
2016-04-23  9:00   ` Pavel Borzenkov
2016-04-25 12:11   ` Alex Bligh
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 42/44] nbd: Implement NBD_CMD_WRITE_ZEROES on client Eric Blake
2016-04-25 12:12   ` Alex Bligh
2016-04-22 23:40 ` [Qemu-devel] [PATCH v3 43/44] nbd: Implement NBD_OPT_BLOCK_SIZE on server Eric Blake
2016-04-25 12:16   ` Alex Bligh
2016-04-22 23:40 ` Eric Blake [this message]
2016-04-25 12:19   ` [Qemu-devel] [PATCH v3 44/44] nbd: Implement NBD_OPT_BLOCK_SIZE on client Alex Bligh
2016-04-25 19:16     ` Eric Blake

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1461368452-10389-45-git-send-email-eblake@redhat.com \
    --to=eblake@redhat.com \
    --cc=alex@alex.org.uk \
    --cc=kwolf@redhat.com \
    --cc=mreitz@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=qemu-block@nongnu.org \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).