From: Roger Pau Monne <roger.pau@citrix.com>
To: qemu-devel@nongnu.org
Cc: Anthony PERARD <anthony.perard@citrix.com>,
xen-devel@lists.xen.org,
Stefano Stabellini <Stefano.Stabellini@eu.citrix.com>,
Roger Pau Monne <roger.pau@citrix.com>
Subject: [Qemu-devel] [PATCH RFC 3/3] xen_disk: add persistent grant support to xen_disk backend
Date: Mon, 31 Dec 2012 13:16:14 +0100 [thread overview]
Message-ID: <1356956174-23548-4-git-send-email-roger.pau@citrix.com> (raw)
In-Reply-To: <1356956174-23548-1-git-send-email-roger.pau@citrix.com>
This protocol extension reuses the same set of grant pages for all
transactions between the front/back drivers, avoiding expensive tlb
flushes, grant table lock contention and switches between userspace
and kernel space. The full description of the protocol can be found in
the public blkif.h header.
Speed improvement with 15 guests performing I/O is ~450%.
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Cc: xen-devel@lists.xen.org
Cc: Stefano Stabellini <Stefano.Stabellini@eu.citrix.com>
Cc: Anthony PERARD <anthony.perard@citrix.com>
---
Performance comparison with the previous implementation can be seen in
the followign graph:
http://xenbits.xen.org/people/royger/persistent_read_qemu.png
---
hw/xen_disk.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++++++------
1 files changed, 138 insertions(+), 17 deletions(-)
diff --git a/hw/xen_disk.c b/hw/xen_disk.c
index 1eb485a..bafeceb 100644
--- a/hw/xen_disk.c
+++ b/hw/xen_disk.c
@@ -52,6 +52,11 @@ static int max_requests = 32;
#define BLOCK_SIZE 512
#define IOCB_COUNT (BLKIF_MAX_SEGMENTS_PER_REQUEST + 2)
+struct persistent_gnt {
+ void *page;
+ struct XenBlkDev *blkdev;
+};
+
struct ioreq {
blkif_request_t req;
int16_t status;
@@ -69,6 +74,7 @@ struct ioreq {
int prot;
void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
void *pages;
+ int num_unmap;
/* aio status */
int aio_inflight;
@@ -105,6 +111,12 @@ struct XenBlkDev {
int requests_inflight;
int requests_finished;
+ /* Persistent grants extension */
+ gboolean feature_persistent;
+ GTree *persistent_gnts;
+ unsigned int persistent_gnt_c;
+ unsigned int max_grants;
+
/* qemu block driver */
DriveInfo *dinfo;
BlockDriverState *bs;
@@ -138,6 +150,29 @@ static void ioreq_reset(struct ioreq *ioreq)
qemu_iovec_reset(&ioreq->v);
}
+static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
+{
+ uint ua = GPOINTER_TO_UINT(a);
+ uint ub = GPOINTER_TO_UINT(b);
+ return (ua > ub) - (ua < ub);
+}
+
+static void destroy_grant(gpointer pgnt)
+{
+ struct persistent_gnt *grant = pgnt;
+ XenGnttab gnt = grant->blkdev->xendev.gnttabdev;
+
+ if (xc_gnttab_munmap(gnt, grant->page, 1) != 0) {
+ xen_be_printf(&grant->blkdev->xendev, 0,
+ "xc_gnttab_munmap failed: %s\n",
+ strerror(errno));
+ }
+ grant->blkdev->persistent_gnt_c--;
+ xen_be_printf(&grant->blkdev->xendev, 3,
+ "unmapped grant %p\n", grant->page);
+ g_free(grant);
+}
+
static struct ioreq *ioreq_start(struct XenBlkDev *blkdev)
{
struct ioreq *ioreq = NULL;
@@ -266,21 +301,21 @@ static void ioreq_unmap(struct ioreq *ioreq)
XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev;
int i;
- if (ioreq->v.niov == 0 || ioreq->mapped == 0) {
+ if (ioreq->num_unmap == 0 || ioreq->mapped == 0) {
return;
}
if (batch_maps) {
if (!ioreq->pages) {
return;
}
- if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->v.niov) != 0) {
+ if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->num_unmap) != 0) {
xen_be_printf(&ioreq->blkdev->xendev, 0, "xc_gnttab_munmap failed: %s\n",
strerror(errno));
}
- ioreq->blkdev->cnt_map -= ioreq->v.niov;
+ ioreq->blkdev->cnt_map -= ioreq->num_unmap;
ioreq->pages = NULL;
} else {
- for (i = 0; i < ioreq->v.niov; i++) {
+ for (i = 0; i < ioreq->num_unmap; i++) {
if (!ioreq->page[i]) {
continue;
}
@@ -298,41 +333,107 @@ static void ioreq_unmap(struct ioreq *ioreq)
static int ioreq_map(struct ioreq *ioreq)
{
XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev;
- int i;
+ uint32_t domids[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ uint32_t refs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ int i, j, new_maps = 0;
+ struct persistent_gnt *grant;
if (ioreq->v.niov == 0 || ioreq->mapped == 1) {
return 0;
}
- if (batch_maps) {
+ if (ioreq->blkdev->feature_persistent) {
+ for (i = 0; i < ioreq->v.niov; i++) {
+ grant = g_tree_lookup(ioreq->blkdev->persistent_gnts,
+ GUINT_TO_POINTER(ioreq->refs[i]));
+
+ if (grant != NULL) {
+ page[i] = grant->page;
+ xen_be_printf(&ioreq->blkdev->xendev, 3,
+ "using persistent-grant %" PRIu32 "\n",
+ ioreq->refs[i]);
+ } else {
+ /* Add the grant to the list of grants that
+ * should be mapped
+ */
+ domids[new_maps] = ioreq->domids[i];
+ refs[new_maps] = ioreq->refs[i];
+ page[i] = NULL;
+ new_maps++;
+ }
+ }
+ /* Set the protection to RW, since grants may be reused later
+ * with a different protection than the one needed for this request
+ */
+ ioreq->prot = PROT_WRITE | PROT_READ;
+ } else {
+ /* All grants in the request should be mapped */
+ memcpy(refs, ioreq->refs, sizeof(refs));
+ memcpy(domids, ioreq->domids, sizeof(domids));
+ memset(page, 0, sizeof(page));
+ new_maps = ioreq->v.niov;
+ }
+
+ if (batch_maps && new_maps) {
ioreq->pages = xc_gnttab_map_grant_refs
- (gnt, ioreq->v.niov, ioreq->domids, ioreq->refs, ioreq->prot);
+ (gnt, new_maps, domids, refs, ioreq->prot);
if (ioreq->pages == NULL) {
xen_be_printf(&ioreq->blkdev->xendev, 0,
"can't map %d grant refs (%s, %d maps)\n",
- ioreq->v.niov, strerror(errno), ioreq->blkdev->cnt_map);
+ new_maps, strerror(errno), ioreq->blkdev->cnt_map);
return -1;
}
- for (i = 0; i < ioreq->v.niov; i++) {
- ioreq->v.iov[i].iov_base = ioreq->pages + i * XC_PAGE_SIZE +
- (uintptr_t)ioreq->v.iov[i].iov_base;
+ for (i = 0, j = 0; i < ioreq->v.niov; i++) {
+ if (page[i] == NULL)
+ page[i] = ioreq->pages + (j++) * XC_PAGE_SIZE;
}
- ioreq->blkdev->cnt_map += ioreq->v.niov;
- } else {
- for (i = 0; i < ioreq->v.niov; i++) {
+ ioreq->blkdev->cnt_map += new_maps;
+ } else if (new_maps) {
+ for (i = 0; i < new_maps; i++) {
ioreq->page[i] = xc_gnttab_map_grant_ref
- (gnt, ioreq->domids[i], ioreq->refs[i], ioreq->prot);
+ (gnt, domids[i], refs[i], ioreq->prot);
if (ioreq->page[i] == NULL) {
xen_be_printf(&ioreq->blkdev->xendev, 0,
"can't map grant ref %d (%s, %d maps)\n",
- ioreq->refs[i], strerror(errno), ioreq->blkdev->cnt_map);
+ refs[i], strerror(errno), ioreq->blkdev->cnt_map);
ioreq_unmap(ioreq);
return -1;
}
- ioreq->v.iov[i].iov_base = ioreq->page[i] + (uintptr_t)ioreq->v.iov[i].iov_base;
ioreq->blkdev->cnt_map++;
}
+ for (i = 0, j = 0; i < ioreq->v.niov; i++) {
+ if (page[i] == NULL)
+ page[i] = ioreq->page[j++];
+ }
+ }
+ if (ioreq->blkdev->feature_persistent) {
+ while((ioreq->blkdev->persistent_gnt_c < ioreq->blkdev->max_grants) &&
+ new_maps) {
+ /* Go through the list of newly mapped grants and add as many
+ * as possible to the list of persistently mapped grants
+ */
+ grant = g_malloc0(sizeof(*grant));
+ new_maps--;
+ if (batch_maps)
+ grant->page = ioreq->pages + (new_maps) * XC_PAGE_SIZE;
+ else
+ grant->page = ioreq->page[new_maps];
+ grant->blkdev = ioreq->blkdev;
+ xen_be_printf(&ioreq->blkdev->xendev, 3,
+ "adding grant %" PRIu32 " page: %p\n",
+ refs[new_maps], grant->page);
+ g_tree_insert(ioreq->blkdev->persistent_gnts,
+ GUINT_TO_POINTER(refs[new_maps]),
+ grant);
+ ioreq->blkdev->persistent_gnt_c++;
+ }
+ }
+ for (i = 0; i < ioreq->v.niov; i++) {
+ ioreq->v.iov[i].iov_base = page[i] +
+ (uintptr_t)ioreq->v.iov[i].iov_base;
}
ioreq->mapped = 1;
+ ioreq->num_unmap = new_maps;
return 0;
}
@@ -690,6 +791,7 @@ static int blk_init(struct XenDevice *xendev)
/* fill info */
xenstore_write_be_int(&blkdev->xendev, "feature-barrier", 1);
+ xenstore_write_be_int(&blkdev->xendev, "feature-persistent", 1);
xenstore_write_be_int(&blkdev->xendev, "info", info);
xenstore_write_be_int(&blkdev->xendev, "sector-size", blkdev->file_blk);
xenstore_write_be_int(&blkdev->xendev, "sectors",
@@ -713,6 +815,7 @@ out_error:
static int blk_connect(struct XenDevice *xendev)
{
struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
+ int pers;
if (xenstore_read_fe_int(&blkdev->xendev, "ring-ref", &blkdev->ring_ref) == -1) {
return -1;
@@ -721,6 +824,11 @@ static int blk_connect(struct XenDevice *xendev)
&blkdev->xendev.remote_port) == -1) {
return -1;
}
+ if (xenstore_read_fe_int(&blkdev->xendev, "feature-persistent", &pers)) {
+ blkdev->feature_persistent = FALSE;
+ } else {
+ blkdev->feature_persistent = !!pers;
+ }
blkdev->protocol = BLKIF_PROTOCOL_NATIVE;
if (blkdev->xendev.protocol) {
@@ -764,6 +872,15 @@ static int blk_connect(struct XenDevice *xendev)
}
}
+ if (blkdev->feature_persistent) {
+ /* Init persistent grants */
+ blkdev->max_grants = max_requests * BLKIF_MAX_SEGMENTS_PER_REQUEST;
+ blkdev->persistent_gnts = g_tree_new_full((GCompareDataFunc)int_cmp,
+ NULL, NULL,
+ (GDestroyNotify)destroy_grant);
+ blkdev->persistent_gnt_c = 0;
+ }
+
xen_be_bind_evtchn(&blkdev->xendev);
xen_be_printf(&blkdev->xendev, 1, "ok: proto %s, ring-ref %d, "
@@ -804,6 +921,10 @@ static int blk_free(struct XenDevice *xendev)
blk_disconnect(xendev);
}
+ /* Free persistent grants */
+ if (blkdev->feature_persistent)
+ g_tree_destroy(blkdev->persistent_gnts);
+
while (!QLIST_EMPTY(&blkdev->freelist)) {
ioreq = QLIST_FIRST(&blkdev->freelist);
QLIST_REMOVE(ioreq, list);
--
1.7.7.5 (Apple Git-26)
next prev parent reply other threads:[~2012-12-31 12:16 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-12-31 12:16 [Qemu-devel] [PATCH RFC 0/3] xen pv disk persistent grants implementation Roger Pau Monne
2012-12-31 12:16 ` [Qemu-devel] [PATCH RFC 1/3] xen_disk: handle disk files on ramfs/tmpfs Roger Pau Monne
2013-01-03 14:21 ` [Qemu-devel] [Xen-devel] " Konrad Rzeszutek Wilk
2013-01-03 14:28 ` Ian Campbell
2013-01-04 14:54 ` Stefano Stabellini
2013-01-04 15:05 ` Roger Pau Monné
2013-01-04 15:30 ` Stefano Stabellini
2012-12-31 12:16 ` [Qemu-devel] [PATCH RFC 2/3] xen_disk: fix memory leak Roger Pau Monne
2013-01-04 15:06 ` Stefano Stabellini
2012-12-31 12:16 ` Roger Pau Monne [this message]
2013-01-04 16:42 ` [Qemu-devel] [PATCH RFC 3/3] xen_disk: add persistent grant support to xen_disk backend Stefano Stabellini
2013-01-04 17:37 ` Roger Pau Monné
2013-01-04 18:02 ` Stefano Stabellini
2013-01-04 21:01 ` Blue Swirl
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1356956174-23548-4-git-send-email-roger.pau@citrix.com \
--to=roger.pau@citrix.com \
--cc=Stefano.Stabellini@eu.citrix.com \
--cc=anthony.perard@citrix.com \
--cc=qemu-devel@nongnu.org \
--cc=xen-devel@lists.xen.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).