From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:39143) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1X5YW2-0003xc-IW for qemu-devel@nongnu.org; Fri, 11 Jul 2014 06:57:27 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1X5YVx-0002m0-Vc for qemu-devel@nongnu.org; Fri, 11 Jul 2014 06:57:22 -0400 Received: from lputeaux-656-01-25-125.w80-12.abo.wanadoo.fr ([80.12.84.125]:54607 helo=paradis.irqsave.net) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1X5YVx-0002kh-Mu for qemu-devel@nongnu.org; Fri, 11 Jul 2014 06:57:17 -0400 Date: Fri, 11 Jul 2014 12:56:53 +0200 From: =?iso-8859-1?Q?Beno=EEt?= Canet Message-ID: <20140711105653.GB4493@irqsave.net> References: <1405047682-26102-1-git-send-email-namei.unix@gmail.com> MIME-Version: 1.0 Content-Type: text/plain; charset=iso-8859-1 Content-Disposition: inline In-Reply-To: <1405047682-26102-1-git-send-email-namei.unix@gmail.com> Content-Transfer-Encoding: quoted-printable Subject: Re: [Qemu-devel] [PATCH] block/quorum: add simple read pattern support List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Liu Yuan Cc: Kevin Wolf , qemu-devel@nongnu.org, Stefan Hajnoczi The Friday 11 Jul 2014 =E0 11:01:22 (+0800), Liu Yuan wrote : > This patch adds single read pattern to quorum driver and quorum vote is= default > pattern. >=20 > For now we do a quorum vote on all the reads, it is designed for unreli= able > underlying storage such as non-redundant NFS to make sure data integrit= y at the > cost of the read performance. >=20 > For some use cases as following: >=20 > VM > -------------- > | | > v v > A B >=20 > Both A and B has hardware raid storage to justify the data integrity on= its own. > So it would help performance if we do a single read instead of on all t= he nodes. > Further, if we run VM on either of the storage node, we can make a loca= l read > request for better performance. >=20 > This patch generalize the above 2 nodes case in the N nodes. That is, >=20 > vm -> write to all the N nodes, read just one of them. If single read f= ails, we > try to read next node in FIFO order specified by the startup command. >=20 > The 2 nodes case is very similar to DRBD[1] though lack of auto-sync > functionality in the single device/node failure for now. But compared w= ith DRBD > we still have some advantages over it: >=20 > - Suppose we have 20 VMs running on one(assume A) of 2 nodes' DRBD back= ed > storage. And if A crashes, we need to restart all the VMs on node B. Bu= t for > practice case, we can't because B might not have enough resources to se= tup 20 VMs > at once. So if we run our 20 VMs with quorum driver, and scatter the re= plicated > images over the data center, we can very likely restart 20 VMs without = any > resource problem. >=20 > After all, I think we can build a more powerful replicated image functi= onality > on quorum and block jobs(block mirror) to meet various High Availibilit= y needs. >=20 > E.g, Enable single read pattern on 2 children, >=20 > -drive driver=3Dquorum,children.0.file.filename=3D0.qcow2,\ > children.1.file.filename=3D1.qcow2,read-pattern=3Dsingle,vote-threshold= =3D1 >=20 > [1] http://en.wikipedia.org/wiki/Distributed_Replicated_Block_Device >=20 > Cc: Benoit Canet > Cc: Kevin Wolf > Cc: Stefan Hajnoczi > Signed-off-by: Liu Yuan > --- > block/quorum.c | 174 +++++++++++++++++++++++++++++++++++++++++--------= -------- > 1 file changed, 125 insertions(+), 49 deletions(-) >=20 > diff --git a/block/quorum.c b/block/quorum.c > index d5ee9c0..2f18755 100644 > --- a/block/quorum.c > +++ b/block/quorum.c > @@ -24,6 +24,7 @@ > #define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold" > #define QUORUM_OPT_BLKVERIFY "blkverify" > #define QUORUM_OPT_REWRITE "rewrite-corrupted" > +#define QUORUM_OPT_READ_PATTERN "read-pattern" > =20 > /* This union holds a vote hash value */ > typedef union QuorumVoteValue { > @@ -74,6 +75,16 @@ typedef struct BDRVQuorumState { > bool rewrite_corrupted;/* true if the driver must rewrite-on-read = corrupted > * block if Quorum is reached. > */ > + > +#define READ_PATTERN_QUORUM 0 /* default */ > +#define READ_PATTERN_SINGLE 1 Why not making these choices an enum ? Would READ_PATTERN_SINGLE be more accuratelly described by READ_PATTERN_R= OUND_ROBIN ? More generaly would s/single/round-robin/ be more descriptive. > + int read_pattern; /* single: read a single child and try firs= t one > + * first. If error, try next child = in an > + * FIFO order specifed by command l= ine. > + * Return error if no child read su= cceeds. > + * quorum: read all the children and do a q= uorum > + * vote on reads. > + */ > } BDRVQuorumState; > =20 > typedef struct QuorumAIOCB QuorumAIOCB; > @@ -117,6 +128,7 @@ struct QuorumAIOCB { > =20 > bool is_read; > int vote_ret; > + int child_iter; /* which child to read in single patte= rn */ > }; > =20 > static bool quorum_vote(QuorumAIOCB *acb); > @@ -256,6 +268,21 @@ static void quorum_rewrite_aio_cb(void *opaque, in= t ret) > quorum_aio_finalize(acb); > } > =20 > +static BlockDriverAIOCB *read_single_child(QuorumAIOCB *acb); > + > +static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) > +{ > + int i; > + assert(dest->niov =3D=3D source->niov); > + assert(dest->size =3D=3D source->size); > + for (i =3D 0; i < source->niov; i++) { > + assert(dest->iov[i].iov_len =3D=3D source->iov[i].iov_len); > + memcpy(dest->iov[i].iov_base, > + source->iov[i].iov_base, > + source->iov[i].iov_len); > + } > +} > + > static void quorum_aio_cb(void *opaque, int ret) > { > QuorumChildRequest *sacb =3D opaque; > @@ -263,6 +290,21 @@ static void quorum_aio_cb(void *opaque, int ret) > BDRVQuorumState *s =3D acb->common.bs->opaque; > bool rewrite =3D false; > =20 > + if (acb->is_read && s->read_pattern =3D=3D READ_PATTERN_SINGLE) { > + /* We try to read next child in FIFO order if we fail to read = */ > + if (ret < 0 && ++acb->child_iter < s->num_children) { > + read_single_child(acb); Here the previous failed read is never finalized/freed. > + return; > + } > + > + if (ret =3D=3D 0) { > + quorum_copy_qiov(acb->qiov, &acb->qcrs[acb->child_iter].qi= ov); > + } > + acb->vote_ret =3D ret; > + quorum_aio_finalize(acb); > + return; > + } > + > sacb->ret =3D ret; > acb->count++; > if (ret =3D=3D 0) { > @@ -276,7 +318,6 @@ static void quorum_aio_cb(void *opaque, int ret) > return; > } > =20 > - /* Do the vote on read */ Why removing this comment ? > if (acb->is_read) { > rewrite =3D quorum_vote(acb); > } else { > @@ -343,19 +384,6 @@ static bool quorum_rewrite_bad_versions(BDRVQuorum= State *s, QuorumAIOCB *acb, > return count; > } > =20 > -static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) > -{ > - int i; > - assert(dest->niov =3D=3D source->niov); > - assert(dest->size =3D=3D source->size); > - for (i =3D 0; i < source->niov; i++) { > - assert(dest->iov[i].iov_len =3D=3D source->iov[i].iov_len); > - memcpy(dest->iov[i].iov_base, > - source->iov[i].iov_base, > - source->iov[i].iov_len); > - } > -} > - > static void quorum_count_vote(QuorumVotes *votes, > QuorumVoteValue *value, > int index) > @@ -615,34 +643,61 @@ free_exit: > return rewrite; > } > =20 > -static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs, > - int64_t sector_num, > - QEMUIOVector *qiov, > - int nb_sectors, > - BlockDriverCompletionFunc *cb= , > - void *opaque) > +static BlockDriverAIOCB *read_all_children(QuorumAIOCB *acb) > { > - BDRVQuorumState *s =3D bs->opaque; > - QuorumAIOCB *acb =3D quorum_aio_get(s, bs, qiov, sector_num, > - nb_sectors, cb, opaque); > + BDRVQuorumState *s =3D acb->common.bs->opaque; > int i; > =20 > - acb->is_read =3D true; > - > for (i =3D 0; i < s->num_children; i++) { > - acb->qcrs[i].buf =3D qemu_blockalign(s->bs[i], qiov->size); > - qemu_iovec_init(&acb->qcrs[i].qiov, qiov->niov); > - qemu_iovec_clone(&acb->qcrs[i].qiov, qiov, acb->qcrs[i].buf); > + acb->qcrs[i].buf =3D qemu_blockalign(s->bs[i], acb->qiov->size= ); > + qemu_iovec_init(&acb->qcrs[i].qiov, acb->qiov->niov); > + qemu_iovec_clone(&acb->qcrs[i].qiov, acb->qiov, acb->qcrs[i].b= uf); > } > =20 > for (i =3D 0; i < s->num_children; i++) { > - bdrv_aio_readv(s->bs[i], sector_num, &acb->qcrs[i].qiov, nb_se= ctors, > - quorum_aio_cb, &acb->qcrs[i]); > + bdrv_aio_readv(s->bs[i], acb->sector_num, &acb->qcrs[i].qiov, > + acb->nb_sectors, quorum_aio_cb, &acb->qcrs[i]); > } > =20 > return &acb->common; > } > =20 > +static BlockDriverAIOCB *read_single_child(QuorumAIOCB *acb) > +{ > + BDRVQuorumState *s =3D acb->common.bs->opaque; > + > + acb->qcrs[acb->child_iter].buf =3D qemu_blockalign(s->bs[acb->chil= d_iter], > + acb->qiov->size); > + qemu_iovec_init(&acb->qcrs[acb->child_iter].qiov, acb->qiov->niov)= ; > + qemu_iovec_clone(&acb->qcrs[acb->child_iter].qiov, acb->qiov, > + acb->qcrs[acb->child_iter].buf); > + bdrv_aio_readv(s->bs[acb->child_iter], acb->sector_num, > + &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors, > + quorum_aio_cb, &acb->qcrs[acb->child_iter]); > + > + return &acb->common; > +} > + > +static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs, > + int64_t sector_num, > + QEMUIOVector *qiov, > + int nb_sectors, > + BlockDriverCompletionFunc *c= b, > + void *opaque) > +{ > + BDRVQuorumState *s =3D bs->opaque; > + QuorumAIOCB *acb =3D quorum_aio_get(s, bs, qiov, sector_num, > + nb_sectors, cb, opaque); > + acb->is_read =3D true; > + > + if (s->read_pattern =3D=3D READ_PATTERN_QUORUM) { > + return read_all_children(acb); > + } > + > + acb->child_iter =3D 0; > + return read_single_child(acb); > +} > + > static BlockDriverAIOCB *quorum_aio_writev(BlockDriverState *bs, > int64_t sector_num, > QEMUIOVector *qiov, > @@ -782,6 +837,11 @@ static QemuOptsList quorum_runtime_opts =3D { > .type =3D QEMU_OPT_BOOL, > .help =3D "Rewrite corrupted block on read quorum", > }, > + { > + .name =3D QUORUM_OPT_READ_PATTERN, > + .type =3D QEMU_OPT_STRING, > + .help =3D "Allowed pattern: quorum, single. Quorum is defa= ult", > + }, > { /* end of list */ } > }, > }; > @@ -796,6 +856,7 @@ static int quorum_open(BlockDriverState *bs, QDict = *options, int flags, > QDict *sub =3D NULL; > QList *list =3D NULL; > const QListEntry *lentry; > + const char *read_pattern; > int i; > int ret =3D 0; > =20 > @@ -827,28 +888,43 @@ static int quorum_open(BlockDriverState *bs, QDic= t *options, int flags, > } > =20 > s->threshold =3D qemu_opt_get_number(opts, QUORUM_OPT_VOTE_THRESHO= LD, 0); > - > - /* and validate it against s->num_children */ > - ret =3D quorum_valid_threshold(s->threshold, s->num_children, &loc= al_err); > - if (ret < 0) { > - goto exit; > + read_pattern =3D qemu_opt_get(opts, QUORUM_OPT_READ_PATTERN); > + if (read_pattern) { > + if (strcmp(read_pattern, "single") =3D=3D 0) { > + s->read_pattern =3D READ_PATTERN_SINGLE; > + } else if (strcmp(read_pattern, "quorum") =3D=3D 0) { > + s->read_pattern =3D READ_PATTERN_QUORUM; > + } else { > + error_setg(&local_err, > + "Please set read-pattern as single or quorum\n"= ); > + ret =3D -EINVAL; > + goto exit; > + } > } > =20 > - /* is the driver in blkverify mode */ > - if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) && > - s->num_children =3D=3D 2 && s->threshold =3D=3D 2) { > - s->is_blkverify =3D true; > - } else if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false)) { > - fprintf(stderr, "blkverify mode is set by setting blkverify=3D= on " > - "and using two files with vote_threshold=3D2\n"); > - } > + if (s->read_pattern =3D=3D READ_PATTERN_QUORUM) { > + /* and validate it against s->num_children */ > + ret =3D quorum_valid_threshold(s->threshold, s->num_children, = &local_err); > + if (ret < 0) { > + goto exit; > + } > =20 > - s->rewrite_corrupted =3D qemu_opt_get_bool(opts, QUORUM_OPT_REWRIT= E, false); > - if (s->rewrite_corrupted && s->is_blkverify) { > - error_setg(&local_err, > - "rewrite-corrupted=3Don cannot be used with blkveri= fy=3Don"); > - ret =3D -EINVAL; > - goto exit; > + /* is the driver in blkverify mode */ > + if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) && > + s->num_children =3D=3D 2 && s->threshold =3D=3D 2) { > + s->is_blkverify =3D true; > + } else if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false= )) { > + fprintf(stderr, "blkverify mode is set by setting blkverif= y=3Don " > + "and using two files with vote_threshold=3D2\n"); > + } > + > + s->rewrite_corrupted =3D qemu_opt_get_bool(opts, QUORUM_OPT_RE= WRITE, false); > + if (s->rewrite_corrupted && s->is_blkverify) { > + error_setg(&local_err, > + "rewrite-corrupted=3Don cannot be used with blk= verify=3Don"); > + ret =3D -EINVAL; > + goto exit; > + } > } > =20 > /* allocate the children BlockDriverState array */ > --=20 > 1.9.1 >=20