From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:38016) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1WS7UE-0001vb-Um for qemu-devel@nongnu.org; Mon, 24 Mar 2014 12:12:36 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1WS7U9-0005W4-US for qemu-devel@nongnu.org; Mon, 24 Mar 2014 12:12:30 -0400 Received: from mx1.redhat.com ([209.132.183.28]:36018) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1WS7U9-0005Vs-J7 for qemu-devel@nongnu.org; Mon, 24 Mar 2014 12:12:25 -0400 Message-ID: <533052DA.9010209@redhat.com> Date: Mon, 24 Mar 2014 16:44:26 +0100 From: Paolo Bonzini MIME-Version: 1.0 References: <1395675241-18859-1-git-send-email-pl@kamp.de> In-Reply-To: <1395675241-18859-1-git-send-email-pl@kamp.de> Content-Type: text/plain; charset=ISO-8859-15; format=flowed Content-Transfer-Encoding: 7bit Subject: Re: [Qemu-devel] [RFC PATCH] block/iscsi: speed up read for unallocated sectors List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Peter Lieven , qemu-devel@nongnu.org Cc: kwolf@redhat.com, ronniesahlberg@gmail.com, stefanha@redhat.com Il 24/03/2014 16:34, Peter Lieven ha scritto: > this patch implements a cache that tracks if a page on the > iscsi target is allocated or not. The cache is implemented in > a way that it allows for false positives > (e.g. pretending a page is allocated, but it isn't), but > no false negatives. > > The cached allocation info is then used to speed up the > read process for unallocated sectors by issueing a GET_LBA_STATUS > request for all sectors that are not yet known to be allocated. > If the read request is confirmed to fall into an unallocated > range we directly return zeroes and do not transfer the > data over the wire. > > Tests have shown that a relatively small amount of GET_LBA_STATUS > requests happens a vServer boot time to fill the allocation cache > (all those blocks are not queried again). > > Not to transfer all the data of unallocated sectors saves a lot > of time, bandwidth and storage I/O load during block jobs or storage > migration and it saves a lot of bandwidth as well for any big sequential > read of the whole disk (e.g. block copy or speed tests) if a significant > number of blocks is unallocated. > > Signed-off-by: Peter Lieven Nice---but this should only be done if BDRV_O_NOCACHE is clear. Paolo > --- > block/iscsi.c | 281 ++++++++++++++++++++++++++++++++++++--------------------- > 1 file changed, 179 insertions(+), 102 deletions(-) > > diff --git a/block/iscsi.c b/block/iscsi.c > index b490e98..72f0f58 100644 > --- a/block/iscsi.c > +++ b/block/iscsi.c > @@ -30,6 +30,8 @@ > #include "qemu-common.h" > #include "qemu/config-file.h" > #include "qemu/error-report.h" > +#include "qemu/bitops.h" > +#include "qemu/bitmap.h" > #include "block/block_int.h" > #include "trace.h" > #include "block/scsi.h" > @@ -59,6 +61,8 @@ typedef struct IscsiLun { > struct scsi_inquiry_logical_block_provisioning lbp; > struct scsi_inquiry_block_limits bl; > unsigned char *zeroblock; > + unsigned long *allocationmap; > + int cluster_sectors; > } IscsiLun; > > typedef struct IscsiTask { > @@ -91,6 +95,7 @@ typedef struct IscsiAIOCB { > #define NOP_INTERVAL 5000 > #define MAX_NOP_FAILURES 3 > #define ISCSI_CMD_RETRIES 5 > +#define ISCSI_CHECKALLOC_THRES 63 > > static void > iscsi_bh_cb(void *p) > @@ -273,6 +278,22 @@ static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors, > return 1; > } > > +static void iscsi_allocationmap_set(IscsiLun *iscsilun, int64_t sector_num, int nb_sectors) { > + if (!iscsilun->cluster_sectors) { > + return; > + } > + bitmap_set(iscsilun->allocationmap, sector_num / iscsilun->cluster_sectors, > + DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors)); > +} > + > +static void iscsi_allocationmap_clear(IscsiLun *iscsilun, int64_t sector_num, int nb_sectors) { > + if (!iscsilun->cluster_sectors) { > + return; > + } > + bitmap_clear(iscsilun->allocationmap, sector_num / iscsilun->cluster_sectors, > + DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors)); > +} > + > static int coroutine_fn iscsi_co_writev(BlockDriverState *bs, > int64_t sector_num, int nb_sectors, > QEMUIOVector *iov) > @@ -336,9 +357,120 @@ retry: > return -EIO; > } > > + iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors); > + > return 0; > } > > + > +static bool iscsi_allocationmap_is_allocated(IscsiLun *iscsilun, int64_t sector_num, int nb_sectors) { > + unsigned long size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors); > + > + if (!iscsilun->cluster_sectors) { > + return true; > + } > + > + return !(find_next_bit(iscsilun->allocationmap, size, > + sector_num / iscsilun->cluster_sectors) == size); > +} > + > + > +#if defined(LIBISCSI_FEATURE_IOVECTOR) > + > +static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, > + int64_t sector_num, > + int nb_sectors, int *pnum) > +{ > + IscsiLun *iscsilun = bs->opaque; > + struct scsi_get_lba_status *lbas = NULL; > + struct scsi_lba_status_descriptor *lbasd = NULL; > + struct IscsiTask iTask; > + int64_t ret; > + > + iscsi_co_init_iscsitask(iscsilun, &iTask); > + > + if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { > + ret = -EINVAL; > + goto out; > + } > + > + /* default to all sectors allocated */ > + ret = BDRV_BLOCK_DATA; > + ret |= (sector_num << BDRV_SECTOR_BITS) | BDRV_BLOCK_OFFSET_VALID; > + *pnum = nb_sectors; > + > + /* LUN does not support logical block provisioning */ > + if (iscsilun->lbpme == 0) { > + goto out; > + } > + > +retry: > + if (iscsi_get_lba_status_task(iscsilun->iscsi, iscsilun->lun, > + sector_qemu2lun(sector_num, iscsilun), > + 8 + 16, iscsi_co_generic_cb, > + &iTask) == NULL) { > + ret = -ENOMEM; > + goto out; > + } > + > + while (!iTask.complete) { > + iscsi_set_events(iscsilun); > + qemu_coroutine_yield(); > + } > + > + if (iTask.do_retry) { > + if (iTask.task != NULL) { > + scsi_free_scsi_task(iTask.task); > + iTask.task = NULL; > + } > + iTask.complete = 0; > + goto retry; > + } > + > + if (iTask.status != SCSI_STATUS_GOOD) { > + /* in case the get_lba_status_callout fails (i.e. > + * because the device is busy or the cmd is not > + * supported) we pretend all blocks are allocated > + * for backwards compatibility */ > + goto out; > + } > + > + lbas = scsi_datain_unmarshall(iTask.task); > + if (lbas == NULL) { > + ret = -EIO; > + goto out; > + } > + > + lbasd = &lbas->descriptors[0]; > + > + if (sector_qemu2lun(sector_num, iscsilun) != lbasd->lba) { > + ret = -EIO; > + goto out; > + } > + > + *pnum = sector_lun2qemu(lbasd->num_blocks, iscsilun); > + if (*pnum > nb_sectors) { > + *pnum = nb_sectors; > + } > + > + if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED || > + lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) { > + ret &= ~BDRV_BLOCK_DATA; > + if (iscsilun->lbprz) { > + ret |= BDRV_BLOCK_ZERO; > + } > + } > + > +out: > + if (iTask.task != NULL) { > + scsi_free_scsi_task(iTask.task); > + } > + return ret; > +} > + > +#endif /* LIBISCSI_FEATURE_IOVECTOR */ > + > + > static int coroutine_fn iscsi_co_readv(BlockDriverState *bs, > int64_t sector_num, int nb_sectors, > QEMUIOVector *iov) > @@ -355,6 +487,27 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs, > return -EINVAL; > } > > +#if defined(LIBISCSI_FEATURE_IOVECTOR) > + if (iscsilun->lbprz && nb_sectors > ISCSI_CHECKALLOC_THRES && > + !iscsi_allocationmap_is_allocated(iscsilun, sector_num, nb_sectors)) { > + int64_t ret; > + int pnum; > + ret = iscsi_co_get_block_status(bs, sector_num, INT_MAX, &pnum); > + if (ret < 0) { > + return ret; > + } > + if (ret & BDRV_BLOCK_ZERO) { > + iscsi_allocationmap_clear(iscsilun, sector_num, pnum); > + if (pnum >= nb_sectors) { > + qemu_iovec_memset(iov, 0, 0x00, iov->size); > + return 0; > + } > + } else { > + iscsi_allocationmap_set(iscsilun, sector_num, pnum); > + } > + } > +#endif > + > lba = sector_qemu2lun(sector_num, iscsilun); > num_sectors = sector_qemu2lun(nb_sectors, iscsilun); > > @@ -639,101 +792,6 @@ iscsi_getlength(BlockDriverState *bs) > return len; > } > > -#if defined(LIBISCSI_FEATURE_IOVECTOR) > - > -static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, > - int64_t sector_num, > - int nb_sectors, int *pnum) > -{ > - IscsiLun *iscsilun = bs->opaque; > - struct scsi_get_lba_status *lbas = NULL; > - struct scsi_lba_status_descriptor *lbasd = NULL; > - struct IscsiTask iTask; > - int64_t ret; > - > - iscsi_co_init_iscsitask(iscsilun, &iTask); > - > - if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { > - ret = -EINVAL; > - goto out; > - } > - > - /* default to all sectors allocated */ > - ret = BDRV_BLOCK_DATA; > - ret |= (sector_num << BDRV_SECTOR_BITS) | BDRV_BLOCK_OFFSET_VALID; > - *pnum = nb_sectors; > - > - /* LUN does not support logical block provisioning */ > - if (iscsilun->lbpme == 0) { > - goto out; > - } > - > -retry: > - if (iscsi_get_lba_status_task(iscsilun->iscsi, iscsilun->lun, > - sector_qemu2lun(sector_num, iscsilun), > - 8 + 16, iscsi_co_generic_cb, > - &iTask) == NULL) { > - ret = -ENOMEM; > - goto out; > - } > - > - while (!iTask.complete) { > - iscsi_set_events(iscsilun); > - qemu_coroutine_yield(); > - } > - > - if (iTask.do_retry) { > - if (iTask.task != NULL) { > - scsi_free_scsi_task(iTask.task); > - iTask.task = NULL; > - } > - iTask.complete = 0; > - goto retry; > - } > - > - if (iTask.status != SCSI_STATUS_GOOD) { > - /* in case the get_lba_status_callout fails (i.e. > - * because the device is busy or the cmd is not > - * supported) we pretend all blocks are allocated > - * for backwards compatibility */ > - goto out; > - } > - > - lbas = scsi_datain_unmarshall(iTask.task); > - if (lbas == NULL) { > - ret = -EIO; > - goto out; > - } > - > - lbasd = &lbas->descriptors[0]; > - > - if (sector_qemu2lun(sector_num, iscsilun) != lbasd->lba) { > - ret = -EIO; > - goto out; > - } > - > - *pnum = sector_lun2qemu(lbasd->num_blocks, iscsilun); > - if (*pnum > nb_sectors) { > - *pnum = nb_sectors; > - } > - > - if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED || > - lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) { > - ret &= ~BDRV_BLOCK_DATA; > - if (iscsilun->lbprz) { > - ret |= BDRV_BLOCK_ZERO; > - } > - } > - > -out: > - if (iTask.task != NULL) { > - scsi_free_scsi_task(iTask.task); > - } > - return ret; > -} > - > -#endif /* LIBISCSI_FEATURE_IOVECTOR */ > - > static int > coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num, > int nb_sectors) > @@ -859,6 +917,12 @@ retry: > return -EIO; > } > > + if (flags & BDRV_REQ_MAY_UNMAP) { > + iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors); > + } else { > + iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors); > + } > + > return 0; > } > > @@ -1288,6 +1352,19 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, > timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL); > #endif > > + /* Guess the internal cluster (page) size of the iscsi target by the means > + * of opt_unmap_gran. Transfer the unmap granularity only if it has a > + * reasonable size */ > + if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 64 * 1024 && > + iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) { > + iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran * iscsilun->block_size) >> BDRV_SECTOR_BITS; > +#if defined(LIBISCSI_FEATURE_IOVECTOR) > + if (iscsilun->lbprz) { > + iscsilun->allocationmap = bitmap_new(DIV_ROUND_UP(bs->total_sectors, iscsilun->cluster_sectors)); > + } > +#endif > + } > + > out: > qemu_opts_del(opts); > if (initiator_name != NULL) { > @@ -1321,6 +1398,7 @@ static void iscsi_close(BlockDriverState *bs) > qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), NULL, NULL, NULL); > iscsi_destroy_context(iscsi); > g_free(iscsilun->zeroblock); > + g_free(iscsilun->allocationmap); > memset(iscsilun, 0, sizeof(IscsiLun)); > } > > @@ -1379,6 +1457,11 @@ static int iscsi_truncate(BlockDriverState *bs, int64_t offset) > return -EINVAL; > } > > + if (iscsilun->allocationmap) { > + g_free(iscsilun->allocationmap); > + iscsilun->allocationmap = bitmap_new(DIV_ROUND_UP(bs->total_sectors, iscsilun->cluster_sectors)); > + } > + > return 0; > } > > @@ -1441,13 +1524,7 @@ static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) > IscsiLun *iscsilun = bs->opaque; > bdi->unallocated_blocks_are_zero = !!iscsilun->lbprz; > bdi->can_write_zeroes_with_unmap = iscsilun->lbprz && iscsilun->lbp.lbpws; > - /* Guess the internal cluster (page) size of the iscsi target by the means > - * of opt_unmap_gran. Transfer the unmap granularity only if it has a > - * reasonable size for bdi->cluster_size */ > - if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 64 * 1024 && > - iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) { > - bdi->cluster_size = iscsilun->bl.opt_unmap_gran * iscsilun->block_size; > - } > + bdi->cluster_size = iscsilun->cluster_sectors * BDRV_SECTOR_SIZE; > return 0; > } > >