From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:41288) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1WS7kD-0002H7-1k for qemu-devel@nongnu.org; Mon, 24 Mar 2014 12:29:06 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1WS7k6-0002oK-U5 for qemu-devel@nongnu.org; Mon, 24 Mar 2014 12:29:00 -0400 Received: from mx-v6.kamp.de ([2a02:248:0:51::16]:53935 helo=mx01.kamp.de) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1WS7k6-0002mm-Ft for qemu-devel@nongnu.org; Mon, 24 Mar 2014 12:28:54 -0400 Message-ID: <53305D41.5010909@kamp.de> Date: Mon, 24 Mar 2014 17:28:49 +0100 From: Peter Lieven MIME-Version: 1.0 References: <1395675241-18859-1-git-send-email-pl@kamp.de> <533052DA.9010209@redhat.com> In-Reply-To: <533052DA.9010209@redhat.com> Content-Type: text/plain; charset=ISO-8859-15 Content-Transfer-Encoding: 7bit Subject: Re: [Qemu-devel] [RFC PATCH] block/iscsi: speed up read for unallocated sectors List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Paolo Bonzini , qemu-devel@nongnu.org Cc: kwolf@redhat.com, ronniesahlberg@gmail.com, stefanha@redhat.com Am 24.03.2014 16:44, schrieb Paolo Bonzini: > Il 24/03/2014 16:34, Peter Lieven ha scritto: >> this patch implements a cache that tracks if a page on the >> iscsi target is allocated or not. The cache is implemented in >> a way that it allows for false positives >> (e.g. pretending a page is allocated, but it isn't), but >> no false negatives. >> >> The cached allocation info is then used to speed up the >> read process for unallocated sectors by issueing a GET_LBA_STATUS >> request for all sectors that are not yet known to be allocated. >> If the read request is confirmed to fall into an unallocated >> range we directly return zeroes and do not transfer the >> data over the wire. >> >> Tests have shown that a relatively small amount of GET_LBA_STATUS >> requests happens a vServer boot time to fill the allocation cache >> (all those blocks are not queried again). >> >> Not to transfer all the data of unallocated sectors saves a lot >> of time, bandwidth and storage I/O load during block jobs or storage >> migration and it saves a lot of bandwidth as well for any big sequential >> read of the whole disk (e.g. block copy or speed tests) if a significant >> number of blocks is unallocated. >> >> Signed-off-by: Peter Lieven > > Nice---but this should only be done if BDRV_O_NOCACHE is clear. Good point. 2 other things I found meanwhile: - I should set and clear the allocationmap in iscsi_co_get_block_status. Otherwise we would call GET_LBA_STATUS twice in qemu-img convert for unallocated blocks. - I should also clear the allocation map after a successful iscsi_co_discard Peter > > Paolo > >> --- >> block/iscsi.c | 281 ++++++++++++++++++++++++++++++++++++--------------------- >> 1 file changed, 179 insertions(+), 102 deletions(-) >> >> diff --git a/block/iscsi.c b/block/iscsi.c >> index b490e98..72f0f58 100644 >> --- a/block/iscsi.c >> +++ b/block/iscsi.c >> @@ -30,6 +30,8 @@ >> #include "qemu-common.h" >> #include "qemu/config-file.h" >> #include "qemu/error-report.h" >> +#include "qemu/bitops.h" >> +#include "qemu/bitmap.h" >> #include "block/block_int.h" >> #include "trace.h" >> #include "block/scsi.h" >> @@ -59,6 +61,8 @@ typedef struct IscsiLun { >> struct scsi_inquiry_logical_block_provisioning lbp; >> struct scsi_inquiry_block_limits bl; >> unsigned char *zeroblock; >> + unsigned long *allocationmap; >> + int cluster_sectors; >> } IscsiLun; >> >> typedef struct IscsiTask { >> @@ -91,6 +95,7 @@ typedef struct IscsiAIOCB { >> #define NOP_INTERVAL 5000 >> #define MAX_NOP_FAILURES 3 >> #define ISCSI_CMD_RETRIES 5 >> +#define ISCSI_CHECKALLOC_THRES 63 >> >> static void >> iscsi_bh_cb(void *p) >> @@ -273,6 +278,22 @@ static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors, >> return 1; >> } >> >> +static void iscsi_allocationmap_set(IscsiLun *iscsilun, int64_t sector_num, int nb_sectors) { >> + if (!iscsilun->cluster_sectors) { >> + return; >> + } >> + bitmap_set(iscsilun->allocationmap, sector_num / iscsilun->cluster_sectors, >> + DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors)); >> +} >> + >> +static void iscsi_allocationmap_clear(IscsiLun *iscsilun, int64_t sector_num, int nb_sectors) { >> + if (!iscsilun->cluster_sectors) { >> + return; >> + } >> + bitmap_clear(iscsilun->allocationmap, sector_num / iscsilun->cluster_sectors, >> + DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors)); >> +} >> + >> static int coroutine_fn iscsi_co_writev(BlockDriverState *bs, >> int64_t sector_num, int nb_sectors, >> QEMUIOVector *iov) >> @@ -336,9 +357,120 @@ retry: >> return -EIO; >> } >> >> + iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors); >> + >> return 0; >> } >> >> + >> +static bool iscsi_allocationmap_is_allocated(IscsiLun *iscsilun, int64_t sector_num, int nb_sectors) { >> + unsigned long size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors); >> + >> + if (!iscsilun->cluster_sectors) { >> + return true; >> + } >> + >> + return !(find_next_bit(iscsilun->allocationmap, size, >> + sector_num / iscsilun->cluster_sectors) == size); >> +} >> + >> + >> +#if defined(LIBISCSI_FEATURE_IOVECTOR) >> + >> +static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, >> + int64_t sector_num, >> + int nb_sectors, int *pnum) >> +{ >> + IscsiLun *iscsilun = bs->opaque; >> + struct scsi_get_lba_status *lbas = NULL; >> + struct scsi_lba_status_descriptor *lbasd = NULL; >> + struct IscsiTask iTask; >> + int64_t ret; >> + >> + iscsi_co_init_iscsitask(iscsilun, &iTask); >> + >> + if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { >> + ret = -EINVAL; >> + goto out; >> + } >> + >> + /* default to all sectors allocated */ >> + ret = BDRV_BLOCK_DATA; >> + ret |= (sector_num << BDRV_SECTOR_BITS) | BDRV_BLOCK_OFFSET_VALID; >> + *pnum = nb_sectors; >> + >> + /* LUN does not support logical block provisioning */ >> + if (iscsilun->lbpme == 0) { >> + goto out; >> + } >> + >> +retry: >> + if (iscsi_get_lba_status_task(iscsilun->iscsi, iscsilun->lun, >> + sector_qemu2lun(sector_num, iscsilun), >> + 8 + 16, iscsi_co_generic_cb, >> + &iTask) == NULL) { >> + ret = -ENOMEM; >> + goto out; >> + } >> + >> + while (!iTask.complete) { >> + iscsi_set_events(iscsilun); >> + qemu_coroutine_yield(); >> + } >> + >> + if (iTask.do_retry) { >> + if (iTask.task != NULL) { >> + scsi_free_scsi_task(iTask.task); >> + iTask.task = NULL; >> + } >> + iTask.complete = 0; >> + goto retry; >> + } >> + >> + if (iTask.status != SCSI_STATUS_GOOD) { >> + /* in case the get_lba_status_callout fails (i.e. >> + * because the device is busy or the cmd is not >> + * supported) we pretend all blocks are allocated >> + * for backwards compatibility */ >> + goto out; >> + } >> + >> + lbas = scsi_datain_unmarshall(iTask.task); >> + if (lbas == NULL) { >> + ret = -EIO; >> + goto out; >> + } >> + >> + lbasd = &lbas->descriptors[0]; >> + >> + if (sector_qemu2lun(sector_num, iscsilun) != lbasd->lba) { >> + ret = -EIO; >> + goto out; >> + } >> + >> + *pnum = sector_lun2qemu(lbasd->num_blocks, iscsilun); >> + if (*pnum > nb_sectors) { >> + *pnum = nb_sectors; >> + } >> + >> + if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED || >> + lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) { >> + ret &= ~BDRV_BLOCK_DATA; >> + if (iscsilun->lbprz) { >> + ret |= BDRV_BLOCK_ZERO; >> + } >> + } >> + >> +out: >> + if (iTask.task != NULL) { >> + scsi_free_scsi_task(iTask.task); >> + } >> + return ret; >> +} >> + >> +#endif /* LIBISCSI_FEATURE_IOVECTOR */ >> + >> + >> static int coroutine_fn iscsi_co_readv(BlockDriverState *bs, >> int64_t sector_num, int nb_sectors, >> QEMUIOVector *iov) >> @@ -355,6 +487,27 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs, >> return -EINVAL; >> } >> >> +#if defined(LIBISCSI_FEATURE_IOVECTOR) >> + if (iscsilun->lbprz && nb_sectors > ISCSI_CHECKALLOC_THRES && >> + !iscsi_allocationmap_is_allocated(iscsilun, sector_num, nb_sectors)) { >> + int64_t ret; >> + int pnum; >> + ret = iscsi_co_get_block_status(bs, sector_num, INT_MAX, &pnum); >> + if (ret < 0) { >> + return ret; >> + } >> + if (ret & BDRV_BLOCK_ZERO) { >> + iscsi_allocationmap_clear(iscsilun, sector_num, pnum); >> + if (pnum >= nb_sectors) { >> + qemu_iovec_memset(iov, 0, 0x00, iov->size); >> + return 0; >> + } >> + } else { >> + iscsi_allocationmap_set(iscsilun, sector_num, pnum); >> + } >> + } >> +#endif >> + >> lba = sector_qemu2lun(sector_num, iscsilun); >> num_sectors = sector_qemu2lun(nb_sectors, iscsilun); >> >> @@ -639,101 +792,6 @@ iscsi_getlength(BlockDriverState *bs) >> return len; >> } >> >> -#if defined(LIBISCSI_FEATURE_IOVECTOR) >> - >> -static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, >> - int64_t sector_num, >> - int nb_sectors, int *pnum) >> -{ >> - IscsiLun *iscsilun = bs->opaque; >> - struct scsi_get_lba_status *lbas = NULL; >> - struct scsi_lba_status_descriptor *lbasd = NULL; >> - struct IscsiTask iTask; >> - int64_t ret; >> - >> - iscsi_co_init_iscsitask(iscsilun, &iTask); >> - >> - if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { >> - ret = -EINVAL; >> - goto out; >> - } >> - >> - /* default to all sectors allocated */ >> - ret = BDRV_BLOCK_DATA; >> - ret |= (sector_num << BDRV_SECTOR_BITS) | BDRV_BLOCK_OFFSET_VALID; >> - *pnum = nb_sectors; >> - >> - /* LUN does not support logical block provisioning */ >> - if (iscsilun->lbpme == 0) { >> - goto out; >> - } >> - >> -retry: >> - if (iscsi_get_lba_status_task(iscsilun->iscsi, iscsilun->lun, >> - sector_qemu2lun(sector_num, iscsilun), >> - 8 + 16, iscsi_co_generic_cb, >> - &iTask) == NULL) { >> - ret = -ENOMEM; >> - goto out; >> - } >> - >> - while (!iTask.complete) { >> - iscsi_set_events(iscsilun); >> - qemu_coroutine_yield(); >> - } >> - >> - if (iTask.do_retry) { >> - if (iTask.task != NULL) { >> - scsi_free_scsi_task(iTask.task); >> - iTask.task = NULL; >> - } >> - iTask.complete = 0; >> - goto retry; >> - } >> - >> - if (iTask.status != SCSI_STATUS_GOOD) { >> - /* in case the get_lba_status_callout fails (i.e. >> - * because the device is busy or the cmd is not >> - * supported) we pretend all blocks are allocated >> - * for backwards compatibility */ >> - goto out; >> - } >> - >> - lbas = scsi_datain_unmarshall(iTask.task); >> - if (lbas == NULL) { >> - ret = -EIO; >> - goto out; >> - } >> - >> - lbasd = &lbas->descriptors[0]; >> - >> - if (sector_qemu2lun(sector_num, iscsilun) != lbasd->lba) { >> - ret = -EIO; >> - goto out; >> - } >> - >> - *pnum = sector_lun2qemu(lbasd->num_blocks, iscsilun); >> - if (*pnum > nb_sectors) { >> - *pnum = nb_sectors; >> - } >> - >> - if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED || >> - lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) { >> - ret &= ~BDRV_BLOCK_DATA; >> - if (iscsilun->lbprz) { >> - ret |= BDRV_BLOCK_ZERO; >> - } >> - } >> - >> -out: >> - if (iTask.task != NULL) { >> - scsi_free_scsi_task(iTask.task); >> - } >> - return ret; >> -} >> - >> -#endif /* LIBISCSI_FEATURE_IOVECTOR */ >> - >> static int >> coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num, >> int nb_sectors) >> @@ -859,6 +917,12 @@ retry: >> return -EIO; >> } >> >> + if (flags & BDRV_REQ_MAY_UNMAP) { >> + iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors); >> + } else { >> + iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors); >> + } >> + >> return 0; >> } >> >> @@ -1288,6 +1352,19 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, >> timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL); >> #endif >> >> + /* Guess the internal cluster (page) size of the iscsi target by the means >> + * of opt_unmap_gran. Transfer the unmap granularity only if it has a >> + * reasonable size */ >> + if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 64 * 1024 && >> + iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) { >> + iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran * iscsilun->block_size) >> BDRV_SECTOR_BITS; >> +#if defined(LIBISCSI_FEATURE_IOVECTOR) >> + if (iscsilun->lbprz) { >> + iscsilun->allocationmap = bitmap_new(DIV_ROUND_UP(bs->total_sectors, iscsilun->cluster_sectors)); >> + } >> +#endif >> + } >> + >> out: >> qemu_opts_del(opts); >> if (initiator_name != NULL) { >> @@ -1321,6 +1398,7 @@ static void iscsi_close(BlockDriverState *bs) >> qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), NULL, NULL, NULL); >> iscsi_destroy_context(iscsi); >> g_free(iscsilun->zeroblock); >> + g_free(iscsilun->allocationmap); >> memset(iscsilun, 0, sizeof(IscsiLun)); >> } >> >> @@ -1379,6 +1457,11 @@ static int iscsi_truncate(BlockDriverState *bs, int64_t offset) >> return -EINVAL; >> } >> >> + if (iscsilun->allocationmap) { >> + g_free(iscsilun->allocationmap); >> + iscsilun->allocationmap = bitmap_new(DIV_ROUND_UP(bs->total_sectors, iscsilun->cluster_sectors)); >> + } >> + >> return 0; >> } >> >> @@ -1441,13 +1524,7 @@ static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) >> IscsiLun *iscsilun = bs->opaque; >> bdi->unallocated_blocks_are_zero = !!iscsilun->lbprz; >> bdi->can_write_zeroes_with_unmap = iscsilun->lbprz && iscsilun->lbp.lbpws; >> - /* Guess the internal cluster (page) size of the iscsi target by the means >> - * of opt_unmap_gran. Transfer the unmap granularity only if it has a >> - * reasonable size for bdi->cluster_size */ >> - if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 64 * 1024 && >> - iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) { >> - bdi->cluster_size = iscsilun->bl.opt_unmap_gran * iscsilun->block_size; >> - } >> + bdi->cluster_size = iscsilun->cluster_sectors * BDRV_SECTOR_SIZE; >> return 0; >> } >> >> >