From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([140.186.70.92]:34542) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1Qqo5G-0005RU-U9 for qemu-devel@nongnu.org; Tue, 09 Aug 2011 11:19:13 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1Qqo5E-0008Ff-Gc for qemu-devel@nongnu.org; Tue, 09 Aug 2011 11:19:10 -0400 Received: from mail-yw0-f45.google.com ([209.85.213.45]:39988) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1Qqo5E-0008FX-0p for qemu-devel@nongnu.org; Tue, 09 Aug 2011 11:19:08 -0400 Received: by ywf9 with SMTP id 9so85504ywf.4 for ; Tue, 09 Aug 2011 08:19:07 -0700 (PDT) MIME-Version: 1.0 In-Reply-To: <1312863472-6901-4-git-send-email-wuzhy@linux.vnet.ibm.com> References: <1312863472-6901-1-git-send-email-wuzhy@linux.vnet.ibm.com> <1312863472-6901-4-git-send-email-wuzhy@linux.vnet.ibm.com> Date: Tue, 9 Aug 2011 16:19:07 +0100 Message-ID: From: Stefan Hajnoczi Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: quoted-printable Subject: Re: [Qemu-devel] [PATCH v5 3/4] block: add block timer and block throttling algorithm List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Zhi Yong Wu Cc: kwolf@redhat.com, pair@us.ibm.com, stefanha@linux.vnet.ibm.com, kvm@vger.kernel.org, mtosatti@redhat.com, qemu-devel@nongnu.org, zwu.kernel@gmail.com, ryanh@us.ibm.com, luowenj@cn.ibm.com On Tue, Aug 9, 2011 at 5:17 AM, Zhi Yong Wu wrot= e: > Note: > =A0 =A0 =A01.) When bps/iops limits are specified to a small value such a= s 511 bytes/s, this VM will hang up. We are considering how to handle this = senario. If an I/O request is larger than the limit itself then I think we should let it through with a warning that the limit is too low. This reflects the fact that we don't split I/O requests into smaller requests and the guest may give us 128 KB (or larger?) requests. In practice the lowest feasible limit is probably 256 KB or 512 KB. > diff --git a/block.c b/block.c > index 24a25d5..8fd6643 100644 > --- a/block.c > +++ b/block.c > @@ -29,6 +29,9 @@ > =A0#include "module.h" > =A0#include "qemu-objects.h" > > +#include "qemu-timer.h" > +#include "block/blk-queue.h" > + > =A0#ifdef CONFIG_BSD > =A0#include > =A0#include > @@ -58,6 +61,13 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t = sector_num, > =A0static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num, > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0const uint8_t *buf, in= t nb_sectors); > > +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, > + =A0 =A0 =A0 =A0bool is_write, double elapsed_time, uint64_t *wait); > +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, > + =A0 =A0 =A0 =A0double elapsed_time, uint64_t *wait); > +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, > + =A0 =A0 =A0 =A0bool is_write, uint64_t *wait); > + > =A0static QTAILQ_HEAD(, BlockDriverState) bdrv_states =3D > =A0 =A0 QTAILQ_HEAD_INITIALIZER(bdrv_states); > > @@ -90,6 +100,68 @@ int is_windows_drive(const char *filename) > =A0} > =A0#endif > > +/* throttling disk I/O limits */ > +void bdrv_io_limits_disable(BlockDriverState *bs) > +{ > + =A0 =A0bs->io_limits_enabled =3D false; > + =A0 =A0bs->req_from_queue =A0 =A0=3D false; > + > + =A0 =A0if (bs->block_queue) { > + =A0 =A0 =A0 =A0qemu_block_queue_flush(bs->block_queue); > + =A0 =A0 =A0 =A0qemu_del_block_queue(bs->block_queue); When you fix the acb lifecycle in block-queue.c this will no longer be safe. Requests that are dispatched still have acbs that belong to this queue. It is simplest to keep the queue for the lifetime of the BlockDriverState - it's just a list so it doesn't take up much memory. > + =A0 =A0} > + > + =A0 =A0if (bs->block_timer) { > + =A0 =A0 =A0 =A0qemu_del_timer(bs->block_timer); > + =A0 =A0 =A0 =A0qemu_free_timer(bs->block_timer); To prevent double frees: bs->block_timer =3D NULL; > + =A0 =A0} > + > + =A0 =A0bs->slice_start[0] =A0 =3D 0; > + =A0 =A0bs->slice_start[1] =A0 =3D 0; > + > + =A0 =A0bs->slice_end[0] =A0 =A0 =3D 0; > + =A0 =A0bs->slice_end[1] =A0 =A0 =3D 0; > +} > + > +static void bdrv_block_timer(void *opaque) > +{ > + =A0 =A0BlockDriverState *bs =3D opaque; > + =A0 =A0BlockQueue *queue =3D bs->block_queue; > + > + =A0 =A0qemu_block_queue_flush(queue); > +} > + > +void bdrv_io_limits_enable(BlockDriverState *bs) > +{ > + =A0 =A0bs->req_from_queue =3D false; > + > + =A0 =A0bs->block_queue =A0 =A0=3D qemu_new_block_queue(); > + =A0 =A0bs->block_timer =A0 =A0=3D qemu_new_timer_ns(vm_clock, bdrv_bloc= k_timer, bs); > + > + =A0 =A0bs->slice_start[BLOCK_IO_LIMIT_READ] =A0=3D qemu_get_clock_ns(vm= _clock); > + =A0 =A0bs->slice_start[BLOCK_IO_LIMIT_WRITE] =3D qemu_get_clock_ns(vm_c= lock); > + > + =A0 =A0bs->slice_end[BLOCK_IO_LIMIT_READ] =A0 =A0=3D > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0qemu_get_clock_ns(vm_clock) = + BLOCK_IO_SLICE_TIME; > + =A0 =A0bs->slice_end[BLOCK_IO_LIMIT_WRITE] =A0 =3D > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0qemu_get_clock_ns(vm_clock) = + BLOCK_IO_SLICE_TIME; The slice times could just be initialized to 0 here. When bdrv_exceed_io_limits() is called it will start a new slice. > +} > + > +bool bdrv_io_limits_enabled(BlockDriverState *bs) > +{ > + =A0 =A0BlockIOLimit *io_limits =3D &bs->io_limits; > + =A0 =A0if ((io_limits->bps[BLOCK_IO_LIMIT_READ] =3D=3D 0) > + =A0 =A0 =A0 =A0 && (io_limits->bps[BLOCK_IO_LIMIT_WRITE] =3D=3D 0) > + =A0 =A0 =A0 =A0 && (io_limits->bps[BLOCK_IO_LIMIT_TOTAL] =3D=3D 0) > + =A0 =A0 =A0 =A0 && (io_limits->iops[BLOCK_IO_LIMIT_READ] =3D=3D 0) > + =A0 =A0 =A0 =A0 && (io_limits->iops[BLOCK_IO_LIMIT_WRITE] =3D=3D 0) > + =A0 =A0 =A0 =A0 && (io_limits->iops[BLOCK_IO_LIMIT_TOTAL] =3D=3D 0)) { > + =A0 =A0 =A0 =A0return false; > + =A0 =A0} > + > + =A0 =A0return true; > +} > + > =A0/* check if the path starts with ":" */ > =A0static int path_has_protocol(const char *path) > =A0{ > @@ -642,6 +714,11 @@ int bdrv_open(BlockDriverState *bs, const char *file= name, int flags, > =A0 =A0 =A0 =A0 =A0 =A0 bs->change_cb(bs->change_opaque, CHANGE_MEDIA); > =A0 =A0 } > > + =A0 =A0/* throttling disk I/O limits */ > + =A0 =A0if (bs->io_limits_enabled) { > + =A0 =A0 =A0 =A0bdrv_io_limits_enable(bs); > + =A0 =A0} > + > =A0 =A0 return 0; > > =A0unlink_and_fail: > @@ -680,6 +757,16 @@ void bdrv_close(BlockDriverState *bs) > =A0 =A0 =A0 =A0 if (bs->change_cb) > =A0 =A0 =A0 =A0 =A0 =A0 bs->change_cb(bs->change_opaque, CHANGE_MEDIA); > =A0 =A0 } > + > + =A0 =A0/* throttling disk I/O limits */ > + =A0 =A0if (bs->block_queue) { > + =A0 =A0 =A0 =A0qemu_del_block_queue(bs->block_queue); > + =A0 =A0} > + > + =A0 =A0if (bs->block_timer) { > + =A0 =A0 =A0 =A0qemu_del_timer(bs->block_timer); > + =A0 =A0 =A0 =A0qemu_free_timer(bs->block_timer); > + =A0 =A0} > =A0} > > =A0void bdrv_close_all(void) > @@ -1312,6 +1399,15 @@ void bdrv_get_geometry_hint(BlockDriverState *bs, > =A0 =A0 *psecs =3D bs->secs; > =A0} > > +/* throttling disk io limits */ > +void bdrv_set_io_limits(BlockDriverState *bs, > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0BlockIOLimit *io= _limits) > +{ > + =A0 =A0memset(&bs->io_limits, 0, sizeof(BlockIOLimit)); The assignment from *io_limits overwrites all of bs->io_limits, the memset() can be removed. > + =A0 =A0bs->io_limits =3D *io_limits; > + =A0 =A0bs->io_limits_enabled =3D bdrv_io_limits_enabled(bs); > +} > + > =A0/* Recognize floppy formats */ > =A0typedef struct FDFormat { > =A0 =A0 FDriveType drive; > @@ -1707,6 +1803,16 @@ static void bdrv_print_dict(QObject *obj, void *op= aque) > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 qdict_get_bool(qd= ict, "ro"), > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 qdict_get_str(qdi= ct, "drv"), > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 qdict_get_bool(qd= ict, "encrypted")); > + > + =A0 =A0 =A0 =A0monitor_printf(mon, " bps=3D%" PRId64 " bps_rd=3D%" PRId= 64 > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0" bps_wr=3D%" PR= Id64 " iops=3D%" PRId64 > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0" iops_rd=3D%" P= RId64 " iops_wr=3D%" PRId64, > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0qdict_get_int(qd= ict, "bps"), > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0qdict_get_int(qd= ict, "bps_rd"), > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0qdict_get_int(qd= ict, "bps_wr"), > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0qdict_get_int(qd= ict, "iops"), > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0qdict_get_int(qd= ict, "iops_rd"), > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0qdict_get_int(qd= ict, "iops_wr")); > =A0 =A0 } else { > =A0 =A0 =A0 =A0 monitor_printf(mon, " [not inserted]"); > =A0 =A0 } > @@ -1739,10 +1845,22 @@ void bdrv_info(Monitor *mon, QObject **ret_data) > =A0 =A0 =A0 =A0 =A0 =A0 QDict *bs_dict =3D qobject_to_qdict(bs_obj); > > =A0 =A0 =A0 =A0 =A0 =A0 obj =3D qobject_from_jsonf("{ 'file': %s, 'ro': %= i, 'drv': %s, " > - =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= "'encrypted': %i }", > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= "'encrypted': %i, " > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= "'bps': %" PRId64 "," > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= "'bps_rd': %" PRId64 "," > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= "'bps_wr': %" PRId64 "," > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= "'iops': %" PRId64 "," > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= "'iops_rd': %" PRId64 "," > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= "'iops_wr': %" PRId64 "}", > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 = =A0bs->filename, bs->read_only, > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 = =A0bs->drv->format_name, > - =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= bdrv_is_encrypted(bs)); > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= bdrv_is_encrypted(bs), > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL], > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= bs->io_limits.bps[BLOCK_IO_LIMIT_READ], > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE], > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL], > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= bs->io_limits.iops[BLOCK_IO_LIMIT_READ], > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE]); > =A0 =A0 =A0 =A0 =A0 =A0 if (bs->backing_file[0] !=3D '\0') { > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 QDict *qdict =3D qobject_to_qdict(obj); > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 qdict_put(qdict, "backing_file", > @@ -2111,6 +2229,165 @@ char *bdrv_snapshot_dump(char *buf, int buf_size,= QEMUSnapshotInfo *sn) > =A0 =A0 return buf; > =A0} > > +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 bool is_write, double elapsed_time, uin= t64_t *wait) { > + =A0 =A0uint64_t bps_limit =3D 0; > + =A0 =A0double =A0 bytes_limit, bytes_disp, bytes_res; > + =A0 =A0double =A0 slice_time, wait_time; > + > + =A0 =A0if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { > + =A0 =A0 =A0 =A0bps_limit =3D bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]; > + =A0 =A0} else if (bs->io_limits.bps[is_write]) { > + =A0 =A0 =A0 =A0bps_limit =3D bs->io_limits.bps[is_write]; > + =A0 =A0} else { > + =A0 =A0 =A0 =A0if (wait) { > + =A0 =A0 =A0 =A0 =A0 =A0*wait =3D 0; > + =A0 =A0 =A0 =A0} > + > + =A0 =A0 =A0 =A0return false; > + =A0 =A0} > + > + =A0 =A0slice_time =3D bs->slice_end[is_write] - bs->slice_start[is_writ= e]; > + =A0 =A0slice_time /=3D (BLOCK_IO_SLICE_TIME * 10.0); > + =A0 =A0bytes_limit =3D bps_limit * slice_time; > + =A0 =A0bytes_disp =A0=3D bs->io_disps.bytes[is_write]; > + =A0 =A0if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { > + =A0 =A0 =A0 =A0bytes_disp +=3D bs->io_disps.bytes[!is_write]; > + =A0 =A0} > + > + =A0 =A0bytes_res =A0 =3D (unsigned) nb_sectors * BDRV_SECTOR_SIZE; > + > + =A0 =A0if (bytes_disp + bytes_res <=3D bytes_limit) { > + =A0 =A0 =A0 =A0if (wait) { > + =A0 =A0 =A0 =A0 =A0 =A0*wait =3D 0; > + =A0 =A0 =A0 =A0} > + > + =A0 =A0 =A0 =A0return false; > + =A0 =A0} > + > + =A0 =A0/* Calc approx time to dispatch */ > + =A0 =A0wait_time =3D (bytes_disp + bytes_res) / bps_limit - elapsed_tim= e; > + > + =A0 =A0if (wait) { > + =A0 =A0 =A0 =A0*wait =3D wait_time * BLOCK_IO_SLICE_TIME * 10; > + =A0 =A0} > + > + =A0 =A0return true; > +} > + > +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 double elapsed_= time, uint64_t *wait) { > + =A0 =A0uint64_t iops_limit =3D 0; > + =A0 =A0double =A0 ios_limit, ios_disp; > + =A0 =A0double =A0 slice_time, wait_time; > + > + =A0 =A0if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { > + =A0 =A0 =A0 =A0iops_limit =3D bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]; > + =A0 =A0} else if (bs->io_limits.iops[is_write]) { > + =A0 =A0 =A0 =A0iops_limit =3D bs->io_limits.iops[is_write]; > + =A0 =A0} else { > + =A0 =A0 =A0 =A0if (wait) { > + =A0 =A0 =A0 =A0 =A0 =A0*wait =3D 0; > + =A0 =A0 =A0 =A0} > + > + =A0 =A0 =A0 =A0return false; > + =A0 =A0} > + > + =A0 =A0slice_time =3D bs->slice_end[is_write] - bs->slice_start[is_writ= e]; > + =A0 =A0slice_time /=3D (BLOCK_IO_SLICE_TIME * 10.0); > + =A0 =A0ios_limit =A0=3D iops_limit * slice_time; > + =A0 =A0ios_disp =A0 =3D bs->io_disps.ios[is_write]; > + =A0 =A0if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { > + =A0 =A0 =A0 =A0ios_disp +=3D bs->io_disps.ios[!is_write]; > + =A0 =A0} > + > + =A0 =A0if (ios_disp + 1 <=3D ios_limit) { > + =A0 =A0 =A0 =A0if (wait) { > + =A0 =A0 =A0 =A0 =A0 =A0*wait =3D 0; > + =A0 =A0 =A0 =A0} > + > + =A0 =A0 =A0 =A0return false; > + =A0 =A0} > + > + =A0 =A0/* Calc approx time to dispatch */ > + =A0 =A0wait_time =3D (ios_disp + 1) / iops_limit; > + =A0 =A0if (wait_time > elapsed_time) { > + =A0 =A0 =A0 =A0wait_time =3D wait_time - elapsed_time; > + =A0 =A0} else { > + =A0 =A0 =A0 =A0wait_time =3D 0; > + =A0 =A0} > + > + =A0 =A0if (wait) { > + =A0 =A0 =A0 =A0*wait =3D wait_time * BLOCK_IO_SLICE_TIME * 10; > + =A0 =A0} > + > + =A0 =A0return true; > +} > + > +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 bool is_write, uint= 64_t *wait) { > + =A0 =A0int64_t =A0real_time, real_slice; > + =A0 =A0uint64_t bps_wait =3D 0, iops_wait =3D 0, max_wait; > + =A0 =A0double =A0 elapsed_time; > + =A0 =A0int =A0 =A0 =A0bps_ret, iops_ret; > + > + =A0 =A0real_time =3D qemu_get_clock_ns(vm_clock); Please call it 'now' instead of 'real_time'. There is a "real-time clock" called rt_clock and this variable name could be confusing. > + =A0 =A0real_slice =3D bs->slice_end[is_write] - bs->slice_start[is_writ= e]; > + =A0 =A0if ((bs->slice_start[is_write] < real_time) > + =A0 =A0 =A0 =A0&& (bs->slice_end[is_write] > real_time)) { > + =A0 =A0 =A0 =A0bs->slice_end[is_write] =A0 =3D real_time + BLOCK_IO_SLI= CE_TIME; > + =A0 =A0} else { > + =A0 =A0 =A0 =A0bs->slice_start[is_write] =A0 =A0 =3D real_time; > + =A0 =A0 =A0 =A0bs->slice_end[is_write] =A0 =A0 =A0 =3D real_time + BLOC= K_IO_SLICE_TIME; > + > + =A0 =A0 =A0 =A0bs->io_disps.bytes[is_write] =A0=3D 0; > + =A0 =A0 =A0 =A0bs->io_disps.bytes[!is_write] =3D 0; > + > + =A0 =A0 =A0 =A0bs->io_disps.ios[is_write] =A0 =A0=3D 0; > + =A0 =A0 =A0 =A0bs->io_disps.ios[!is_write] =A0 =3D 0; > + =A0 =A0} > + > + =A0 =A0/* If a limit was exceeded, immediately queue this request */ > + =A0 =A0if (!bs->req_from_queue > + =A0 =A0 =A0 =A0&& !QTAILQ_EMPTY(&bs->block_queue->requests)) { bs->req_from_queue seems to be a way to prevent requests unconditionally being queued during flush. Please define a BlockQueue interface instead, something like this: /* If a limit was exceeded, immediately queue this request */ if (qemu_block_queue_has_pending(bs->block_queue)) { Then req_from_queue can be hidden inside BlockQueue. I'd rename it to bool flushing. qemu_block_queue_flush() will set it to true at the start of the function and false at the end of the function. > + =A0 =A0 =A0 =A0if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL] > + =A0 =A0 =A0 =A0 =A0 =A0|| bs->io_limits.bps[is_write] || bs->io_limits.= iops[is_write] > + =A0 =A0 =A0 =A0 =A0 =A0|| bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { > + =A0 =A0 =A0 =A0 =A0 =A0if (wait) { > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0*wait =3D 0; No estimated wait time? That's okay only if we stop setting the timer whenever a request is queued - more comments about this below. > + =A0 =A0 =A0 =A0 =A0 =A0} > + > + =A0 =A0 =A0 =A0 =A0 =A0return true; > + =A0 =A0 =A0 =A0} > + =A0 =A0} > + > + =A0 =A0elapsed_time =A0=3D real_time - bs->slice_start[is_write]; > + =A0 =A0elapsed_time =A0/=3D (BLOCK_IO_SLICE_TIME * 10.0); Why * 10.0? > + > + =A0 =A0bps_ret =A0=3D bdrv_exceed_bps_limits(bs, nb_sectors, > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= =A0is_write, elapsed_time, &bps_wait); > + =A0 =A0iops_ret =3D bdrv_exceed_iops_limits(bs, is_write, > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= =A0elapsed_time, &iops_wait); > + =A0 =A0if (bps_ret || iops_ret) { > + =A0 =A0 =A0 =A0max_wait =3D bps_wait > iops_wait ? bps_wait : iops_wait= ; > + =A0 =A0 =A0 =A0if (wait) { > + =A0 =A0 =A0 =A0 =A0 =A0*wait =3D max_wait; > + =A0 =A0 =A0 =A0} > + > + =A0 =A0 =A0 =A0real_time =3D qemu_get_clock_ns(vm_clock); > + =A0 =A0 =A0 =A0if (bs->slice_end[is_write] < real_time + max_wait) { > + =A0 =A0 =A0 =A0 =A0 =A0bs->slice_end[is_write] =3D real_time + max_wait= ; > + =A0 =A0 =A0 =A0} Why is this necessary? We have already extended the slice at the top of the function. > + > + =A0 =A0 =A0 =A0return true; > + =A0 =A0} > + > + =A0 =A0if (wait) { > + =A0 =A0 =A0 =A0*wait =3D 0; > + =A0 =A0} > + > + =A0 =A0return false; > +} > > =A0/**************************************************************/ > =A0/* async I/Os */ > @@ -2121,13 +2398,28 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState= *bs, int64_t sector_num, > =A0{ > =A0 =A0 BlockDriver *drv =3D bs->drv; > =A0 =A0 BlockDriverAIOCB *ret; > + =A0 =A0uint64_t wait_time =3D 0; > > =A0 =A0 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); > > - =A0 =A0if (!drv) > - =A0 =A0 =A0 =A0return NULL; > - =A0 =A0if (bdrv_check_request(bs, sector_num, nb_sectors)) > + =A0 =A0if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) { > + =A0 =A0 =A0 =A0if (bs->io_limits_enabled) { > + =A0 =A0 =A0 =A0 =A0 =A0bs->req_from_queue =3D false; > + =A0 =A0 =A0 =A0} > =A0 =A0 =A0 =A0 return NULL; > + =A0 =A0} > + > + =A0 =A0/* throttling disk read I/O */ > + =A0 =A0if (bs->io_limits_enabled) { > + =A0 =A0 =A0 =A0if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_t= ime)) { > + =A0 =A0 =A0 =A0 =A0 =A0ret =3D qemu_block_queue_enqueue(bs->block_queue= , bs, bdrv_aio_readv, > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0sector_num, = qiov, nb_sectors, cb, opaque); > + =A0 =A0 =A0 =A0 =A0 =A0qemu_mod_timer(bs->block_timer, > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 wait_time + qemu_get_clock_= ns(vm_clock)); If a guest keeps sending I/O requests, say every millisecond, then we will delay dispatch forever. In practice we hope the storage interface (virtio-blk, LSI SCSI, etc) has a resource limit that prevents this. But the point remains that this delays requests that should be dispatched for too long. Once the timer has been set we should not set it again. > + =A0 =A0 =A0 =A0 =A0 =A0bs->req_from_queue =3D false; > + =A0 =A0 =A0 =A0 =A0 =A0return ret; > + =A0 =A0 =A0 =A0} > + =A0 =A0} > > =A0 =A0 ret =3D drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 cb, opaque); > @@ -2136,6 +2428,16 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState = *bs, int64_t sector_num, > =A0 =A0 =A0 =A0/* Update stats even though technically transfer has not h= appened. */ > =A0 =A0 =A0 =A0bs->rd_bytes +=3D (unsigned) nb_sectors * BDRV_SECTOR_SIZE= ; > =A0 =A0 =A0 =A0bs->rd_ops ++; > + > + =A0 =A0 =A0 =A0if (bs->io_limits_enabled) { > + =A0 =A0 =A0 =A0 =A0 =A0bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] +=3D > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0(unsigned) n= b_sectors * BDRV_SECTOR_SIZE; > + =A0 =A0 =A0 =A0 =A0 =A0bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++; > + =A0 =A0 =A0 =A0} > + =A0 =A0} > + > + =A0 =A0if (bs->io_limits_enabled) { > + =A0 =A0 =A0 =A0bs->req_from_queue =3D false; > =A0 =A0 } > > =A0 =A0 return ret; > @@ -2184,15 +2486,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverStat= e *bs, int64_t sector_num, > =A0 =A0 BlockDriver *drv =3D bs->drv; > =A0 =A0 BlockDriverAIOCB *ret; > =A0 =A0 BlockCompleteData *blk_cb_data; > + =A0 =A0uint64_t wait_time =3D 0; > > =A0 =A0 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); > > - =A0 =A0if (!drv) > - =A0 =A0 =A0 =A0return NULL; > - =A0 =A0if (bs->read_only) > - =A0 =A0 =A0 =A0return NULL; > - =A0 =A0if (bdrv_check_request(bs, sector_num, nb_sectors)) > + =A0 =A0if (!drv || bs->read_only > + =A0 =A0 =A0 =A0|| bdrv_check_request(bs, sector_num, nb_sectors)) { > + =A0 =A0 =A0 =A0if (bs->io_limits_enabled) { > + =A0 =A0 =A0 =A0 =A0 =A0bs->req_from_queue =3D false; > + =A0 =A0 =A0 =A0} > + > =A0 =A0 =A0 =A0 return NULL; > + =A0 =A0} > > =A0 =A0 if (bs->dirty_bitmap) { > =A0 =A0 =A0 =A0 blk_cb_data =3D blk_dirty_cb_alloc(bs, sector_num, nb_sec= tors, cb, > @@ -2201,6 +2506,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState= *bs, int64_t sector_num, > =A0 =A0 =A0 =A0 opaque =3D blk_cb_data; > =A0 =A0 } > > + =A0 =A0/* throttling disk write I/O */ > + =A0 =A0if (bs->io_limits_enabled) { > + =A0 =A0 =A0 =A0if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_ti= me)) { > + =A0 =A0 =A0 =A0 =A0 =A0ret =3D qemu_block_queue_enqueue(bs->block_queue= , bs, bdrv_aio_writev, > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0sect= or_num, qiov, nb_sectors, cb, opaque); > + =A0 =A0 =A0 =A0 =A0 =A0qemu_mod_timer(bs->block_timer, > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0wait= _time + qemu_get_clock_ns(vm_clock)); > + =A0 =A0 =A0 =A0 =A0 =A0bs->req_from_queue =3D false; > + =A0 =A0 =A0 =A0 =A0 =A0return ret; > + =A0 =A0 =A0 =A0} > + =A0 =A0} > + > =A0 =A0 ret =3D drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0cb, opaque= ); > > @@ -2211,6 +2528,16 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState= *bs, int64_t sector_num, > =A0 =A0 =A0 =A0 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) = { > =A0 =A0 =A0 =A0 =A0 =A0 bs->wr_highest_sector =3D sector_num + nb_sectors= - 1; > =A0 =A0 =A0 =A0 } > + > + =A0 =A0 =A0 =A0if (bs->io_limits_enabled) { > + =A0 =A0 =A0 =A0 =A0 =A0bs->io_disps.bytes[BLOCK_IO_LIMIT_WRITE] +=3D > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 (unsigned) = nb_sectors * BDRV_SECTOR_SIZE; > + =A0 =A0 =A0 =A0 =A0 =A0bs->io_disps.ios[BLOCK_IO_LIMIT_WRITE]++; > + =A0 =A0 =A0 =A0} > + =A0 =A0} > + > + =A0 =A0if (bs->io_limits_enabled) { > + =A0 =A0 =A0 =A0bs->req_from_queue =3D false; > =A0 =A0 } > > =A0 =A0 return ret; > diff --git a/block.h b/block.h > index 859d1d9..3d02902 100644 > --- a/block.h > +++ b/block.h > @@ -57,6 +57,11 @@ void bdrv_info(Monitor *mon, QObject **ret_data); > =A0void bdrv_stats_print(Monitor *mon, const QObject *data); > =A0void bdrv_info_stats(Monitor *mon, QObject **ret_data); > > +/* disk I/O throttling */ > +void bdrv_io_limits_enable(BlockDriverState *bs); > +void bdrv_io_limits_disable(BlockDriverState *bs); > +bool bdrv_io_limits_enabled(BlockDriverState *bs); > + > =A0void bdrv_init(void); > =A0void bdrv_init_with_whitelist(void); > =A0BlockDriver *bdrv_find_protocol(const char *filename); > @@ -97,7 +102,6 @@ int bdrv_change_backing_file(BlockDriverState *bs, > =A0 =A0 const char *backing_file, const char *backing_fmt); > =A0void bdrv_register(BlockDriver *bdrv); > > - > =A0typedef struct BdrvCheckResult { > =A0 =A0 int corruptions; > =A0 =A0 int leaks; > diff --git a/block_int.h b/block_int.h > index 1e265d2..a4cd458 100644 > --- a/block_int.h > +++ b/block_int.h > @@ -27,10 +27,17 @@ > =A0#include "block.h" > =A0#include "qemu-option.h" > =A0#include "qemu-queue.h" > +#include "block/blk-queue.h" > > =A0#define BLOCK_FLAG_ENCRYPT =A0 =A0 1 > =A0#define BLOCK_FLAG_COMPAT6 =A0 =A0 4 > > +#define BLOCK_IO_LIMIT_READ =A0 =A0 0 > +#define BLOCK_IO_LIMIT_WRITE =A0 =A01 > +#define BLOCK_IO_LIMIT_TOTAL =A0 =A02 > + > +#define BLOCK_IO_SLICE_TIME =A0 =A0 100000000 Please add a comment indicating the units: /* nanoseconds */ > + > =A0#define BLOCK_OPT_SIZE =A0 =A0 =A0 =A0 =A0"size" > =A0#define BLOCK_OPT_ENCRYPT =A0 =A0 =A0 "encryption" > =A0#define BLOCK_OPT_COMPAT6 =A0 =A0 =A0 "compat6" > @@ -46,6 +53,16 @@ typedef struct AIOPool { > =A0 =A0 BlockDriverAIOCB *free_aiocb; > =A0} AIOPool; > > +typedef struct BlockIOLimit { > + =A0 =A0uint64_t bps[3]; > + =A0 =A0uint64_t iops[3]; > +} BlockIOLimit; > + > +typedef struct BlockIODisp { > + =A0 =A0uint64_t bytes[2]; > + =A0 =A0uint64_t ios[2]; > +} BlockIODisp; > + > =A0struct BlockDriver { > =A0 =A0 const char *format_name; > =A0 =A0 int instance_size; > @@ -175,6 +192,16 @@ struct BlockDriverState { > > =A0 =A0 void *sync_aiocb; > > + =A0 =A0/* the time for latest disk I/O */ > + =A0 =A0int64_t slice_start[2]; > + =A0 =A0int64_t slice_end[2]; > + =A0 =A0BlockIOLimit io_limits; > + =A0 =A0BlockIODisp =A0io_disps; > + =A0 =A0BlockQueue =A0 *block_queue; > + =A0 =A0QEMUTimer =A0 =A0*block_timer; > + =A0 =A0bool =A0 =A0 =A0 =A0 io_limits_enabled; > + =A0 =A0bool =A0 =A0 =A0 =A0 req_from_queue; > + > =A0 =A0 /* I/O stats (display with "info blockstats"). */ > =A0 =A0 uint64_t rd_bytes; > =A0 =A0 uint64_t wr_bytes; > @@ -222,6 +249,9 @@ void qemu_aio_release(void *p); > > =A0void *qemu_blockalign(BlockDriverState *bs, size_t size); > > +void bdrv_set_io_limits(BlockDriverState *bs, > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0BlockIOLimit *io= _limits); > + > =A0#ifdef _WIN32 > =A0int is_windows_drive(const char *filename); > =A0#endif > -- > 1.7.2.3 > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at =A0http://vger.kernel.org/majordomo-info.html >