Re: [Qemu-devel] [PATCH v12 3/5] block: add I/O throttling algorithm

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Kevin Wolf <kwolf@redhat.com>
To: Zhi Yong Wu <zwu.kernel@gmail.com>
Cc: ryanh@us.ibm.com, Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>,
	qemu-devel@nongnu.org, stefanha@linux.vnet.ibm.com
Subject: Re: [Qemu-devel] [PATCH v12 3/5] block: add I/O throttling algorithm
Date: Tue, 08 Nov 2011 09:41:06 +0100	[thread overview]
Message-ID: <4EB8EB22.7060707@redhat.com> (raw)
In-Reply-To: <CAEH94Lg6OmK4qWVARzvY+t4sfa3HFgYeK4T6LqFqf=SU3kpq8g@mail.gmail.com>

Am 08.11.2011 05:34, schrieb Zhi Yong Wu:
> On Mon, Nov 7, 2011 at 11:18 PM, Kevin Wolf <kwolf@redhat.com> wrote:
>> Am 03.11.2011 09:57, schrieb Zhi Yong Wu:
>>> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>>> Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
>>> ---
>>>  block.c     |  220 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>  block.h     |    1 +
>>>  block_int.h |    1 +
>>>  3 files changed, 222 insertions(+), 0 deletions(-)
>>>
>>> diff --git a/block.c b/block.c
>>> index 79e7f09..b2af48f 100644
>>> --- a/block.c
>>> +++ b/block.c
>>> @@ -74,6 +74,13 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
>>>                                                 bool is_write);
>>>  static void coroutine_fn bdrv_co_do_rw(void *opaque);
>>>
>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>> +        bool is_write, double elapsed_time, uint64_t *wait);
>>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>>> +        double elapsed_time, uint64_t *wait);
>>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>>> +        bool is_write, int64_t *wait);
>>> +
>>>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>>
>>> @@ -107,6 +114,24 @@ int is_windows_drive(const char *filename)
>>>  #endif
>>>
>>>  /* throttling disk I/O limits */
>>> +void bdrv_io_limits_disable(BlockDriverState *bs)
>>> +{
>>> +    bs->io_limits_enabled = false;
>>> +
>>> +    while (qemu_co_queue_next(&bs->throttled_reqs));
>>> +
>>> +    if (bs->block_timer) {
>>> +        qemu_del_timer(bs->block_timer);
>>> +        qemu_free_timer(bs->block_timer);
>>> +        bs->block_timer = NULL;
>>> +    }
>>> +
>>> +    bs->slice_start = 0;
>>> +    bs->slice_end   = 0;
>>> +    bs->slice_time  = 0;
>>> +    memset(&bs->io_base, 0, sizeof(bs->io_base));
>>> +}
>>> +
>>>  static void bdrv_block_timer(void *opaque)
>>>  {
>>>      BlockDriverState *bs = opaque;
>>> @@ -136,6 +161,31 @@ bool bdrv_io_limits_enabled(BlockDriverState *bs)
>>>           || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
>>>  }
>>>
>>> +static void bdrv_io_limits_intercept(BlockDriverState *bs,
>>> +                                     bool is_write, int nb_sectors)
>>> +{
>>> +    int64_t wait_time = -1;
>>> +
>>> +    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
>>> +        qemu_co_queue_wait(&bs->throttled_reqs);
>>> +    }
>>> +
>>> +    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
>>> +     * throttled requests will not be dequeued until the current request is
>>> +     * allowed to be serviced. So if the current request still exceeds the
>>> +     * limits, it will be inserted to the head. All requests followed it will
>>> +     * be still in throttled_reqs queue.
>>> +     */
>>> +
>>> +    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
>>> +        qemu_mod_timer(bs->block_timer,
>>> +                       wait_time + qemu_get_clock_ns(vm_clock));
>>> +        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
>>> +    }
>>> +
>>> +    qemu_co_queue_next(&bs->throttled_reqs);
>>> +}
>>> +
>>>  /* check if the path starts with "<protocol>:" */
>>>  static int path_has_protocol(const char *path)
>>>  {
>>> @@ -718,6 +768,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>>>          bdrv_dev_change_media_cb(bs, true);
>>>      }
>>>
>>> +    /* throttling disk I/O limits */
>>> +    if (bs->io_limits_enabled) {
>>> +        bdrv_io_limits_enable(bs);
>>> +    }
>>> +
>>>      return 0;
>>>
>>>  unlink_and_fail:
>>> @@ -753,6 +808,11 @@ void bdrv_close(BlockDriverState *bs)
>>>
>>>          bdrv_dev_change_media_cb(bs, false);
>>>      }
>>> +
>>> +    /*throttling disk I/O limits*/
>>> +    if (bs->io_limits_enabled) {
>>> +        bdrv_io_limits_disable(bs);
>>> +    }
>>>  }
>>>
>>>  void bdrv_close_all(void)
>>> @@ -1291,6 +1351,11 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
>>>          return -EIO;
>>>      }
>>>
>>> +    /* throttling disk read I/O */
>>> +    if (bs->io_limits_enabled) {
>>> +        bdrv_io_limits_intercept(bs, false, nb_sectors);
>>> +    }
>>> +
>>>      return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
>>>  }
>>>
>>> @@ -1321,6 +1386,11 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
>>>          return -EIO;
>>>      }
>>>
>>> +    /* throttling disk write I/O */
>>> +    if (bs->io_limits_enabled) {
>>> +        bdrv_io_limits_intercept(bs, true, nb_sectors);
>>> +    }
>>> +
>>>      ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
>>>
>>>      if (bs->dirty_bitmap) {
>>> @@ -2512,6 +2582,156 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
>>>      acb->pool->cancel(acb);
>>>  }
>>>
>>> +/* block I/O throttling */
>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>> +                 bool is_write, double elapsed_time, uint64_t *wait) {
>>> +    uint64_t bps_limit = 0;
>>> +    double   bytes_limit, bytes_base, bytes_res;
>>> +    double   slice_time, wait_time;
>>> +
>>> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>>> +        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
>>> +    } else if (bs->io_limits.bps[is_write]) {
>>> +        bps_limit = bs->io_limits.bps[is_write];
>>> +    } else {
>>> +        if (wait) {
>>> +            *wait = 0;
>>> +        }
>>> +
>>> +        return false;
>>> +    }
>>> +
>>> +    slice_time = bs->slice_end - bs->slice_start;
>>> +    slice_time /= (NANOSECONDS_PER_SECOND);
>>> +    bytes_limit = bps_limit * slice_time;
>>> +    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
>>> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>>> +        bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
>>> +    }
>>> +
>>> +    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>>> +
>>> +    if (bytes_base + bytes_res <= bytes_limit) {
>>> +        if (wait) {
>>> +            *wait = 0;
>>> +        }
>>> +
>>> +        return false;
>>> +    }
>>> +
>>> +    /* Calc approx time to dispatch */
>>> +    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
>>> +
>>> +    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
>>> +    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
>>> +    if (wait) {
>>> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
>>> +    }
>>
>> I'm not quire sure what bs->slice_end really is and what these
>> calculations do exactly. Looks like magic. Can you add some comments
>> that explain why slice_end is increased?
> As you'ver known, when the I/O rate at runtime exceeds the limits,
> bs->slice_end need to be extended in order that the current statistic
> info can be kept until the timer fire, so it is increased and tuned
> based on the result of experimet.
> 
>> and how you estimate *wait?
> The wait time is calcuated based on the history info of bps and iops.
> 
> bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
> 
> 1.) bytes_base is the bytes of data which have been read/written; and
> it is obtained from the history statistic info.
> 2.) bytes_res is the remaining bytes of data which need to be read/written.
> 3.) (bytes_base + bytes_res) / bps_limit, this expression will be used
> to calcuated the total time for completing reading/writting all data.
> 
> I don't make sure if you understand this.

Yes, I think this makes sense to me.

However, I don't understand why things like 10 * BLOCK_IO_SLICE_TIME or
3 * BLOCK_IO_SLICE_TIME appear in the code. These numbers are magic for
me. Are they more or less arbitrary values that happen to work well?

Kevin

next prev parent reply	other threads:[~2011-11-08  8:38 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-11-03  8:57 [Qemu-devel] [PATCH v12 0/5] The intro to QEMU block I/O throttling Zhi Yong Wu
2011-11-03  8:57 ` [Qemu-devel] [PATCH v12 1/5] block: add the blockio limits command line support Zhi Yong Wu
2011-11-03  8:57 ` [Qemu-devel] [PATCH v12 2/5] CoQueue: introduce qemu_co_queue_wait_insert_head Zhi Yong Wu
2011-11-03  8:57 ` [Qemu-devel] [PATCH v12 3/5] block: add I/O throttling algorithm Zhi Yong Wu
2011-11-07 15:18   ` Kevin Wolf
2011-11-08  4:34     ` Zhi Yong Wu
2011-11-08  8:41       ` Kevin Wolf [this message]
2011-11-08  8:57         ` Zhi Yong Wu
2011-11-03  8:57 ` [Qemu-devel] [PATCH v12 4/5] hmp/qmp: add block_set_io_throttle Zhi Yong Wu
2011-11-07 15:26   ` Kevin Wolf
2011-11-08  2:21     ` Zhi Yong Wu
2011-11-03  8:57 ` [Qemu-devel] [PATCH v12 5/5] block: perf testing report based on block I/O throttling Zhi Yong Wu
2011-11-07 15:27   ` Kevin Wolf
2011-11-08  2:15     ` Zhi Yong Wu
  -- strict thread matches above, loose matches on Subject: below --
2011-11-08  5:00 [Qemu-devel] [PATCH v12 3/5] block: add I/O throttling algorithm Zhi Yong Wu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4EB8EB22.7060707@redhat.com \
    --to=kwolf@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=ryanh@us.ibm.com \
    --cc=stefanha@linux.vnet.ibm.com \
    --cc=wuzhy@linux.vnet.ibm.com \
    --cc=zwu.kernel@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.