* [Qemu-devel] [PATCH v12 3/5] block: add I/O throttling algorithm
@ 2011-11-08 5:00 Zhi Yong Wu
0 siblings, 0 replies; 6+ messages in thread
From: Zhi Yong Wu @ 2011-11-08 5:00 UTC (permalink / raw)
To: kwolf; +Cc: zwu.kernel, ryanh, Zhi Yong Wu, qemu-devel, stefanha
Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
block.c | 234 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
block.h | 1 +
block_int.h | 1 +
3 files changed, 236 insertions(+), 0 deletions(-)
diff --git a/block.c b/block.c
index 79e7f09..3d0ec23 100644
--- a/block.c
+++ b/block.c
@@ -74,6 +74,13 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
bool is_write);
static void coroutine_fn bdrv_co_do_rw(void *opaque);
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+ bool is_write, double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+ double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+ bool is_write, int64_t *wait);
+
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
QTAILQ_HEAD_INITIALIZER(bdrv_states);
@@ -107,6 +114,24 @@ int is_windows_drive(const char *filename)
#endif
/* throttling disk I/O limits */
+void bdrv_io_limits_disable(BlockDriverState *bs)
+{
+ bs->io_limits_enabled = false;
+
+ while (qemu_co_queue_next(&bs->throttled_reqs));
+
+ if (bs->block_timer) {
+ qemu_del_timer(bs->block_timer);
+ qemu_free_timer(bs->block_timer);
+ bs->block_timer = NULL;
+ }
+
+ bs->slice_start = 0;
+ bs->slice_end = 0;
+ bs->slice_time = 0;
+ memset(&bs->io_base, 0, sizeof(bs->io_base));
+}
+
static void bdrv_block_timer(void *opaque)
{
BlockDriverState *bs = opaque;
@@ -136,6 +161,31 @@ bool bdrv_io_limits_enabled(BlockDriverState *bs)
|| io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
}
+static void bdrv_io_limits_intercept(BlockDriverState *bs,
+ bool is_write, int nb_sectors)
+{
+ int64_t wait_time = -1;
+
+ if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
+ qemu_co_queue_wait(&bs->throttled_reqs);
+ }
+
+ /* In fact, we hope to keep each request's timing, in FIFO mode. The next
+ * throttled requests will not be dequeued until the current request is
+ * allowed to be serviced. So if the current request still exceeds the
+ * limits, it will be inserted to the head. All requests followed it will
+ * be still in throttled_reqs queue.
+ */
+
+ while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
+ qemu_mod_timer(bs->block_timer,
+ wait_time + qemu_get_clock_ns(vm_clock));
+ qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
+ }
+
+ qemu_co_queue_next(&bs->throttled_reqs);
+}
+
/* check if the path starts with "<protocol>:" */
static int path_has_protocol(const char *path)
{
@@ -718,6 +768,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
bdrv_dev_change_media_cb(bs, true);
}
+ /* throttling disk I/O limits */
+ if (bs->io_limits_enabled) {
+ bdrv_io_limits_enable(bs);
+ }
+
return 0;
unlink_and_fail:
@@ -753,6 +808,11 @@ void bdrv_close(BlockDriverState *bs)
bdrv_dev_change_media_cb(bs, false);
}
+
+ /*throttling disk I/O limits*/
+ if (bs->io_limits_enabled) {
+ bdrv_io_limits_disable(bs);
+ }
}
void bdrv_close_all(void)
@@ -1291,6 +1351,11 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
return -EIO;
}
+ /* throttling disk read I/O */
+ if (bs->io_limits_enabled) {
+ bdrv_io_limits_intercept(bs, false, nb_sectors);
+ }
+
return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
}
@@ -1321,6 +1386,11 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
return -EIO;
}
+ /* throttling disk write I/O */
+ if (bs->io_limits_enabled) {
+ bdrv_io_limits_intercept(bs, true, nb_sectors);
+ }
+
ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
if (bs->dirty_bitmap) {
@@ -2512,6 +2582,170 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
acb->pool->cancel(acb);
}
+/* block I/O throttling */
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+ bool is_write, double elapsed_time, uint64_t *wait)
+{
+ uint64_t bps_limit = 0;
+ double bytes_limit, bytes_base, bytes_res;
+ double slice_time, wait_time;
+
+ if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+ bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
+ } else if (bs->io_limits.bps[is_write]) {
+ bps_limit = bs->io_limits.bps[is_write];
+ } else {
+ if (wait) {
+ *wait = 0;
+ }
+
+ return false;
+ }
+
+ slice_time = bs->slice_end - bs->slice_start;
+ slice_time /= (NANOSECONDS_PER_SECOND);
+ bytes_limit = bps_limit * slice_time;
+ bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
+ if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+ bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
+ }
+
+ /* bytes_base: the bytes of data which have been read/written; and
+ * it is obtained from the history statistic info.
+ * bytes_res: the remaining bytes of data which need to be read/written.
+ * (bytes_base + bytes_res) / bps_limit: used to calcuate
+ * the total time for completing reading/writting all data.
+ */
+ bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+
+ if (bytes_base + bytes_res <= bytes_limit) {
+ if (wait) {
+ *wait = 0;
+ }
+
+ return false;
+ }
+
+ /* Calc approx time to dispatch */
+ wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
+
+ /* When the I/O rate at runtime exceeds the limits,
+ * bs->slice_end need to be extended in order that the current statistic
+ * info can be kept until the timer fire, so it is increased and tuned
+ * based on the result of experiment.
+ */
+ bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
+ bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
+ if (wait) {
+ *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+ }
+
+ return true;
+}
+
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+ double elapsed_time, uint64_t *wait)
+{
+ uint64_t iops_limit = 0;
+ double ios_limit, ios_base;
+ double slice_time, wait_time;
+
+ if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+ iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
+ } else if (bs->io_limits.iops[is_write]) {
+ iops_limit = bs->io_limits.iops[is_write];
+ } else {
+ if (wait) {
+ *wait = 0;
+ }
+
+ return false;
+ }
+
+ slice_time = bs->slice_end - bs->slice_start;
+ slice_time /= (NANOSECONDS_PER_SECOND);
+ ios_limit = iops_limit * slice_time;
+ ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
+ if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+ ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
+ }
+
+ if (ios_base + 1 <= ios_limit) {
+ if (wait) {
+ *wait = 0;
+ }
+
+ return false;
+ }
+
+ /* Calc approx time to dispatch */
+ wait_time = (ios_base + 1) / iops_limit;
+ if (wait_time > elapsed_time) {
+ wait_time = wait_time - elapsed_time;
+ } else {
+ wait_time = 0;
+ }
+
+ bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
+ bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
+ if (wait) {
+ *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+ }
+
+ return true;
+}
+
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+ bool is_write, int64_t *wait)
+{
+ int64_t now, max_wait;
+ uint64_t bps_wait = 0, iops_wait = 0;
+ double elapsed_time;
+ int bps_ret, iops_ret;
+
+ now = qemu_get_clock_ns(vm_clock);
+ if ((bs->slice_start < now)
+ && (bs->slice_end > now)) {
+ bs->slice_end = now + bs->slice_time;
+ } else {
+ bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
+ bs->slice_start = now;
+ bs->slice_end = now + bs->slice_time;
+
+ bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
+ bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
+
+ bs->io_base.ios[is_write] = bs->nr_ops[is_write];
+ bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
+ }
+
+ elapsed_time = now - bs->slice_start;
+ elapsed_time /= (NANOSECONDS_PER_SECOND);
+
+ bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
+ is_write, elapsed_time, &bps_wait);
+ iops_ret = bdrv_exceed_iops_limits(bs, is_write,
+ elapsed_time, &iops_wait);
+ if (bps_ret || iops_ret) {
+ max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
+ if (wait) {
+ *wait = max_wait;
+ }
+
+ now = qemu_get_clock_ns(vm_clock);
+ if (bs->slice_end < now + max_wait) {
+ bs->slice_end = now + max_wait;
+ }
+
+ return true;
+ }
+
+ if (wait) {
+ *wait = 0;
+ }
+
+ return false;
+}
/**************************************************************/
/* async block device emulation */
diff --git a/block.h b/block.h
index bc8315d..9b5b35f 100644
--- a/block.h
+++ b/block.h
@@ -91,6 +91,7 @@ void bdrv_info_stats(Monitor *mon, QObject **ret_data);
/* disk I/O throttling */
void bdrv_io_limits_enable(BlockDriverState *bs);
+void bdrv_io_limits_disable(BlockDriverState *bs);
bool bdrv_io_limits_enabled(BlockDriverState *bs);
void bdrv_init(void);
diff --git a/block_int.h b/block_int.h
index 7315e0d..69418fe 100644
--- a/block_int.h
+++ b/block_int.h
@@ -39,6 +39,7 @@
#define BLOCK_IO_LIMIT_TOTAL 2
#define BLOCK_IO_SLICE_TIME 100000000
+#define NANOSECONDS_PER_SECOND 1000000000.0
#define BLOCK_OPT_SIZE "size"
#define BLOCK_OPT_ENCRYPT "encryption"
--
1.7.6
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [Qemu-devel] [PATCH v12 0/5] The intro to QEMU block I/O throttling
@ 2011-11-03 8:57 Zhi Yong Wu
2011-11-03 8:57 ` [Qemu-devel] [PATCH v12 3/5] block: add I/O throttling algorithm Zhi Yong Wu
0 siblings, 1 reply; 6+ messages in thread
From: Zhi Yong Wu @ 2011-11-03 8:57 UTC (permalink / raw)
To: kwolf; +Cc: zwu.kernel, ryanh, Zhi Yong Wu, qemu-devel, stefanha
The main goal of the patch is to effectively cap the disk I/O speed or counts of one single VM.It is only one draft, so it unavoidably has some drawbacks, if you catch them, please let me know.
The patch will mainly introduce one block I/O throttling algorithm, one timer and one block queue for each I/O limits enabled drive.
When a block request is coming in, the throttling algorithm will check if its I/O rate or counts exceed the limits; if yes, then it will enqueue to the block queue; The timer will handle the I/O requests in it.
Some available features follow as below:
(1) global bps limit.
-drive bps=xxx in bytes/s
(2) only read bps limit
-drive bps_rd=xxx in bytes/s
(3) only write bps limit
-drive bps_wr=xxx in bytes/s
(4) global iops limit
-drive iops=xxx in ios/s
(5) only read iops limit
-drive iops_rd=xxx in ios/s
(6) only write iops limit
-drive iops_wr=xxx in ios/s
(7) the combination of some limits.
-drive bps=xxx,iops=xxx
Known Limitations:
(1) #1 can not coexist with #2, #3
(2) #4 can not coexist with #5, #6
Changes since code V11:
Made some changes based on kevin's comments.
v11: Made some mininal changes based on stefan and Ryan's comments
Add one perf report for block I/O throttling
v10: Greately simply the logic and rebase request queue to CoQueue based on Stefan's comments.
v9: made a lot of changes based on kevin's comments.
slice_time is dynamically adjusted based on wait_time.
rebase the latest qemu upstream.
v8: fix the build per patch based on stefan's comments.
v7: Mainly simply the block queue.
Adjust codes based on stefan's comments.
v6: Mainly fix the aio callback issue for block queue.
Adjust codes based on Ram Pai's comments.
v5: add qmp/hmp support.
Adjust the codes based on stefan's comments
qmp/hmp: add block_set_io_throttle
v4: fix memory leaking based on ryan's feedback.
v3: Added the code for extending slice time, and modified the method to compute wait time for the timer.
v2: The codes V2 for QEMU disk I/O limits.
Modified the codes mainly based on stefan's comments.
v1: Submit the codes for QEMU disk I/O limits.
Zhi Yong Wu (5):
block: add the blockio limits command line support
CoQueue: introduce qemu_co_queue_wait_insert_head
block: add I/O throttling algorithm
hmp/qmp: add block_set_io_throttle
block: perf testing report based on block I/O throttling
10mbps.dat | 310 ++++++++++++++++++++++++++++++++++++++++++++
1mbps.dat | 339 +++++++++++++++++++++++++++++++++++++++++++++++++
block.c | 274 +++++++++++++++++++++++++++++++++++++++
block.h | 5 +
block_int.h | 30 +++++
blockdev.c | 103 +++++++++++++++
blockdev.h | 2 +
hmp-commands.hx | 15 ++
hmp.c | 10 ++
qapi-schema.json | 16 ++-
qemu-config.c | 24 ++++
qemu-coroutine-lock.c | 8 +
qemu-coroutine.h | 6 +
qemu-options.hx | 1 +
qerror.c | 4 +
qerror.h | 3 +
qmp-commands.hx | 53 ++++++++-
17 files changed, 1201 insertions(+), 2 deletions(-)
create mode 100644 10mbps.dat
create mode 100644 1mbps.dat
--
1.7.6
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Qemu-devel] [PATCH v12 3/5] block: add I/O throttling algorithm
2011-11-03 8:57 [Qemu-devel] [PATCH v12 0/5] The intro to QEMU block I/O throttling Zhi Yong Wu
@ 2011-11-03 8:57 ` Zhi Yong Wu
2011-11-07 15:18 ` Kevin Wolf
0 siblings, 1 reply; 6+ messages in thread
From: Zhi Yong Wu @ 2011-11-03 8:57 UTC (permalink / raw)
To: kwolf; +Cc: zwu.kernel, ryanh, Zhi Yong Wu, qemu-devel, stefanha
Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
block.c | 220 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
block.h | 1 +
block_int.h | 1 +
3 files changed, 222 insertions(+), 0 deletions(-)
diff --git a/block.c b/block.c
index 79e7f09..b2af48f 100644
--- a/block.c
+++ b/block.c
@@ -74,6 +74,13 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
bool is_write);
static void coroutine_fn bdrv_co_do_rw(void *opaque);
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+ bool is_write, double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+ double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+ bool is_write, int64_t *wait);
+
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
QTAILQ_HEAD_INITIALIZER(bdrv_states);
@@ -107,6 +114,24 @@ int is_windows_drive(const char *filename)
#endif
/* throttling disk I/O limits */
+void bdrv_io_limits_disable(BlockDriverState *bs)
+{
+ bs->io_limits_enabled = false;
+
+ while (qemu_co_queue_next(&bs->throttled_reqs));
+
+ if (bs->block_timer) {
+ qemu_del_timer(bs->block_timer);
+ qemu_free_timer(bs->block_timer);
+ bs->block_timer = NULL;
+ }
+
+ bs->slice_start = 0;
+ bs->slice_end = 0;
+ bs->slice_time = 0;
+ memset(&bs->io_base, 0, sizeof(bs->io_base));
+}
+
static void bdrv_block_timer(void *opaque)
{
BlockDriverState *bs = opaque;
@@ -136,6 +161,31 @@ bool bdrv_io_limits_enabled(BlockDriverState *bs)
|| io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
}
+static void bdrv_io_limits_intercept(BlockDriverState *bs,
+ bool is_write, int nb_sectors)
+{
+ int64_t wait_time = -1;
+
+ if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
+ qemu_co_queue_wait(&bs->throttled_reqs);
+ }
+
+ /* In fact, we hope to keep each request's timing, in FIFO mode. The next
+ * throttled requests will not be dequeued until the current request is
+ * allowed to be serviced. So if the current request still exceeds the
+ * limits, it will be inserted to the head. All requests followed it will
+ * be still in throttled_reqs queue.
+ */
+
+ while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
+ qemu_mod_timer(bs->block_timer,
+ wait_time + qemu_get_clock_ns(vm_clock));
+ qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
+ }
+
+ qemu_co_queue_next(&bs->throttled_reqs);
+}
+
/* check if the path starts with "<protocol>:" */
static int path_has_protocol(const char *path)
{
@@ -718,6 +768,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
bdrv_dev_change_media_cb(bs, true);
}
+ /* throttling disk I/O limits */
+ if (bs->io_limits_enabled) {
+ bdrv_io_limits_enable(bs);
+ }
+
return 0;
unlink_and_fail:
@@ -753,6 +808,11 @@ void bdrv_close(BlockDriverState *bs)
bdrv_dev_change_media_cb(bs, false);
}
+
+ /*throttling disk I/O limits*/
+ if (bs->io_limits_enabled) {
+ bdrv_io_limits_disable(bs);
+ }
}
void bdrv_close_all(void)
@@ -1291,6 +1351,11 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
return -EIO;
}
+ /* throttling disk read I/O */
+ if (bs->io_limits_enabled) {
+ bdrv_io_limits_intercept(bs, false, nb_sectors);
+ }
+
return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
}
@@ -1321,6 +1386,11 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
return -EIO;
}
+ /* throttling disk write I/O */
+ if (bs->io_limits_enabled) {
+ bdrv_io_limits_intercept(bs, true, nb_sectors);
+ }
+
ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
if (bs->dirty_bitmap) {
@@ -2512,6 +2582,156 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
acb->pool->cancel(acb);
}
+/* block I/O throttling */
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+ bool is_write, double elapsed_time, uint64_t *wait) {
+ uint64_t bps_limit = 0;
+ double bytes_limit, bytes_base, bytes_res;
+ double slice_time, wait_time;
+
+ if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+ bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
+ } else if (bs->io_limits.bps[is_write]) {
+ bps_limit = bs->io_limits.bps[is_write];
+ } else {
+ if (wait) {
+ *wait = 0;
+ }
+
+ return false;
+ }
+
+ slice_time = bs->slice_end - bs->slice_start;
+ slice_time /= (NANOSECONDS_PER_SECOND);
+ bytes_limit = bps_limit * slice_time;
+ bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
+ if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+ bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
+ }
+
+ bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+
+ if (bytes_base + bytes_res <= bytes_limit) {
+ if (wait) {
+ *wait = 0;
+ }
+
+ return false;
+ }
+
+ /* Calc approx time to dispatch */
+ wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
+
+ bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
+ bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
+ if (wait) {
+ *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+ }
+
+ return true;
+}
+
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+ double elapsed_time, uint64_t *wait) {
+ uint64_t iops_limit = 0;
+ double ios_limit, ios_base;
+ double slice_time, wait_time;
+
+ if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+ iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
+ } else if (bs->io_limits.iops[is_write]) {
+ iops_limit = bs->io_limits.iops[is_write];
+ } else {
+ if (wait) {
+ *wait = 0;
+ }
+
+ return false;
+ }
+
+ slice_time = bs->slice_end - bs->slice_start;
+ slice_time /= (NANOSECONDS_PER_SECOND);
+ ios_limit = iops_limit * slice_time;
+ ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
+ if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+ ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
+ }
+
+ if (ios_base + 1 <= ios_limit) {
+ if (wait) {
+ *wait = 0;
+ }
+
+ return false;
+ }
+
+ /* Calc approx time to dispatch */
+ wait_time = (ios_base + 1) / iops_limit;
+ if (wait_time > elapsed_time) {
+ wait_time = wait_time - elapsed_time;
+ } else {
+ wait_time = 0;
+ }
+
+ bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
+ bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
+ if (wait) {
+ *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+ }
+
+ return true;
+}
+
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+ bool is_write, int64_t *wait) {
+ int64_t now, max_wait;
+ uint64_t bps_wait = 0, iops_wait = 0;
+ double elapsed_time;
+ int bps_ret, iops_ret;
+
+ now = qemu_get_clock_ns(vm_clock);
+ if ((bs->slice_start < now)
+ && (bs->slice_end > now)) {
+ bs->slice_end = now + bs->slice_time;
+ } else {
+ bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
+ bs->slice_start = now;
+ bs->slice_end = now + bs->slice_time;
+
+ bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
+ bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
+
+ bs->io_base.ios[is_write] = bs->nr_ops[is_write];
+ bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
+ }
+
+ elapsed_time = now - bs->slice_start;
+ elapsed_time /= (NANOSECONDS_PER_SECOND);
+
+ bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
+ is_write, elapsed_time, &bps_wait);
+ iops_ret = bdrv_exceed_iops_limits(bs, is_write,
+ elapsed_time, &iops_wait);
+ if (bps_ret || iops_ret) {
+ max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
+ if (wait) {
+ *wait = max_wait;
+ }
+
+ now = qemu_get_clock_ns(vm_clock);
+ if (bs->slice_end < now + max_wait) {
+ bs->slice_end = now + max_wait;
+ }
+
+ return true;
+ }
+
+ if (wait) {
+ *wait = 0;
+ }
+
+ return false;
+}
/**************************************************************/
/* async block device emulation */
diff --git a/block.h b/block.h
index bc8315d..9b5b35f 100644
--- a/block.h
+++ b/block.h
@@ -91,6 +91,7 @@ void bdrv_info_stats(Monitor *mon, QObject **ret_data);
/* disk I/O throttling */
void bdrv_io_limits_enable(BlockDriverState *bs);
+void bdrv_io_limits_disable(BlockDriverState *bs);
bool bdrv_io_limits_enabled(BlockDriverState *bs);
void bdrv_init(void);
diff --git a/block_int.h b/block_int.h
index 7315e0d..69418fe 100644
--- a/block_int.h
+++ b/block_int.h
@@ -39,6 +39,7 @@
#define BLOCK_IO_LIMIT_TOTAL 2
#define BLOCK_IO_SLICE_TIME 100000000
+#define NANOSECONDS_PER_SECOND 1000000000.0
#define BLOCK_OPT_SIZE "size"
#define BLOCK_OPT_ENCRYPT "encryption"
--
1.7.6
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [Qemu-devel] [PATCH v12 3/5] block: add I/O throttling algorithm
2011-11-03 8:57 ` [Qemu-devel] [PATCH v12 3/5] block: add I/O throttling algorithm Zhi Yong Wu
@ 2011-11-07 15:18 ` Kevin Wolf
2011-11-08 4:34 ` Zhi Yong Wu
0 siblings, 1 reply; 6+ messages in thread
From: Kevin Wolf @ 2011-11-07 15:18 UTC (permalink / raw)
To: Zhi Yong Wu; +Cc: zwu.kernel, ryanh, qemu-devel, stefanha
Am 03.11.2011 09:57, schrieb Zhi Yong Wu:
> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
> Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
> ---
> block.c | 220 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> block.h | 1 +
> block_int.h | 1 +
> 3 files changed, 222 insertions(+), 0 deletions(-)
>
> diff --git a/block.c b/block.c
> index 79e7f09..b2af48f 100644
> --- a/block.c
> +++ b/block.c
> @@ -74,6 +74,13 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
> bool is_write);
> static void coroutine_fn bdrv_co_do_rw(void *opaque);
>
> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
> + bool is_write, double elapsed_time, uint64_t *wait);
> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
> + double elapsed_time, uint64_t *wait);
> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
> + bool is_write, int64_t *wait);
> +
> static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
> QTAILQ_HEAD_INITIALIZER(bdrv_states);
>
> @@ -107,6 +114,24 @@ int is_windows_drive(const char *filename)
> #endif
>
> /* throttling disk I/O limits */
> +void bdrv_io_limits_disable(BlockDriverState *bs)
> +{
> + bs->io_limits_enabled = false;
> +
> + while (qemu_co_queue_next(&bs->throttled_reqs));
> +
> + if (bs->block_timer) {
> + qemu_del_timer(bs->block_timer);
> + qemu_free_timer(bs->block_timer);
> + bs->block_timer = NULL;
> + }
> +
> + bs->slice_start = 0;
> + bs->slice_end = 0;
> + bs->slice_time = 0;
> + memset(&bs->io_base, 0, sizeof(bs->io_base));
> +}
> +
> static void bdrv_block_timer(void *opaque)
> {
> BlockDriverState *bs = opaque;
> @@ -136,6 +161,31 @@ bool bdrv_io_limits_enabled(BlockDriverState *bs)
> || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
> }
>
> +static void bdrv_io_limits_intercept(BlockDriverState *bs,
> + bool is_write, int nb_sectors)
> +{
> + int64_t wait_time = -1;
> +
> + if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
> + qemu_co_queue_wait(&bs->throttled_reqs);
> + }
> +
> + /* In fact, we hope to keep each request's timing, in FIFO mode. The next
> + * throttled requests will not be dequeued until the current request is
> + * allowed to be serviced. So if the current request still exceeds the
> + * limits, it will be inserted to the head. All requests followed it will
> + * be still in throttled_reqs queue.
> + */
> +
> + while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
> + qemu_mod_timer(bs->block_timer,
> + wait_time + qemu_get_clock_ns(vm_clock));
> + qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
> + }
> +
> + qemu_co_queue_next(&bs->throttled_reqs);
> +}
> +
> /* check if the path starts with "<protocol>:" */
> static int path_has_protocol(const char *path)
> {
> @@ -718,6 +768,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
> bdrv_dev_change_media_cb(bs, true);
> }
>
> + /* throttling disk I/O limits */
> + if (bs->io_limits_enabled) {
> + bdrv_io_limits_enable(bs);
> + }
> +
> return 0;
>
> unlink_and_fail:
> @@ -753,6 +808,11 @@ void bdrv_close(BlockDriverState *bs)
>
> bdrv_dev_change_media_cb(bs, false);
> }
> +
> + /*throttling disk I/O limits*/
> + if (bs->io_limits_enabled) {
> + bdrv_io_limits_disable(bs);
> + }
> }
>
> void bdrv_close_all(void)
> @@ -1291,6 +1351,11 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
> return -EIO;
> }
>
> + /* throttling disk read I/O */
> + if (bs->io_limits_enabled) {
> + bdrv_io_limits_intercept(bs, false, nb_sectors);
> + }
> +
> return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
> }
>
> @@ -1321,6 +1386,11 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
> return -EIO;
> }
>
> + /* throttling disk write I/O */
> + if (bs->io_limits_enabled) {
> + bdrv_io_limits_intercept(bs, true, nb_sectors);
> + }
> +
> ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
>
> if (bs->dirty_bitmap) {
> @@ -2512,6 +2582,156 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
> acb->pool->cancel(acb);
> }
>
> +/* block I/O throttling */
> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
> + bool is_write, double elapsed_time, uint64_t *wait) {
> + uint64_t bps_limit = 0;
> + double bytes_limit, bytes_base, bytes_res;
> + double slice_time, wait_time;
> +
> + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
> + bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
> + } else if (bs->io_limits.bps[is_write]) {
> + bps_limit = bs->io_limits.bps[is_write];
> + } else {
> + if (wait) {
> + *wait = 0;
> + }
> +
> + return false;
> + }
> +
> + slice_time = bs->slice_end - bs->slice_start;
> + slice_time /= (NANOSECONDS_PER_SECOND);
> + bytes_limit = bps_limit * slice_time;
> + bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
> + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
> + bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
> + }
> +
> + bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> +
> + if (bytes_base + bytes_res <= bytes_limit) {
> + if (wait) {
> + *wait = 0;
> + }
> +
> + return false;
> + }
> +
> + /* Calc approx time to dispatch */
> + wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
> +
> + bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
> + bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
> + if (wait) {
> + *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
> + }
I'm not quire sure what bs->slice_end really is and what these
calculations do exactly. Looks like magic. Can you add some comments
that explain why slice_end is increased and how you estimate *wait?
> +
> + return true;
> +}
> +
> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
> + double elapsed_time, uint64_t *wait) {
Coding style requires the brace on its own line.
> + uint64_t iops_limit = 0;
> + double ios_limit, ios_base;
> + double slice_time, wait_time;
> +
> + if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> + iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
> + } else if (bs->io_limits.iops[is_write]) {
> + iops_limit = bs->io_limits.iops[is_write];
> + } else {
> + if (wait) {
> + *wait = 0;
> + }
> +
> + return false;
> + }
> +
> + slice_time = bs->slice_end - bs->slice_start;
> + slice_time /= (NANOSECONDS_PER_SECOND);
> + ios_limit = iops_limit * slice_time;
> + ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
> + if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> + ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
> + }
> +
> + if (ios_base + 1 <= ios_limit) {
> + if (wait) {
> + *wait = 0;
> + }
> +
> + return false;
> + }
> +
> + /* Calc approx time to dispatch */
> + wait_time = (ios_base + 1) / iops_limit;
> + if (wait_time > elapsed_time) {
> + wait_time = wait_time - elapsed_time;
> + } else {
> + wait_time = 0;
> + }
> +
> + bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
> + bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
> + if (wait) {
> + *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
> + }
> +
> + return true;
> +}
> +
> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
> + bool is_write, int64_t *wait) {
Same here.
Kevin
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [Qemu-devel] [PATCH v12 3/5] block: add I/O throttling algorithm
2011-11-07 15:18 ` Kevin Wolf
@ 2011-11-08 4:34 ` Zhi Yong Wu
2011-11-08 8:41 ` Kevin Wolf
0 siblings, 1 reply; 6+ messages in thread
From: Zhi Yong Wu @ 2011-11-08 4:34 UTC (permalink / raw)
To: Kevin Wolf; +Cc: ryanh, Zhi Yong Wu, qemu-devel, stefanha
On Mon, Nov 7, 2011 at 11:18 PM, Kevin Wolf <kwolf@redhat.com> wrote:
> Am 03.11.2011 09:57, schrieb Zhi Yong Wu:
>> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>> Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
>> ---
>> block.c | 220 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>> block.h | 1 +
>> block_int.h | 1 +
>> 3 files changed, 222 insertions(+), 0 deletions(-)
>>
>> diff --git a/block.c b/block.c
>> index 79e7f09..b2af48f 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -74,6 +74,13 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
>> bool is_write);
>> static void coroutine_fn bdrv_co_do_rw(void *opaque);
>>
>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>> + bool is_write, double elapsed_time, uint64_t *wait);
>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>> + double elapsed_time, uint64_t *wait);
>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>> + bool is_write, int64_t *wait);
>> +
>> static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>> QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>
>> @@ -107,6 +114,24 @@ int is_windows_drive(const char *filename)
>> #endif
>>
>> /* throttling disk I/O limits */
>> +void bdrv_io_limits_disable(BlockDriverState *bs)
>> +{
>> + bs->io_limits_enabled = false;
>> +
>> + while (qemu_co_queue_next(&bs->throttled_reqs));
>> +
>> + if (bs->block_timer) {
>> + qemu_del_timer(bs->block_timer);
>> + qemu_free_timer(bs->block_timer);
>> + bs->block_timer = NULL;
>> + }
>> +
>> + bs->slice_start = 0;
>> + bs->slice_end = 0;
>> + bs->slice_time = 0;
>> + memset(&bs->io_base, 0, sizeof(bs->io_base));
>> +}
>> +
>> static void bdrv_block_timer(void *opaque)
>> {
>> BlockDriverState *bs = opaque;
>> @@ -136,6 +161,31 @@ bool bdrv_io_limits_enabled(BlockDriverState *bs)
>> || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
>> }
>>
>> +static void bdrv_io_limits_intercept(BlockDriverState *bs,
>> + bool is_write, int nb_sectors)
>> +{
>> + int64_t wait_time = -1;
>> +
>> + if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
>> + qemu_co_queue_wait(&bs->throttled_reqs);
>> + }
>> +
>> + /* In fact, we hope to keep each request's timing, in FIFO mode. The next
>> + * throttled requests will not be dequeued until the current request is
>> + * allowed to be serviced. So if the current request still exceeds the
>> + * limits, it will be inserted to the head. All requests followed it will
>> + * be still in throttled_reqs queue.
>> + */
>> +
>> + while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
>> + qemu_mod_timer(bs->block_timer,
>> + wait_time + qemu_get_clock_ns(vm_clock));
>> + qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
>> + }
>> +
>> + qemu_co_queue_next(&bs->throttled_reqs);
>> +}
>> +
>> /* check if the path starts with "<protocol>:" */
>> static int path_has_protocol(const char *path)
>> {
>> @@ -718,6 +768,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>> bdrv_dev_change_media_cb(bs, true);
>> }
>>
>> + /* throttling disk I/O limits */
>> + if (bs->io_limits_enabled) {
>> + bdrv_io_limits_enable(bs);
>> + }
>> +
>> return 0;
>>
>> unlink_and_fail:
>> @@ -753,6 +808,11 @@ void bdrv_close(BlockDriverState *bs)
>>
>> bdrv_dev_change_media_cb(bs, false);
>> }
>> +
>> + /*throttling disk I/O limits*/
>> + if (bs->io_limits_enabled) {
>> + bdrv_io_limits_disable(bs);
>> + }
>> }
>>
>> void bdrv_close_all(void)
>> @@ -1291,6 +1351,11 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
>> return -EIO;
>> }
>>
>> + /* throttling disk read I/O */
>> + if (bs->io_limits_enabled) {
>> + bdrv_io_limits_intercept(bs, false, nb_sectors);
>> + }
>> +
>> return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
>> }
>>
>> @@ -1321,6 +1386,11 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
>> return -EIO;
>> }
>>
>> + /* throttling disk write I/O */
>> + if (bs->io_limits_enabled) {
>> + bdrv_io_limits_intercept(bs, true, nb_sectors);
>> + }
>> +
>> ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
>>
>> if (bs->dirty_bitmap) {
>> @@ -2512,6 +2582,156 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
>> acb->pool->cancel(acb);
>> }
>>
>> +/* block I/O throttling */
>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>> + bool is_write, double elapsed_time, uint64_t *wait) {
>> + uint64_t bps_limit = 0;
>> + double bytes_limit, bytes_base, bytes_res;
>> + double slice_time, wait_time;
>> +
>> + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>> + bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
>> + } else if (bs->io_limits.bps[is_write]) {
>> + bps_limit = bs->io_limits.bps[is_write];
>> + } else {
>> + if (wait) {
>> + *wait = 0;
>> + }
>> +
>> + return false;
>> + }
>> +
>> + slice_time = bs->slice_end - bs->slice_start;
>> + slice_time /= (NANOSECONDS_PER_SECOND);
>> + bytes_limit = bps_limit * slice_time;
>> + bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
>> + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>> + bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
>> + }
>> +
>> + bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>> +
>> + if (bytes_base + bytes_res <= bytes_limit) {
>> + if (wait) {
>> + *wait = 0;
>> + }
>> +
>> + return false;
>> + }
>> +
>> + /* Calc approx time to dispatch */
>> + wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
>> +
>> + bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
>> + bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
>> + if (wait) {
>> + *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
>> + }
>
> I'm not quire sure what bs->slice_end really is and what these
> calculations do exactly. Looks like magic. Can you add some comments
> that explain why slice_end is increased?
As you'ver known, when the I/O rate at runtime exceeds the limits,
bs->slice_end need to be extended in order that the current statistic
info can be kept until the timer fire, so it is increased and tuned
based on the result of experimet.
> and how you estimate *wait?
The wait time is calcuated based on the history info of bps and iops.
bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
1.) bytes_base is the bytes of data which have been read/written; and
it is obtained from the history statistic info.
2.) bytes_res is the remaining bytes of data which need to be read/written.
3.) (bytes_base + bytes_res) / bps_limit, this expression will be used
to calcuated the total time for completing reading/writting all data.
I don't make sure if you understand this.
>
>> +
>> + return true;
>> +}
>> +
>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>> + double elapsed_time, uint64_t *wait) {
>
> Coding style requires the brace on its own line.
>
>> + uint64_t iops_limit = 0;
>> + double ios_limit, ios_base;
>> + double slice_time, wait_time;
>> +
>> + if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
>> + iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
>> + } else if (bs->io_limits.iops[is_write]) {
>> + iops_limit = bs->io_limits.iops[is_write];
>> + } else {
>> + if (wait) {
>> + *wait = 0;
>> + }
>> +
>> + return false;
>> + }
>> +
>> + slice_time = bs->slice_end - bs->slice_start;
>> + slice_time /= (NANOSECONDS_PER_SECOND);
>> + ios_limit = iops_limit * slice_time;
>> + ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
>> + if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
>> + ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
>> + }
>> +
>> + if (ios_base + 1 <= ios_limit) {
>> + if (wait) {
>> + *wait = 0;
>> + }
>> +
>> + return false;
>> + }
>> +
>> + /* Calc approx time to dispatch */
>> + wait_time = (ios_base + 1) / iops_limit;
>> + if (wait_time > elapsed_time) {
>> + wait_time = wait_time - elapsed_time;
>> + } else {
>> + wait_time = 0;
>> + }
>> +
>> + bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
>> + bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
>> + if (wait) {
>> + *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
>> + }
>> +
>> + return true;
>> +}
>> +
>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>> + bool is_write, int64_t *wait) {
>
> Same here.
>
> Kevin
>
--
Regards,
Zhi Yong Wu
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [Qemu-devel] [PATCH v12 3/5] block: add I/O throttling algorithm
2011-11-08 4:34 ` Zhi Yong Wu
@ 2011-11-08 8:41 ` Kevin Wolf
2011-11-08 8:57 ` Zhi Yong Wu
0 siblings, 1 reply; 6+ messages in thread
From: Kevin Wolf @ 2011-11-08 8:41 UTC (permalink / raw)
To: Zhi Yong Wu; +Cc: ryanh, Zhi Yong Wu, qemu-devel, stefanha
Am 08.11.2011 05:34, schrieb Zhi Yong Wu:
> On Mon, Nov 7, 2011 at 11:18 PM, Kevin Wolf <kwolf@redhat.com> wrote:
>> Am 03.11.2011 09:57, schrieb Zhi Yong Wu:
>>> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>>> Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
>>> ---
>>> block.c | 220 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>> block.h | 1 +
>>> block_int.h | 1 +
>>> 3 files changed, 222 insertions(+), 0 deletions(-)
>>>
>>> diff --git a/block.c b/block.c
>>> index 79e7f09..b2af48f 100644
>>> --- a/block.c
>>> +++ b/block.c
>>> @@ -74,6 +74,13 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
>>> bool is_write);
>>> static void coroutine_fn bdrv_co_do_rw(void *opaque);
>>>
>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>> + bool is_write, double elapsed_time, uint64_t *wait);
>>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>>> + double elapsed_time, uint64_t *wait);
>>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>>> + bool is_write, int64_t *wait);
>>> +
>>> static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>> QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>>
>>> @@ -107,6 +114,24 @@ int is_windows_drive(const char *filename)
>>> #endif
>>>
>>> /* throttling disk I/O limits */
>>> +void bdrv_io_limits_disable(BlockDriverState *bs)
>>> +{
>>> + bs->io_limits_enabled = false;
>>> +
>>> + while (qemu_co_queue_next(&bs->throttled_reqs));
>>> +
>>> + if (bs->block_timer) {
>>> + qemu_del_timer(bs->block_timer);
>>> + qemu_free_timer(bs->block_timer);
>>> + bs->block_timer = NULL;
>>> + }
>>> +
>>> + bs->slice_start = 0;
>>> + bs->slice_end = 0;
>>> + bs->slice_time = 0;
>>> + memset(&bs->io_base, 0, sizeof(bs->io_base));
>>> +}
>>> +
>>> static void bdrv_block_timer(void *opaque)
>>> {
>>> BlockDriverState *bs = opaque;
>>> @@ -136,6 +161,31 @@ bool bdrv_io_limits_enabled(BlockDriverState *bs)
>>> || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
>>> }
>>>
>>> +static void bdrv_io_limits_intercept(BlockDriverState *bs,
>>> + bool is_write, int nb_sectors)
>>> +{
>>> + int64_t wait_time = -1;
>>> +
>>> + if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
>>> + qemu_co_queue_wait(&bs->throttled_reqs);
>>> + }
>>> +
>>> + /* In fact, we hope to keep each request's timing, in FIFO mode. The next
>>> + * throttled requests will not be dequeued until the current request is
>>> + * allowed to be serviced. So if the current request still exceeds the
>>> + * limits, it will be inserted to the head. All requests followed it will
>>> + * be still in throttled_reqs queue.
>>> + */
>>> +
>>> + while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
>>> + qemu_mod_timer(bs->block_timer,
>>> + wait_time + qemu_get_clock_ns(vm_clock));
>>> + qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
>>> + }
>>> +
>>> + qemu_co_queue_next(&bs->throttled_reqs);
>>> +}
>>> +
>>> /* check if the path starts with "<protocol>:" */
>>> static int path_has_protocol(const char *path)
>>> {
>>> @@ -718,6 +768,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>>> bdrv_dev_change_media_cb(bs, true);
>>> }
>>>
>>> + /* throttling disk I/O limits */
>>> + if (bs->io_limits_enabled) {
>>> + bdrv_io_limits_enable(bs);
>>> + }
>>> +
>>> return 0;
>>>
>>> unlink_and_fail:
>>> @@ -753,6 +808,11 @@ void bdrv_close(BlockDriverState *bs)
>>>
>>> bdrv_dev_change_media_cb(bs, false);
>>> }
>>> +
>>> + /*throttling disk I/O limits*/
>>> + if (bs->io_limits_enabled) {
>>> + bdrv_io_limits_disable(bs);
>>> + }
>>> }
>>>
>>> void bdrv_close_all(void)
>>> @@ -1291,6 +1351,11 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
>>> return -EIO;
>>> }
>>>
>>> + /* throttling disk read I/O */
>>> + if (bs->io_limits_enabled) {
>>> + bdrv_io_limits_intercept(bs, false, nb_sectors);
>>> + }
>>> +
>>> return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
>>> }
>>>
>>> @@ -1321,6 +1386,11 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
>>> return -EIO;
>>> }
>>>
>>> + /* throttling disk write I/O */
>>> + if (bs->io_limits_enabled) {
>>> + bdrv_io_limits_intercept(bs, true, nb_sectors);
>>> + }
>>> +
>>> ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
>>>
>>> if (bs->dirty_bitmap) {
>>> @@ -2512,6 +2582,156 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
>>> acb->pool->cancel(acb);
>>> }
>>>
>>> +/* block I/O throttling */
>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>> + bool is_write, double elapsed_time, uint64_t *wait) {
>>> + uint64_t bps_limit = 0;
>>> + double bytes_limit, bytes_base, bytes_res;
>>> + double slice_time, wait_time;
>>> +
>>> + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>>> + bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
>>> + } else if (bs->io_limits.bps[is_write]) {
>>> + bps_limit = bs->io_limits.bps[is_write];
>>> + } else {
>>> + if (wait) {
>>> + *wait = 0;
>>> + }
>>> +
>>> + return false;
>>> + }
>>> +
>>> + slice_time = bs->slice_end - bs->slice_start;
>>> + slice_time /= (NANOSECONDS_PER_SECOND);
>>> + bytes_limit = bps_limit * slice_time;
>>> + bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
>>> + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>>> + bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
>>> + }
>>> +
>>> + bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>>> +
>>> + if (bytes_base + bytes_res <= bytes_limit) {
>>> + if (wait) {
>>> + *wait = 0;
>>> + }
>>> +
>>> + return false;
>>> + }
>>> +
>>> + /* Calc approx time to dispatch */
>>> + wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
>>> +
>>> + bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
>>> + bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
>>> + if (wait) {
>>> + *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
>>> + }
>>
>> I'm not quire sure what bs->slice_end really is and what these
>> calculations do exactly. Looks like magic. Can you add some comments
>> that explain why slice_end is increased?
> As you'ver known, when the I/O rate at runtime exceeds the limits,
> bs->slice_end need to be extended in order that the current statistic
> info can be kept until the timer fire, so it is increased and tuned
> based on the result of experimet.
>
>> and how you estimate *wait?
> The wait time is calcuated based on the history info of bps and iops.
>
> bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
>
> 1.) bytes_base is the bytes of data which have been read/written; and
> it is obtained from the history statistic info.
> 2.) bytes_res is the remaining bytes of data which need to be read/written.
> 3.) (bytes_base + bytes_res) / bps_limit, this expression will be used
> to calcuated the total time for completing reading/writting all data.
>
> I don't make sure if you understand this.
Yes, I think this makes sense to me.
However, I don't understand why things like 10 * BLOCK_IO_SLICE_TIME or
3 * BLOCK_IO_SLICE_TIME appear in the code. These numbers are magic for
me. Are they more or less arbitrary values that happen to work well?
Kevin
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [Qemu-devel] [PATCH v12 3/5] block: add I/O throttling algorithm
2011-11-08 8:41 ` Kevin Wolf
@ 2011-11-08 8:57 ` Zhi Yong Wu
0 siblings, 0 replies; 6+ messages in thread
From: Zhi Yong Wu @ 2011-11-08 8:57 UTC (permalink / raw)
To: Kevin Wolf; +Cc: ryanh, Zhi Yong Wu, qemu-devel, stefanha
On Tue, Nov 8, 2011 at 4:41 PM, Kevin Wolf <kwolf@redhat.com> wrote:
> Am 08.11.2011 05:34, schrieb Zhi Yong Wu:
>> On Mon, Nov 7, 2011 at 11:18 PM, Kevin Wolf <kwolf@redhat.com> wrote:
>>> Am 03.11.2011 09:57, schrieb Zhi Yong Wu:
>>>> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>>>> Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
>>>> ---
>>>> block.c | 220 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>> block.h | 1 +
>>>> block_int.h | 1 +
>>>> 3 files changed, 222 insertions(+), 0 deletions(-)
>>>>
>>>> diff --git a/block.c b/block.c
>>>> index 79e7f09..b2af48f 100644
>>>> --- a/block.c
>>>> +++ b/block.c
>>>> @@ -74,6 +74,13 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
>>>> bool is_write);
>>>> static void coroutine_fn bdrv_co_do_rw(void *opaque);
>>>>
>>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>>> + bool is_write, double elapsed_time, uint64_t *wait);
>>>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>>>> + double elapsed_time, uint64_t *wait);
>>>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>>>> + bool is_write, int64_t *wait);
>>>> +
>>>> static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>>> QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>>>
>>>> @@ -107,6 +114,24 @@ int is_windows_drive(const char *filename)
>>>> #endif
>>>>
>>>> /* throttling disk I/O limits */
>>>> +void bdrv_io_limits_disable(BlockDriverState *bs)
>>>> +{
>>>> + bs->io_limits_enabled = false;
>>>> +
>>>> + while (qemu_co_queue_next(&bs->throttled_reqs));
>>>> +
>>>> + if (bs->block_timer) {
>>>> + qemu_del_timer(bs->block_timer);
>>>> + qemu_free_timer(bs->block_timer);
>>>> + bs->block_timer = NULL;
>>>> + }
>>>> +
>>>> + bs->slice_start = 0;
>>>> + bs->slice_end = 0;
>>>> + bs->slice_time = 0;
>>>> + memset(&bs->io_base, 0, sizeof(bs->io_base));
>>>> +}
>>>> +
>>>> static void bdrv_block_timer(void *opaque)
>>>> {
>>>> BlockDriverState *bs = opaque;
>>>> @@ -136,6 +161,31 @@ bool bdrv_io_limits_enabled(BlockDriverState *bs)
>>>> || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
>>>> }
>>>>
>>>> +static void bdrv_io_limits_intercept(BlockDriverState *bs,
>>>> + bool is_write, int nb_sectors)
>>>> +{
>>>> + int64_t wait_time = -1;
>>>> +
>>>> + if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
>>>> + qemu_co_queue_wait(&bs->throttled_reqs);
>>>> + }
>>>> +
>>>> + /* In fact, we hope to keep each request's timing, in FIFO mode. The next
>>>> + * throttled requests will not be dequeued until the current request is
>>>> + * allowed to be serviced. So if the current request still exceeds the
>>>> + * limits, it will be inserted to the head. All requests followed it will
>>>> + * be still in throttled_reqs queue.
>>>> + */
>>>> +
>>>> + while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
>>>> + qemu_mod_timer(bs->block_timer,
>>>> + wait_time + qemu_get_clock_ns(vm_clock));
>>>> + qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
>>>> + }
>>>> +
>>>> + qemu_co_queue_next(&bs->throttled_reqs);
>>>> +}
>>>> +
>>>> /* check if the path starts with "<protocol>:" */
>>>> static int path_has_protocol(const char *path)
>>>> {
>>>> @@ -718,6 +768,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>>>> bdrv_dev_change_media_cb(bs, true);
>>>> }
>>>>
>>>> + /* throttling disk I/O limits */
>>>> + if (bs->io_limits_enabled) {
>>>> + bdrv_io_limits_enable(bs);
>>>> + }
>>>> +
>>>> return 0;
>>>>
>>>> unlink_and_fail:
>>>> @@ -753,6 +808,11 @@ void bdrv_close(BlockDriverState *bs)
>>>>
>>>> bdrv_dev_change_media_cb(bs, false);
>>>> }
>>>> +
>>>> + /*throttling disk I/O limits*/
>>>> + if (bs->io_limits_enabled) {
>>>> + bdrv_io_limits_disable(bs);
>>>> + }
>>>> }
>>>>
>>>> void bdrv_close_all(void)
>>>> @@ -1291,6 +1351,11 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
>>>> return -EIO;
>>>> }
>>>>
>>>> + /* throttling disk read I/O */
>>>> + if (bs->io_limits_enabled) {
>>>> + bdrv_io_limits_intercept(bs, false, nb_sectors);
>>>> + }
>>>> +
>>>> return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
>>>> }
>>>>
>>>> @@ -1321,6 +1386,11 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
>>>> return -EIO;
>>>> }
>>>>
>>>> + /* throttling disk write I/O */
>>>> + if (bs->io_limits_enabled) {
>>>> + bdrv_io_limits_intercept(bs, true, nb_sectors);
>>>> + }
>>>> +
>>>> ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
>>>>
>>>> if (bs->dirty_bitmap) {
>>>> @@ -2512,6 +2582,156 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
>>>> acb->pool->cancel(acb);
>>>> }
>>>>
>>>> +/* block I/O throttling */
>>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>>> + bool is_write, double elapsed_time, uint64_t *wait) {
>>>> + uint64_t bps_limit = 0;
>>>> + double bytes_limit, bytes_base, bytes_res;
>>>> + double slice_time, wait_time;
>>>> +
>>>> + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>>>> + bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
>>>> + } else if (bs->io_limits.bps[is_write]) {
>>>> + bps_limit = bs->io_limits.bps[is_write];
>>>> + } else {
>>>> + if (wait) {
>>>> + *wait = 0;
>>>> + }
>>>> +
>>>> + return false;
>>>> + }
>>>> +
>>>> + slice_time = bs->slice_end - bs->slice_start;
>>>> + slice_time /= (NANOSECONDS_PER_SECOND);
>>>> + bytes_limit = bps_limit * slice_time;
>>>> + bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
>>>> + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>>>> + bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
>>>> + }
>>>> +
>>>> + bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>>>> +
>>>> + if (bytes_base + bytes_res <= bytes_limit) {
>>>> + if (wait) {
>>>> + *wait = 0;
>>>> + }
>>>> +
>>>> + return false;
>>>> + }
>>>> +
>>>> + /* Calc approx time to dispatch */
>>>> + wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
>>>> +
>>>> + bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
>>>> + bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
>>>> + if (wait) {
>>>> + *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
>>>> + }
>>>
>>> I'm not quire sure what bs->slice_end really is and what these
>>> calculations do exactly. Looks like magic. Can you add some comments
>>> that explain why slice_end is increased?
>> As you'ver known, when the I/O rate at runtime exceeds the limits,
>> bs->slice_end need to be extended in order that the current statistic
>> info can be kept until the timer fire, so it is increased and tuned
>> based on the result of experimet.
>>
>>> and how you estimate *wait?
>> The wait time is calcuated based on the history info of bps and iops.
>>
>> bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>> wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
>>
>> 1.) bytes_base is the bytes of data which have been read/written; and
>> it is obtained from the history statistic info.
>> 2.) bytes_res is the remaining bytes of data which need to be read/written.
>> 3.) (bytes_base + bytes_res) / bps_limit, this expression will be used
>> to calcuated the total time for completing reading/writting all data.
>>
>> I don't make sure if you understand this.
>
> Yes, I think this makes sense to me.
>
> However, I don't understand why things like 10 * BLOCK_IO_SLICE_TIME or
10 * BLOCK_IO_SLICE_TIME is used to translate s value to ns value, and
is actually 1s.
> 3 * BLOCK_IO_SLICE_TIME appear in the code. These numbers are magic for
> me. Are they more or less arbitrary values that happen to work well?
They are used to define the window size of one slice. The slice
determine how close the calcuated runtime rate is to the real runtime
rate. So they are tunable variable.
>
> Kevin
>
--
Regards,
Zhi Yong Wu
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2011-11-08 8:57 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-11-08 5:00 [Qemu-devel] [PATCH v12 3/5] block: add I/O throttling algorithm Zhi Yong Wu
-- strict thread matches above, loose matches on Subject: below --
2011-11-03 8:57 [Qemu-devel] [PATCH v12 0/5] The intro to QEMU block I/O throttling Zhi Yong Wu
2011-11-03 8:57 ` [Qemu-devel] [PATCH v12 3/5] block: add I/O throttling algorithm Zhi Yong Wu
2011-11-07 15:18 ` Kevin Wolf
2011-11-08 4:34 ` Zhi Yong Wu
2011-11-08 8:41 ` Kevin Wolf
2011-11-08 8:57 ` Zhi Yong Wu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).