* [PATCH V2 1/3] blk-stat: delete useless code
2017-10-07 0:55 [PATCH V2 0/3] block: export latency info for cgroups Shaohua Li
@ 2017-10-07 0:55 ` Shaohua Li
2017-10-10 17:57 ` Omar Sandoval
2017-10-10 18:02 ` Jens Axboe
2017-10-07 0:56 ` [PATCH V2 2/3] block: set request_list for request Shaohua Li
2017-10-07 0:56 ` [PATCH V2 3/3] blockcg: export latency info for each cgroup Shaohua Li
2 siblings, 2 replies; 10+ messages in thread
From: Shaohua Li @ 2017-10-07 0:55 UTC (permalink / raw)
To: linux-block; +Cc: vgoyal, tj, axboe, Kernel-team, Shaohua Li, Omar Sandoval
From: Shaohua Li <shli@fb.com>
Fix two issues:
- the per-cpu stat flush is unnecessary, nobody uses per-cpu stat except
sum it to global stat. We can do the calculation there. The flush just
wastes cpu time.
- some fields are signed int/s64. I don't see the point.
Cc: Omar Sandoval <osandov@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
block/blk-stat.c | 45 +++++++--------------------------------------
include/linux/blk_types.h | 5 ++---
2 files changed, 9 insertions(+), 41 deletions(-)
diff --git a/block/blk-stat.c b/block/blk-stat.c
index c52356d..3a2f3c9 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -11,8 +11,6 @@
#include "blk-mq.h"
#include "blk.h"
-#define BLK_RQ_STAT_BATCH 64
-
struct blk_queue_stats {
struct list_head callbacks;
spinlock_t lock;
@@ -23,45 +21,21 @@ static void blk_stat_init(struct blk_rq_stat *stat)
{
stat->min = -1ULL;
stat->max = stat->nr_samples = stat->mean = 0;
- stat->batch = stat->nr_batch = 0;
-}
-
-static void blk_stat_flush_batch(struct blk_rq_stat *stat)
-{
- const s32 nr_batch = READ_ONCE(stat->nr_batch);
- const s32 nr_samples = READ_ONCE(stat->nr_samples);
-
- if (!nr_batch)
- return;
- if (!nr_samples)
- stat->mean = div64_s64(stat->batch, nr_batch);
- else {
- stat->mean = div64_s64((stat->mean * nr_samples) +
- stat->batch,
- nr_batch + nr_samples);
- }
-
- stat->nr_samples += nr_batch;
- stat->nr_batch = stat->batch = 0;
+ stat->batch = 0;
}
+/* src is a per-cpu stat, mean isn't initialized */
static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
{
- blk_stat_flush_batch(src);
-
if (!src->nr_samples)
return;
dst->min = min(dst->min, src->min);
dst->max = max(dst->max, src->max);
- if (!dst->nr_samples)
- dst->mean = src->mean;
- else {
- dst->mean = div64_s64((src->mean * src->nr_samples) +
- (dst->mean * dst->nr_samples),
- dst->nr_samples + src->nr_samples);
- }
+ dst->mean = div_u64(src->batch + dst->mean * dst->nr_samples,
+ dst->nr_samples + src->nr_samples);
+
dst->nr_samples += src->nr_samples;
}
@@ -69,13 +43,8 @@ static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
{
stat->min = min(stat->min, value);
stat->max = max(stat->max, value);
-
- if (stat->batch + value < stat->batch ||
- stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
- blk_stat_flush_batch(stat);
-
stat->batch += value;
- stat->nr_batch++;
+ stat->nr_samples++;
}
void blk_stat_add(struct request *rq)
@@ -84,7 +53,7 @@ void blk_stat_add(struct request *rq)
struct blk_stat_callback *cb;
struct blk_rq_stat *stat;
int bucket;
- s64 now, value;
+ u64 now, value;
now = __blk_stat_time(ktime_to_ns(ktime_get()));
if (now < blk_stat_time(&rq->issue_stat))
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index a2d2aa7..3385c89 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -329,11 +329,10 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
}
struct blk_rq_stat {
- s64 mean;
+ u64 mean;
u64 min;
u64 max;
- s32 nr_samples;
- s32 nr_batch;
+ u32 nr_samples;
u64 batch;
};
--
2.9.5
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH V2 2/3] block: set request_list for request
2017-10-07 0:55 [PATCH V2 0/3] block: export latency info for cgroups Shaohua Li
2017-10-07 0:55 ` [PATCH V2 1/3] blk-stat: delete useless code Shaohua Li
@ 2017-10-07 0:56 ` Shaohua Li
2017-10-10 19:47 ` Jens Axboe
2017-10-07 0:56 ` [PATCH V2 3/3] blockcg: export latency info for each cgroup Shaohua Li
2 siblings, 1 reply; 10+ messages in thread
From: Shaohua Li @ 2017-10-07 0:56 UTC (permalink / raw)
To: linux-block; +Cc: vgoyal, tj, axboe, Kernel-team, Shaohua Li
From: Shaohua Li <shli@fb.com>
Legacy queue sets request's request_list, mq doesn't. This makes mq does
the same thing, so we can find cgroup of a request. Note, we really
only use blkg field of request_list, it's pointless to allocate mempool
for request_list in mq case.
Signed-off-by: Shaohua Li <shli@fb.com>
---
block/blk-core.c | 2 +-
block/blk-mq.c | 5 +++++
2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index adb064a..f88afd1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -718,7 +718,7 @@ static void free_request_size(void *element, void *data)
int blk_init_rl(struct request_list *rl, struct request_queue *q,
gfp_t gfp_mask)
{
- if (unlikely(rl->rq_pool))
+ if (unlikely(rl->rq_pool) || q->mq_ops)
return 0;
rl->q = q;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7f01d69..c3b5876 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -481,6 +481,9 @@ void blk_mq_free_request(struct request *rq)
wbt_done(q->rq_wb, &rq->issue_stat);
+ if (blk_rq_rl(rq))
+ blk_put_rl(blk_rq_rl(rq));
+
clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
if (rq->tag != -1)
@@ -1504,6 +1507,8 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
{
blk_init_request_from_bio(rq, bio);
+ blk_rq_set_rl(rq, blk_get_rl(rq->q, bio));
+
blk_account_io_start(rq, true);
}
--
2.9.5
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH V2 3/3] blockcg: export latency info for each cgroup
2017-10-07 0:55 [PATCH V2 0/3] block: export latency info for cgroups Shaohua Li
2017-10-07 0:55 ` [PATCH V2 1/3] blk-stat: delete useless code Shaohua Li
2017-10-07 0:56 ` [PATCH V2 2/3] block: set request_list for request Shaohua Li
@ 2017-10-07 0:56 ` Shaohua Li
2017-10-10 17:35 ` weiping zhang
2 siblings, 1 reply; 10+ messages in thread
From: Shaohua Li @ 2017-10-07 0:56 UTC (permalink / raw)
To: linux-block; +Cc: vgoyal, tj, axboe, Kernel-team, Shaohua Li
From: Shaohua Li <shli@fb.com>
Export the latency info to user. The latency is a good sign to indicate
if IO is congested or not. User can use the info to make decisions like
adjust cgroup settings.
Existing io.stat shows accumulated IO bytes and requests, but
accumulated value for latency doesn't make much sense. This patch
exports the latency info in a 100ms interval.
To reduce overhead, latency info of children is propagated to parents
every 10ms. This means the parents's latency could lost 10ms info of its
children in 100ms. This should be ok, as we don't need precise latency
info.
A micro benchmark running fio test against null_blk in a sixth level
cgroup doesn't show obvious regression. perf shows a little bit overhead
in blk_stat_add (~1%) and blkg_lookup (~1%), which is unavoidable right
now.
With this patch, the io.stat will show:
8:0 rbytes=7282688 wbytes=0 rios=83 wios=0 rlat_mean=2720 rlat_min=183 rlat_max=14880 wlat_mean=0 wlat_min=0 wlat_max=0
The new fields will display read/write average/minimum/maximum latency
within 100ms. The latency is us.
Signed-off-by: Shaohua Li <shli@fb.com>
---
block/blk-cgroup.c | 29 +++++++++-
block/blk-stat.c | 135 ++++++++++++++++++++++++++++++++++++++++++++-
block/blk.h | 5 ++
include/linux/blk-cgroup.h | 9 +++
4 files changed, 175 insertions(+), 3 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d3f56ba..89c5075 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -78,6 +78,7 @@ static void blkg_free(struct blkcg_gq *blkg)
blkg_rwstat_exit(&blkg->stat_ios);
blkg_rwstat_exit(&blkg->stat_bytes);
+ blkg_rq_stat_exit(blkg);
kfree(blkg);
}
@@ -104,6 +105,8 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
goto err_free;
+ if (blkg_rq_stat_init(blkg, gfp_mask))
+ goto err_free;
blkg->q = q;
INIT_LIST_HEAD(&blkg->q_node);
blkg->blkcg = blkcg;
@@ -952,6 +955,8 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
const char *dname;
struct blkg_rwstat rwstat;
u64 rbytes, wbytes, rios, wios;
+ u64 rmean = 0, rmin = 0, rmax = 0;
+ u64 wmean = 0, wmin = 0, wmax = 0;
dname = blkg_dev_name(blkg);
if (!dname)
@@ -969,11 +974,30 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+ if (blkg->rq_stat.stat[0].nr_samples) {
+ rmean = blkg->rq_stat.stat[0].mean;
+ do_div(rmean, 1000);
+ rmin = blkg->rq_stat.stat[0].min;
+ do_div(rmin, 1000);
+ rmax = blkg->rq_stat.stat[0].max;
+ do_div(rmax, 1000);
+ }
+ if (blkg->rq_stat.stat[1].nr_samples) {
+ wmean = blkg->rq_stat.stat[1].mean;
+ do_div(wmean, 1000);
+ wmin = blkg->rq_stat.stat[1].min;
+ do_div(wmin, 1000);
+ wmax = blkg->rq_stat.stat[1].max;
+ do_div(wmax, 1000);
+ }
spin_unlock_irq(blkg->q->queue_lock);
if (rbytes || wbytes || rios || wios)
- seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n",
- dname, rbytes, wbytes, rios, wios);
+ seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu "
+ "rlat_mean=%llu rlat_min=%llu rlat_max=%llu "
+ "wlat_mean=%llu wlat_min=%llu wlat_max=%llu\n",
+ dname, rbytes, wbytes, rios, wios,
+ rmean, rmin, rmax, wmean, wmin, wmax);
}
rcu_read_unlock();
@@ -1167,6 +1191,7 @@ int blkcg_init_queue(struct request_queue *q)
blkg_destroy_all(q);
spin_unlock_irq(q->queue_lock);
}
+ blk_stat_enable_accounting(q);
return ret;
}
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 3a2f3c9..12fd356 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -6,6 +6,7 @@
#include <linux/kernel.h>
#include <linux/rculist.h>
#include <linux/blk-mq.h>
+#include <linux/blk-cgroup.h>
#include "blk-stat.h"
#include "blk-mq.h"
@@ -47,6 +48,135 @@ static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
stat->nr_samples++;
}
+#ifdef CONFIG_BLK_CGROUP
+#define BLKCG_FLUSH_WINDOW (1000 * 1000 * 100)
+#define BLKCG_PROPAGATE_WINDOW (1000 * 1000 * 10)
+static void blkg_rq_stat_flush_percpu(struct blkcg_gq *blkg, u64 now)
+{
+ int cpu;
+
+ if (now < blkg->rq_stat.last_flush_time + BLKCG_FLUSH_WINDOW)
+ return;
+ blkg->rq_stat.last_flush_time = now;
+
+ blk_stat_init(&blkg->rq_stat.stat[0]);
+ blk_stat_init(&blkg->rq_stat.stat[1]);
+
+ for_each_online_cpu(cpu) {
+ struct blk_rq_stat *cpu_stat;
+
+ cpu_stat = per_cpu_ptr(blkg->rq_stat.cpu_stat, cpu);
+ blk_stat_sum(&blkg->rq_stat.stat[0], &cpu_stat[0]);
+ blk_stat_init(&cpu_stat[0]);
+ blk_stat_sum(&blkg->rq_stat.stat[1], &cpu_stat[1]);
+ blk_stat_init(&cpu_stat[1]);
+ }
+}
+
+static void blkg_rq_stat_propagate(struct blkcg_gq *blkg, int dir, u64 value,
+ u64 now)
+{
+ struct blkcg_gq *parent;
+ struct blk_rq_stat *prop_stat;
+ u64 *prop_time;
+
+ prop_stat = &this_cpu_ptr(blkg->rq_stat.cpu_propagate_stat)[dir];
+ prop_time = this_cpu_ptr(blkg->rq_stat.cpu_propagate_time);
+
+ __blk_stat_add(prop_stat, value);
+
+ if (now < *prop_time + BLKCG_PROPAGATE_WINDOW)
+ return;
+
+ prop_stat = this_cpu_ptr(blkg->rq_stat.cpu_propagate_stat);
+ parent = blkg->parent;
+ while (parent) {
+ struct blk_rq_stat *pstat;
+
+ pstat = this_cpu_ptr(parent->rq_stat.cpu_stat);
+ pstat[0].min = min(prop_stat[0].min, pstat[0].min);
+ pstat[1].min = min(prop_stat[1].min, pstat[1].min);
+ pstat[0].max = max(prop_stat[0].max, pstat[0].max);
+ pstat[1].max = max(prop_stat[1].max, pstat[1].max);
+ pstat[0].batch += prop_stat[0].batch;
+ pstat[1].batch += prop_stat[1].batch;
+ pstat[0].nr_samples += prop_stat[0].nr_samples;
+ pstat[1].nr_samples += prop_stat[1].nr_samples;
+
+ blkg_rq_stat_flush_percpu(parent, now);
+
+ parent = parent->parent;
+ }
+
+ *prop_time = now;
+ blk_stat_init(&prop_stat[0]);
+ blk_stat_init(&prop_stat[1]);
+}
+
+static void blkg_rq_stat_add(struct request *rq, u64 now, u64 value)
+{
+ struct blkcg_gq *blkg;
+ struct blk_rq_stat *stat;
+ int dir = rq_data_dir(rq);
+
+ if (!blk_rq_rl(rq))
+ return;
+ blkg = blk_rq_rl(rq)->blkg;
+
+ stat = get_cpu_ptr(blkg->rq_stat.cpu_stat);
+ __blk_stat_add(&stat[dir], value);
+ blkg_rq_stat_propagate(blkg, dir, value, now);
+ put_cpu_ptr(blkg->rq_stat.cpu_stat);
+
+ blkg_rq_stat_flush_percpu(blkg, now);
+}
+
+void blkg_rq_stat_exit(struct blkcg_gq *blkg)
+{
+ free_percpu(blkg->rq_stat.cpu_stat);
+ free_percpu(blkg->rq_stat.cpu_propagate_stat);
+ free_percpu(blkg->rq_stat.cpu_propagate_time);
+}
+
+int blkg_rq_stat_init(struct blkcg_gq *blkg, gfp_t gfp)
+{
+ int cpu;
+
+ memset(&blkg->rq_stat, 0, sizeof(blkg->rq_stat));
+
+ blkg->rq_stat.cpu_stat =
+ __alloc_percpu_gfp(2 * sizeof(struct blk_rq_stat),
+ __alignof__(struct blk_rq_stat), gfp);
+ blkg->rq_stat.cpu_propagate_stat =
+ __alloc_percpu_gfp(2 * sizeof(struct blk_rq_stat),
+ __alignof__(struct blk_rq_stat), gfp);
+ blkg->rq_stat.cpu_propagate_time = alloc_percpu_gfp(u64, gfp);
+ if (!blkg->rq_stat.cpu_stat || !blkg->rq_stat.cpu_propagate_stat ||
+ !blkg->rq_stat.cpu_propagate_time) {
+ blkg_rq_stat_exit(blkg);
+ return -ENOMEM;
+ }
+ blk_stat_init(&blkg->rq_stat.stat[0]);
+ blk_stat_init(&blkg->rq_stat.stat[1]);
+ for_each_online_cpu(cpu) {
+ struct blk_rq_stat *cpu_stat;
+
+ cpu_stat = per_cpu_ptr(blkg->rq_stat.cpu_stat, cpu);
+ blk_stat_init(&cpu_stat[0]);
+ blk_stat_init(&cpu_stat[1]);
+ cpu_stat = per_cpu_ptr(blkg->rq_stat.cpu_propagate_stat, cpu);
+ blk_stat_init(&cpu_stat[0]);
+ blk_stat_init(&cpu_stat[1]);
+ }
+ return 0;
+}
+
+#else
+static void blkg_rq_stat_add(struct request *rq, u64 now, u64 value)
+{
+}
+#endif
+
void blk_stat_add(struct request *rq)
{
struct request_queue *q = rq->q;
@@ -54,8 +184,10 @@ void blk_stat_add(struct request *rq)
struct blk_rq_stat *stat;
int bucket;
u64 now, value;
+ u64 time;
- now = __blk_stat_time(ktime_to_ns(ktime_get()));
+ time = ktime_get_ns();
+ now = __blk_stat_time(time);
if (now < blk_stat_time(&rq->issue_stat))
return;
@@ -64,6 +196,7 @@ void blk_stat_add(struct request *rq)
blk_throtl_stat_add(rq, value);
rcu_read_lock();
+ blkg_rq_stat_add(rq, time, value);
list_for_each_entry_rcu(cb, &q->stats->callbacks, list) {
if (!blk_stat_is_active(cb))
continue;
diff --git a/block/blk.h b/block/blk.h
index fda5a46..4d76a971 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -309,6 +309,11 @@ static inline void blk_throtl_bio_endio(struct bio *bio) { }
static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
#endif
+#ifdef CONFIG_BLK_CGROUP
+extern int blkg_rq_stat_init(struct blkcg_gq *blkg, gfp_t gfp);
+extern void blkg_rq_stat_exit(struct blkcg_gq *blkg);
+#endif
+
#ifdef CONFIG_BOUNCE
extern int init_emergency_isa_pool(void);
extern void blk_queue_bounce(struct request_queue *q, struct bio **bio);
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index f57e54d..58f3d25 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -102,6 +102,14 @@ struct blkcg_policy_data {
int plid;
};
+struct blkcg_gq_rq_stat {
+ u64 last_flush_time;
+ struct blk_rq_stat stat[2];
+ struct blk_rq_stat __percpu *cpu_stat;
+ struct blk_rq_stat __percpu *cpu_propagate_stat;
+ u64 __percpu *cpu_propagate_time;
+};
+
/* association between a blk cgroup and a request queue */
struct blkcg_gq {
/* Pointer to the associated request_queue */
@@ -130,6 +138,7 @@ struct blkcg_gq {
struct blkg_rwstat stat_bytes;
struct blkg_rwstat stat_ios;
+ struct blkcg_gq_rq_stat rq_stat;
struct blkg_policy_data *pd[BLKCG_MAX_POLS];
--
2.9.5
^ permalink raw reply related [flat|nested] 10+ messages in thread