All of lore.kernel.org
 help / color / mirror / Atom feed
From: Omar Sandoval <osandov@osandov.com>
To: Jianchao Wang <jianchao.w.wang@oracle.com>
Cc: axboe@kernel.dk, linux-block@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH V3 2/2] block: kyber: make kyber more friendly with merging
Date: Wed, 30 May 2018 09:40:55 -0700	[thread overview]
Message-ID: <20180530164055.GA25342@vader> (raw)
In-Reply-To: <1527665168-1965-3-git-send-email-jianchao.w.wang@oracle.com>

On Wed, May 30, 2018 at 03:26:08PM +0800, Jianchao Wang wrote:
> Currently, kyber is very unfriendly with merging. kyber depends
> on ctx rq_list to do merging, however, most of time, it will not
> leave any requests in ctx rq_list. This is because even if tokens
> of one domain is used up, kyber will try to dispatch requests
> from other domain and flush the rq_list there.
> 
> To improve this, we setup kyber_ctx_queue (kcq) which is similar
> with ctx, but it has rq_lists for different domain and build same
> mapping between kcq and khd as the ctx & hctx. Then we could merge,
> insert and dispatch for different domains separately. At the same
> time, only flush the rq_list of kcq when get domain token successfully.
> Then if one domain token is used up, the requests could be left in
> the rq_list of that domain and maybe merged with following io.
> 
> Following is my test result on machine with 8 cores and NVMe card
> INTEL SSDPEKKR128G7
> 
> fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
> seq/random
> +------+---------------------------------------------------------------+
> |patch?| bw(MB/s) |   iops    | slat(usec) |    clat(usec)   |  merge  |
> +----------------------------------------------------------------------+
> | w/o  |  606/612 | 151k/153k |  6.89/7.03 | 3349.21/3305.40 |   0/0   |
> +----------------------------------------------------------------------+
> | w/   | 1083/616 | 277k/154k |  4.93/6.95 | 1830.62/3279.95 | 223k/3k |
> +----------------------------------------------------------------------+
> When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
> on my platform.

Looks good, and it survived blktests plus a few other stress tests.
Thanks!

Reviewed-by: Omar Sandoval <osandov@fb.com>

> Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
> Tested-by: Holger Hoffst�tte <holger@applied-asynchrony.com>
> ---
>  block/kyber-iosched.c | 190 +++++++++++++++++++++++++++++++++++++++++---------
>  1 file changed, 158 insertions(+), 32 deletions(-)
> 
> diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
> index 0d6d25e3..a1613655 100644
> --- a/block/kyber-iosched.c
> +++ b/block/kyber-iosched.c
> @@ -72,6 +72,19 @@ static const unsigned int kyber_batch_size[] = {
>  	[KYBER_OTHER] = 8,
>  };
>  
> +/*
> + * There is a same mapping between ctx & hctx and kcq & khd,
> + * we use request->mq_ctx->index_hw to index the kcq in khd.
> + */
> +struct kyber_ctx_queue {
> +	/*
> +	 * Used to ensure operations on rq_list and kcq_map to be an atmoic one.
> +	 * Also protect the rqs on rq_list when merge.
> +	 */
> +	spinlock_t lock;
> +	struct list_head rq_list[KYBER_NUM_DOMAINS];
> +} ____cacheline_aligned_in_smp;
> +
>  struct kyber_queue_data {
>  	struct request_queue *q;
>  
> @@ -99,6 +112,8 @@ struct kyber_hctx_data {
>  	struct list_head rqs[KYBER_NUM_DOMAINS];
>  	unsigned int cur_domain;
>  	unsigned int batching;
> +	struct kyber_ctx_queue *kcqs;
> +	struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
>  	wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
>  	struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
>  	atomic_t wait_index[KYBER_NUM_DOMAINS];
> @@ -107,10 +122,8 @@ struct kyber_hctx_data {
>  static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
>  			     void *key);
>  
> -static int rq_sched_domain(const struct request *rq)
> +static unsigned int kyber_sched_domain(unsigned int op)
>  {
> -	unsigned int op = rq->cmd_flags;
> -
>  	if ((op & REQ_OP_MASK) == REQ_OP_READ)
>  		return KYBER_READ;
>  	else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
> @@ -284,6 +297,11 @@ static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
>  	return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
>  }
>  
> +static int kyber_bucket_fn(const struct request *rq)
> +{
> +	return kyber_sched_domain(rq->cmd_flags);
> +}
> +
>  static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
>  {
>  	struct kyber_queue_data *kqd;
> @@ -297,7 +315,7 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
>  		goto err;
>  	kqd->q = q;
>  
> -	kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, rq_sched_domain,
> +	kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn,
>  					  KYBER_NUM_DOMAINS, kqd);
>  	if (!kqd->cb)
>  		goto err_kqd;
> @@ -376,6 +394,15 @@ static void kyber_exit_sched(struct elevator_queue *e)
>  	kfree(kqd);
>  }
>  
> +static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq)
> +{
> +	unsigned int i;
> +
> +	spin_lock_init(&kcq->lock);
> +	for (i = 0; i < KYBER_NUM_DOMAINS; i++)
> +		INIT_LIST_HEAD(&kcq->rq_list[i]);
> +}
> +
>  static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
>  {
>  	struct kyber_hctx_data *khd;
> @@ -385,6 +412,24 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
>  	if (!khd)
>  		return -ENOMEM;
>  
> +	khd->kcqs = kmalloc_array_node(hctx->nr_ctx,
> +				       sizeof(struct kyber_ctx_queue),
> +				       GFP_KERNEL, hctx->numa_node);
> +	if (!khd->kcqs)
> +		goto err_khd;
> +
> +	for (i = 0; i < hctx->nr_ctx; i++)
> +		kyber_ctx_queue_init(&khd->kcqs[i]);
> +
> +	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
> +		if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx,
> +				      ilog2(8), GFP_KERNEL, hctx->numa_node)) {
> +			while (--i >= 0)
> +				sbitmap_free(&khd->kcq_map[i]);
> +			goto err_kcqs;
> +		}
> +	}
> +
>  	spin_lock_init(&khd->lock);
>  
>  	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
> @@ -402,10 +447,22 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
>  	hctx->sched_data = khd;
>  
>  	return 0;
> +
> +err_kcqs:
> +	kfree(khd->kcqs);
> +err_khd:
> +	kfree(khd);
> +	return -ENOMEM;
>  }
>  
>  static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
>  {
> +	struct kyber_hctx_data *khd = hctx->sched_data;
> +	int i;
> +
> +	for (i = 0; i < KYBER_NUM_DOMAINS; i++)
> +		sbitmap_free(&khd->kcq_map[i]);
> +	kfree(khd->kcqs);
>  	kfree(hctx->sched_data);
>  }
>  
> @@ -427,7 +484,7 @@ static void rq_clear_domain_token(struct kyber_queue_data *kqd,
>  
>  	nr = rq_get_domain_token(rq);
>  	if (nr != -1) {
> -		sched_domain = rq_sched_domain(rq);
> +		sched_domain = kyber_sched_domain(rq->cmd_flags);
>  		sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
>  				    rq->mq_ctx->cpu);
>  	}
> @@ -446,11 +503,51 @@ static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
>  	}
>  }
>  
> +static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
> +{
> +	struct kyber_hctx_data *khd = hctx->sched_data;
> +	struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
> +	struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw];
> +	unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
> +	struct list_head *rq_list = &kcq->rq_list[sched_domain];
> +	bool merged;
> +
> +	spin_lock(&kcq->lock);
> +	merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio);
> +	spin_unlock(&kcq->lock);
> +	blk_mq_put_ctx(ctx);
> +
> +	return merged;
> +}
> +
>  static void kyber_prepare_request(struct request *rq, struct bio *bio)
>  {
>  	rq_set_domain_token(rq, -1);
>  }
>  
> +static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
> +				  struct list_head *rq_list, bool at_head)
> +{
> +	struct kyber_hctx_data *khd = hctx->sched_data;
> +	struct request *rq, *next;
> +
> +	list_for_each_entry_safe(rq, next, rq_list, queuelist) {
> +		unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
> +		struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw];
> +		struct list_head *head = &kcq->rq_list[sched_domain];
> +
> +		spin_lock(&kcq->lock);
> +		if (at_head)
> +			list_move(&rq->queuelist, head);
> +		else
> +			list_move_tail(&rq->queuelist, head);
> +		sbitmap_set_bit(&khd->kcq_map[sched_domain],
> +				rq->mq_ctx->index_hw);
> +		blk_mq_sched_request_inserted(rq);
> +		spin_unlock(&kcq->lock);
> +	}
> +}
> +
>  static void kyber_finish_request(struct request *rq)
>  {
>  	struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
> @@ -469,7 +566,7 @@ static void kyber_completed_request(struct request *rq)
>  	 * Check if this request met our latency goal. If not, quickly gather
>  	 * some statistics and start throttling.
>  	 */
> -	sched_domain = rq_sched_domain(rq);
> +	sched_domain = kyber_sched_domain(rq->cmd_flags);
>  	switch (sched_domain) {
>  	case KYBER_READ:
>  		target = kqd->read_lat_nsec;
> @@ -495,19 +592,38 @@ static void kyber_completed_request(struct request *rq)
>  		blk_stat_activate_msecs(kqd->cb, 10);
>  }
>  
> -static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd,
> -				  struct blk_mq_hw_ctx *hctx)
> +struct flush_kcq_data {
> +	struct kyber_hctx_data *khd;
> +	unsigned int sched_domain;
> +	struct list_head *list;
> +};
> +
> +static bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data)
>  {
> -	LIST_HEAD(rq_list);
> -	struct request *rq, *next;
> +	struct flush_kcq_data *flush_data = data;
> +	struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr];
>  
> -	blk_mq_flush_busy_ctxs(hctx, &rq_list);
> -	list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
> -		unsigned int sched_domain;
> +	spin_lock(&kcq->lock);
> +	list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain],
> +			      flush_data->list);
> +	sbitmap_clear_bit(sb, bitnr);
> +	spin_unlock(&kcq->lock);
>  
> -		sched_domain = rq_sched_domain(rq);
> -		list_move_tail(&rq->queuelist, &khd->rqs[sched_domain]);
> -	}
> +	return true;
> +}
> +
> +static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd,
> +				  unsigned int sched_domain,
> +				  struct list_head *list)
> +{
> +	struct flush_kcq_data data = {
> +		.khd = khd,
> +		.sched_domain = sched_domain,
> +		.list = list,
> +	};
> +
> +	sbitmap_for_each_set(&khd->kcq_map[sched_domain],
> +			     flush_busy_kcq, &data);
>  }
>  
>  static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
> @@ -570,26 +686,23 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
>  static struct request *
>  kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
>  			  struct kyber_hctx_data *khd,
> -			  struct blk_mq_hw_ctx *hctx,
> -			  bool *flushed)
> +			  struct blk_mq_hw_ctx *hctx)
>  {
>  	struct list_head *rqs;
>  	struct request *rq;
>  	int nr;
>  
>  	rqs = &khd->rqs[khd->cur_domain];
> -	rq = list_first_entry_or_null(rqs, struct request, queuelist);
>  
>  	/*
> -	 * If there wasn't already a pending request and we haven't flushed the
> -	 * software queues yet, flush the software queues and check again.
> +	 * If we already have a flushed request, then we just need to get a
> +	 * token for it. Otherwise, if there are pending requests in the kcqs,
> +	 * flush the kcqs, but only if we can get a token. If not, we should
> +	 * leave the requests in the kcqs so that they can be merged. Note that
> +	 * khd->lock serializes the flushes, so if we observed any bit set in
> +	 * the kcq_map, we will always get a request.
>  	 */
> -	if (!rq && !*flushed) {
> -		kyber_flush_busy_ctxs(khd, hctx);
> -		*flushed = true;
> -		rq = list_first_entry_or_null(rqs, struct request, queuelist);
> -	}
> -
> +	rq = list_first_entry_or_null(rqs, struct request, queuelist);
>  	if (rq) {
>  		nr = kyber_get_domain_token(kqd, khd, hctx);
>  		if (nr >= 0) {
> @@ -598,6 +711,16 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
>  			list_del_init(&rq->queuelist);
>  			return rq;
>  		}
> +	} else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
> +		nr = kyber_get_domain_token(kqd, khd, hctx);
> +		if (nr >= 0) {
> +			kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs);
> +			rq = list_first_entry(rqs, struct request, queuelist);
> +			khd->batching++;
> +			rq_set_domain_token(rq, nr);
> +			list_del_init(&rq->queuelist);
> +			return rq;
> +		}
>  	}
>  
>  	/* There were either no pending requests or no tokens. */
> @@ -608,7 +731,6 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
>  {
>  	struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
>  	struct kyber_hctx_data *khd = hctx->sched_data;
> -	bool flushed = false;
>  	struct request *rq;
>  	int i;
>  
> @@ -619,7 +741,7 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
>  	 * from the batch.
>  	 */
>  	if (khd->batching < kyber_batch_size[khd->cur_domain]) {
> -		rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
> +		rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
>  		if (rq)
>  			goto out;
>  	}
> @@ -640,7 +762,7 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
>  		else
>  			khd->cur_domain++;
>  
> -		rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
> +		rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
>  		if (rq)
>  			goto out;
>  	}
> @@ -657,10 +779,12 @@ static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
>  	int i;
>  
>  	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
> -		if (!list_empty_careful(&khd->rqs[i]))
> +		if (!list_empty_careful(&khd->rqs[i]) ||
> +		    sbitmap_any_bit_set(&khd->kcq_map[i]))
>  			return true;
>  	}
> -	return sbitmap_any_bit_set(&hctx->ctx_map);
> +
> +	return false;
>  }
>  
>  #define KYBER_LAT_SHOW_STORE(op)					\
> @@ -831,7 +955,9 @@ static struct elevator_type kyber_sched = {
>  		.init_hctx = kyber_init_hctx,
>  		.exit_hctx = kyber_exit_hctx,
>  		.limit_depth = kyber_limit_depth,
> +		.bio_merge = kyber_bio_merge,
>  		.prepare_request = kyber_prepare_request,
> +		.insert_requests = kyber_insert_requests,
>  		.finish_request = kyber_finish_request,
>  		.requeue_request = kyber_finish_request,
>  		.completed_request = kyber_completed_request,
> -- 
> 2.7.4
> 

WARNING: multiple messages have this Message-ID (diff)
From: Omar Sandoval <osandov@osandov.com>
To: Jianchao Wang <jianchao.w.wang@oracle.com>
Cc: axboe@kernel.dk, linux-block@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH V3 2/2] block: kyber: make kyber more friendly with merging
Date: Wed, 30 May 2018 09:40:55 -0700	[thread overview]
Message-ID: <20180530164055.GA25342@vader> (raw)
In-Reply-To: <1527665168-1965-3-git-send-email-jianchao.w.wang@oracle.com>

On Wed, May 30, 2018 at 03:26:08PM +0800, Jianchao Wang wrote:
> Currently, kyber is very unfriendly with merging. kyber depends
> on ctx rq_list to do merging, however, most of time, it will not
> leave any requests in ctx rq_list. This is because even if tokens
> of one domain is used up, kyber will try to dispatch requests
> from other domain and flush the rq_list there.
> 
> To improve this, we setup kyber_ctx_queue (kcq) which is similar
> with ctx, but it has rq_lists for different domain and build same
> mapping between kcq and khd as the ctx & hctx. Then we could merge,
> insert and dispatch for different domains separately. At the same
> time, only flush the rq_list of kcq when get domain token successfully.
> Then if one domain token is used up, the requests could be left in
> the rq_list of that domain and maybe merged with following io.
> 
> Following is my test result on machine with 8 cores and NVMe card
> INTEL SSDPEKKR128G7
> 
> fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8
> seq/random
> +------+---------------------------------------------------------------+
> |patch?| bw(MB/s) |   iops    | slat(usec) |    clat(usec)   |  merge  |
> +----------------------------------------------------------------------+
> | w/o  |  606/612 | 151k/153k |  6.89/7.03 | 3349.21/3305.40 |   0/0   |
> +----------------------------------------------------------------------+
> | w/   | 1083/616 | 277k/154k |  4.93/6.95 | 1830.62/3279.95 | 223k/3k |
> +----------------------------------------------------------------------+
> When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k
> on my platform.

Looks good, and it survived blktests plus a few other stress tests.
Thanks!

Reviewed-by: Omar Sandoval <osandov@fb.com>

> Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
> Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
> ---
>  block/kyber-iosched.c | 190 +++++++++++++++++++++++++++++++++++++++++---------
>  1 file changed, 158 insertions(+), 32 deletions(-)
> 
> diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
> index 0d6d25e3..a1613655 100644
> --- a/block/kyber-iosched.c
> +++ b/block/kyber-iosched.c
> @@ -72,6 +72,19 @@ static const unsigned int kyber_batch_size[] = {
>  	[KYBER_OTHER] = 8,
>  };
>  
> +/*
> + * There is a same mapping between ctx & hctx and kcq & khd,
> + * we use request->mq_ctx->index_hw to index the kcq in khd.
> + */
> +struct kyber_ctx_queue {
> +	/*
> +	 * Used to ensure operations on rq_list and kcq_map to be an atmoic one.
> +	 * Also protect the rqs on rq_list when merge.
> +	 */
> +	spinlock_t lock;
> +	struct list_head rq_list[KYBER_NUM_DOMAINS];
> +} ____cacheline_aligned_in_smp;
> +
>  struct kyber_queue_data {
>  	struct request_queue *q;
>  
> @@ -99,6 +112,8 @@ struct kyber_hctx_data {
>  	struct list_head rqs[KYBER_NUM_DOMAINS];
>  	unsigned int cur_domain;
>  	unsigned int batching;
> +	struct kyber_ctx_queue *kcqs;
> +	struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
>  	wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
>  	struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
>  	atomic_t wait_index[KYBER_NUM_DOMAINS];
> @@ -107,10 +122,8 @@ struct kyber_hctx_data {
>  static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
>  			     void *key);
>  
> -static int rq_sched_domain(const struct request *rq)
> +static unsigned int kyber_sched_domain(unsigned int op)
>  {
> -	unsigned int op = rq->cmd_flags;
> -
>  	if ((op & REQ_OP_MASK) == REQ_OP_READ)
>  		return KYBER_READ;
>  	else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
> @@ -284,6 +297,11 @@ static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
>  	return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
>  }
>  
> +static int kyber_bucket_fn(const struct request *rq)
> +{
> +	return kyber_sched_domain(rq->cmd_flags);
> +}
> +
>  static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
>  {
>  	struct kyber_queue_data *kqd;
> @@ -297,7 +315,7 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
>  		goto err;
>  	kqd->q = q;
>  
> -	kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, rq_sched_domain,
> +	kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn,
>  					  KYBER_NUM_DOMAINS, kqd);
>  	if (!kqd->cb)
>  		goto err_kqd;
> @@ -376,6 +394,15 @@ static void kyber_exit_sched(struct elevator_queue *e)
>  	kfree(kqd);
>  }
>  
> +static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq)
> +{
> +	unsigned int i;
> +
> +	spin_lock_init(&kcq->lock);
> +	for (i = 0; i < KYBER_NUM_DOMAINS; i++)
> +		INIT_LIST_HEAD(&kcq->rq_list[i]);
> +}
> +
>  static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
>  {
>  	struct kyber_hctx_data *khd;
> @@ -385,6 +412,24 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
>  	if (!khd)
>  		return -ENOMEM;
>  
> +	khd->kcqs = kmalloc_array_node(hctx->nr_ctx,
> +				       sizeof(struct kyber_ctx_queue),
> +				       GFP_KERNEL, hctx->numa_node);
> +	if (!khd->kcqs)
> +		goto err_khd;
> +
> +	for (i = 0; i < hctx->nr_ctx; i++)
> +		kyber_ctx_queue_init(&khd->kcqs[i]);
> +
> +	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
> +		if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx,
> +				      ilog2(8), GFP_KERNEL, hctx->numa_node)) {
> +			while (--i >= 0)
> +				sbitmap_free(&khd->kcq_map[i]);
> +			goto err_kcqs;
> +		}
> +	}
> +
>  	spin_lock_init(&khd->lock);
>  
>  	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
> @@ -402,10 +447,22 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
>  	hctx->sched_data = khd;
>  
>  	return 0;
> +
> +err_kcqs:
> +	kfree(khd->kcqs);
> +err_khd:
> +	kfree(khd);
> +	return -ENOMEM;
>  }
>  
>  static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
>  {
> +	struct kyber_hctx_data *khd = hctx->sched_data;
> +	int i;
> +
> +	for (i = 0; i < KYBER_NUM_DOMAINS; i++)
> +		sbitmap_free(&khd->kcq_map[i]);
> +	kfree(khd->kcqs);
>  	kfree(hctx->sched_data);
>  }
>  
> @@ -427,7 +484,7 @@ static void rq_clear_domain_token(struct kyber_queue_data *kqd,
>  
>  	nr = rq_get_domain_token(rq);
>  	if (nr != -1) {
> -		sched_domain = rq_sched_domain(rq);
> +		sched_domain = kyber_sched_domain(rq->cmd_flags);
>  		sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
>  				    rq->mq_ctx->cpu);
>  	}
> @@ -446,11 +503,51 @@ static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
>  	}
>  }
>  
> +static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
> +{
> +	struct kyber_hctx_data *khd = hctx->sched_data;
> +	struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
> +	struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw];
> +	unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
> +	struct list_head *rq_list = &kcq->rq_list[sched_domain];
> +	bool merged;
> +
> +	spin_lock(&kcq->lock);
> +	merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio);
> +	spin_unlock(&kcq->lock);
> +	blk_mq_put_ctx(ctx);
> +
> +	return merged;
> +}
> +
>  static void kyber_prepare_request(struct request *rq, struct bio *bio)
>  {
>  	rq_set_domain_token(rq, -1);
>  }
>  
> +static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
> +				  struct list_head *rq_list, bool at_head)
> +{
> +	struct kyber_hctx_data *khd = hctx->sched_data;
> +	struct request *rq, *next;
> +
> +	list_for_each_entry_safe(rq, next, rq_list, queuelist) {
> +		unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
> +		struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw];
> +		struct list_head *head = &kcq->rq_list[sched_domain];
> +
> +		spin_lock(&kcq->lock);
> +		if (at_head)
> +			list_move(&rq->queuelist, head);
> +		else
> +			list_move_tail(&rq->queuelist, head);
> +		sbitmap_set_bit(&khd->kcq_map[sched_domain],
> +				rq->mq_ctx->index_hw);
> +		blk_mq_sched_request_inserted(rq);
> +		spin_unlock(&kcq->lock);
> +	}
> +}
> +
>  static void kyber_finish_request(struct request *rq)
>  {
>  	struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
> @@ -469,7 +566,7 @@ static void kyber_completed_request(struct request *rq)
>  	 * Check if this request met our latency goal. If not, quickly gather
>  	 * some statistics and start throttling.
>  	 */
> -	sched_domain = rq_sched_domain(rq);
> +	sched_domain = kyber_sched_domain(rq->cmd_flags);
>  	switch (sched_domain) {
>  	case KYBER_READ:
>  		target = kqd->read_lat_nsec;
> @@ -495,19 +592,38 @@ static void kyber_completed_request(struct request *rq)
>  		blk_stat_activate_msecs(kqd->cb, 10);
>  }
>  
> -static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd,
> -				  struct blk_mq_hw_ctx *hctx)
> +struct flush_kcq_data {
> +	struct kyber_hctx_data *khd;
> +	unsigned int sched_domain;
> +	struct list_head *list;
> +};
> +
> +static bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data)
>  {
> -	LIST_HEAD(rq_list);
> -	struct request *rq, *next;
> +	struct flush_kcq_data *flush_data = data;
> +	struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr];
>  
> -	blk_mq_flush_busy_ctxs(hctx, &rq_list);
> -	list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
> -		unsigned int sched_domain;
> +	spin_lock(&kcq->lock);
> +	list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain],
> +			      flush_data->list);
> +	sbitmap_clear_bit(sb, bitnr);
> +	spin_unlock(&kcq->lock);
>  
> -		sched_domain = rq_sched_domain(rq);
> -		list_move_tail(&rq->queuelist, &khd->rqs[sched_domain]);
> -	}
> +	return true;
> +}
> +
> +static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd,
> +				  unsigned int sched_domain,
> +				  struct list_head *list)
> +{
> +	struct flush_kcq_data data = {
> +		.khd = khd,
> +		.sched_domain = sched_domain,
> +		.list = list,
> +	};
> +
> +	sbitmap_for_each_set(&khd->kcq_map[sched_domain],
> +			     flush_busy_kcq, &data);
>  }
>  
>  static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
> @@ -570,26 +686,23 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
>  static struct request *
>  kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
>  			  struct kyber_hctx_data *khd,
> -			  struct blk_mq_hw_ctx *hctx,
> -			  bool *flushed)
> +			  struct blk_mq_hw_ctx *hctx)
>  {
>  	struct list_head *rqs;
>  	struct request *rq;
>  	int nr;
>  
>  	rqs = &khd->rqs[khd->cur_domain];
> -	rq = list_first_entry_or_null(rqs, struct request, queuelist);
>  
>  	/*
> -	 * If there wasn't already a pending request and we haven't flushed the
> -	 * software queues yet, flush the software queues and check again.
> +	 * If we already have a flushed request, then we just need to get a
> +	 * token for it. Otherwise, if there are pending requests in the kcqs,
> +	 * flush the kcqs, but only if we can get a token. If not, we should
> +	 * leave the requests in the kcqs so that they can be merged. Note that
> +	 * khd->lock serializes the flushes, so if we observed any bit set in
> +	 * the kcq_map, we will always get a request.
>  	 */
> -	if (!rq && !*flushed) {
> -		kyber_flush_busy_ctxs(khd, hctx);
> -		*flushed = true;
> -		rq = list_first_entry_or_null(rqs, struct request, queuelist);
> -	}
> -
> +	rq = list_first_entry_or_null(rqs, struct request, queuelist);
>  	if (rq) {
>  		nr = kyber_get_domain_token(kqd, khd, hctx);
>  		if (nr >= 0) {
> @@ -598,6 +711,16 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
>  			list_del_init(&rq->queuelist);
>  			return rq;
>  		}
> +	} else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
> +		nr = kyber_get_domain_token(kqd, khd, hctx);
> +		if (nr >= 0) {
> +			kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs);
> +			rq = list_first_entry(rqs, struct request, queuelist);
> +			khd->batching++;
> +			rq_set_domain_token(rq, nr);
> +			list_del_init(&rq->queuelist);
> +			return rq;
> +		}
>  	}
>  
>  	/* There were either no pending requests or no tokens. */
> @@ -608,7 +731,6 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
>  {
>  	struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
>  	struct kyber_hctx_data *khd = hctx->sched_data;
> -	bool flushed = false;
>  	struct request *rq;
>  	int i;
>  
> @@ -619,7 +741,7 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
>  	 * from the batch.
>  	 */
>  	if (khd->batching < kyber_batch_size[khd->cur_domain]) {
> -		rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
> +		rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
>  		if (rq)
>  			goto out;
>  	}
> @@ -640,7 +762,7 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
>  		else
>  			khd->cur_domain++;
>  
> -		rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
> +		rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
>  		if (rq)
>  			goto out;
>  	}
> @@ -657,10 +779,12 @@ static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
>  	int i;
>  
>  	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
> -		if (!list_empty_careful(&khd->rqs[i]))
> +		if (!list_empty_careful(&khd->rqs[i]) ||
> +		    sbitmap_any_bit_set(&khd->kcq_map[i]))
>  			return true;
>  	}
> -	return sbitmap_any_bit_set(&hctx->ctx_map);
> +
> +	return false;
>  }
>  
>  #define KYBER_LAT_SHOW_STORE(op)					\
> @@ -831,7 +955,9 @@ static struct elevator_type kyber_sched = {
>  		.init_hctx = kyber_init_hctx,
>  		.exit_hctx = kyber_exit_hctx,
>  		.limit_depth = kyber_limit_depth,
> +		.bio_merge = kyber_bio_merge,
>  		.prepare_request = kyber_prepare_request,
> +		.insert_requests = kyber_insert_requests,
>  		.finish_request = kyber_finish_request,
>  		.requeue_request = kyber_finish_request,
>  		.completed_request = kyber_completed_request,
> -- 
> 2.7.4
> 

  reply	other threads:[~2018-05-30 16:40 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-05-30  7:26 [PATCH V3 0/2] block: kyber: make kyber more friendly with merging Jianchao Wang
2018-05-30  7:26 ` [PATCH V3 1/2] blk-mq: abstract out blk-mq-sched rq list iteration bio merge helper Jianchao Wang
2018-05-30  7:26 ` [PATCH V3 2/2] block: kyber: make kyber more friendly with merging Jianchao Wang
2018-05-30 16:40   ` Omar Sandoval [this message]
2018-05-30 16:40     ` Omar Sandoval
2018-05-30 16:48 ` [PATCH V3 0/2] " Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180530164055.GA25342@vader \
    --to=osandov@osandov.com \
    --cc=axboe@kernel.dk \
    --cc=jianchao.w.wang@oracle.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.