linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: NeilBrown <neilb@suse.de>
To: Shaohua Li <shli@kernel.org>
Cc: linux-raid@vger.kernel.org, axboe@kernel.dk,
	dan.j.williams@intel.com, shli@fusionio.com
Subject: Re: [patch 10/10 v3] raid5: create multiple threads to handle stripes
Date: Mon, 2 Jul 2012 12:39:57 +1000	[thread overview]
Message-ID: <20120702123957.474578c7@notabene.brown> (raw)
In-Reply-To: <20120625072726.364743203@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 10513 bytes --]

On Mon, 25 Jun 2012 15:24:57 +0800 Shaohua Li <shli@kernel.org> wrote:

> Like raid 1/10, raid5 uses one thread to handle stripe. In a fast storage, the
> thread becomes a bottleneck. raid5 can offload calculation like checksum to
> async threads. And if storge is fast, scheduling async work and running async
> work will introduce heavy lock contention of workqueue, which makes such
> optimization useless. And calculation isn't the only bottleneck. For example,
> in my test raid5 thread must handle > 450k requests per second. Just doing
> dispatch and completion will make raid5 thread incapable. The only chance to
> scale is using several threads to handle stripe.
> 
> With this patch, user can create several extra threads to handle stripe. How
> many threads are better depending on disk number, so the thread number can be
> changed in userspace. By default, the thread number is 0, which means no extra
> thread.
> 
> In a 3-disk raid5 setup, 2 extra threads can provide 130% throughput
> improvement (double stripe_cache_size) and the throughput is pretty close to
> theory value. With >=4 disks, the improvement is even bigger, for example, can
> improve 200% for 4-disk setup, but the throughput is far less than theory
> value, which is caused by several factors like request queue lock contention,
> cache issue, latency introduced by how a stripe is handled in different disks.
> Those factors need further investigations.
> 
> Signed-off-by: Shaohua Li <shli@fusionio.com>
> ---
>  drivers/md/raid5.c |  137 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  drivers/md/raid5.h |    3 +
>  2 files changed, 139 insertions(+), 1 deletion(-)
> 
> Index: linux/drivers/md/raid5.c
> ===================================================================
> --- linux.orig/drivers/md/raid5.c	2012-06-25 14:58:06.420138526 +0800
> +++ linux/drivers/md/raid5.c	2012-06-25 14:58:06.428138426 +0800
> @@ -211,6 +211,7 @@ static void handle_release_stripe(struct
>  			clear_bit(STRIPE_DELAYED, &sh->state);
>  			clear_bit(STRIPE_BIT_DELAY, &sh->state);
>  			list_add_tail(&sh->lru, &conf->handle_list);
> +			conf->pending_stripes++;
>  		}
>  		md_wakeup_thread(conf->mddev->thread);
>  	} else {
> @@ -489,6 +490,10 @@ get_active_stripe(struct r5conf *conf, s
>  			} else {
>  				if (!test_bit(STRIPE_HANDLE, &sh->state))
>  					atomic_inc(&conf->active_stripes);
> +				else if (!list_empty(&sh->lru)
> +					 && !test_bit(STRIPE_DELAYED, &sh->state)
> +					 && !test_bit(STRIPE_BIT_DELAY, &sh->state))
> +					conf->pending_stripes--;
>  				if (list_empty(&sh->lru) &&
>  				    !test_bit(STRIPE_EXPANDING, &sh->state))
>  					BUG();
> @@ -3670,6 +3675,7 @@ static void raid5_activate_delayed(struc
>  			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
>  				atomic_inc(&conf->preread_active_stripes);
>  			list_add_tail(&sh->lru, &conf->hold_list);
> +			conf->pending_stripes++;
>  		}
>  	}
>  }
> @@ -3979,6 +3985,7 @@ static struct stripe_head *__get_priorit
>  	} else
>  		return NULL;
>  
> +	conf->pending_stripes--;
>  	list_del_init(&sh->lru);
>  	atomic_inc(&sh->count);
>  	BUG_ON(atomic_read(&sh->count) != 1);
> @@ -4593,6 +4600,33 @@ static int handle_active_stripes(struct
>  	return batch_size;
>  }
>  
> +static void raid5auxd(struct mddev *mddev)
> +{
> +	struct r5conf *conf = mddev->private;
> +	struct blk_plug plug;
> +	int handled;
> +
> +	pr_debug("+++ raid5auxd active\n");
> +
> +	blk_start_plug(&plug);
> +	handled = 0;
> +	spin_lock_irq(&conf->device_lock);
> +	while (1) {
> +		int batch_size;
> +
> +		batch_size = handle_active_stripes(conf);
> +		if (!batch_size)
> +			break;
> +		handled += batch_size;
> +	}
> +	pr_debug("%d stripes handled\n", handled);
> +
> +	spin_unlock_irq(&conf->device_lock);
> +	blk_finish_plug(&plug);
> +
> +	pr_debug("--- raid5auxd inactive\n");
> +}
> +
>  /*
>   * This is our raid5 kernel thread.
>   *
> @@ -4615,7 +4649,7 @@ static void raid5d(struct mddev *mddev)
>  	spin_lock_irq(&conf->device_lock);
>  	while (1) {
>  		struct bio *bio;
> -		int batch_size;
> +		int batch_size, i;
>  
>  		if (atomic_read(&mddev->plug_cnt) == 0 &&
>  		    !list_empty(&conf->bitmap_list)) {
> @@ -4645,6 +4679,10 @@ static void raid5d(struct mddev *mddev)
>  			break;
>  		handled += batch_size;
>  
> +		for (i = 0; i < conf->aux_thread_num
> +		     && i < conf->pending_stripes/MAX_STRIPE_BATCH + 1; i++)
> +			md_wakeup_thread(conf->aux_threads[i]);
> +
>  		if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) {
>  			spin_unlock_irq(&conf->device_lock);
>  			md_check_recovery(mddev);
> @@ -4769,10 +4807,85 @@ stripe_cache_active_show(struct mddev *m
>  static struct md_sysfs_entry
>  raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
>  
> +static ssize_t
> +raid5_show_auxthread_number(struct mddev *mddev, char *page)
> +{
> +	struct r5conf *conf = mddev->private;
> +	if (conf)
> +		return sprintf(page, "%d\n", conf->aux_thread_num);
> +	else
> +		return 0;
> +}
> +
> +static ssize_t
> +raid5_store_auxthread_number(struct mddev *mddev, const char *page, size_t len)
> +{
> +	struct r5conf *conf = mddev->private;
> +	unsigned long new;
> +	int i;
> +	struct md_thread **threads;
> +
> +	if (len >= PAGE_SIZE)
> +		return -EINVAL;
> +	if (!conf)
> +		return -ENODEV;
> +
> +	if (strict_strtoul(page, 10, &new))
> +		return -EINVAL;
> +
> +	if (new == conf->aux_thread_num)
> +		return len;
> +
> +	if (new > conf->aux_thread_num) {
> +		threads = kmalloc(sizeof(struct md_thread *) * new, GFP_KERNEL);
> +		if (!threads)
> +			return -EFAULT;
> +
> +		i = conf->aux_thread_num;
> +		while (i < new) {
> +			char name[10];
> +
> +			sprintf(name, "aux%d", i);
> +			threads[i] = md_register_thread(raid5auxd, mddev, name);
> +			if (!threads[i])
> +				goto error;
> +			i++;
> +		}
> +		memcpy(threads, conf->aux_threads,
> +			sizeof(struct md_thread *) * conf->aux_thread_num);
> +		spin_lock_irq(&conf->device_lock);
> +		kfree(conf->aux_threads);
> +		conf->aux_threads = threads;
> +		conf->aux_thread_num = new;
> +		spin_unlock_irq(&conf->device_lock);
> +	} else {
> +		int old = conf->aux_thread_num;
> +
> +		spin_lock_irq(&conf->device_lock);
> +		conf->aux_thread_num = new;
> +		spin_unlock_irq(&conf->device_lock);
> +		for (i = new; i < old; i++)
> +			md_unregister_thread(&conf->aux_threads[i]);
> +	}
> +
> +	return len;
> +error:
> +	while (--i >= conf->aux_thread_num)
> +		md_unregister_thread(&threads[i]);
> +	kfree(threads);
> +	return -EFAULT;
> +}
> +
> +static struct md_sysfs_entry
> +raid5_auxthread_number = __ATTR(auxthread_number, S_IRUGO|S_IWUSR,
> +				raid5_show_auxthread_number,
> +				raid5_store_auxthread_number);
> +
>  static struct attribute *raid5_attrs[] =  {
>  	&raid5_stripecache_size.attr,
>  	&raid5_stripecache_active.attr,
>  	&raid5_preread_bypass_threshold.attr,
> +	&raid5_auxthread_number.attr,
>  	NULL,
>  };
>  static struct attribute_group raid5_attrs_group = {
> @@ -4820,6 +4933,7 @@ static void raid5_free_percpu(struct r5c
>  
>  static void free_conf(struct r5conf *conf)
>  {
> +	kfree(conf->aux_threads);
>  	shrink_stripes(conf);
>  	raid5_free_percpu(conf);
>  	kfree(conf->disks);
> @@ -4914,6 +5028,7 @@ static struct r5conf *setup_conf(struct
>  	int raid_disk, memory, max_disks;
>  	struct md_rdev *rdev;
>  	struct disk_info *disk;
> +	int i;
>  
>  	if (mddev->new_level != 5
>  	    && mddev->new_level != 4
> @@ -5037,6 +5152,22 @@ static struct r5conf *setup_conf(struct
>  		printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
>  		       mdname(mddev), memory);
>  
> +	/* By default, auxthread number equals to disk number */
> +	conf->aux_threads = kmalloc(sizeof(struct md_thread *) * max_disks,
> +				    GFP_KERNEL);
> +	if (!conf->aux_threads)
> +		goto abort;
> +	for (i = 0; i < max_disks; i++) {
> +		char name[10];
> +
> +		sprintf(name, "aux%d", i);
> +		conf->aux_threads[i] = md_register_thread(raid5auxd, mddev, name);
> +		if (!conf->aux_threads[i])
> +			break;
> +	}
> +
> +	conf->aux_thread_num = i;
> +
>  	conf->thread = md_register_thread(raid5d, mddev, NULL);
>  	if (!conf->thread) {
>  		printk(KERN_ERR
> @@ -5376,6 +5507,10 @@ abort:
>  static int stop(struct mddev *mddev)
>  {
>  	struct r5conf *conf = mddev->private;
> +	int i;
> +
> +	for (i = 0; i < conf->aux_thread_num; i++)
> +		md_unregister_thread(&conf->aux_threads[i]);
>  
>  	md_unregister_thread(&mddev->thread);
>  	if (mddev->queue)
> Index: linux/drivers/md/raid5.h
> ===================================================================
> --- linux.orig/drivers/md/raid5.h	2012-06-25 14:58:06.408138677 +0800
> +++ linux/drivers/md/raid5.h	2012-06-25 14:58:06.432138376 +0800
> @@ -450,6 +450,7 @@ struct r5conf {
>  	int			inactive_blocked;	/* release of inactive stripes blocked,
>  							 * waiting for 25% to be free
>  							 */
> +	int			pending_stripes;
>  	int			pool_size; /* number of disks in stripeheads in pool */
>  	spinlock_t		device_lock;
>  	struct disk_info	*disks;
> @@ -458,6 +459,8 @@ struct r5conf {
>  	 * the new thread here until we fully activate the array.
>  	 */
>  	struct md_thread	*thread;
> +	int			aux_thread_num;
> +	struct md_thread	**aux_threads;
>  };
>  
>  /*


Hi,
 I'm certainly interested in this patch, but I'm not going to apply it yet,
 partly because I want all the other bits to settle and be well tested first.

 I'm still uncomfortable about setting an explicit number of threads...

 I wonder if a different approach might be useful.  i.e. add an ioctl (or
 similar) while allows a normal user thread to start handling raid5 requests.
 Then instead of telling the kernel how many thread to start, we just start
 the right number of processes, bind them to CPUs or whatever might be
 wanted, then call the ioctl.
 Possibly the ioctl would return whenever it runs out of work to do, and this
 could be used somehow to dynamically adjust the number of threads.

 I haven't really thought this through fully yet so it might not work, but
 I'd like to explore the possibility of having the number of threads to
 adjusted automatically, and that probably means allowing user-space a fair
 bit of control and providing it with a fair bit of information.

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

  reply	other threads:[~2012-07-02  2:39 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-06-25  7:24 [patch 00/10 v3] raid5: improve write performance for fast storage Shaohua Li
2012-06-25  7:24 ` [patch 01/10 v3] raid5: use wake_up_all for overlap waking Shaohua Li
2012-06-28  7:26   ` NeilBrown
2012-06-28  8:53     ` Shaohua Li
2012-06-25  7:24 ` [patch 02/10 v3] raid5: delayed stripe fix Shaohua Li
2012-07-02  0:46   ` NeilBrown
2012-07-02  0:49     ` Shaohua Li
2012-07-02  0:55       ` NeilBrown
2012-06-25  7:24 ` [patch 03/10 v3] raid5: add a per-stripe lock Shaohua Li
2012-07-02  0:50   ` NeilBrown
2012-07-02  3:16     ` Shaohua Li
2012-07-02  7:39       ` NeilBrown
2012-07-03  1:27         ` Shaohua Li
2012-07-03 12:16         ` majianpeng
2012-07-03 23:56           ` NeilBrown
2012-07-04  1:09             ` majianpeng
2012-06-25  7:24 ` [patch 04/10 v3] raid5: lockless access raid5 overrided bi_phys_segments Shaohua Li
2012-06-25  7:24 ` [patch 05/10 v3] raid5: remove some device_lock locking places Shaohua Li
2012-06-25  7:24 ` [patch 06/10 v3] raid5: reduce chance release_stripe() taking device_lock Shaohua Li
2012-07-02  0:57   ` NeilBrown
2012-06-25  7:24 ` [patch 07/10 v3] md: personality can provide unplug private data Shaohua Li
2012-07-02  1:06   ` NeilBrown
2012-06-25  7:24 ` [patch 08/10 v3] raid5: make_request use batch stripe release Shaohua Li
2012-07-02  2:31   ` NeilBrown
2012-07-02  2:59     ` Shaohua Li
2012-07-02  5:07       ` NeilBrown
2012-06-25  7:24 ` [patch 09/10 v3] raid5: raid5d handle stripe in batch way Shaohua Li
2012-07-02  2:32   ` NeilBrown
2012-06-25  7:24 ` [patch 10/10 v3] raid5: create multiple threads to handle stripes Shaohua Li
2012-07-02  2:39   ` NeilBrown [this message]
2012-07-02 20:03   ` Dan Williams
2012-07-03  8:04     ` Shaohua Li

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20120702123957.474578c7@notabene.brown \
    --to=neilb@suse.de \
    --cc=axboe@kernel.dk \
    --cc=dan.j.williams@intel.com \
    --cc=linux-raid@vger.kernel.org \
    --cc=shli@fusionio.com \
    --cc=shli@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).