[patch 2/2 v3]raid5: create multiple threads to handle stripes

linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [patch 2/2 v3]raid5: create multiple threads to handle stripes
@ 2012-08-09  8:58 Shaohua Li
  2012-08-11  8:45 ` Jianpeng Ma
                   ` (2 more replies)
  0 siblings, 3 replies; 29+ messages in thread
From: Shaohua Li @ 2012-08-09  8:58 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, dan.j.williams

This is a new tempt to make raid5 handle stripes in multiple threads, as
suggested by Neil to have maxium flexibility and better numa binding. It
basically is a combination of my first and second generation patches. By
default, no multiple thread is enabled (all stripes are handled by raid5d).

An example to enable multiple threads:
#echo 3 > /sys/block/md0/md/auxthread_number
This will create 3 auxiliary threads to handle stripes. The threads can run
on any cpus and handle stripes produced by any cpus.

#echo 1-3 > /sys/block/md0/md/auxth0/cpulist
This will bind auxiliary thread 0 to cpu 1-3, and this thread will only handle
stripes produced by cpu 1-3. User tool can further change the thread's
affinity, but the thread can only handle stripes produced by cpu 1-3 till the
sysfs entry is changed again.

If stripes produced by a CPU aren't handled by any auxiliary thread, such
stripes will be handled by raid5d. Otherwise, raid5d doesn't handle any
stripes.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 drivers/md/md.c    |    8 -
 drivers/md/md.h    |    8 +
 drivers/md/raid5.c |  406 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 drivers/md/raid5.h |   19 ++
 4 files changed, 418 insertions(+), 23 deletions(-)

Index: linux/drivers/md/raid5.c
===================================================================
--- linux.orig/drivers/md/raid5.c	2012-08-09 10:43:04.800022626 +0800
+++ linux/drivers/md/raid5.c	2012-08-09 16:44:39.663278511 +0800
@@ -196,6 +196,21 @@ static int stripe_operations_active(stru
 	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
 }
 
+static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
+{
+	struct r5conf *conf = sh->raid_conf;
+	struct raid5_percpu *percpu;
+	int i, orphaned = 1;
+
+	percpu = per_cpu_ptr(conf->percpu, sh->cpu);
+	for_each_cpu(i, &percpu->handle_threads) {
+		md_wakeup_thread(conf->aux_threads[i]->thread);
+		orphaned = 0;
+	}
+	if (orphaned)
+		md_wakeup_thread(conf->mddev->thread);
+}
+
 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
 {
 	BUG_ON(!list_empty(&sh->lru));
@@ -208,9 +223,19 @@ static void do_release_stripe(struct r5c
 			   sh->bm_seq - conf->seq_write > 0)
 			list_add_tail(&sh->lru, &conf->bitmap_list);
 		else {
+			int cpu = sh->cpu;
+			struct raid5_percpu *percpu;
+			if (!cpu_online(cpu)) {
+				cpu = cpumask_any(cpu_online_mask);
+				sh->cpu = cpu;
+			}
+			percpu = per_cpu_ptr(conf->percpu, cpu);
+
 			clear_bit(STRIPE_DELAYED, &sh->state);
 			clear_bit(STRIPE_BIT_DELAY, &sh->state);
-			list_add_tail(&sh->lru, &conf->handle_list);
+			list_add_tail(&sh->lru, &percpu->handle_list);
+			raid5_wakeup_stripe_thread(sh);
+			return;
 		}
 		md_wakeup_thread(conf->mddev->thread);
 	} else {
@@ -355,6 +380,7 @@ static void init_stripe(struct stripe_he
 		raid5_build_block(sh, i, previous);
 	}
 	insert_hash(conf, sh);
+	sh->cpu = smp_processor_id();
 }
 
 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
@@ -3689,12 +3715,19 @@ static void raid5_activate_delayed(struc
 		while (!list_empty(&conf->delayed_list)) {
 			struct list_head *l = conf->delayed_list.next;
 			struct stripe_head *sh;
+			int cpu;
 			sh = list_entry(l, struct stripe_head, lru);
 			list_del_init(l);
 			clear_bit(STRIPE_DELAYED, &sh->state);
 			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 				atomic_inc(&conf->preread_active_stripes);
 			list_add_tail(&sh->lru, &conf->hold_list);
+			cpu = sh->cpu;
+			if (!cpu_online(cpu)) {
+				cpu = cpumask_any(cpu_online_mask);
+				sh->cpu = cpu;
+			}
+			raid5_wakeup_stripe_thread(sh);
 		}
 	}
 }
@@ -3968,18 +4001,29 @@ static int chunk_aligned_read(struct mdd
  * head of the hold_list has changed, i.e. the head was promoted to the
  * handle_list.
  */
-static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
+static struct stripe_head *__get_priority_stripe(struct r5conf *conf,
+	cpumask_t *mask)
 {
-	struct stripe_head *sh;
-
+	struct stripe_head *sh = NULL, *tmp;
+	struct list_head *handle_list = NULL;
+	int cpu;
+
+	/* Should we take action to avoid starvation of latter CPUs ? */
+	for_each_cpu(cpu, mask) {
+		struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
+		if (!list_empty(&percpu->handle_list)) {
+			handle_list = &percpu->handle_list;
+			break;
+		}
+	}
 	pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
 		  __func__,
-		  list_empty(&conf->handle_list) ? "empty" : "busy",
+		  !handle_list ? "empty" : "busy",
 		  list_empty(&conf->hold_list) ? "empty" : "busy",
 		  atomic_read(&conf->pending_full_writes), conf->bypass_count);
 
-	if (!list_empty(&conf->handle_list)) {
-		sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
+	if (handle_list) {
+		sh = list_entry(handle_list->next, typeof(*sh), lru);
 
 		if (list_empty(&conf->hold_list))
 			conf->bypass_count = 0;
@@ -3997,12 +4041,23 @@ static struct stripe_head *__get_priorit
 		   ((conf->bypass_threshold &&
 		     conf->bypass_count > conf->bypass_threshold) ||
 		    atomic_read(&conf->pending_full_writes) == 0)) {
-		sh = list_entry(conf->hold_list.next,
-				typeof(*sh), lru);
-		conf->bypass_count -= conf->bypass_threshold;
-		if (conf->bypass_count < 0)
-			conf->bypass_count = 0;
-	} else
+
+		list_for_each_entry(tmp, &conf->hold_list,  lru) {
+			if (cpumask_test_cpu(tmp->cpu, mask) ||
+			    !cpu_online(tmp->cpu)) {
+				sh = tmp;
+				break;
+			}
+		}
+
+		if (sh) {
+			conf->bypass_count -= conf->bypass_threshold;
+			if (conf->bypass_count < 0)
+				conf->bypass_count = 0;
+		}
+	}
+
+	if (!sh)
 		return NULL;
 
 	list_del_init(&sh->lru);
@@ -4594,13 +4649,13 @@ static int  retry_aligned_read(struct r5
 }
 
 #define MAX_STRIPE_BATCH 8
-static int handle_active_stripes(struct r5conf *conf)
+static int handle_active_stripes(struct r5conf *conf, cpumask_t *mask)
 {
 	struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
 	int i, batch_size = 0;
 
 	while (batch_size < MAX_STRIPE_BATCH &&
-			(sh = __get_priority_stripe(conf)) != NULL)
+			(sh = __get_priority_stripe(conf, mask)) != NULL)
 		batch[batch_size++] = sh;
 
 	if (batch_size == 0)
@@ -4618,6 +4673,35 @@ static int handle_active_stripes(struct
 	return batch_size;
 }
 
+static void raid5auxd(struct md_thread *thread)
+{
+	struct mddev *mddev = thread->mddev;
+	struct r5conf *conf = mddev->private;
+	struct blk_plug plug;
+	int handled;
+	struct raid5_auxth *auxth = thread->private;
+
+	pr_debug("+++ raid5auxd active\n");
+
+	blk_start_plug(&plug);
+	handled = 0;
+	spin_lock_irq(&conf->device_lock);
+	while (1) {
+		int batch_size;
+
+		batch_size = handle_active_stripes(conf, &auxth->work_mask);
+		if (!batch_size)
+			break;
+		handled += batch_size;
+	}
+	pr_debug("%d stripes handled\n", handled);
+
+	spin_unlock_irq(&conf->device_lock);
+	blk_finish_plug(&plug);
+
+	pr_debug("--- raid5auxd inactive\n");
+}
+
 /*
  * This is our raid5 kernel thread.
  *
@@ -4665,7 +4749,7 @@ static void raid5d(struct md_thread *thr
 			handled++;
 		}
 
-		batch_size = handle_active_stripes(conf);
+		batch_size = handle_active_stripes(conf, &conf->work_mask);
 		if (!batch_size)
 			break;
 		handled += batch_size;
@@ -4794,10 +4878,270 @@ stripe_cache_active_show(struct mddev *m
 static struct md_sysfs_entry
 raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
 
+static void raid5_update_threads_handle_mask(struct mddev *mddev)
+{
+	int cpu, i;
+	struct raid5_percpu *percpu;
+	struct r5conf *conf = mddev->private;
+
+	for_each_online_cpu(cpu) {
+		percpu = per_cpu_ptr(conf->percpu, cpu);
+		cpumask_clear(&percpu->handle_threads);
+	}
+	cpumask_copy(&conf->work_mask, cpu_online_mask);
+
+	for (i = 0; i < conf->aux_thread_num; i++) {
+		cpumask_t *work_mask = &conf->aux_threads[i]->work_mask;
+		for_each_cpu(cpu, work_mask) {
+			percpu = per_cpu_ptr(conf->percpu, cpu);
+			cpumask_set_cpu(i, &percpu->handle_threads);
+		}
+		cpumask_andnot(&conf->work_mask, &conf->work_mask,
+				work_mask);
+	}
+}
+
+struct raid5_auxth_sysfs {
+	struct attribute attr;
+	ssize_t (*show)(struct mddev *, struct raid5_auxth *, char *);
+	ssize_t (*store)(struct mddev *, struct raid5_auxth *,
+		const char *, size_t);
+};
+
+static ssize_t raid5_show_thread_cpulist(struct mddev *mddev,
+	struct raid5_auxth *thread, char *page)
+{
+	if (!mddev->private)
+		return 0;
+	return cpulist_scnprintf(page, PAGE_SIZE, &thread->work_mask);
+}
+
+static ssize_t
+raid5_store_thread_cpulist(struct mddev *mddev, struct raid5_auxth *thread,
+	const char *page, size_t len)
+{
+	struct r5conf *conf = mddev->private;
+	cpumask_var_t mask;
+
+	if (!conf)
+		return -ENODEV;
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	if (cpulist_parse(page, mask)) {
+		free_cpumask_var(mask);
+		return -EINVAL;
+	}
+
+	get_online_cpus();
+	spin_lock_irq(&conf->device_lock);
+	cpumask_copy(&thread->work_mask, mask);
+	raid5_update_threads_handle_mask(mddev);
+	spin_unlock_irq(&conf->device_lock);
+	put_online_cpus();
+	set_cpus_allowed_ptr(thread->thread->tsk, mask);
+
+	free_cpumask_var(mask);
+	return len;
+}
+
+static struct raid5_auxth_sysfs thread_cpulist =
+__ATTR(cpulist, S_IRUGO|S_IWUSR, raid5_show_thread_cpulist,
+	raid5_store_thread_cpulist);
+
+static struct attribute *auxth_attrs[] = {
+	&thread_cpulist.attr,
+	NULL,
+};
+
+static ssize_t
+raid5_auxth_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+	struct raid5_auxth_sysfs *entry = container_of(attr,
+		struct raid5_auxth_sysfs, attr);
+	struct raid5_auxth *thread = container_of(kobj,
+		struct raid5_auxth, kobj);
+	struct mddev *mddev = thread->thread->mddev;
+	ssize_t ret;
+
+	if (!entry->show)
+		return -EIO;
+	mddev_lock(mddev);
+	ret = entry->show(mddev, thread, page);
+	mddev_unlock(mddev);
+	return ret;
+}
+
+static ssize_t
+raid5_auxth_attr_store(struct kobject *kobj, struct attribute *attr,
+	      const char *page, size_t length)
+{
+	struct raid5_auxth_sysfs *entry = container_of(attr,
+		struct raid5_auxth_sysfs, attr);
+	struct raid5_auxth *thread = container_of(kobj,
+		struct raid5_auxth, kobj);
+	struct mddev *mddev = thread->thread->mddev;
+	ssize_t ret;
+
+	if (!entry->store)
+		return -EIO;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+	mddev_lock(mddev);
+	ret = entry->store(mddev, thread, page, length);
+	mddev_unlock(mddev);
+	return ret;
+}
+
+static void raid5_auxth_release(struct kobject *kobj)
+{
+	struct raid5_auxth *thread = container_of(kobj,
+		struct raid5_auxth, kobj);
+	kfree(thread);
+}
+
+static const struct sysfs_ops raid5_auxth_sysfsops = {
+	.show = raid5_auxth_attr_show,
+	.store = raid5_auxth_attr_store,
+};
+static struct kobj_type raid5_auxth_ktype = {
+	.release = raid5_auxth_release,
+	.sysfs_ops = &raid5_auxth_sysfsops,
+	.default_attrs = auxth_attrs,
+};
+
+static ssize_t
+raid5_show_auxthread_number(struct mddev *mddev, char *page)
+{
+	struct r5conf *conf = mddev->private;
+	if (conf)
+		return sprintf(page, "%d\n", conf->aux_thread_num);
+	else
+		return 0;
+}
+
+static void raid5_auxth_delete(struct work_struct *ws)
+{
+	struct raid5_auxth *thread = container_of(ws, struct raid5_auxth,
+		del_work);
+
+	kobject_del(&thread->kobj);
+	kobject_put(&thread->kobj);
+}
+
+static void __free_aux_thread(struct mddev *mddev, struct raid5_auxth *thread)
+{
+	md_unregister_thread(&thread->thread);
+	INIT_WORK(&thread->del_work, raid5_auxth_delete);
+	kobject_get(&thread->kobj);
+	md_queue_misc_work(&thread->del_work);
+}
+
+static struct raid5_auxth *__create_aux_thread(struct mddev *mddev, int i)
+{
+	struct raid5_auxth *thread;
+	char name[10];
+
+	thread = kzalloc(sizeof(*thread), GFP_KERNEL);
+	if (!thread)
+		return NULL;
+	snprintf(name, 10, "aux%d", i);
+	thread->thread = md_register_thread(raid5auxd, mddev, name);
+	if (!thread->thread) {
+		kfree(thread);
+		return NULL;
+	}
+	thread->thread->private = thread;
+
+	cpumask_copy(&thread->work_mask, cpu_online_mask);
+
+	if (kobject_init_and_add(&thread->kobj, &raid5_auxth_ktype,
+	    &mddev->kobj, "auxth%d", i)) {
+		md_unregister_thread(&thread->thread);
+		kfree(thread);
+		return NULL;
+	}
+	return thread;
+}
+
+static ssize_t
+raid5_store_auxthread_number(struct mddev *mddev, const char *page, size_t len)
+{
+	struct r5conf *conf = mddev->private;
+	unsigned long new;
+	int i;
+	struct raid5_auxth **threads;
+
+	if (len >= PAGE_SIZE)
+		return -EINVAL;
+	if (!conf)
+		return -ENODEV;
+
+	if (kstrtoul(page, 10, &new))
+		return -EINVAL;
+
+	if (new == conf->aux_thread_num)
+		return len;
+
+	/* There is no point creating more threads than cpu number */
+	if (new > num_online_cpus())
+		return -EINVAL;
+
+	if (new > conf->aux_thread_num) {
+		threads = kzalloc(sizeof(struct raid5_auxth *) * new,
+				GFP_KERNEL);
+		if (!threads)
+			return -ENOMEM;
+
+		i = conf->aux_thread_num;
+		while (i < new) {
+			threads[i] = __create_aux_thread(mddev, i);
+			if (!threads[i])
+				goto error;
+
+			i++;
+		}
+		memcpy(threads, conf->aux_threads,
+			sizeof(struct raid5_auxth *) * conf->aux_thread_num);
+		get_online_cpus();
+		spin_lock_irq(&conf->device_lock);
+		kfree(conf->aux_threads);
+		conf->aux_threads = threads;
+		conf->aux_thread_num = new;
+		raid5_update_threads_handle_mask(mddev);
+		spin_unlock_irq(&conf->device_lock);
+		put_online_cpus();
+	} else {
+		int old = conf->aux_thread_num;
+
+		get_online_cpus();
+		spin_lock_irq(&conf->device_lock);
+		conf->aux_thread_num = new;
+		raid5_update_threads_handle_mask(mddev);
+		spin_unlock_irq(&conf->device_lock);
+		put_online_cpus();
+		for (i = new; i < old; i++)
+			__free_aux_thread(mddev, conf->aux_threads[i]);
+	}
+
+	return len;
+error:
+	while (--i >= conf->aux_thread_num)
+		__free_aux_thread(mddev, threads[i]);
+	kfree(threads);
+	return -ENOMEM;
+}
+
+static struct md_sysfs_entry
+raid5_auxthread_number = __ATTR(auxthread_number, S_IRUGO|S_IWUSR,
+				raid5_show_auxthread_number,
+				raid5_store_auxthread_number);
+
 static struct attribute *raid5_attrs[] =  {
 	&raid5_stripecache_size.attr,
 	&raid5_stripecache_active.attr,
 	&raid5_preread_bypass_threshold.attr,
+	&raid5_auxthread_number.attr,
 	NULL,
 };
 static struct attribute_group raid5_attrs_group = {
@@ -4845,6 +5189,7 @@ static void raid5_free_percpu(struct r5c
 
 static void free_conf(struct r5conf *conf)
 {
+	kfree(conf->aux_threads);
 	shrink_stripes(conf);
 	raid5_free_percpu(conf);
 	kfree(conf->disks);
@@ -4857,7 +5202,7 @@ static int raid456_cpu_notify(struct not
 			      void *hcpu)
 {
 	struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify);
-	long cpu = (long)hcpu;
+	long cpu = (long)hcpu, anycpu;
 	struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
 
 	switch (action) {
@@ -4876,9 +5221,17 @@ static int raid456_cpu_notify(struct not
 			       __func__, cpu);
 			return notifier_from_errno(-ENOMEM);
 		}
+		INIT_LIST_HEAD(&(percpu->handle_list));
+		cpumask_clear(&percpu->handle_threads);
 		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
+		spin_lock_irq(&conf->device_lock);
+		anycpu = cpumask_any(cpu_online_mask);
+		list_splice_tail_init(&percpu->handle_list,
+			&per_cpu_ptr(conf->percpu, anycpu)->handle_list);
+		spin_unlock_irq(&conf->device_lock);
+
 		safe_put_page(percpu->spare_page);
 		kfree(percpu->scribble);
 		percpu->spare_page = NULL;
@@ -4887,6 +5240,10 @@ static int raid456_cpu_notify(struct not
 	default:
 		break;
 	}
+
+	spin_lock_irq(&conf->device_lock);
+	raid5_update_threads_handle_mask(conf->mddev);
+	spin_unlock_irq(&conf->device_lock);
 	return NOTIFY_OK;
 }
 #endif
@@ -4907,20 +5264,24 @@ static int raid5_alloc_percpu(struct r5c
 	get_online_cpus();
 	err = 0;
 	for_each_present_cpu(cpu) {
+		struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
+
 		if (conf->level == 6) {
 			spare_page = alloc_page(GFP_KERNEL);
 			if (!spare_page) {
 				err = -ENOMEM;
 				break;
 			}
-			per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
+			percpu->spare_page = spare_page;
 		}
 		scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
 		if (!scribble) {
 			err = -ENOMEM;
 			break;
 		}
-		per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
+		percpu->scribble = scribble;
+		INIT_LIST_HEAD(&percpu->handle_list);
+		cpumask_clear(&percpu->handle_threads);
 	}
 #ifdef CONFIG_HOTPLUG_CPU
 	conf->cpu_notify.notifier_call = raid456_cpu_notify;
@@ -4976,7 +5337,6 @@ static struct r5conf *setup_conf(struct
 	spin_lock_init(&conf->device_lock);
 	init_waitqueue_head(&conf->wait_for_stripe);
 	init_waitqueue_head(&conf->wait_for_overlap);
-	INIT_LIST_HEAD(&conf->handle_list);
 	INIT_LIST_HEAD(&conf->hold_list);
 	INIT_LIST_HEAD(&conf->delayed_list);
 	INIT_LIST_HEAD(&conf->bitmap_list);
@@ -4987,6 +5347,8 @@ static struct r5conf *setup_conf(struct
 	conf->bypass_threshold = BYPASS_THRESHOLD;
 	conf->recovery_disabled = mddev->recovery_disabled - 1;
 
+	cpumask_copy(&conf->work_mask, cpu_online_mask);
+
 	conf->raid_disks = mddev->raid_disks;
 	if (mddev->reshape_position == MaxSector)
 		conf->previous_raid_disks = mddev->raid_disks;
@@ -5403,6 +5765,10 @@ abort:
 static int stop(struct mddev *mddev)
 {
 	struct r5conf *conf = mddev->private;
+	int i;
+
+	for (i = 0; i < conf->aux_thread_num; i++)
+		__free_aux_thread(mddev, conf->aux_threads[i]);
 
 	md_unregister_thread(&mddev->thread);
 	if (mddev->queue)
Index: linux/drivers/md/raid5.h
===================================================================
--- linux.orig/drivers/md/raid5.h	2012-08-09 09:57:23.826481331 +0800
+++ linux/drivers/md/raid5.h	2012-08-09 16:17:51.279501447 +0800
@@ -211,6 +211,7 @@ struct stripe_head {
 	enum check_states	check_state;
 	enum reconstruct_states reconstruct_state;
 	spinlock_t		stripe_lock;
+	int			cpu;
 	/**
 	 * struct stripe_operations
 	 * @target - STRIPE_OP_COMPUTE_BLK target
@@ -364,6 +365,14 @@ struct disk_info {
 	struct md_rdev	*rdev, *replacement;
 };
 
+struct raid5_auxth {
+	struct md_thread	*thread;
+	/* which CPUs should the auxiliary thread handle stripes from */
+	cpumask_t		work_mask;
+	struct kobject		kobj;
+	struct work_struct	del_work;
+};
+
 struct r5conf {
 	struct hlist_head	*stripe_hashtbl;
 	struct mddev		*mddev;
@@ -432,6 +441,12 @@ struct r5conf {
 					      * lists and performing address
 					      * conversions
 					      */
+		struct list_head handle_list;
+		cpumask_t	handle_threads; /* Which threads can the CPU's
+						 * stripes be handled. It really
+						 * is a bitmap to aux_threads[],
+						 * but has max bits NR_CPUS
+						 */
 	} __percpu *percpu;
 	size_t			scribble_len; /* size of scribble region must be
 					       * associated with conf to handle
@@ -459,6 +474,10 @@ struct r5conf {
 	 * the new thread here until we fully activate the array.
 	 */
 	struct md_thread	*thread;
+	int			aux_thread_num;
+	struct raid5_auxth	**aux_threads;
+	/* which CPUs should raid5d thread handle stripes from */
+	cpumask_t		work_mask;
 };
 
 /*
Index: linux/drivers/md/md.c
===================================================================
--- linux.orig/drivers/md/md.c	2012-08-09 10:43:04.796022675 +0800
+++ linux/drivers/md/md.c	2012-08-09 16:17:19.367902517 +0800
@@ -640,9 +640,9 @@ static struct mddev * mddev_find(dev_t u
 	goto retry;
 }
 
-static inline int mddev_lock(struct mddev * mddev)
+int md_queue_misc_work(struct work_struct *work)
 {
-	return mutex_lock_interruptible(&mddev->reconfig_mutex);
+	return queue_work(md_misc_wq, work);
 }
 
 static inline int mddev_is_locked(struct mddev *mddev)
@@ -657,7 +657,7 @@ static inline int mddev_trylock(struct m
 
 static struct attribute_group md_redundancy_group;
 
-static void mddev_unlock(struct mddev * mddev)
+void mddev_unlock(struct mddev * mddev)
 {
 	if (mddev->to_remove) {
 		/* These cannot be removed under reconfig_mutex as
@@ -8569,6 +8569,8 @@ EXPORT_SYMBOL(md_register_thread);
 EXPORT_SYMBOL(md_unregister_thread);
 EXPORT_SYMBOL(md_wakeup_thread);
 EXPORT_SYMBOL(md_check_recovery);
+EXPORT_SYMBOL(mddev_unlock);
+EXPORT_SYMBOL(md_queue_misc_work);
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("MD RAID framework");
 MODULE_ALIAS("md");
Index: linux/drivers/md/md.h
===================================================================
--- linux.orig/drivers/md/md.h	2012-08-09 10:43:04.800022626 +0800
+++ linux/drivers/md/md.h	2012-08-09 16:14:26.318078815 +0800
@@ -636,4 +636,12 @@ static inline int mddev_check_plugged(st
 	return !!blk_check_plugged(md_unplug, mddev,
 				   sizeof(struct blk_plug_cb));
 }
+
+static inline int mddev_lock(struct mddev * mddev)
+{
+	return mutex_lock_interruptible(&mddev->reconfig_mutex);
+}
+extern void mddev_unlock(struct mddev *mddev);
+extern int md_queue_misc_work(struct work_struct *work);
+
 #endif /* _MD_MD_H */

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2012-08-09  8:58 [patch 2/2 v3]raid5: create multiple threads to handle stripes Shaohua Li
@ 2012-08-11  8:45 ` Jianpeng Ma
  2012-08-13  0:21   ` Shaohua Li
  2012-08-13  4:29 ` NeilBrown
  2013-03-07  7:31 ` Shaohua Li
  2 siblings, 1 reply; 29+ messages in thread
From: Jianpeng Ma @ 2012-08-11  8:45 UTC (permalink / raw)
  To: shli, linux-raid; +Cc: Neil Brown, Dan Williams

On 2012-08-09 16:58 Shaohua Li <shli@kernel.org> Wrote:
>This is a new tempt to make raid5 handle stripes in multiple threads, as
>suggested by Neil to have maxium flexibility and better numa binding. It
>basically is a combination of my first and second generation patches. By
>default, no multiple thread is enabled (all stripes are handled by raid5d).
>
>An example to enable multiple threads:
>#echo 3 > /sys/block/md0/md/auxthread_number
>This will create 3 auxiliary threads to handle stripes. The threads can run
>on any cpus and handle stripes produced by any cpus.
>
>#echo 1-3 > /sys/block/md0/md/auxth0/cpulist
>This will bind auxiliary thread 0 to cpu 1-3, and this thread will only handle
>stripes produced by cpu 1-3. User tool can further change the thread's
>affinity, but the thread can only handle stripes produced by cpu 1-3 till the
>sysfs entry is changed again.
>
>If stripes produced by a CPU aren't handled by any auxiliary thread, such
>stripes will be handled by raid5d. Otherwise, raid5d doesn't handle any
>stripes.
>
I tested and found two problem(maybe not).

1:print cpulist of auxth, you maybe lost print the '\n'.
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7c8151a..3700cdc 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4911,9 +4911,13 @@ struct raid5_auxth_sysfs {
 static ssize_t raid5_show_thread_cpulist(struct mddev *mddev,
        struct raid5_auxth *thread, char *page)
 {
+       int n;
        if (!mddev->private)
                return 0;
-       return cpulist_scnprintf(page, PAGE_SIZE, &thread->work_mask);
+       n = cpulist_scnprintf(page, PAGE_SIZE - 2, &thread->work_mask);
+       page[n++] = '\n';
+       page[n] = 0;
+       return n;
 }
 
 static ssize_t

2: Test 'dd if=/dev/zero of=/dev/md0 bs=2M ', the performance regress remarkable.
auxthread_number=0, 200MB/s;
auxthread_number=4, 95MB/s.


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2012-08-11  8:45 ` Jianpeng Ma
@ 2012-08-13  0:21   ` Shaohua Li
  2012-08-13  1:06     ` Jianpeng Ma
  2012-08-13  9:11     ` Jianpeng Ma
  0 siblings, 2 replies; 29+ messages in thread
From: Shaohua Li @ 2012-08-13  0:21 UTC (permalink / raw)
  To: Jianpeng Ma; +Cc: linux-raid, Neil Brown, Dan Williams

2012/8/11 Jianpeng Ma <majianpeng@gmail.com>:
> On 2012-08-09 16:58 Shaohua Li <shli@kernel.org> Wrote:
>>This is a new tempt to make raid5 handle stripes in multiple threads, as
>>suggested by Neil to have maxium flexibility and better numa binding. It
>>basically is a combination of my first and second generation patches. By
>>default, no multiple thread is enabled (all stripes are handled by raid5d).
>>
>>An example to enable multiple threads:
>>#echo 3 > /sys/block/md0/md/auxthread_number
>>This will create 3 auxiliary threads to handle stripes. The threads can run
>>on any cpus and handle stripes produced by any cpus.
>>
>>#echo 1-3 > /sys/block/md0/md/auxth0/cpulist
>>This will bind auxiliary thread 0 to cpu 1-3, and this thread will only handle
>>stripes produced by cpu 1-3. User tool can further change the thread's
>>affinity, but the thread can only handle stripes produced by cpu 1-3 till the
>>sysfs entry is changed again.
>>
>>If stripes produced by a CPU aren't handled by any auxiliary thread, such
>>stripes will be handled by raid5d. Otherwise, raid5d doesn't handle any
>>stripes.
>>
> I tested and found two problem(maybe not).
>
> 1:print cpulist of auxth, you maybe lost print the '\n'.
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 7c8151a..3700cdc 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -4911,9 +4911,13 @@ struct raid5_auxth_sysfs {
>  static ssize_t raid5_show_thread_cpulist(struct mddev *mddev,
>         struct raid5_auxth *thread, char *page)
>  {
> +       int n;
>         if (!mddev->private)
>                 return 0;
> -       return cpulist_scnprintf(page, PAGE_SIZE, &thread->work_mask);
> +       n = cpulist_scnprintf(page, PAGE_SIZE - 2, &thread->work_mask);
> +       page[n++] = '\n';
> +       page[n] = 0;
> +       return n;
>  }
>
>  static ssize_t

some sysfs entries print out '\n', some not, I don't mind add it

> 2: Test 'dd if=/dev/zero of=/dev/md0 bs=2M ', the performance regress remarkable.
> auxthread_number=0, 200MB/s;
> auxthread_number=4, 95MB/s.

So multiple threads handle stripes reduce request merge. In your
workload, raid5d isn't a bottleneck at all. In practice, I thought only
array which can drive high IOPS needs enable multi thread. And
if you create multiple threads, better let the threads handle different
cpus.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2012-08-13  0:21   ` Shaohua Li
@ 2012-08-13  1:06     ` Jianpeng Ma
  2012-08-13  2:13       ` Shaohua Li
  2012-08-13  9:11     ` Jianpeng Ma
  1 sibling, 1 reply; 29+ messages in thread
From: Jianpeng Ma @ 2012-08-13  1:06 UTC (permalink / raw)
  To: shli; +Cc: linux-raid, Neil Brown, Dan Williams

On 2012-08-13 08:21 Shaohua Li <shli@kernel.org> Wrote:
>2012/8/11 Jianpeng Ma <majianpeng@gmail.com>:
>> On 2012-08-09 16:58 Shaohua Li <shli@kernel.org> Wrote:
>>>This is a new tempt to make raid5 handle stripes in multiple threads, as
>>>suggested by Neil to have maxium flexibility and better numa binding. It
>>>basically is a combination of my first and second generation patches. By
>>>default, no multiple thread is enabled (all stripes are handled by raid5d).
>>>
>>>An example to enable multiple threads:
>>>#echo 3 > /sys/block/md0/md/auxthread_number
>>>This will create 3 auxiliary threads to handle stripes. The threads can run
>>>on any cpus and handle stripes produced by any cpus.
>>>
>>>#echo 1-3 > /sys/block/md0/md/auxth0/cpulist
>>>This will bind auxiliary thread 0 to cpu 1-3, and this thread will only handle
>>>stripes produced by cpu 1-3. User tool can further change the thread's
>>>affinity, but the thread can only handle stripes produced by cpu 1-3 till the
>>>sysfs entry is changed again.
>>>
>>>If stripes produced by a CPU aren't handled by any auxiliary thread, such
>>>stripes will be handled by raid5d. Otherwise, raid5d doesn't handle any
>>>stripes.
>>>
>> I tested and found two problem(maybe not).
>>
>> 1:print cpulist of auxth, you maybe lost print the '\n'.
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index 7c8151a..3700cdc 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -4911,9 +4911,13 @@ struct raid5_auxth_sysfs {
>>  static ssize_t raid5_show_thread_cpulist(struct mddev *mddev,
>>         struct raid5_auxth *thread, char *page)
>>  {
>> +       int n;
>>         if (!mddev->private)
>>                 return 0;
>> -       return cpulist_scnprintf(page, PAGE_SIZE, &thread->work_mask);
>> +       n = cpulist_scnprintf(page, PAGE_SIZE - 2, &thread->work_mask);
>> +       page[n++] = '\n';
>> +       page[n] = 0;
>> +       return n;
>>  }
>>
>>  static ssize_t
>
>some sysfs entries print out '\n', some not, I don't mind add it
I search kernel code found places which like this print out '\n';
Can you tell rule which use or not?
Thanks!
>
>> 2: Test 'dd if=/dev/zero of=/dev/md0 bs=2M ', the performance regress remarkable.
>> auxthread_number=0, 200MB/s;
>> auxthread_number=4, 95MB/s.
>
>So multiple threads handle stripes reduce request merge. In your
>workload, raid5d isn't a bottleneck at all. In practice, I thought only
>array which can drive high IOPS needs enable multi thread. And
>if you create multiple threads, better let the threads handle different
>cpus.
I will test for multiple threads.
>
>Thanks,
>Shaohua

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2012-08-13  1:06     ` Jianpeng Ma
@ 2012-08-13  2:13       ` Shaohua Li
  2012-08-13  2:20         ` Shaohua Li
  0 siblings, 1 reply; 29+ messages in thread
From: Shaohua Li @ 2012-08-13  2:13 UTC (permalink / raw)
  To: Jianpeng Ma; +Cc: linux-raid, Neil Brown, Dan Williams

On Mon, Aug 13, 2012 at 09:06:45AM +0800, Jianpeng Ma wrote:
> On 2012-08-13 08:21 Shaohua Li <shli@kernel.org> Wrote:
> >2012/8/11 Jianpeng Ma <majianpeng@gmail.com>:
> >> On 2012-08-09 16:58 Shaohua Li <shli@kernel.org> Wrote:
> >>>This is a new tempt to make raid5 handle stripes in multiple threads, as
> >>>suggested by Neil to have maxium flexibility and better numa binding. It
> >>>basically is a combination of my first and second generation patches. By
> >>>default, no multiple thread is enabled (all stripes are handled by raid5d).
> >>>
> >>>An example to enable multiple threads:
> >>>#echo 3 > /sys/block/md0/md/auxthread_number
> >>>This will create 3 auxiliary threads to handle stripes. The threads can run
> >>>on any cpus and handle stripes produced by any cpus.
> >>>
> >>>#echo 1-3 > /sys/block/md0/md/auxth0/cpulist
> >>>This will bind auxiliary thread 0 to cpu 1-3, and this thread will only handle
> >>>stripes produced by cpu 1-3. User tool can further change the thread's
> >>>affinity, but the thread can only handle stripes produced by cpu 1-3 till the
> >>>sysfs entry is changed again.
> >>>
> >>>If stripes produced by a CPU aren't handled by any auxiliary thread, such
> >>>stripes will be handled by raid5d. Otherwise, raid5d doesn't handle any
> >>>stripes.
> >>>
> >> I tested and found two problem(maybe not).
> >>
> >> 1:print cpulist of auxth, you maybe lost print the '\n'.
> >> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> >> index 7c8151a..3700cdc 100644
> >> --- a/drivers/md/raid5.c
> >> +++ b/drivers/md/raid5.c
> >> @@ -4911,9 +4911,13 @@ struct raid5_auxth_sysfs {
> >>  static ssize_t raid5_show_thread_cpulist(struct mddev *mddev,
> >>         struct raid5_auxth *thread, char *page)
> >>  {
> >> +       int n;
> >>         if (!mddev->private)
> >>                 return 0;
> >> -       return cpulist_scnprintf(page, PAGE_SIZE, &thread->work_mask);
> >> +       n = cpulist_scnprintf(page, PAGE_SIZE - 2, &thread->work_mask);
> >> +       page[n++] = '\n';
> >> +       page[n] = 0;
> >> +       return n;
> >>  }
> >>
> >>  static ssize_t
> >
> >some sysfs entries print out '\n', some not, I don't mind add it
> I search kernel code found places which like this print out '\n';
> Can you tell rule which use or not?
> Thanks!

I'm not aware any rule about this

> >> 2: Test 'dd if=/dev/zero of=/dev/md0 bs=2M ', the performance regress remarkable.
> >> auxthread_number=0, 200MB/s;
> >> auxthread_number=4, 95MB/s.
> >
> >So multiple threads handle stripes reduce request merge. In your
> >workload, raid5d isn't a bottleneck at all. In practice, I thought only
> >array which can drive high IOPS needs enable multi thread. And
> >if you create multiple threads, better let the threads handle different
> >cpus.
> I will test for multiple threads.
Thanks

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2012-08-13  2:13       ` Shaohua Li
@ 2012-08-13  2:20         ` Shaohua Li
  2012-08-13  2:25           ` Jianpeng Ma
                             ` (2 more replies)
  0 siblings, 3 replies; 29+ messages in thread
From: Shaohua Li @ 2012-08-13  2:20 UTC (permalink / raw)
  To: Jianpeng Ma; +Cc: linux-raid, Neil Brown, Dan Williams

2012/8/13 Shaohua Li <shli@kernel.org>:
> On Mon, Aug 13, 2012 at 09:06:45AM +0800, Jianpeng Ma wrote:
>> On 2012-08-13 08:21 Shaohua Li <shli@kernel.org> Wrote:
>> >2012/8/11 Jianpeng Ma <majianpeng@gmail.com>:
>> >> On 2012-08-09 16:58 Shaohua Li <shli@kernel.org> Wrote:
>> >>>This is a new tempt to make raid5 handle stripes in multiple threads, as
>> >>>suggested by Neil to have maxium flexibility and better numa binding. It
>> >>>basically is a combination of my first and second generation patches. By
>> >>>default, no multiple thread is enabled (all stripes are handled by raid5d).
>> >>>
>> >>>An example to enable multiple threads:
>> >>>#echo 3 > /sys/block/md0/md/auxthread_number
>> >>>This will create 3 auxiliary threads to handle stripes. The threads can run
>> >>>on any cpus and handle stripes produced by any cpus.
>> >>>
>> >>>#echo 1-3 > /sys/block/md0/md/auxth0/cpulist
>> >>>This will bind auxiliary thread 0 to cpu 1-3, and this thread will only handle
>> >>>stripes produced by cpu 1-3. User tool can further change the thread's
>> >>>affinity, but the thread can only handle stripes produced by cpu 1-3 till the
>> >>>sysfs entry is changed again.
>> >>>
>> >>>If stripes produced by a CPU aren't handled by any auxiliary thread, such
>> >>>stripes will be handled by raid5d. Otherwise, raid5d doesn't handle any
>> >>>stripes.
>> >>>
>> >> I tested and found two problem(maybe not).
>> >>
>> >> 1:print cpulist of auxth, you maybe lost print the '\n'.
>> >> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> >> index 7c8151a..3700cdc 100644
>> >> --- a/drivers/md/raid5.c
>> >> +++ b/drivers/md/raid5.c
>> >> @@ -4911,9 +4911,13 @@ struct raid5_auxth_sysfs {
>> >>  static ssize_t raid5_show_thread_cpulist(struct mddev *mddev,
>> >>         struct raid5_auxth *thread, char *page)
>> >>  {
>> >> +       int n;
>> >>         if (!mddev->private)
>> >>                 return 0;
>> >> -       return cpulist_scnprintf(page, PAGE_SIZE, &thread->work_mask);
>> >> +       n = cpulist_scnprintf(page, PAGE_SIZE - 2, &thread->work_mask);
>> >> +       page[n++] = '\n';
>> >> +       page[n] = 0;
>> >> +       return n;
>> >>  }
>> >>
>> >>  static ssize_t
>> >
>> >some sysfs entries print out '\n', some not, I don't mind add it
>> I search kernel code found places which like this print out '\n';
>> Can you tell rule which use or not?
>> Thanks!
>
> I'm not aware any rule about this
>
>> >> 2: Test 'dd if=/dev/zero of=/dev/md0 bs=2M ', the performance regress remarkable.
>> >> auxthread_number=0, 200MB/s;
>> >> auxthread_number=4, 95MB/s.
>> >
>> >So multiple threads handle stripes reduce request merge. In your
>> >workload, raid5d isn't a bottleneck at all. In practice, I thought only
>> >array which can drive high IOPS needs enable multi thread. And
>> >if you create multiple threads, better let the threads handle different
>> >cpus.
>> I will test for multiple threads.
> Thanks

BTW, can you try below patch for the above dd workload?
http://git.kernel.dk/?p=linux-block.git;a=commitdiff;h=274193224cdabd687d804a26e0150bb20f2dd52c
That one is reverted in upstream, but eventually we should make it
enter again after some CFQ issues are fixed.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2012-08-13  2:20         ` Shaohua Li
@ 2012-08-13  2:25           ` Jianpeng Ma
  2012-08-13  4:21           ` NeilBrown
  2012-08-14 10:39           ` Jianpeng Ma
  2 siblings, 0 replies; 29+ messages in thread
From: Jianpeng Ma @ 2012-08-13  2:25 UTC (permalink / raw)
  To: shli; +Cc: linux-raid, Neil Brown, Dan Williams

On 2012-08-13 10:20 Shaohua Li <shli@kernel.org> Wrote:
>2012/8/13 Shaohua Li <shli@kernel.org>:
>> On Mon, Aug 13, 2012 at 09:06:45AM +0800, Jianpeng Ma wrote:
>>> On 2012-08-13 08:21 Shaohua Li <shli@kernel.org> Wrote:
>>> >2012/8/11 Jianpeng Ma <majianpeng@gmail.com>:
>>> >> On 2012-08-09 16:58 Shaohua Li <shli@kernel.org> Wrote:
>>> >>>This is a new tempt to make raid5 handle stripes in multiple threads, as
>>> >>>suggested by Neil to have maxium flexibility and better numa binding. It
>>> >>>basically is a combination of my first and second generation patches. By
>>> >>>default, no multiple thread is enabled (all stripes are handled by raid5d).
>>> >>>
>>> >>>An example to enable multiple threads:
>>> >>>#echo 3 > /sys/block/md0/md/auxthread_number
>>> >>>This will create 3 auxiliary threads to handle stripes. The threads can run
>>> >>>on any cpus and handle stripes produced by any cpus.
>>> >>>
>>> >>>#echo 1-3 > /sys/block/md0/md/auxth0/cpulist
>>> >>>This will bind auxiliary thread 0 to cpu 1-3, and this thread will only handle
>>> >>>stripes produced by cpu 1-3. User tool can further change the thread's
>>> >>>affinity, but the thread can only handle stripes produced by cpu 1-3 till the
>>> >>>sysfs entry is changed again.
>>> >>>
>>> >>>If stripes produced by a CPU aren't handled by any auxiliary thread, such
>>> >>>stripes will be handled by raid5d. Otherwise, raid5d doesn't handle any
>>> >>>stripes.
>>> >>>
>>> >> I tested and found two problem(maybe not).
>>> >>
>>> >> 1:print cpulist of auxth, you maybe lost print the '\n'.
>>> >> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>>> >> index 7c8151a..3700cdc 100644
>>> >> --- a/drivers/md/raid5.c
>>> >> +++ b/drivers/md/raid5.c
>>> >> @@ -4911,9 +4911,13 @@ struct raid5_auxth_sysfs {
>>> >>  static ssize_t raid5_show_thread_cpulist(struct mddev *mddev,
>>> >>         struct raid5_auxth *thread, char *page)
>>> >>  {
>>> >> +       int n;
>>> >>         if (!mddev->private)
>>> >>                 return 0;
>>> >> -       return cpulist_scnprintf(page, PAGE_SIZE, &thread->work_mask);
>>> >> +       n = cpulist_scnprintf(page, PAGE_SIZE - 2, &thread->work_mask);
>>> >> +       page[n++] = '\n';
>>> >> +       page[n] = 0;
>>> >> +       return n;
>>> >>  }
>>> >>
>>> >>  static ssize_t
>>> >
>>> >some sysfs entries print out '\n', some not, I don't mind add it
>>> I search kernel code found places which like this print out '\n';
>>> Can you tell rule which use or not?
>>> Thanks!
>>
>> I'm not aware any rule about this
>>
>>> >> 2: Test 'dd if=/dev/zero of=/dev/md0 bs=2M ', the performance regress remarkable.
>>> >> auxthread_number=0, 200MB/s;
>>> >> auxthread_number=4, 95MB/s.
>>> >
>>> >So multiple threads handle stripes reduce request merge. In your
>>> >workload, raid5d isn't a bottleneck at all. In practice, I thought only
>>> >array which can drive high IOPS needs enable multi thread. And
>>> >if you create multiple threads, better let the threads handle different
>>> >cpus.
>>> I will test for multiple threads.
>> Thanks
>
>BTW, can you try below patch for the above dd workload?
>http://git.kernel.dk/?p=linux-block.git;a=commitdiff;h=274193224cdabd687d804a26e0150bb20f2dd52c
>That one is reverted in upstream, but eventually we should make it
>enter again after some CFQ issues are fixed.
Ok, i will do and sent the result.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2012-08-13  2:20         ` Shaohua Li
  2012-08-13  2:25           ` Jianpeng Ma
@ 2012-08-13  4:21           ` NeilBrown
  2012-08-14 10:39           ` Jianpeng Ma
  2 siblings, 0 replies; 29+ messages in thread
From: NeilBrown @ 2012-08-13  4:21 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Jianpeng Ma, linux-raid, Dan Williams

[-- Attachment #1: Type: text/plain, Size: 1275 bytes --]

On Mon, 13 Aug 2012 10:20:01 +0800 Shaohua Li <shli@kernel.org> wrote:


> >> >> 1:print cpulist of auxth, you maybe lost print the '\n'.
> >> >> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> >> >> index 7c8151a..3700cdc 100644
> >> >> --- a/drivers/md/raid5.c
> >> >> +++ b/drivers/md/raid5.c
> >> >> @@ -4911,9 +4911,13 @@ struct raid5_auxth_sysfs {
> >> >>  static ssize_t raid5_show_thread_cpulist(struct mddev *mddev,
> >> >>         struct raid5_auxth *thread, char *page)
> >> >>  {
> >> >> +       int n;
> >> >>         if (!mddev->private)
> >> >>                 return 0;
> >> >> -       return cpulist_scnprintf(page, PAGE_SIZE, &thread->work_mask);
> >> >> +       n = cpulist_scnprintf(page, PAGE_SIZE - 2, &thread->work_mask);
> >> >> +       page[n++] = '\n';
> >> >> +       page[n] = 0;
> >> >> +       return n;
> >> >>  }
> >> >>
> >> >>  static ssize_t
> >> >
> >> >some sysfs entries print out '\n', some not, I don't mind add it
> >> I search kernel code found places which like this print out '\n';
> >> Can you tell rule which use or not?
> >> Thanks!
> >
> > I'm not aware any rule about this

I can provide a rule if it helps:
  sysfs files must *always* end with '\n' unless they are empty.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2012-08-09  8:58 [patch 2/2 v3]raid5: create multiple threads to handle stripes Shaohua Li
  2012-08-11  8:45 ` Jianpeng Ma
@ 2012-08-13  4:29 ` NeilBrown
  2012-08-13  6:22   ` Shaohua Li
  2013-03-07  7:31 ` Shaohua Li
  2 siblings, 1 reply; 29+ messages in thread
From: NeilBrown @ 2012-08-13  4:29 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, dan.j.williams

[-- Attachment #1: Type: text/plain, Size: 4547 bytes --]

On Thu, 9 Aug 2012 16:58:08 +0800 Shaohua Li <shli@kernel.org> wrote:

> This is a new tempt to make raid5 handle stripes in multiple threads, as
> suggested by Neil to have maxium flexibility and better numa binding. It
> basically is a combination of my first and second generation patches. By
> default, no multiple thread is enabled (all stripes are handled by raid5d).
> 
> An example to enable multiple threads:
> #echo 3 > /sys/block/md0/md/auxthread_number
> This will create 3 auxiliary threads to handle stripes. The threads can run
> on any cpus and handle stripes produced by any cpus.
> 
> #echo 1-3 > /sys/block/md0/md/auxth0/cpulist
> This will bind auxiliary thread 0 to cpu 1-3, and this thread will only handle
> stripes produced by cpu 1-3. User tool can further change the thread's
> affinity, but the thread can only handle stripes produced by cpu 1-3 till the
> sysfs entry is changed again.
> 
> If stripes produced by a CPU aren't handled by any auxiliary thread, such
> stripes will be handled by raid5d. Otherwise, raid5d doesn't handle any
> stripes.
> 
> Signed-off-by: Shaohua Li <shli@fusionio.com>
> ---
>  drivers/md/md.c    |    8 -
>  drivers/md/md.h    |    8 +
>  drivers/md/raid5.c |  406 ++++++++++++++++++++++++++++++++++++++++++++++++++---
>  drivers/md/raid5.h |   19 ++
>  4 files changed, 418 insertions(+), 23 deletions(-)
> 
> Index: linux/drivers/md/raid5.c
> ===================================================================
> --- linux.orig/drivers/md/raid5.c	2012-08-09 10:43:04.800022626 +0800
> +++ linux/drivers/md/raid5.c	2012-08-09 16:44:39.663278511 +0800
> @@ -196,6 +196,21 @@ static int stripe_operations_active(stru
>  	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
>  }
>  
> +static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
> +{
> +	struct r5conf *conf = sh->raid_conf;
> +	struct raid5_percpu *percpu;
> +	int i, orphaned = 1;
> +
> +	percpu = per_cpu_ptr(conf->percpu, sh->cpu);
> +	for_each_cpu(i, &percpu->handle_threads) {
> +		md_wakeup_thread(conf->aux_threads[i]->thread);
> +		orphaned = 0;
> +	}
> +	if (orphaned)
> +		md_wakeup_thread(conf->mddev->thread);
> +}
> +
>  static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
>  {
>  	BUG_ON(!list_empty(&sh->lru));
> @@ -208,9 +223,19 @@ static void do_release_stripe(struct r5c
>  			   sh->bm_seq - conf->seq_write > 0)
>  			list_add_tail(&sh->lru, &conf->bitmap_list);
>  		else {
> +			int cpu = sh->cpu;
> +			struct raid5_percpu *percpu;
> +			if (!cpu_online(cpu)) {
> +				cpu = cpumask_any(cpu_online_mask);
> +				sh->cpu = cpu;
> +			}
> +			percpu = per_cpu_ptr(conf->percpu, cpu);
> +
>  			clear_bit(STRIPE_DELAYED, &sh->state);
>  			clear_bit(STRIPE_BIT_DELAY, &sh->state);
> -			list_add_tail(&sh->lru, &conf->handle_list);
> +			list_add_tail(&sh->lru, &percpu->handle_list);
> +			raid5_wakeup_stripe_thread(sh);
> +			return;

I confess that I don't know a lot about cpu hotplug, but this looks like it
should have some locking.  In particular,  "get_online_cpus()" before we
check "cpu_online()", and "put_online_cpus()" after we have added to the
per_cpu->handle_list().

Maybe that isn't needed, but if it isn't  I'd like to understand why.


>  		}
>  		md_wakeup_thread(conf->mddev->thread);
>  	} else {
> @@ -355,6 +380,7 @@ static void init_stripe(struct stripe_he
>  		raid5_build_block(sh, i, previous);
>  	}
>  	insert_hash(conf, sh);
> +	sh->cpu = smp_processor_id();
>  }
>  
>  static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
> @@ -3689,12 +3715,19 @@ static void raid5_activate_delayed(struc
>  		while (!list_empty(&conf->delayed_list)) {
>  			struct list_head *l = conf->delayed_list.next;
>  			struct stripe_head *sh;
> +			int cpu;
>  			sh = list_entry(l, struct stripe_head, lru);
>  			list_del_init(l);
>  			clear_bit(STRIPE_DELAYED, &sh->state);
>  			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
>  				atomic_inc(&conf->preread_active_stripes);
>  			list_add_tail(&sh->lru, &conf->hold_list);
> +			cpu = sh->cpu;
> +			if (!cpu_online(cpu)) {
> +				cpu = cpumask_any(cpu_online_mask);
> +				sh->cpu = cpu;
> +			}
> +			raid5_wakeup_stripe_thread(sh);

Similarly here??
And anywhere  that 'cpu_online_mask' or 'cpu_online' are used.

I'll apply this to my for-next branch so it is easier to test but I won't
promise to submit for 3.6 just yet.


Thanks,
NeilBrown


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2012-08-13  4:29 ` NeilBrown
@ 2012-08-13  6:22   ` Shaohua Li
  0 siblings, 0 replies; 29+ messages in thread
From: Shaohua Li @ 2012-08-13  6:22 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid, dan.j.williams

On Mon, Aug 13, 2012 at 02:29:47PM +1000, NeilBrown wrote:
> On Thu, 9 Aug 2012 16:58:08 +0800 Shaohua Li <shli@kernel.org> wrote:
> 
> > This is a new tempt to make raid5 handle stripes in multiple threads, as
> > suggested by Neil to have maxium flexibility and better numa binding. It
> > basically is a combination of my first and second generation patches. By
> > default, no multiple thread is enabled (all stripes are handled by raid5d).
> > 
> > An example to enable multiple threads:
> > #echo 3 > /sys/block/md0/md/auxthread_number
> > This will create 3 auxiliary threads to handle stripes. The threads can run
> > on any cpus and handle stripes produced by any cpus.
> > 
> > #echo 1-3 > /sys/block/md0/md/auxth0/cpulist
> > This will bind auxiliary thread 0 to cpu 1-3, and this thread will only handle
> > stripes produced by cpu 1-3. User tool can further change the thread's
> > affinity, but the thread can only handle stripes produced by cpu 1-3 till the
> > sysfs entry is changed again.
> > 
> > If stripes produced by a CPU aren't handled by any auxiliary thread, such
> > stripes will be handled by raid5d. Otherwise, raid5d doesn't handle any
> > stripes.
> > 
> > Signed-off-by: Shaohua Li <shli@fusionio.com>
> > ---
> >  drivers/md/md.c    |    8 -
> >  drivers/md/md.h    |    8 +
> >  drivers/md/raid5.c |  406 ++++++++++++++++++++++++++++++++++++++++++++++++++---
> >  drivers/md/raid5.h |   19 ++
> >  4 files changed, 418 insertions(+), 23 deletions(-)
> > 
> > Index: linux/drivers/md/raid5.c
> > ===================================================================
> > --- linux.orig/drivers/md/raid5.c	2012-08-09 10:43:04.800022626 +0800
> > +++ linux/drivers/md/raid5.c	2012-08-09 16:44:39.663278511 +0800
> > @@ -196,6 +196,21 @@ static int stripe_operations_active(stru
> >  	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
> >  }
> >  
> > +static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
> > +{
> > +	struct r5conf *conf = sh->raid_conf;
> > +	struct raid5_percpu *percpu;
> > +	int i, orphaned = 1;
> > +
> > +	percpu = per_cpu_ptr(conf->percpu, sh->cpu);
> > +	for_each_cpu(i, &percpu->handle_threads) {
> > +		md_wakeup_thread(conf->aux_threads[i]->thread);
> > +		orphaned = 0;
> > +	}
> > +	if (orphaned)
> > +		md_wakeup_thread(conf->mddev->thread);
> > +}
> > +
> >  static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
> >  {
> >  	BUG_ON(!list_empty(&sh->lru));
> > @@ -208,9 +223,19 @@ static void do_release_stripe(struct r5c
> >  			   sh->bm_seq - conf->seq_write > 0)
> >  			list_add_tail(&sh->lru, &conf->bitmap_list);
> >  		else {
> > +			int cpu = sh->cpu;
> > +			struct raid5_percpu *percpu;
> > +			if (!cpu_online(cpu)) {
> > +				cpu = cpumask_any(cpu_online_mask);
> > +				sh->cpu = cpu;
> > +			}
> > +			percpu = per_cpu_ptr(conf->percpu, cpu);
> > +
> >  			clear_bit(STRIPE_DELAYED, &sh->state);
> >  			clear_bit(STRIPE_BIT_DELAY, &sh->state);
> > -			list_add_tail(&sh->lru, &conf->handle_list);
> > +			list_add_tail(&sh->lru, &percpu->handle_list);
> > +			raid5_wakeup_stripe_thread(sh);
> > +			return;
> 
> I confess that I don't know a lot about cpu hotplug, but this looks like it
> should have some locking.  In particular,  "get_online_cpus()" before we
> check "cpu_online()", and "put_online_cpus()" after we have added to the
> per_cpu->handle_list().
> 
> Maybe that isn't needed, but if it isn't  I'd like to understand why.

We already hold conf->device_lock. The same in the sectond place you mentioned.
What I learned is CPU hot remove can't happen between spinlock hold or
preemption disabled, which is how stop_machine() is designed.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2012-08-13  0:21   ` Shaohua Li
  2012-08-13  1:06     ` Jianpeng Ma
@ 2012-08-13  9:11     ` Jianpeng Ma
  1 sibling, 0 replies; 29+ messages in thread
From: Jianpeng Ma @ 2012-08-13  9:11 UTC (permalink / raw)
  To: shli; +Cc: linux-raid, Neil Brown, Dan Williams

On 2012-08-13 08:21 Shaohua Li <shli@kernel.org> Wrote:
>2012/8/11 Jianpeng Ma <majianpeng@gmail.com>:
>> On 2012-08-09 16:58 Shaohua Li <shli@kernel.org> Wrote:
>>>This is a new tempt to make raid5 handle stripes in multiple threads, as
>>>suggested by Neil to have maxium flexibility and better numa binding. It
[snip]
>> 2: Test 'dd if=/dev/zero of=/dev/md0 bs=2M ', the performance regress remarkable.
>> auxthread_number=0, 200MB/s;
>> auxthread_number=4, 95MB/s.
>
>So multiple threads handle stripes reduce request merge. In your
>workload, raid5d isn't a bottleneck at all. In practice, I thought only
>array which can drive high IOPS needs enable multi thread. And
>if you create multiple threads, better let the threads handle different
>cpus.
If dd write using buffer-mode, the make-rquest is exec by kernel-thread "flush-n:0".
So we only used multiple thread to write by odirect or sync to achieve the result which we wanted.
>
>Thanks,
>Shaohua

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2012-08-13  2:20         ` Shaohua Li
  2012-08-13  2:25           ` Jianpeng Ma
  2012-08-13  4:21           ` NeilBrown
@ 2012-08-14 10:39           ` Jianpeng Ma
  2012-08-15  3:51             ` Shaohua Li
  2 siblings, 1 reply; 29+ messages in thread
From: Jianpeng Ma @ 2012-08-14 10:39 UTC (permalink / raw)
  To: shli; +Cc: linux-raid

On 2012-08-13 10:20 Shaohua Li <shli@kernel.org> Wrote:
>2012/8/13 Shaohua Li <shli@kernel.org>:
>> On Mon, Aug 13, 2012 at 09:06:45AM +0800, Jianpeng Ma wrote:
>>> On 2012-08-13 08:21 Shaohua Li <shli@kernel.org> Wrote:
>>> >2012/8/11 Jianpeng Ma <majianpeng@gmail.com>:
>>> >> On 2012-08-09 16:58 Shaohua Li <shli@kernel.org> Wrote:
>>> >>>This is a new tempt to make raid5 handle stripes in multiple threads, as
>>> >>>suggested by Neil to have maxium flexibility and better numa binding. It
>>> >>>basically is a combination of my first and second generation patches. By
>>> >>>default, no multiple thread is enabled (all stripes are handled by raid5d).
>>> >>>
>>> >>>An example to enable multiple threads:
>>> >>>#echo 3 > /sys/block/md0/md/auxthread_number
>>> >>>This will create 3 auxiliary threads to handle stripes. The threads can run
>>> >>>on any cpus and handle stripes produced by any cpus.
>>> >>>
>>> >>>#echo 1-3 > /sys/block/md0/md/auxth0/cpulist
>>> >>>This will bind auxiliary thread 0 to cpu 1-3, and this thread will only handle
>>> >>>stripes produced by cpu 1-3. User tool can further change the thread's
>>> >>>affinity, but the thread can only handle stripes produced by cpu 1-3 till the
>>> >>>sysfs entry is changed again.
>>> >>>
>>> >>>If stripes produced by a CPU aren't handled by any auxiliary thread, such
>>> >>>stripes will be handled by raid5d. Otherwise, raid5d doesn't handle any
>>> >>>stripes.
>>> >>>
>>> >> I tested and found two problem(maybe not).
>>> >>
>>> >> 1:print cpulist of auxth, you maybe lost print the '\n'.
>>> >> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>>> >> index 7c8151a..3700cdc 100644
>>> >> --- a/drivers/md/raid5.c
>>> >> +++ b/drivers/md/raid5.c
>>> >> @@ -4911,9 +4911,13 @@ struct raid5_auxth_sysfs {
>>> >>  static ssize_t raid5_show_thread_cpulist(struct mddev *mddev,
>>> >>         struct raid5_auxth *thread, char *page)
>>> >>  {
>>> >> +       int n;
>>> >>         if (!mddev->private)
>>> >>                 return 0;
>>> >> -       return cpulist_scnprintf(page, PAGE_SIZE, &thread->work_mask);
>>> >> +       n = cpulist_scnprintf(page, PAGE_SIZE - 2, &thread->work_mask);
>>> >> +       page[n++] = '\n';
>>> >> +       page[n] = 0;
>>> >> +       return n;
>>> >>  }
>>> >>
>>> >>  static ssize_t
>>> >
>>> >some sysfs entries print out '\n', some not, I don't mind add it
>>> I search kernel code found places which like this print out '\n';
>>> Can you tell rule which use or not?
>>> Thanks!
>>
>> I'm not aware any rule about this
>>
>>> >> 2: Test 'dd if=/dev/zero of=/dev/md0 bs=2M ', the performance regress remarkable.
>>> >> auxthread_number=0, 200MB/s;
>>> >> auxthread_number=4, 95MB/s.
>>> >
>>> >So multiple threads handle stripes reduce request merge. In your
>>> >workload, raid5d isn't a bottleneck at all. In practice, I thought only
>>> >array which can drive high IOPS needs enable multi thread. And
>>> >if you create multiple threads, better let the threads handle different
>>> >cpus.
>>> I will test for multiple threads.
>> Thanks
I used fio for randwrite test using four thread which run different cpus.
The bs is 4k/8k/16k.
The result isn't increase regardless of whether using authread(four authread which run different cpu) or not?
Maybe my test config had problem?
>
>BTW, can you try below patch for the above dd workload?
>http://git.kernel.dk/?p=linux-block.git;a=commitdiff;h=274193224cdabd687d804a26e0150bb20f2dd52c
>That one is reverted in upstream, but eventually we should make it
>enter again after some CFQ issues are fixed.
I tested this patch.And not found problem.And the performance did not increase.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2012-08-14 10:39           ` Jianpeng Ma
@ 2012-08-15  3:51             ` Shaohua Li
  2012-08-15  6:21               ` Jianpeng Ma
  0 siblings, 1 reply; 29+ messages in thread
From: Shaohua Li @ 2012-08-15  3:51 UTC (permalink / raw)
  To: Jianpeng Ma, Neil Brown; +Cc: linux-raid

2012/8/14 Jianpeng Ma <majianpeng@gmail.com>:
> On 2012-08-13 10:20 Shaohua Li <shli@kernel.org> Wrote:
>>2012/8/13 Shaohua Li <shli@kernel.org>:
>>> On Mon, Aug 13, 2012 at 09:06:45AM +0800, Jianpeng Ma wrote:
>>>> On 2012-08-13 08:21 Shaohua Li <shli@kernel.org> Wrote:
>>>> >2012/8/11 Jianpeng Ma <majianpeng@gmail.com>:
>>>> >> On 2012-08-09 16:58 Shaohua Li <shli@kernel.org> Wrote:
>>>> >>>This is a new tempt to make raid5 handle stripes in multiple threads, as
>>>> >>>suggested by Neil to have maxium flexibility and better numa binding. It
>>>> >>>basically is a combination of my first and second generation patches. By
>>>> >>>default, no multiple thread is enabled (all stripes are handled by raid5d).
>>>> >>>
>>>> >>>An example to enable multiple threads:
>>>> >>>#echo 3 > /sys/block/md0/md/auxthread_number
>>>> >>>This will create 3 auxiliary threads to handle stripes. The threads can run
>>>> >>>on any cpus and handle stripes produced by any cpus.
>>>> >>>
>>>> >>>#echo 1-3 > /sys/block/md0/md/auxth0/cpulist
>>>> >>>This will bind auxiliary thread 0 to cpu 1-3, and this thread will only handle
>>>> >>>stripes produced by cpu 1-3. User tool can further change the thread's
>>>> >>>affinity, but the thread can only handle stripes produced by cpu 1-3 till the
>>>> >>>sysfs entry is changed again.
>>>> >>>
>>>> >>>If stripes produced by a CPU aren't handled by any auxiliary thread, such
>>>> >>>stripes will be handled by raid5d. Otherwise, raid5d doesn't handle any
>>>> >>>stripes.
>>>> >>>
>>>> >> I tested and found two problem(maybe not).
>>>> >>
>>>> >> 1:print cpulist of auxth, you maybe lost print the '\n'.
>>>> >> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>>>> >> index 7c8151a..3700cdc 100644
>>>> >> --- a/drivers/md/raid5.c
>>>> >> +++ b/drivers/md/raid5.c
>>>> >> @@ -4911,9 +4911,13 @@ struct raid5_auxth_sysfs {
>>>> >>  static ssize_t raid5_show_thread_cpulist(struct mddev *mddev,
>>>> >>         struct raid5_auxth *thread, char *page)
>>>> >>  {
>>>> >> +       int n;
>>>> >>         if (!mddev->private)
>>>> >>                 return 0;
>>>> >> -       return cpulist_scnprintf(page, PAGE_SIZE, &thread->work_mask);
>>>> >> +       n = cpulist_scnprintf(page, PAGE_SIZE - 2, &thread->work_mask);
>>>> >> +       page[n++] = '\n';
>>>> >> +       page[n] = 0;
>>>> >> +       return n;
>>>> >>  }
>>>> >>
>>>> >>  static ssize_t
>>>> >
>>>> >some sysfs entries print out '\n', some not, I don't mind add it
>>>> I search kernel code found places which like this print out '\n';
>>>> Can you tell rule which use or not?
>>>> Thanks!
>>>
>>> I'm not aware any rule about this
>>>
>>>> >> 2: Test 'dd if=/dev/zero of=/dev/md0 bs=2M ', the performance regress remarkable.
>>>> >> auxthread_number=0, 200MB/s;
>>>> >> auxthread_number=4, 95MB/s.
>>>> >
>>>> >So multiple threads handle stripes reduce request merge. In your
>>>> >workload, raid5d isn't a bottleneck at all. In practice, I thought only
>>>> >array which can drive high IOPS needs enable multi thread. And
>>>> >if you create multiple threads, better let the threads handle different
>>>> >cpus.
>>>> I will test for multiple threads.
>>> Thanks
> I used fio for randwrite test using four thread which run different cpus.
> The bs is 4k/8k/16k.
> The result isn't increase regardless of whether using authread(four authread which run different cpu) or not?
> Maybe my test config had problem?

how fast is your raid? If your raid can't drive high IOPS, it's
not strange multithread makes no difference.

>>BTW, can you try below patch for the above dd workload?
>>http://git.kernel.dk/?p=linux-block.git;a=commitdiff;h=274193224cdabd687d804a26e0150bb20f2dd52c
>>That one is reverted in upstream, but eventually we should make it
>>enter again after some CFQ issues are fixed.
> I tested this patch.And not found problem.And the performance did not increase.

Ok, each thread delivers request in random time, so merge doesn't
work even with that patch. I didn't worry about big size request too
much, since if you set correct affinity for the auxthread, the issue
should go away. And mulithread is for fast storage, I suppose it has
no advantages for harddisk raid. On the other hand, maybe we can
make MAX_STRIPE_BATCH bigger. Currently it's 8, so the auxthread
will dispatch 8*4k request for the workload. Changing it to 16
(16*4=64k) should be good enough even for hard disk raid.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2012-08-15  3:51             ` Shaohua Li
@ 2012-08-15  6:21               ` Jianpeng Ma
  2012-08-15  8:04                 ` Shaohua Li
  0 siblings, 1 reply; 29+ messages in thread
From: Jianpeng Ma @ 2012-08-15  6:21 UTC (permalink / raw)
  To: shli, Neil Brown; +Cc: linux-raid

On 2012-08-15 11:51 Shaohua Li <shli@kernel.org> Wrote:
>2012/8/14 Jianpeng Ma <majianpeng@gmail.com>:
>> On 2012-08-13 10:20 Shaohua Li <shli@kernel.org> Wrote:
>>>2012/8/13 Shaohua Li <shli@kernel.org>:
>>>> On Mon, Aug 13, 2012 at 09:06:45AM +0800, Jianpeng Ma wrote:
>>>>> On 2012-08-13 08:21 Shaohua Li <shli@kernel.org> Wrote:
>>>>> >2012/8/11 Jianpeng Ma <majianpeng@gmail.com>:
>>>>> >> On 2012-08-09 16:58 Shaohua Li <shli@kernel.org> Wrote:
>>>>> >>>This is a new tempt to make raid5 handle stripes in multiple threads, as
>>>>> >>>suggested by Neil to have maxium flexibility and better numa binding. It
>>>>> >>>basically is a combination of my first and second generation patches. By
>>>>> >>>default, no multiple thread is enabled (all stripes are handled by raid5d).
>>>>> >>>
>>>>> >>>An example to enable multiple threads:
>>>>> >>>#echo 3 > /sys/block/md0/md/auxthread_number
>>>>> >>>This will create 3 auxiliary threads to handle stripes. The threads can run
>>>>> >>>on any cpus and handle stripes produced by any cpus.
>>>>> >>>
>>>>> >>>#echo 1-3 > /sys/block/md0/md/auxth0/cpulist
>>>>> >>>This will bind auxiliary thread 0 to cpu 1-3, and this thread will only handle
>>>>> >>>stripes produced by cpu 1-3. User tool can further change the thread's
>>>>> >>>affinity, but the thread can only handle stripes produced by cpu 1-3 till the
>>>>> >>>sysfs entry is changed again.
>>>>> >>>
>>>>> >>>If stripes produced by a CPU aren't handled by any auxiliary thread, such
>>>>> >>>stripes will be handled by raid5d. Otherwise, raid5d doesn't handle any
>>>>> >>>stripes.
>>>>> >>>
>>>>> >> I tested and found two problem(maybe not).
>>>>> >>
>>>>> >> 1:print cpulist of auxth, you maybe lost print the '\n'.
>>>>> >> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>>>>> >> index 7c8151a..3700cdc 100644
>>>>> >> --- a/drivers/md/raid5.c
>>>>> >> +++ b/drivers/md/raid5.c
>>>>> >> @@ -4911,9 +4911,13 @@ struct raid5_auxth_sysfs {
>>>>> >>  static ssize_t raid5_show_thread_cpulist(struct mddev *mddev,
>>>>> >>         struct raid5_auxth *thread, char *page)
>>>>> >>  {
>>>>> >> +       int n;
>>>>> >>         if (!mddev->private)
>>>>> >>                 return 0;
>>>>> >> -       return cpulist_scnprintf(page, PAGE_SIZE, &thread->work_mask);
>>>>> >> +       n = cpulist_scnprintf(page, PAGE_SIZE - 2, &thread->work_mask);
>>>>> >> +       page[n++] = '\n';
>>>>> >> +       page[n] = 0;
>>>>> >> +       return n;
>>>>> >>  }
>>>>> >>
>>>>> >>  static ssize_t
>>>>> >
>>>>> >some sysfs entries print out '\n', some not, I don't mind add it
>>>>> I search kernel code found places which like this print out '\n';
>>>>> Can you tell rule which use or not?
>>>>> Thanks!
>>>>
>>>> I'm not aware any rule about this
>>>>
>>>>> >> 2: Test 'dd if=/dev/zero of=/dev/md0 bs=2M ', the performance regress remarkable.
>>>>> >> auxthread_number=0, 200MB/s;
>>>>> >> auxthread_number=4, 95MB/s.
>>>>> >
>>>>> >So multiple threads handle stripes reduce request merge. In your
>>>>> >workload, raid5d isn't a bottleneck at all. In practice, I thought only
>>>>> >array which can drive high IOPS needs enable multi thread. And
>>>>> >if you create multiple threads, better let the threads handle different
>>>>> >cpus.
>>>>> I will test for multiple threads.
>>>> Thanks
>> I used fio for randwrite test using four thread which run different cpus.
>> The bs is 4k/8k/16k.
>> The result isn't increase regardless of whether using authread(four authread which run different cpu) or not?
>> Maybe my test config had problem?
>
>how fast is your raid? If your raid can't drive high IOPS, it's
>not strange multithread makes no difference.
>
Only 175 for 4K. I think your patch for harddisk dose not effect.
Maybe it's only for ssd.
>>>BTW, can you try below patch for the above dd workload?
>>>http://git.kernel.dk/?p=linux-block.git;a=commitdiff;h=274193224cdabd687d804a26e0150bb20f2dd52c
>>>That one is reverted in upstream, but eventually we should make it
>>>enter again after some CFQ issues are fixed.
>> I tested this patch.And not found problem.And the performance did not increase.
>
>Ok, each thread delivers request in random time, so merge doesn't
>work even with that patch. I didn't worry about big size request too
>much, since if you set correct affinity for the auxthread, the issue
>should go away. And mulithread is for fast storage, I suppose it has
>no advantages for harddisk raid. On the other hand, maybe we can
>make MAX_STRIPE_BATCH bigger. Currently it's 8, so the auxthread
>will dispatch 8*4k request for the workload. Changing it to 16
>(16*4=64k) should be good enough even for hard disk raid.
>
I review your code and have a question about wakeup authread:
>static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
>{
>       struct r5conf *conf = sh->raid_conf;
>       struct raid5_percpu *percpu;
>       int i, orphaned = 1;
>
>       percpu = per_cpu_ptr(conf->percpu, sh->cpu);
>       for_each_cpu(i, &percpu->handle_threads) {
>               md_wakeup_thread(conf->aux_threads[i]->thread);
>               orphaned = 0;
>       }
If there are small stripes in handle_threads of cpu0.But the authread0/1 can run cpu0.
It's no necessary to wakup all thread.authread0 may exec all stripe,but the authread1 only wakeup and sleep,but it will spin_lock_irq(&conf->device_lock).
I think you should add some limited to do .

BTW, In my workload, i found some merge problem like this patch.At first,i wanted to add front-merge(why only had backmerge?).
But i readed your patch and it's a good idea than my.
Later, i readed the mailist about reverting your patch.
If use the code in blk_queue_bio():
>if (el_ret == ELEVATOR_BACK_MERGE) {
>		if (bio_attempt_back_merge(q, req, bio)) {
>			elv_bio_merged(q, req, bio);
>			if (!attempt_back_merge(q, req))
>				elv_merged_request(q, req, el_ret);
>			goto out_unlock;
>		}
>	} else if (el_ret == ELEVATOR_FRONT_MERGE) {
>		if (bio_attempt_front_merge(q, req, bio)) {
>			elv_bio_merged(q, req, bio);
>			if (!attempt_front_merge(q, req))
>				elv_merged_request(q, req, el_ret);
>			goto out_unlock;
>		}
The result is not good as your patch.But it's correct.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2012-08-15  6:21               ` Jianpeng Ma
@ 2012-08-15  8:04                 ` Shaohua Li
  2012-08-15  8:19                   ` Jianpeng Ma
  2012-09-24 11:15                   ` Jianpeng Ma
  0 siblings, 2 replies; 29+ messages in thread
From: Shaohua Li @ 2012-08-15  8:04 UTC (permalink / raw)
  To: Jianpeng Ma; +Cc: Neil Brown, linux-raid

2012/8/15 Jianpeng Ma <majianpeng@gmail.com>:
> On 2012-08-15 11:51 Shaohua Li <shli@kernel.org> Wrote:
>>2012/8/14 Jianpeng Ma <majianpeng@gmail.com>:
>>> On 2012-08-13 10:20 Shaohua Li <shli@kernel.org> Wrote:
>>>>2012/8/13 Shaohua Li <shli@kernel.org>:
>>>>> On Mon, Aug 13, 2012 at 09:06:45AM +0800, Jianpeng Ma wrote:
>>>>>> On 2012-08-13 08:21 Shaohua Li <shli@kernel.org> Wrote:
>>>>>> >2012/8/11 Jianpeng Ma <majianpeng@gmail.com>:
>>>>>> >> On 2012-08-09 16:58 Shaohua Li <shli@kernel.org> Wrote:
>>>>>> >>>This is a new tempt to make raid5 handle stripes in multiple threads, as
>>>>>> >>>suggested by Neil to have maxium flexibility and better numa binding. It
>>>>>> >>>basically is a combination of my first and second generation patches. By
>>>>>> >>>default, no multiple thread is enabled (all stripes are handled by raid5d).
>>>>>> >>>
>>>>>> >>>An example to enable multiple threads:
>>>>>> >>>#echo 3 > /sys/block/md0/md/auxthread_number
>>>>>> >>>This will create 3 auxiliary threads to handle stripes. The threads can run
>>>>>> >>>on any cpus and handle stripes produced by any cpus.
>>>>>> >>>
>>>>>> >>>#echo 1-3 > /sys/block/md0/md/auxth0/cpulist
>>>>>> >>>This will bind auxiliary thread 0 to cpu 1-3, and this thread will only handle
>>>>>> >>>stripes produced by cpu 1-3. User tool can further change the thread's
>>>>>> >>>affinity, but the thread can only handle stripes produced by cpu 1-3 till the
>>>>>> >>>sysfs entry is changed again.
>>>>>> >>>
>>>>>> >>>If stripes produced by a CPU aren't handled by any auxiliary thread, such
>>>>>> >>>stripes will be handled by raid5d. Otherwise, raid5d doesn't handle any
>>>>>> >>>stripes.
>>>>>> >>>
>>>>>> >> I tested and found two problem(maybe not).
>>>>>> >>
>>>>>> >> 1:print cpulist of auxth, you maybe lost print the '\n'.
>>>>>> >> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>>>>>> >> index 7c8151a..3700cdc 100644
>>>>>> >> --- a/drivers/md/raid5.c
>>>>>> >> +++ b/drivers/md/raid5.c
>>>>>> >> @@ -4911,9 +4911,13 @@ struct raid5_auxth_sysfs {
>>>>>> >>  static ssize_t raid5_show_thread_cpulist(struct mddev *mddev,
>>>>>> >>         struct raid5_auxth *thread, char *page)
>>>>>> >>  {
>>>>>> >> +       int n;
>>>>>> >>         if (!mddev->private)
>>>>>> >>                 return 0;
>>>>>> >> -       return cpulist_scnprintf(page, PAGE_SIZE, &thread->work_mask);
>>>>>> >> +       n = cpulist_scnprintf(page, PAGE_SIZE - 2, &thread->work_mask);
>>>>>> >> +       page[n++] = '\n';
>>>>>> >> +       page[n] = 0;
>>>>>> >> +       return n;
>>>>>> >>  }
>>>>>> >>
>>>>>> >>  static ssize_t
>>>>>> >
>>>>>> >some sysfs entries print out '\n', some not, I don't mind add it
>>>>>> I search kernel code found places which like this print out '\n';
>>>>>> Can you tell rule which use or not?
>>>>>> Thanks!
>>>>>
>>>>> I'm not aware any rule about this
>>>>>
>>>>>> >> 2: Test 'dd if=/dev/zero of=/dev/md0 bs=2M ', the performance regress remarkable.
>>>>>> >> auxthread_number=0, 200MB/s;
>>>>>> >> auxthread_number=4, 95MB/s.
>>>>>> >
>>>>>> >So multiple threads handle stripes reduce request merge. In your
>>>>>> >workload, raid5d isn't a bottleneck at all. In practice, I thought only
>>>>>> >array which can drive high IOPS needs enable multi thread. And
>>>>>> >if you create multiple threads, better let the threads handle different
>>>>>> >cpus.
>>>>>> I will test for multiple threads.
>>>>> Thanks
>>> I used fio for randwrite test using four thread which run different cpus.
>>> The bs is 4k/8k/16k.
>>> The result isn't increase regardless of whether using authread(four authread which run different cpu) or not?
>>> Maybe my test config had problem?
>>
>>how fast is your raid? If your raid can't drive high IOPS, it's
>>not strange multithread makes no difference.
>>
> Only 175 for 4K. I think your patch for harddisk dose not effect.
> Maybe it's only for ssd.
>>>>BTW, can you try below patch for the above dd workload?
>>>>http://git.kernel.dk/?p=linux-block.git;a=commitdiff;h=274193224cdabd687d804a26e0150bb20f2dd52c
>>>>That one is reverted in upstream, but eventually we should make it
>>>>enter again after some CFQ issues are fixed.
>>> I tested this patch.And not found problem.And the performance did not increase.
>>
>>Ok, each thread delivers request in random time, so merge doesn't
>>work even with that patch. I didn't worry about big size request too
>>much, since if you set correct affinity for the auxthread, the issue
>>should go away. And mulithread is for fast storage, I suppose it has
>>no advantages for harddisk raid. On the other hand, maybe we can
>>make MAX_STRIPE_BATCH bigger. Currently it's 8, so the auxthread
>>will dispatch 8*4k request for the workload. Changing it to 16
>>(16*4=64k) should be good enough even for hard disk raid.
>>
> I review your code and have a question about wakeup authread:
>>static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
>>{
>>       struct r5conf *conf = sh->raid_conf;
>>       struct raid5_percpu *percpu;
>>       int i, orphaned = 1;
>>
>>       percpu = per_cpu_ptr(conf->percpu, sh->cpu);
>>       for_each_cpu(i, &percpu->handle_threads) {
>>               md_wakeup_thread(conf->aux_threads[i]->thread);
>>               orphaned = 0;
>>       }
> If there are small stripes in handle_threads of cpu0.But the authread0/1 can run cpu0.
> It's no necessary to wakup all thread.authread0 may exec all stripe,but the authread1 only wakeup and sleep,but it will spin_lock_irq(&conf->device_lock).
> I think you should add some limited to do .

I used to have a counter for stripes queued, with it, we can determine
how many threads should be waken up. That is what I did when each
each thread can handle stripes from any CPU. I thought this problem
isn't sever now since each thread can just handle strips from one or
several CPUs. If this really is a problem, we can fix it later, but my test
doesn't show this is a problem.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2012-08-15  8:04                 ` Shaohua Li
@ 2012-08-15  8:19                   ` Jianpeng Ma
  2012-09-24 11:15                   ` Jianpeng Ma
  1 sibling, 0 replies; 29+ messages in thread
From: Jianpeng Ma @ 2012-08-15  8:19 UTC (permalink / raw)
  To: shli; +Cc: Neil Brown, linux-raid

On 2012-08-15 16:04 Shaohua Li <shli@kernel.org> Wrote:
>2012/8/15 Jianpeng Ma <majianpeng@gmail.com>:
>> On 2012-08-15 11:51 Shaohua Li <shli@kernel.org> Wrote:
>>>2012/8/14 Jianpeng Ma <majianpeng@gmail.com>:
>>>> On 2012-08-13 10:20 Shaohua Li <shli@kernel.org> Wrote:
>>>>>2012/8/13 Shaohua Li <shli@kernel.org>:
>>>>>> On Mon, Aug 13, 2012 at 09:06:45AM +0800, Jianpeng Ma wrote:
>>>>>>> On 2012-08-13 08:21 Shaohua Li <shli@kernel.org> Wrote:
>>>>>>> >2012/8/11 Jianpeng Ma <majianpeng@gmail.com>:
>>>>>>> >> On 2012-08-09 16:58 Shaohua Li <shli@kernel.org> Wrote:
>>>>>>> >>>This is a new tempt to make raid5 handle stripes in multiple threads, as
>>>>>>> >>>suggested by Neil to have maxium flexibility and better numa binding. It
>>>>>>> >>>basically is a combination of my first and second generation patches. By
>>>>>>> >>>default, no multiple thread is enabled (all stripes are handled by raid5d).
>>>>>>> >>>
>>>>>>> >>>An example to enable multiple threads:
>>>>>>> >>>#echo 3 > /sys/block/md0/md/auxthread_number
>>>>>>> >>>This will create 3 auxiliary threads to handle stripes. The threads can run
>>>>>>> >>>on any cpus and handle stripes produced by any cpus.
>>>>>>> >>>
>>>>>>> >>>#echo 1-3 > /sys/block/md0/md/auxth0/cpulist
>>>>>>> >>>This will bind auxiliary thread 0 to cpu 1-3, and this thread will only handle
>>>>>>> >>>stripes produced by cpu 1-3. User tool can further change the thread's
>>>>>>> >>>affinity, but the thread can only handle stripes produced by cpu 1-3 till the
>>>>>>> >>>sysfs entry is changed again.
>>>>>>> >>>
>>>>>>> >>>If stripes produced by a CPU aren't handled by any auxiliary thread, such
>>>>>>> >>>stripes will be handled by raid5d. Otherwise, raid5d doesn't handle any
>>>>>>> >>>stripes.
>>>>>>> >>>
>>>>>>> >> I tested and found two problem(maybe not).
>>>>>>> >>
>>>>>>> >> 1:print cpulist of auxth, you maybe lost print the '\n'.
>>>>>>> >> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>>>>>>> >> index 7c8151a..3700cdc 100644
>>>>>>> >> --- a/drivers/md/raid5.c
>>>>>>> >> +++ b/drivers/md/raid5.c
>>>>>>> >> @@ -4911,9 +4911,13 @@ struct raid5_auxth_sysfs {
>>>>>>> >>  static ssize_t raid5_show_thread_cpulist(struct mddev *mddev,
>>>>>>> >>         struct raid5_auxth *thread, char *page)
>>>>>>> >>  {
>>>>>>> >> +       int n;
>>>>>>> >>         if (!mddev->private)
>>>>>>> >>                 return 0;
>>>>>>> >> -       return cpulist_scnprintf(page, PAGE_SIZE, &thread->work_mask);
>>>>>>> >> +       n = cpulist_scnprintf(page, PAGE_SIZE - 2, &thread->work_mask);
>>>>>>> >> +       page[n++] = '\n';
>>>>>>> >> +       page[n] = 0;
>>>>>>> >> +       return n;
>>>>>>> >>  }
>>>>>>> >>
>>>>>>> >>  static ssize_t
>>>>>>> >
>>>>>>> >some sysfs entries print out '\n', some not, I don't mind add it
>>>>>>> I search kernel code found places which like this print out '\n';
>>>>>>> Can you tell rule which use or not?
>>>>>>> Thanks!
>>>>>>
>>>>>> I'm not aware any rule about this
>>>>>>
>>>>>>> >> 2: Test 'dd if=/dev/zero of=/dev/md0 bs=2M ', the performance regress remarkable.
>>>>>>> >> auxthread_number=0, 200MB/s;
>>>>>>> >> auxthread_number=4, 95MB/s.
>>>>>>> >
>>>>>>> >So multiple threads handle stripes reduce request merge. In your
>>>>>>> >workload, raid5d isn't a bottleneck at all. In practice, I thought only
>>>>>>> >array which can drive high IOPS needs enable multi thread. And
>>>>>>> >if you create multiple threads, better let the threads handle different
>>>>>>> >cpus.
>>>>>>> I will test for multiple threads.
>>>>>> Thanks
>>>> I used fio for randwrite test using four thread which run different cpus.
>>>> The bs is 4k/8k/16k.
>>>> The result isn't increase regardless of whether using authread(four authread which run different cpu) or not?
>>>> Maybe my test config had problem?
>>>
>>>how fast is your raid? If your raid can't drive high IOPS, it's
>>>not strange multithread makes no difference.
>>>
>> Only 175 for 4K. I think your patch for harddisk dose not effect.
>> Maybe it's only for ssd.
>>>>>BTW, can you try below patch for the above dd workload?
>>>>>http://git.kernel.dk/?p=linux-block.git;a=commitdiff;h=274193224cdabd687d804a26e0150bb20f2dd52c
>>>>>That one is reverted in upstream, but eventually we should make it
>>>>>enter again after some CFQ issues are fixed.
>>>> I tested this patch.And not found problem.And the performance did not increase.
>>>
>>>Ok, each thread delivers request in random time, so merge doesn't
>>>work even with that patch. I didn't worry about big size request too
>>>much, since if you set correct affinity for the auxthread, the issue
>>>should go away. And mulithread is for fast storage, I suppose it has
>>>no advantages for harddisk raid. On the other hand, maybe we can
>>>make MAX_STRIPE_BATCH bigger. Currently it's 8, so the auxthread
>>>will dispatch 8*4k request for the workload. Changing it to 16
>>>(16*4=64k) should be good enough even for hard disk raid.
>>>
>> I review your code and have a question about wakeup authread:
>>>static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
>>>{
>>>       struct r5conf *conf = sh->raid_conf;
>>>       struct raid5_percpu *percpu;
>>>       int i, orphaned = 1;
>>>
>>>       percpu = per_cpu_ptr(conf->percpu, sh->cpu);
>>>       for_each_cpu(i, &percpu->handle_threads) {
>>>               md_wakeup_thread(conf->aux_threads[i]->thread);
>>>               orphaned = 0;
>>>       }
>> If there are small stripes in handle_threads of cpu0.But the authread0/1 can run cpu0.
>> It's no necessary to wakup all thread.authread0 may exec all stripe,but the authread1 only wakeup and sleep,but it will spin_lock_irq(&conf->device_lock).
>> I think you should add some limited to do .
>
>I used to have a counter for stripes queued, with it, we can determine
>how many threads should be waken up. That is what I did when each
>each thread can handle stripes from any CPU. I thought this problem
>isn't sever now since each thread can just handle strips from one or
>several CPUs. If this really is a problem, we can fix it later, but my test
>doesn't show this is a problem.
Yes, you provide the interface for user.
Are there debuginfo for user to find info to control?
Because i didn't have ssd device,so the test about this patch can't do anymore.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2012-08-15  8:04                 ` Shaohua Li
  2012-08-15  8:19                   ` Jianpeng Ma
@ 2012-09-24 11:15                   ` Jianpeng Ma
  2012-09-26  1:26                     ` NeilBrown
  1 sibling, 1 reply; 29+ messages in thread
From: Jianpeng Ma @ 2012-09-24 11:15 UTC (permalink / raw)
  To: shli; +Cc: Neil Brown, linux-raid

Sorry for using this title to send this.
This code maybe error,so it should correct(If multiple-thread dosen't enable,it maybe also error,but no obviously).
In func handle_parity_checks5:
>>conf->mddev->resync_mismatches += STRIPE_SECTORS;
If multi-thread handle stripe on different cpu, there will be raced on resync_mismatches.

So the patch is:

Subject: [PATCH] md:change resync_mismatches to atomic64_t to avoid races

Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
---
 drivers/md/md.c     |    7 ++++---
 drivers/md/md.h     |    2 +-
 drivers/md/raid1.c  |    2 +-
 drivers/md/raid10.c |    2 +-
 drivers/md/raid5.c  |    4 ++--
 5 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 308e87b..b0b9474 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4278,7 +4278,8 @@ static ssize_t
 mismatch_cnt_show(struct mddev *mddev, char *page)
 {
 	return sprintf(page, "%llu\n",
-		       (unsigned long long) mddev->resync_mismatches);
+		       (unsigned long long)
+		       atomic64_read(&mddev->resync_mismatches));
 }
 
 static struct md_sysfs_entry md_scan_mode =
@@ -5245,7 +5246,7 @@ static void md_clean(struct mddev *mddev)
 	mddev->new_layout = 0;
 	mddev->new_chunk_sectors = 0;
 	mddev->curr_resync = 0;
-	mddev->resync_mismatches = 0;
+	atomic64_set(&mddev->resync_mismatches, 0);
 	mddev->suspend_lo = mddev->suspend_hi = 0;
 	mddev->sync_speed_min = mddev->sync_speed_max = 0;
 	mddev->recovery = 0;
@@ -7349,7 +7350,7 @@ void md_do_sync(struct mddev *mddev)
 		 * which defaults to physical size, but can be virtual size
 		 */
 		max_sectors = mddev->resync_max_sectors;
-		mddev->resync_mismatches = 0;
+		atomic64_set(&mddev->resync_mismatches, 0);
 		/* we don't use the checkpoint if there's a bitmap */
 		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
 			j = mddev->resync_min;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index f385b03..8f341d8 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -282,7 +282,7 @@ struct mddev {
 
 	sector_t			resync_max_sectors; /* may be set by personality */
 
-	sector_t			resync_mismatches; /* count of sectors where
+	atomic64_t			resync_mismatches; /* count of sectors where
 							    * parity/replica mismatch found
 							    */
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 611b5f7..ebceb98 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1867,7 +1867,7 @@ static int process_checks(struct r1bio *r1_bio)
 		} else
 			j = 0;
 		if (j >= 0)
-			mddev->resync_mismatches += r1_bio->sectors;
+			atomic64_add(r1_bio->sectors, &mddev->resync_mismatches);
 		if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
 			      && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
 			/* No need to write to this device. */
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 1c2eb38..ad0a8b7 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1950,7 +1950,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 					break;
 			if (j == vcnt)
 				continue;
-			mddev->resync_mismatches += r10_bio->sectors;
+			atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
 			if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
 				/* Don't fix anything. */
 				continue;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7031b86..0a2702f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2931,7 +2931,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
 			 */
 			set_bit(STRIPE_INSYNC, &sh->state);
 		else {
-			conf->mddev->resync_mismatches += STRIPE_SECTORS;
+			atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
 			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
 				/* don't try to repair!! */
 				set_bit(STRIPE_INSYNC, &sh->state);
@@ -3083,7 +3083,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
 				 */
 			}
 		} else {
-			conf->mddev->resync_mismatches += STRIPE_SECTORS;
+			atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
 			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
 				/* don't try to repair!! */
 				set_bit(STRIPE_INSYNC, &sh->state);
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2012-09-24 11:15                   ` Jianpeng Ma
@ 2012-09-26  1:26                     ` NeilBrown
  0 siblings, 0 replies; 29+ messages in thread
From: NeilBrown @ 2012-09-26  1:26 UTC (permalink / raw)
  To: Jianpeng Ma; +Cc: shli, linux-raid

[-- Attachment #1: Type: text/plain, Size: 4874 bytes --]

On Mon, 24 Sep 2012 19:15:12 +0800 "Jianpeng Ma" <majianpeng@gmail.com> wrote:

> Sorry for using this title to send this.

It isn't hard to change the Subject line for the Email, is it?

> This code maybe error,so it should correct(If multiple-thread dosen't enable,it maybe also error,but no obviously).
> In func handle_parity_checks5:
> >>conf->mddev->resync_mismatches += STRIPE_SECTORS;
> If multi-thread handle stripe on different cpu, there will be raced on resync_mismatches.
> 
> So the patch is:
> 
> Subject: [PATCH] md:change resync_mismatches to atomic64_t to avoid races

I doubt this is really necessary, as resync_mismatches doesn't need to be
precise.
But it doesn't really hurt either, so I've applied it.

Thanks,
NeilBrown



> 
> Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
> ---
>  drivers/md/md.c     |    7 ++++---
>  drivers/md/md.h     |    2 +-
>  drivers/md/raid1.c  |    2 +-
>  drivers/md/raid10.c |    2 +-
>  drivers/md/raid5.c  |    4 ++--
>  5 files changed, 9 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 308e87b..b0b9474 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -4278,7 +4278,8 @@ static ssize_t
>  mismatch_cnt_show(struct mddev *mddev, char *page)
>  {
>  	return sprintf(page, "%llu\n",
> -		       (unsigned long long) mddev->resync_mismatches);
> +		       (unsigned long long)
> +		       atomic64_read(&mddev->resync_mismatches));
>  }
>  
>  static struct md_sysfs_entry md_scan_mode =
> @@ -5245,7 +5246,7 @@ static void md_clean(struct mddev *mddev)
>  	mddev->new_layout = 0;
>  	mddev->new_chunk_sectors = 0;
>  	mddev->curr_resync = 0;
> -	mddev->resync_mismatches = 0;
> +	atomic64_set(&mddev->resync_mismatches, 0);
>  	mddev->suspend_lo = mddev->suspend_hi = 0;
>  	mddev->sync_speed_min = mddev->sync_speed_max = 0;
>  	mddev->recovery = 0;
> @@ -7349,7 +7350,7 @@ void md_do_sync(struct mddev *mddev)
>  		 * which defaults to physical size, but can be virtual size
>  		 */
>  		max_sectors = mddev->resync_max_sectors;
> -		mddev->resync_mismatches = 0;
> +		atomic64_set(&mddev->resync_mismatches, 0);
>  		/* we don't use the checkpoint if there's a bitmap */
>  		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
>  			j = mddev->resync_min;
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index f385b03..8f341d8 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -282,7 +282,7 @@ struct mddev {
>  
>  	sector_t			resync_max_sectors; /* may be set by personality */
>  
> -	sector_t			resync_mismatches; /* count of sectors where
> +	atomic64_t			resync_mismatches; /* count of sectors where
>  							    * parity/replica mismatch found
>  							    */
>  
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 611b5f7..ebceb98 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -1867,7 +1867,7 @@ static int process_checks(struct r1bio *r1_bio)
>  		} else
>  			j = 0;
>  		if (j >= 0)
> -			mddev->resync_mismatches += r1_bio->sectors;
> +			atomic64_add(r1_bio->sectors, &mddev->resync_mismatches);
>  		if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
>  			      && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
>  			/* No need to write to this device. */
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 1c2eb38..ad0a8b7 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -1950,7 +1950,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
>  					break;
>  			if (j == vcnt)
>  				continue;
> -			mddev->resync_mismatches += r10_bio->sectors;
> +			atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
>  			if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
>  				/* Don't fix anything. */
>  				continue;
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 7031b86..0a2702f 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -2931,7 +2931,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
>  			 */
>  			set_bit(STRIPE_INSYNC, &sh->state);
>  		else {
> -			conf->mddev->resync_mismatches += STRIPE_SECTORS;
> +			atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
>  			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
>  				/* don't try to repair!! */
>  				set_bit(STRIPE_INSYNC, &sh->state);
> @@ -3083,7 +3083,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
>  				 */
>  			}
>  		} else {
> -			conf->mddev->resync_mismatches += STRIPE_SECTORS;
> +			atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
>  			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
>  				/* don't try to repair!! */
>  				set_bit(STRIPE_INSYNC, &sh->state);


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2012-08-09  8:58 [patch 2/2 v3]raid5: create multiple threads to handle stripes Shaohua Li
  2012-08-11  8:45 ` Jianpeng Ma
  2012-08-13  4:29 ` NeilBrown
@ 2013-03-07  7:31 ` Shaohua Li
  2013-03-12  1:39   ` NeilBrown
  2 siblings, 1 reply; 29+ messages in thread
From: Shaohua Li @ 2013-03-07  7:31 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, dan.j.williams

Hi Neil,

I just noticed this one is missed. It used to be in your branch, but get
dropped some time. Is anything I missed? I can resend it if you like.

Thanks,
Shaohua

On Thu, Aug 09, 2012 at 04:58:08PM +0800, Shaohua Li wrote:
> This is a new tempt to make raid5 handle stripes in multiple threads, as
> suggested by Neil to have maxium flexibility and better numa binding. It
> basically is a combination of my first and second generation patches. By
> default, no multiple thread is enabled (all stripes are handled by raid5d).
> 
> An example to enable multiple threads:
> #echo 3 > /sys/block/md0/md/auxthread_number
> This will create 3 auxiliary threads to handle stripes. The threads can run
> on any cpus and handle stripes produced by any cpus.
> 
> #echo 1-3 > /sys/block/md0/md/auxth0/cpulist
> This will bind auxiliary thread 0 to cpu 1-3, and this thread will only handle
> stripes produced by cpu 1-3. User tool can further change the thread's
> affinity, but the thread can only handle stripes produced by cpu 1-3 till the
> sysfs entry is changed again.
> 
> If stripes produced by a CPU aren't handled by any auxiliary thread, such
> stripes will be handled by raid5d. Otherwise, raid5d doesn't handle any
> stripes.
> 
> Signed-off-by: Shaohua Li <shli@fusionio.com>
> ---
>  drivers/md/md.c    |    8 -
>  drivers/md/md.h    |    8 +
>  drivers/md/raid5.c |  406 ++++++++++++++++++++++++++++++++++++++++++++++++++---
>  drivers/md/raid5.h |   19 ++
>  4 files changed, 418 insertions(+), 23 deletions(-)
> 
> Index: linux/drivers/md/raid5.c
> ===================================================================
> --- linux.orig/drivers/md/raid5.c	2012-08-09 10:43:04.800022626 +0800
> +++ linux/drivers/md/raid5.c	2012-08-09 16:44:39.663278511 +0800
> @@ -196,6 +196,21 @@ static int stripe_operations_active(stru
>  	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
>  }
>  
> +static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
> +{
> +	struct r5conf *conf = sh->raid_conf;
> +	struct raid5_percpu *percpu;
> +	int i, orphaned = 1;
> +
> +	percpu = per_cpu_ptr(conf->percpu, sh->cpu);
> +	for_each_cpu(i, &percpu->handle_threads) {
> +		md_wakeup_thread(conf->aux_threads[i]->thread);
> +		orphaned = 0;
> +	}
> +	if (orphaned)
> +		md_wakeup_thread(conf->mddev->thread);
> +}
> +
>  static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
>  {
>  	BUG_ON(!list_empty(&sh->lru));
> @@ -208,9 +223,19 @@ static void do_release_stripe(struct r5c
>  			   sh->bm_seq - conf->seq_write > 0)
>  			list_add_tail(&sh->lru, &conf->bitmap_list);
>  		else {
> +			int cpu = sh->cpu;
> +			struct raid5_percpu *percpu;
> +			if (!cpu_online(cpu)) {
> +				cpu = cpumask_any(cpu_online_mask);
> +				sh->cpu = cpu;
> +			}
> +			percpu = per_cpu_ptr(conf->percpu, cpu);
> +
>  			clear_bit(STRIPE_DELAYED, &sh->state);
>  			clear_bit(STRIPE_BIT_DELAY, &sh->state);
> -			list_add_tail(&sh->lru, &conf->handle_list);
> +			list_add_tail(&sh->lru, &percpu->handle_list);
> +			raid5_wakeup_stripe_thread(sh);
> +			return;
>  		}
>  		md_wakeup_thread(conf->mddev->thread);
>  	} else {
> @@ -355,6 +380,7 @@ static void init_stripe(struct stripe_he
>  		raid5_build_block(sh, i, previous);
>  	}
>  	insert_hash(conf, sh);
> +	sh->cpu = smp_processor_id();
>  }
>  
>  static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
> @@ -3689,12 +3715,19 @@ static void raid5_activate_delayed(struc
>  		while (!list_empty(&conf->delayed_list)) {
>  			struct list_head *l = conf->delayed_list.next;
>  			struct stripe_head *sh;
> +			int cpu;
>  			sh = list_entry(l, struct stripe_head, lru);
>  			list_del_init(l);
>  			clear_bit(STRIPE_DELAYED, &sh->state);
>  			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
>  				atomic_inc(&conf->preread_active_stripes);
>  			list_add_tail(&sh->lru, &conf->hold_list);
> +			cpu = sh->cpu;
> +			if (!cpu_online(cpu)) {
> +				cpu = cpumask_any(cpu_online_mask);
> +				sh->cpu = cpu;
> +			}
> +			raid5_wakeup_stripe_thread(sh);
>  		}
>  	}
>  }
> @@ -3968,18 +4001,29 @@ static int chunk_aligned_read(struct mdd
>   * head of the hold_list has changed, i.e. the head was promoted to the
>   * handle_list.
>   */
> -static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
> +static struct stripe_head *__get_priority_stripe(struct r5conf *conf,
> +	cpumask_t *mask)
>  {
> -	struct stripe_head *sh;
> -
> +	struct stripe_head *sh = NULL, *tmp;
> +	struct list_head *handle_list = NULL;
> +	int cpu;
> +
> +	/* Should we take action to avoid starvation of latter CPUs ? */
> +	for_each_cpu(cpu, mask) {
> +		struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
> +		if (!list_empty(&percpu->handle_list)) {
> +			handle_list = &percpu->handle_list;
> +			break;
> +		}
> +	}
>  	pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
>  		  __func__,
> -		  list_empty(&conf->handle_list) ? "empty" : "busy",
> +		  !handle_list ? "empty" : "busy",
>  		  list_empty(&conf->hold_list) ? "empty" : "busy",
>  		  atomic_read(&conf->pending_full_writes), conf->bypass_count);
>  
> -	if (!list_empty(&conf->handle_list)) {
> -		sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
> +	if (handle_list) {
> +		sh = list_entry(handle_list->next, typeof(*sh), lru);
>  
>  		if (list_empty(&conf->hold_list))
>  			conf->bypass_count = 0;
> @@ -3997,12 +4041,23 @@ static struct stripe_head *__get_priorit
>  		   ((conf->bypass_threshold &&
>  		     conf->bypass_count > conf->bypass_threshold) ||
>  		    atomic_read(&conf->pending_full_writes) == 0)) {
> -		sh = list_entry(conf->hold_list.next,
> -				typeof(*sh), lru);
> -		conf->bypass_count -= conf->bypass_threshold;
> -		if (conf->bypass_count < 0)
> -			conf->bypass_count = 0;
> -	} else
> +
> +		list_for_each_entry(tmp, &conf->hold_list,  lru) {
> +			if (cpumask_test_cpu(tmp->cpu, mask) ||
> +			    !cpu_online(tmp->cpu)) {
> +				sh = tmp;
> +				break;
> +			}
> +		}
> +
> +		if (sh) {
> +			conf->bypass_count -= conf->bypass_threshold;
> +			if (conf->bypass_count < 0)
> +				conf->bypass_count = 0;
> +		}
> +	}
> +
> +	if (!sh)
>  		return NULL;
>  
>  	list_del_init(&sh->lru);
> @@ -4594,13 +4649,13 @@ static int  retry_aligned_read(struct r5
>  }
>  
>  #define MAX_STRIPE_BATCH 8
> -static int handle_active_stripes(struct r5conf *conf)
> +static int handle_active_stripes(struct r5conf *conf, cpumask_t *mask)
>  {
>  	struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
>  	int i, batch_size = 0;
>  
>  	while (batch_size < MAX_STRIPE_BATCH &&
> -			(sh = __get_priority_stripe(conf)) != NULL)
> +			(sh = __get_priority_stripe(conf, mask)) != NULL)
>  		batch[batch_size++] = sh;
>  
>  	if (batch_size == 0)
> @@ -4618,6 +4673,35 @@ static int handle_active_stripes(struct
>  	return batch_size;
>  }
>  
> +static void raid5auxd(struct md_thread *thread)
> +{
> +	struct mddev *mddev = thread->mddev;
> +	struct r5conf *conf = mddev->private;
> +	struct blk_plug plug;
> +	int handled;
> +	struct raid5_auxth *auxth = thread->private;
> +
> +	pr_debug("+++ raid5auxd active\n");
> +
> +	blk_start_plug(&plug);
> +	handled = 0;
> +	spin_lock_irq(&conf->device_lock);
> +	while (1) {
> +		int batch_size;
> +
> +		batch_size = handle_active_stripes(conf, &auxth->work_mask);
> +		if (!batch_size)
> +			break;
> +		handled += batch_size;
> +	}
> +	pr_debug("%d stripes handled\n", handled);
> +
> +	spin_unlock_irq(&conf->device_lock);
> +	blk_finish_plug(&plug);
> +
> +	pr_debug("--- raid5auxd inactive\n");
> +}
> +
>  /*
>   * This is our raid5 kernel thread.
>   *
> @@ -4665,7 +4749,7 @@ static void raid5d(struct md_thread *thr
>  			handled++;
>  		}
>  
> -		batch_size = handle_active_stripes(conf);
> +		batch_size = handle_active_stripes(conf, &conf->work_mask);
>  		if (!batch_size)
>  			break;
>  		handled += batch_size;
> @@ -4794,10 +4878,270 @@ stripe_cache_active_show(struct mddev *m
>  static struct md_sysfs_entry
>  raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
>  
> +static void raid5_update_threads_handle_mask(struct mddev *mddev)
> +{
> +	int cpu, i;
> +	struct raid5_percpu *percpu;
> +	struct r5conf *conf = mddev->private;
> +
> +	for_each_online_cpu(cpu) {
> +		percpu = per_cpu_ptr(conf->percpu, cpu);
> +		cpumask_clear(&percpu->handle_threads);
> +	}
> +	cpumask_copy(&conf->work_mask, cpu_online_mask);
> +
> +	for (i = 0; i < conf->aux_thread_num; i++) {
> +		cpumask_t *work_mask = &conf->aux_threads[i]->work_mask;
> +		for_each_cpu(cpu, work_mask) {
> +			percpu = per_cpu_ptr(conf->percpu, cpu);
> +			cpumask_set_cpu(i, &percpu->handle_threads);
> +		}
> +		cpumask_andnot(&conf->work_mask, &conf->work_mask,
> +				work_mask);
> +	}
> +}
> +
> +struct raid5_auxth_sysfs {
> +	struct attribute attr;
> +	ssize_t (*show)(struct mddev *, struct raid5_auxth *, char *);
> +	ssize_t (*store)(struct mddev *, struct raid5_auxth *,
> +		const char *, size_t);
> +};
> +
> +static ssize_t raid5_show_thread_cpulist(struct mddev *mddev,
> +	struct raid5_auxth *thread, char *page)
> +{
> +	if (!mddev->private)
> +		return 0;
> +	return cpulist_scnprintf(page, PAGE_SIZE, &thread->work_mask);
> +}
> +
> +static ssize_t
> +raid5_store_thread_cpulist(struct mddev *mddev, struct raid5_auxth *thread,
> +	const char *page, size_t len)
> +{
> +	struct r5conf *conf = mddev->private;
> +	cpumask_var_t mask;
> +
> +	if (!conf)
> +		return -ENODEV;
> +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
> +		return -ENOMEM;
> +
> +	if (cpulist_parse(page, mask)) {
> +		free_cpumask_var(mask);
> +		return -EINVAL;
> +	}
> +
> +	get_online_cpus();
> +	spin_lock_irq(&conf->device_lock);
> +	cpumask_copy(&thread->work_mask, mask);
> +	raid5_update_threads_handle_mask(mddev);
> +	spin_unlock_irq(&conf->device_lock);
> +	put_online_cpus();
> +	set_cpus_allowed_ptr(thread->thread->tsk, mask);
> +
> +	free_cpumask_var(mask);
> +	return len;
> +}
> +
> +static struct raid5_auxth_sysfs thread_cpulist =
> +__ATTR(cpulist, S_IRUGO|S_IWUSR, raid5_show_thread_cpulist,
> +	raid5_store_thread_cpulist);
> +
> +static struct attribute *auxth_attrs[] = {
> +	&thread_cpulist.attr,
> +	NULL,
> +};
> +
> +static ssize_t
> +raid5_auxth_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
> +{
> +	struct raid5_auxth_sysfs *entry = container_of(attr,
> +		struct raid5_auxth_sysfs, attr);
> +	struct raid5_auxth *thread = container_of(kobj,
> +		struct raid5_auxth, kobj);
> +	struct mddev *mddev = thread->thread->mddev;
> +	ssize_t ret;
> +
> +	if (!entry->show)
> +		return -EIO;
> +	mddev_lock(mddev);
> +	ret = entry->show(mddev, thread, page);
> +	mddev_unlock(mddev);
> +	return ret;
> +}
> +
> +static ssize_t
> +raid5_auxth_attr_store(struct kobject *kobj, struct attribute *attr,
> +	      const char *page, size_t length)
> +{
> +	struct raid5_auxth_sysfs *entry = container_of(attr,
> +		struct raid5_auxth_sysfs, attr);
> +	struct raid5_auxth *thread = container_of(kobj,
> +		struct raid5_auxth, kobj);
> +	struct mddev *mddev = thread->thread->mddev;
> +	ssize_t ret;
> +
> +	if (!entry->store)
> +		return -EIO;
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EACCES;
> +	mddev_lock(mddev);
> +	ret = entry->store(mddev, thread, page, length);
> +	mddev_unlock(mddev);
> +	return ret;
> +}
> +
> +static void raid5_auxth_release(struct kobject *kobj)
> +{
> +	struct raid5_auxth *thread = container_of(kobj,
> +		struct raid5_auxth, kobj);
> +	kfree(thread);
> +}
> +
> +static const struct sysfs_ops raid5_auxth_sysfsops = {
> +	.show = raid5_auxth_attr_show,
> +	.store = raid5_auxth_attr_store,
> +};
> +static struct kobj_type raid5_auxth_ktype = {
> +	.release = raid5_auxth_release,
> +	.sysfs_ops = &raid5_auxth_sysfsops,
> +	.default_attrs = auxth_attrs,
> +};
> +
> +static ssize_t
> +raid5_show_auxthread_number(struct mddev *mddev, char *page)
> +{
> +	struct r5conf *conf = mddev->private;
> +	if (conf)
> +		return sprintf(page, "%d\n", conf->aux_thread_num);
> +	else
> +		return 0;
> +}
> +
> +static void raid5_auxth_delete(struct work_struct *ws)
> +{
> +	struct raid5_auxth *thread = container_of(ws, struct raid5_auxth,
> +		del_work);
> +
> +	kobject_del(&thread->kobj);
> +	kobject_put(&thread->kobj);
> +}
> +
> +static void __free_aux_thread(struct mddev *mddev, struct raid5_auxth *thread)
> +{
> +	md_unregister_thread(&thread->thread);
> +	INIT_WORK(&thread->del_work, raid5_auxth_delete);
> +	kobject_get(&thread->kobj);
> +	md_queue_misc_work(&thread->del_work);
> +}
> +
> +static struct raid5_auxth *__create_aux_thread(struct mddev *mddev, int i)
> +{
> +	struct raid5_auxth *thread;
> +	char name[10];
> +
> +	thread = kzalloc(sizeof(*thread), GFP_KERNEL);
> +	if (!thread)
> +		return NULL;
> +	snprintf(name, 10, "aux%d", i);
> +	thread->thread = md_register_thread(raid5auxd, mddev, name);
> +	if (!thread->thread) {
> +		kfree(thread);
> +		return NULL;
> +	}
> +	thread->thread->private = thread;
> +
> +	cpumask_copy(&thread->work_mask, cpu_online_mask);
> +
> +	if (kobject_init_and_add(&thread->kobj, &raid5_auxth_ktype,
> +	    &mddev->kobj, "auxth%d", i)) {
> +		md_unregister_thread(&thread->thread);
> +		kfree(thread);
> +		return NULL;
> +	}
> +	return thread;
> +}
> +
> +static ssize_t
> +raid5_store_auxthread_number(struct mddev *mddev, const char *page, size_t len)
> +{
> +	struct r5conf *conf = mddev->private;
> +	unsigned long new;
> +	int i;
> +	struct raid5_auxth **threads;
> +
> +	if (len >= PAGE_SIZE)
> +		return -EINVAL;
> +	if (!conf)
> +		return -ENODEV;
> +
> +	if (kstrtoul(page, 10, &new))
> +		return -EINVAL;
> +
> +	if (new == conf->aux_thread_num)
> +		return len;
> +
> +	/* There is no point creating more threads than cpu number */
> +	if (new > num_online_cpus())
> +		return -EINVAL;
> +
> +	if (new > conf->aux_thread_num) {
> +		threads = kzalloc(sizeof(struct raid5_auxth *) * new,
> +				GFP_KERNEL);
> +		if (!threads)
> +			return -ENOMEM;
> +
> +		i = conf->aux_thread_num;
> +		while (i < new) {
> +			threads[i] = __create_aux_thread(mddev, i);
> +			if (!threads[i])
> +				goto error;
> +
> +			i++;
> +		}
> +		memcpy(threads, conf->aux_threads,
> +			sizeof(struct raid5_auxth *) * conf->aux_thread_num);
> +		get_online_cpus();
> +		spin_lock_irq(&conf->device_lock);
> +		kfree(conf->aux_threads);
> +		conf->aux_threads = threads;
> +		conf->aux_thread_num = new;
> +		raid5_update_threads_handle_mask(mddev);
> +		spin_unlock_irq(&conf->device_lock);
> +		put_online_cpus();
> +	} else {
> +		int old = conf->aux_thread_num;
> +
> +		get_online_cpus();
> +		spin_lock_irq(&conf->device_lock);
> +		conf->aux_thread_num = new;
> +		raid5_update_threads_handle_mask(mddev);
> +		spin_unlock_irq(&conf->device_lock);
> +		put_online_cpus();
> +		for (i = new; i < old; i++)
> +			__free_aux_thread(mddev, conf->aux_threads[i]);
> +	}
> +
> +	return len;
> +error:
> +	while (--i >= conf->aux_thread_num)
> +		__free_aux_thread(mddev, threads[i]);
> +	kfree(threads);
> +	return -ENOMEM;
> +}
> +
> +static struct md_sysfs_entry
> +raid5_auxthread_number = __ATTR(auxthread_number, S_IRUGO|S_IWUSR,
> +				raid5_show_auxthread_number,
> +				raid5_store_auxthread_number);
> +
>  static struct attribute *raid5_attrs[] =  {
>  	&raid5_stripecache_size.attr,
>  	&raid5_stripecache_active.attr,
>  	&raid5_preread_bypass_threshold.attr,
> +	&raid5_auxthread_number.attr,
>  	NULL,
>  };
>  static struct attribute_group raid5_attrs_group = {
> @@ -4845,6 +5189,7 @@ static void raid5_free_percpu(struct r5c
>  
>  static void free_conf(struct r5conf *conf)
>  {
> +	kfree(conf->aux_threads);
>  	shrink_stripes(conf);
>  	raid5_free_percpu(conf);
>  	kfree(conf->disks);
> @@ -4857,7 +5202,7 @@ static int raid456_cpu_notify(struct not
>  			      void *hcpu)
>  {
>  	struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify);
> -	long cpu = (long)hcpu;
> +	long cpu = (long)hcpu, anycpu;
>  	struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
>  
>  	switch (action) {
> @@ -4876,9 +5221,17 @@ static int raid456_cpu_notify(struct not
>  			       __func__, cpu);
>  			return notifier_from_errno(-ENOMEM);
>  		}
> +		INIT_LIST_HEAD(&(percpu->handle_list));
> +		cpumask_clear(&percpu->handle_threads);
>  		break;
>  	case CPU_DEAD:
>  	case CPU_DEAD_FROZEN:
> +		spin_lock_irq(&conf->device_lock);
> +		anycpu = cpumask_any(cpu_online_mask);
> +		list_splice_tail_init(&percpu->handle_list,
> +			&per_cpu_ptr(conf->percpu, anycpu)->handle_list);
> +		spin_unlock_irq(&conf->device_lock);
> +
>  		safe_put_page(percpu->spare_page);
>  		kfree(percpu->scribble);
>  		percpu->spare_page = NULL;
> @@ -4887,6 +5240,10 @@ static int raid456_cpu_notify(struct not
>  	default:
>  		break;
>  	}
> +
> +	spin_lock_irq(&conf->device_lock);
> +	raid5_update_threads_handle_mask(conf->mddev);
> +	spin_unlock_irq(&conf->device_lock);
>  	return NOTIFY_OK;
>  }
>  #endif
> @@ -4907,20 +5264,24 @@ static int raid5_alloc_percpu(struct r5c
>  	get_online_cpus();
>  	err = 0;
>  	for_each_present_cpu(cpu) {
> +		struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
> +
>  		if (conf->level == 6) {
>  			spare_page = alloc_page(GFP_KERNEL);
>  			if (!spare_page) {
>  				err = -ENOMEM;
>  				break;
>  			}
> -			per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
> +			percpu->spare_page = spare_page;
>  		}
>  		scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
>  		if (!scribble) {
>  			err = -ENOMEM;
>  			break;
>  		}
> -		per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
> +		percpu->scribble = scribble;
> +		INIT_LIST_HEAD(&percpu->handle_list);
> +		cpumask_clear(&percpu->handle_threads);
>  	}
>  #ifdef CONFIG_HOTPLUG_CPU
>  	conf->cpu_notify.notifier_call = raid456_cpu_notify;
> @@ -4976,7 +5337,6 @@ static struct r5conf *setup_conf(struct
>  	spin_lock_init(&conf->device_lock);
>  	init_waitqueue_head(&conf->wait_for_stripe);
>  	init_waitqueue_head(&conf->wait_for_overlap);
> -	INIT_LIST_HEAD(&conf->handle_list);
>  	INIT_LIST_HEAD(&conf->hold_list);
>  	INIT_LIST_HEAD(&conf->delayed_list);
>  	INIT_LIST_HEAD(&conf->bitmap_list);
> @@ -4987,6 +5347,8 @@ static struct r5conf *setup_conf(struct
>  	conf->bypass_threshold = BYPASS_THRESHOLD;
>  	conf->recovery_disabled = mddev->recovery_disabled - 1;
>  
> +	cpumask_copy(&conf->work_mask, cpu_online_mask);
> +
>  	conf->raid_disks = mddev->raid_disks;
>  	if (mddev->reshape_position == MaxSector)
>  		conf->previous_raid_disks = mddev->raid_disks;
> @@ -5403,6 +5765,10 @@ abort:
>  static int stop(struct mddev *mddev)
>  {
>  	struct r5conf *conf = mddev->private;
> +	int i;
> +
> +	for (i = 0; i < conf->aux_thread_num; i++)
> +		__free_aux_thread(mddev, conf->aux_threads[i]);
>  
>  	md_unregister_thread(&mddev->thread);
>  	if (mddev->queue)
> Index: linux/drivers/md/raid5.h
> ===================================================================
> --- linux.orig/drivers/md/raid5.h	2012-08-09 09:57:23.826481331 +0800
> +++ linux/drivers/md/raid5.h	2012-08-09 16:17:51.279501447 +0800
> @@ -211,6 +211,7 @@ struct stripe_head {
>  	enum check_states	check_state;
>  	enum reconstruct_states reconstruct_state;
>  	spinlock_t		stripe_lock;
> +	int			cpu;
>  	/**
>  	 * struct stripe_operations
>  	 * @target - STRIPE_OP_COMPUTE_BLK target
> @@ -364,6 +365,14 @@ struct disk_info {
>  	struct md_rdev	*rdev, *replacement;
>  };
>  
> +struct raid5_auxth {
> +	struct md_thread	*thread;
> +	/* which CPUs should the auxiliary thread handle stripes from */
> +	cpumask_t		work_mask;
> +	struct kobject		kobj;
> +	struct work_struct	del_work;
> +};
> +
>  struct r5conf {
>  	struct hlist_head	*stripe_hashtbl;
>  	struct mddev		*mddev;
> @@ -432,6 +441,12 @@ struct r5conf {
>  					      * lists and performing address
>  					      * conversions
>  					      */
> +		struct list_head handle_list;
> +		cpumask_t	handle_threads; /* Which threads can the CPU's
> +						 * stripes be handled. It really
> +						 * is a bitmap to aux_threads[],
> +						 * but has max bits NR_CPUS
> +						 */
>  	} __percpu *percpu;
>  	size_t			scribble_len; /* size of scribble region must be
>  					       * associated with conf to handle
> @@ -459,6 +474,10 @@ struct r5conf {
>  	 * the new thread here until we fully activate the array.
>  	 */
>  	struct md_thread	*thread;
> +	int			aux_thread_num;
> +	struct raid5_auxth	**aux_threads;
> +	/* which CPUs should raid5d thread handle stripes from */
> +	cpumask_t		work_mask;
>  };
>  
>  /*
> Index: linux/drivers/md/md.c
> ===================================================================
> --- linux.orig/drivers/md/md.c	2012-08-09 10:43:04.796022675 +0800
> +++ linux/drivers/md/md.c	2012-08-09 16:17:19.367902517 +0800
> @@ -640,9 +640,9 @@ static struct mddev * mddev_find(dev_t u
>  	goto retry;
>  }
>  
> -static inline int mddev_lock(struct mddev * mddev)
> +int md_queue_misc_work(struct work_struct *work)
>  {
> -	return mutex_lock_interruptible(&mddev->reconfig_mutex);
> +	return queue_work(md_misc_wq, work);
>  }
>  
>  static inline int mddev_is_locked(struct mddev *mddev)
> @@ -657,7 +657,7 @@ static inline int mddev_trylock(struct m
>  
>  static struct attribute_group md_redundancy_group;
>  
> -static void mddev_unlock(struct mddev * mddev)
> +void mddev_unlock(struct mddev * mddev)
>  {
>  	if (mddev->to_remove) {
>  		/* These cannot be removed under reconfig_mutex as
> @@ -8569,6 +8569,8 @@ EXPORT_SYMBOL(md_register_thread);
>  EXPORT_SYMBOL(md_unregister_thread);
>  EXPORT_SYMBOL(md_wakeup_thread);
>  EXPORT_SYMBOL(md_check_recovery);
> +EXPORT_SYMBOL(mddev_unlock);
> +EXPORT_SYMBOL(md_queue_misc_work);
>  MODULE_LICENSE("GPL");
>  MODULE_DESCRIPTION("MD RAID framework");
>  MODULE_ALIAS("md");
> Index: linux/drivers/md/md.h
> ===================================================================
> --- linux.orig/drivers/md/md.h	2012-08-09 10:43:04.800022626 +0800
> +++ linux/drivers/md/md.h	2012-08-09 16:14:26.318078815 +0800
> @@ -636,4 +636,12 @@ static inline int mddev_check_plugged(st
>  	return !!blk_check_plugged(md_unplug, mddev,
>  				   sizeof(struct blk_plug_cb));
>  }
> +
> +static inline int mddev_lock(struct mddev * mddev)
> +{
> +	return mutex_lock_interruptible(&mddev->reconfig_mutex);
> +}
> +extern void mddev_unlock(struct mddev *mddev);
> +extern int md_queue_misc_work(struct work_struct *work);
> +
>  #endif /* _MD_MD_H */

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2013-03-07  7:31 ` Shaohua Li
@ 2013-03-12  1:39   ` NeilBrown
  2013-03-13  0:44     ` Stan Hoeppner
  0 siblings, 1 reply; 29+ messages in thread
From: NeilBrown @ 2013-03-12  1:39 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, dan.j.williams

[-- Attachment #1: Type: text/plain, Size: 1575 bytes --]

On Thu, 7 Mar 2013 15:31:23 +0800 Shaohua Li <shli@kernel.org> wrote:

> Hi Neil,
> 
> I just noticed this one is missed. It used to be in your branch, but get
> dropped some time. Is anything I missed? I can resend it if you like.
> 
> Thanks,
> Shaohua
> 
> On Thu, Aug 09, 2012 at 04:58:08PM +0800, Shaohua Li wrote:
> > This is a new tempt to make raid5 handle stripes in multiple threads, as
> > suggested by Neil to have maxium flexibility and better numa binding. It
> > basically is a combination of my first and second generation patches. By
> > default, no multiple thread is enabled (all stripes are handled by raid5d).
> > 
> > An example to enable multiple threads:
> > #echo 3 > /sys/block/md0/md/auxthread_number
> > This will create 3 auxiliary threads to handle stripes. The threads can run
> > on any cpus and handle stripes produced by any cpus.
> > 
> > #echo 1-3 > /sys/block/md0/md/auxth0/cpulist
> > This will bind auxiliary thread 0 to cpu 1-3, and this thread will only handle
> > stripes produced by cpu 1-3. User tool can further change the thread's
> > affinity, but the thread can only handle stripes produced by cpu 1-3 till the
> > sysfs entry is changed again.
> > 
> > If stripes produced by a CPU aren't handled by any auxiliary thread, such
> > stripes will be handled by raid5d. Otherwise, raid5d doesn't handle any
> > stripes.


Hi Shaohua,
 I still have this sitting in my queue, but I haven't had a chance to look at
 is properly yet - I'm sorry about that.  I'll try to get to it soon.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2013-03-12  1:39   ` NeilBrown
@ 2013-03-13  0:44     ` Stan Hoeppner
  2013-03-28  6:47       ` NeilBrown
  0 siblings, 1 reply; 29+ messages in thread
From: Stan Hoeppner @ 2013-03-13  0:44 UTC (permalink / raw)
  To: NeilBrown; +Cc: Shaohua Li, linux-raid, dan.j.williams

On 3/11/2013 8:39 PM, NeilBrown wrote:
> On Thu, 7 Mar 2013 15:31:23 +0800 Shaohua Li <shli@kernel.org> wrote:
...
>>> #echo 1-3 > /sys/block/md0/md/auxth0/cpulist
>>> This will bind auxiliary thread 0 to cpu 1-3, and this thread will only handle
>>> stripes produced by cpu 1-3. User tool can further change the thread's
>>> affinity, but the thread can only handle stripes produced by cpu 1-3 till the
>>> sysfs entry is changed again.

Would it not be better to use the existing cpusets infrastructure for
this, instead of manually binding threads to specific cores or sets of
cores?

Also, I understand the hot cache issue driving the desire to have a raid
thread only process stripes created by its CPU.  But what of the
scenario where an HPC user pins application threads to cores and needs
all the L1/L2 cache?  Say this user has a dual socket 24 core NUMA
system with 2 NUMA nodes per socket, 4 nodes total.  Each NUMA node has
6 cores and shared L3 cache.  The user pins 5 processes to 5 cores in
each node, and wants to pin a raid thread to the remaining core in each
node to handle the write IO generated by the 5 user threads on the node.

Does your patch series allow this?  Using the above example, if the user
creates 4 cpusets, can he assign a raid thread to that set and the
thread will execute on any core in the set, and only that set, on any
stripes created by any CPU in that set, and only that set?

The infrastructure for this already exists, has since 2004.  And it
seems is more flexible than what you've implemented here.  I suggest we
make use of it, as it is the kernel standard for doing such things.

See:  http://man7.org/linux/man-pages/man7/cpuset.7.html

> Hi Shaohua,
>  I still have this sitting in my queue, but I haven't had a chance to look at
>  is properly yet - I'm sorry about that.  I'll try to get to it soon.

-- 
Stan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2013-03-13  0:44     ` Stan Hoeppner
@ 2013-03-28  6:47       ` NeilBrown
  2013-03-28 16:53         ` Stan Hoeppner
  2013-03-29  2:34         ` Shaohua Li
  0 siblings, 2 replies; 29+ messages in thread
From: NeilBrown @ 2013-03-28  6:47 UTC (permalink / raw)
  To: stan; +Cc: Shaohua Li, linux-raid, dan.j.williams

[-- Attachment #1: Type: text/plain, Size: 3951 bytes --]

On Tue, 12 Mar 2013 19:44:19 -0500 Stan Hoeppner <stan@hardwarefreak.com>
wrote:

> On 3/11/2013 8:39 PM, NeilBrown wrote:
> > On Thu, 7 Mar 2013 15:31:23 +0800 Shaohua Li <shli@kernel.org> wrote:
> ...
> >>> #echo 1-3 > /sys/block/md0/md/auxth0/cpulist
> >>> This will bind auxiliary thread 0 to cpu 1-3, and this thread will only handle
> >>> stripes produced by cpu 1-3. User tool can further change the thread's
> >>> affinity, but the thread can only handle stripes produced by cpu 1-3 till the
> >>> sysfs entry is changed again.
> 
> Would it not be better to use the existing cpusets infrastructure for
> this, instead of manually binding threads to specific cores or sets of
> cores?
> 
> Also, I understand the hot cache issue driving the desire to have a raid
> thread only process stripes created by its CPU.  But what of the
> scenario where an HPC user pins application threads to cores and needs
> all the L1/L2 cache?  Say this user has a dual socket 24 core NUMA
> system with 2 NUMA nodes per socket, 4 nodes total.  Each NUMA node has
> 6 cores and shared L3 cache.  The user pins 5 processes to 5 cores in
> each node, and wants to pin a raid thread to the remaining core in each
> node to handle the write IO generated by the 5 user threads on the node.
> 
> Does your patch series allow this?  Using the above example, if the user
> creates 4 cpusets, can he assign a raid thread to that set and the
> thread will execute on any core in the set, and only that set, on any
> stripes created by any CPU in that set, and only that set?
> 
> The infrastructure for this already exists, has since 2004.  And it
> seems is more flexible than what you've implemented here.  I suggest we
> make use of it, as it is the kernel standard for doing such things.
> 
> See:  http://man7.org/linux/man-pages/man7/cpuset.7.html
> 
> > Hi Shaohua,
> >  I still have this sitting in my queue, but I haven't had a chance to look at
> >  is properly yet - I'm sorry about that.  I'll try to get to it soon.
> 

Thanks for this feedback.  The interface is the thing I am most concerned
about getting right at this stage, and is exactly what you are commenting on.

The current code allows you to request N separate raid threads, and to tie
each one to a subset of processors.  This tying is in two senses.  The
thread can only run on cpus in the subset, and the requests queued by any
given processor will preferentially be processed by threads tied to that
processor.

It does sound a lot like cpusets could be used instead of lists of CPUs.
However it does merge the two different cpuset concepts which you seem to
suggest might not be ideal, and maybe it isn't.

A completely general solution might be to allow each thread to handle
requests from one cpuset, and run on any processor in another cpuset.
Would that be too much flexibility?

cpusets are a config option, so we would need to only enable multithreads if
CONFIG_CPUSETS were set.  Is this unnecessarily restrictive?  Are there any
other cases of kernel threads binding to cpusets?  If there aren't I'd be a
but cautious of being the first, I as have very little familiarity with this
stuff.

I still like the idea of an 'ioctl' which a process can call and will cause
it to start handling requests.
The process could bind itself to whatever cpu or cpuset it wanted to, then
could call the ioctl on the relevant md array, and pass in a bitmap of cpus
which indicate which requests it wants to be responsible for.  The current
kernel thread will then only handle requests that no-one else has put their
hand up for.  This leave all the details of configuration in user-space
(where I think it belongs).

Shaohua - have you given any thought to that approach.

If anyone else has any familiarity with multithreading and numa and cpu
affinity and would like to share some thoughts, I am all ears.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2013-03-28  6:47       ` NeilBrown
@ 2013-03-28 16:53         ` Stan Hoeppner
  2013-03-29  2:34         ` Shaohua Li
  1 sibling, 0 replies; 29+ messages in thread
From: Stan Hoeppner @ 2013-03-28 16:53 UTC (permalink / raw)
  To: NeilBrown
  Cc: Shaohua Li, linux-raid, dan.j.williams, Christoph Hellwig,
	Dave Chinner

I'm CC'ing Christoph and Dave hoping they may have some insights or
recommendations WRT this md RAID kernel write thread scheduling, and if
use cpusets might be handy or worth considering.

On 3/28/2013 1:47 AM, NeilBrown wrote:
> On Tue, 12 Mar 2013 19:44:19 -0500 Stan Hoeppner <stan@hardwarefreak.com>
> wrote:
> 
>> On 3/11/2013 8:39 PM, NeilBrown wrote:
>>> On Thu, 7 Mar 2013 15:31:23 +0800 Shaohua Li <shli@kernel.org> wrote:
>> ...
>>>>> #echo 1-3 > /sys/block/md0/md/auxth0/cpulist
>>>>> This will bind auxiliary thread 0 to cpu 1-3, and this thread will only handle
>>>>> stripes produced by cpu 1-3. User tool can further change the thread's
>>>>> affinity, but the thread can only handle stripes produced by cpu 1-3 till the
>>>>> sysfs entry is changed again.
>>
>> Would it not be better to use the existing cpusets infrastructure for
>> this, instead of manually binding threads to specific cores or sets of
>> cores?
>>
>> Also, I understand the hot cache issue driving the desire to have a raid
>> thread only process stripes created by its CPU.  But what of the
>> scenario where an HPC user pins application threads to cores and needs
>> all the L1/L2 cache?  Say this user has a dual socket 24 core NUMA
>> system with 2 NUMA nodes per socket, 4 nodes total.  Each NUMA node has
>> 6 cores and shared L3 cache.  The user pins 5 processes to 5 cores in
>> each node, and wants to pin a raid thread to the remaining core in each
>> node to handle the write IO generated by the 5 user threads on the node.
>>
>> Does your patch series allow this?  Using the above example, if the user
>> creates 4 cpusets, can he assign a raid thread to that set and the
>> thread will execute on any core in the set, and only that set, on any
>> stripes created by any CPU in that set, and only that set?
>>
>> The infrastructure for this already exists, has since 2004.  And it
>> seems is more flexible than what you've implemented here.  I suggest we
>> make use of it, as it is the kernel standard for doing such things.
>>
>> See:  http://man7.org/linux/man-pages/man7/cpuset.7.html
>>
>>> Hi Shaohua,
>>>  I still have this sitting in my queue, but I haven't had a chance to look at
>>>  is properly yet - I'm sorry about that.  I'll try to get to it soon.
>>
> 
> Thanks for this feedback.  The interface is the thing I am most concerned
> about getting right at this stage, and is exactly what you are commenting on.

Yeah, that's why I wanted to get these thoughts out there.

> The current code allows you to request N separate raid threads, and to tie
> each one to a subset of processors.  This tying is in two senses.  The
> thread can only run on cpus in the subset, and the requests queued by any
> given processor will preferentially be processed by threads tied to that
> processor.
> 
> It does sound a lot like cpusets could be used instead of lists of CPUs.
> However it does merge the two different cpuset concepts which you seem to
> suggest might not be ideal, and maybe it isn't.

I don't know which method would best.  The Linux Scalability Effort in
the early 2000s put a tremendous amount of work into creating the
cpusets infrastructure to facilitate this type of thing.  I figured it
should be used if applicable/beneficial.  As it's been a standard
interface for quite some time, it should be familiar to many and fairly
well understood.  When I saw Shaohua's description it seemed a little
like reinventing the wheel.

> A completely general solution might be to allow each thread to handle
> requests from one cpuset, and run on any processor in another cpuset.
> Would that be too much flexibility?

I think any<->any would work fine as a default as it would tend to avoid
idle cores.  It may put extra traffic on the NUMA interconnect, but this
is where you would allow the user to tune it via cpusets.

> cpusets are a config option, so we would need to only enable multithreads if
> CONFIG_CPUSETS were set.  Is this unnecessarily restrictive?  

Yes, too restrictive.  I don't think I'd make multithreads dependent on
CONFIG_CPUSETS.  Not every kernel in the wild has CONFIG_CPUSETS enabled
but these machines may indeed have multiple cores.  I have a few such
machines/kernels.

> Are there any
> other cases of kernel threads binding to cpusets?  If there aren't I'd be a
> but cautious of being the first, I as have very little familiarity with this
> stuff.

I do not know if others do kernel thread binding/masking via cpusets or
the underlying kernel interface.  You need feedback from other devs
working with NUMA stuff and kernel thread scheduling.

> I still like the idea of an 'ioctl' which a process can call and will cause
> it to start handling requests.

Agreed.

> The process could bind itself to whatever cpu or cpuset it wanted to, then
> could call the ioctl on the relevant md array, and pass in a bitmap of cpus
> which indicate which requests it wants to be responsible for.  The current
> kernel thread will then only handle requests that no-one else has put their
> hand up for.  This leave all the details of configuration in user-space
> (where I think it belongs).

Agreed.  The cpusets customization will be used by very few people.  But
as a single one of these threads may eat an entire core with some
workloads, I think the option should exist for those who may need it.
This is also pretty much true of cpusets itself, as I'd guess far less
than 1% take advantage of it.  Most servers out there, and
desktops/laptops, are single or dual socket boxen, and most folks
managing such systems have never heard of cpusets.  They simply load up
their apps and let the default kernel config do its thing.

> Shaohua - have you given any thought to that approach.
> 
> If anyone else has any familiarity with multithreading and numa and cpu
> affinity and would like to share some thoughts, I am all ears.

As I said I think you need to get this to a wider audience.  I think
Christoph and Dave might have some valuable insight here, or know others
who do.  CC'ing them.

-- 
Stan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2013-03-28  6:47       ` NeilBrown
  2013-03-28 16:53         ` Stan Hoeppner
@ 2013-03-29  2:34         ` Shaohua Li
  2013-03-29  9:36           ` Stan Hoeppner
  1 sibling, 1 reply; 29+ messages in thread
From: Shaohua Li @ 2013-03-29  2:34 UTC (permalink / raw)
  To: NeilBrown; +Cc: stan, linux-raid, dan.j.williams

On Thu, Mar 28, 2013 at 05:47:44PM +1100, NeilBrown wrote:
> On Tue, 12 Mar 2013 19:44:19 -0500 Stan Hoeppner <stan@hardwarefreak.com>
> wrote:
> 
> > On 3/11/2013 8:39 PM, NeilBrown wrote:
> > > On Thu, 7 Mar 2013 15:31:23 +0800 Shaohua Li <shli@kernel.org> wrote:
> > ...
> > >>> #echo 1-3 > /sys/block/md0/md/auxth0/cpulist
> > >>> This will bind auxiliary thread 0 to cpu 1-3, and this thread will only handle
> > >>> stripes produced by cpu 1-3. User tool can further change the thread's
> > >>> affinity, but the thread can only handle stripes produced by cpu 1-3 till the
> > >>> sysfs entry is changed again.
> > 
> > Would it not be better to use the existing cpusets infrastructure for
> > this, instead of manually binding threads to specific cores or sets of
> > cores?
> > 
> > Also, I understand the hot cache issue driving the desire to have a raid
> > thread only process stripes created by its CPU.  But what of the
> > scenario where an HPC user pins application threads to cores and needs
> > all the L1/L2 cache?  Say this user has a dual socket 24 core NUMA
> > system with 2 NUMA nodes per socket, 4 nodes total.  Each NUMA node has
> > 6 cores and shared L3 cache.  The user pins 5 processes to 5 cores in
> > each node, and wants to pin a raid thread to the remaining core in each
> > node to handle the write IO generated by the 5 user threads on the node.
> > 
> > Does your patch series allow this?  Using the above example, if the user
> > creates 4 cpusets, can he assign a raid thread to that set and the
> > thread will execute on any core in the set, and only that set, on any
> > stripes created by any CPU in that set, and only that set?
> > 
> > The infrastructure for this already exists, has since 2004.  And it
> > seems is more flexible than what you've implemented here.  I suggest we
> > make use of it, as it is the kernel standard for doing such things.
> > 
> > See:  http://man7.org/linux/man-pages/man7/cpuset.7.html
> > 
> > > Hi Shaohua,
> > >  I still have this sitting in my queue, but I haven't had a chance to look at
> > >  is properly yet - I'm sorry about that.  I'll try to get to it soon.
> > 
> 
> Thanks for this feedback.  The interface is the thing I am most concerned
> about getting right at this stage, and is exactly what you are commenting on.
> 
> The current code allows you to request N separate raid threads, and to tie
> each one to a subset of processors.  This tying is in two senses.  The
> thread can only run on cpus in the subset, and the requests queued by any
> given processor will preferentially be processed by threads tied to that
> processor.
> 
> It does sound a lot like cpusets could be used instead of lists of CPUs.
> However it does merge the two different cpuset concepts which you seem to
> suggest might not be ideal, and maybe it isn't.
> 
> A completely general solution might be to allow each thread to handle
> requests from one cpuset, and run on any processor in another cpuset.
> Would that be too much flexibility?
> 
> cpusets are a config option, so we would need to only enable multithreads if
> CONFIG_CPUSETS were set.  Is this unnecessarily restrictive?  Are there any
> other cases of kernel threads binding to cpusets?  If there aren't I'd be a
> but cautious of being the first, I as have very little familiarity with this
> stuff.

Frankly I don't like the cpuset way. It might just work, but it's just another
API to control process affinity and has no essential difference against my
approach (which directly sets process affinity). Generally we use cpuset
instead of process affinity is because of something like inherit affinity.
While the raid5 process doesn't involve those.
 
> I still like the idea of an 'ioctl' which a process can call and will cause
> it to start handling requests.
> The process could bind itself to whatever cpu or cpuset it wanted to, then
> could call the ioctl on the relevant md array, and pass in a bitmap of cpus
> which indicate which requests it wants to be responsible for.  The current
> kernel thread will then only handle requests that no-one else has put their
> hand up for.  This leave all the details of configuration in user-space
> (where I think it belongs).

The 'ioctl' way is interesting. But there are something we need answer:

1. How kernel knows if there will be process to handle one cpu's requests
before an 'ioctl' is called? I suppose you want 2 ioctls. One ioctl telles
kernel the process handles request from cpus of a cpumask. The other ioctl does
request handling. The process must sleep in the ioctl to wait requests.

2. If a process is killed in the middle, how kernel knows? Do you want to hook
something in task management code? For normal process exit, we need another
ioctl to tell kernel the process is exiting.

The only difference between this way and my way is if the request handling task
is userspace or kernel space. In both ways, you need set affinity and uses
ioctl/sysfs to control requests source for the process.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2013-03-29  2:34         ` Shaohua Li
@ 2013-03-29  9:36           ` Stan Hoeppner
  2013-04-01  1:57             ` Shaohua Li
  0 siblings, 1 reply; 29+ messages in thread
From: Stan Hoeppner @ 2013-03-29  9:36 UTC (permalink / raw)
  To: Shaohua Li
  Cc: NeilBrown, linux-raid, dan.j.williams, Christoph Hellwig,
	Dave Chinner, Joe Landman

I'm CC'ing Joe Landman as he's already building systems of the caliber
that would benefit from this write threading and may need configurable
CPU scheduling.  Joe I've not seen a post from you on linux-raid in a
while so I don't know if you've been following this topic.  Shaohua has
created patch sets to eliminate, or dramatically mitigate, the horrible
single threaded write performance of md/RAID 1, 10, 5, 6 on SSD.
Throughput no longer hits a wall due to peaking one core, as with the
currently shipping kernel code.  Your thoughts?

On 3/28/2013 9:34 PM, Shaohua Li wrote:
...
> Frankly I don't like the cpuset way. It might just work, but it's just another
> API to control process affinity and has no essential difference against my
> approach (which directly sets process affinity). Generally we use cpuset
> instead of process affinity is because of something like inherit affinity.
> While the raid5 process doesn't involve those.

First I should again state I'm not a developer, but a sysadmin, and this
is the viewpoint from which I speak.

The essential difference I see is the user interface the sysadmin will
employ to tweak thread placement/behavior.  Hypothetically, say I have a
64 socket Altix UV machine w/8 core CPUs, 512 cores.  Each node board
has two sockets, two distinct NUMA nodes, 64 total, but these share a
NUMALink hub interface chip connection to the rest of the machine, and
share a PCIe mezzanine interface.

We obviously want to keep md/RAID housekeeping bandwidth (stripe cache,
RMW reads, etc) isolated to the node where it is attached so it doesn't
needlessly traverse NUMALink eating precious, limited, 'high' latency
NUMAlink system interconnect bandwidth.  We need to keep that free for
our parallel application which is eating 100% of the other 504 cores and
saturating NUMAlink with MPI and file IO traffic.

So lets say I have one NUMA node out of 64 dedicated to block device IO.
 It has a PCIe x8 v2 IB 4x QDR HBA (4GB/s) connection to a SAN box with
18 SSDs (and 128 SAS rust).  The SAN RAID ASIC can't keep up with SSD
RAID5 IO rates while also doing RAID for the rust.  So we export the
SSDs individually and we make 2x 9 drive md/RAID5 arrays.  I've already
created a cpuset with this NUMA node for strictly storage related
processes including but not limited to XFS utils, backup processes,
snapshots, etc, so that the only block IO traversing NUMAlink is user
application data.  Now I add another 18 SSDs to the SAN chassis, and
another IB HBA to this node board.

Ideally, my md/RAID write threads should already be bound to this
cpuset.  So all I should need to do is add this 2nd node to the cpuset
and I'm done.  No need to monkey with additional md/RAID specific
interfaces.

Now, that's the simple scenario.  On this particular machine's
architecture, you have two NUMA nodes per physical node, so expanding
storage hardware on the same node board should be straightforward above.
 However, most Altix UV machines will have storage HBAs plugged into
many node boards.  If we create one cpuset and put all the md/RAID write
theads in it, then we get housekeeping RAID IO traversing the NUMAlink
interconnect.  So in this case we'd want to pin the threads to the
physical node board where the PCIe cards, and thus disks, are attached.

The 'easy' way to do this is simply create multiple cpusets, one for
each storage node.  But then you have the downside of administration
headaches, because you may need to pin your FS utils, backup, etc to a
different storage cpuset depending on which HBAs the filesystem resides,
and do this each and every time, which is a nightmare with scheduled
jobs.  Thus in this case its probably best to retain the single storage
cpuset and simply make sure the node boards share the same upstream
switch hop, keeping the traffic as local as possible.  The kernel
scheduler might already have some NUMA scheduling intelligence here that
works automagically even within a cpuset, to minimize this.  I simply
lack knowledge in this area.

>> I still like the idea of an 'ioctl' which a process can call and will cause
>> it to start handling requests.
>> The process could bind itself to whatever cpu or cpuset it wanted to, then
>> could call the ioctl on the relevant md array, and pass in a bitmap of cpus
>> which indicate which requests it wants to be responsible for.  The current
>> kernel thread will then only handle requests that no-one else has put their
>> hand up for.  This leave all the details of configuration in user-space
>> (where I think it belongs).
> 
> The 'ioctl' way is interesting. But there are something we need answer:
> 
> 1. How kernel knows if there will be process to handle one cpu's requests
> before an 'ioctl' is called? I suppose you want 2 ioctls. One ioctl telles
> kernel the process handles request from cpus of a cpumask. The other ioctl does
> request handling. The process must sleep in the ioctl to wait requests.
> 
> 2. If a process is killed in the middle, how kernel knows? Do you want to hook
> something in task management code? For normal process exit, we need another
> ioctl to tell kernel the process is exiting.
> 
> The only difference between this way and my way is if the request handling task
> is userspace or kernel space. In both ways, you need set affinity and uses
> ioctl/sysfs to control requests source for the process.

Being a non dev I lack requisite knowledge to comment on ioctls.  I'll
simply reiterate that whatever you go with should make use of an
existing familiar user interface where this same scheduling is already
handled, which is cpusets.  The only difference being kernel vs user
space.  Which may turn out to be a problem, I dunno.

-- 
Stan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2013-03-29  9:36           ` Stan Hoeppner
@ 2013-04-01  1:57             ` Shaohua Li
  2013-04-01 19:31               ` Stan Hoeppner
  0 siblings, 1 reply; 29+ messages in thread
From: Shaohua Li @ 2013-04-01  1:57 UTC (permalink / raw)
  To: Stan Hoeppner
  Cc: NeilBrown, linux-raid, dan.j.williams, Christoph Hellwig,
	Dave Chinner, Joe Landman

On Fri, Mar 29, 2013 at 04:36:14AM -0500, Stan Hoeppner wrote:
> I'm CC'ing Joe Landman as he's already building systems of the caliber
> that would benefit from this write threading and may need configurable
> CPU scheduling.  Joe I've not seen a post from you on linux-raid in a
> while so I don't know if you've been following this topic.  Shaohua has
> created patch sets to eliminate, or dramatically mitigate, the horrible
> single threaded write performance of md/RAID 1, 10, 5, 6 on SSD.
> Throughput no longer hits a wall due to peaking one core, as with the
> currently shipping kernel code.  Your thoughts?
> 
> On 3/28/2013 9:34 PM, Shaohua Li wrote:
> ...
> > Frankly I don't like the cpuset way. It might just work, but it's just another
> > API to control process affinity and has no essential difference against my
> > approach (which directly sets process affinity). Generally we use cpuset
> > instead of process affinity is because of something like inherit affinity.
> > While the raid5 process doesn't involve those.
> 
> First I should again state I'm not a developer, but a sysadmin, and this
> is the viewpoint from which I speak.
> 
> The essential difference I see is the user interface the sysadmin will
> employ to tweak thread placement/behavior.  Hypothetically, say I have a
> 64 socket Altix UV machine w/8 core CPUs, 512 cores.  Each node board
> has two sockets, two distinct NUMA nodes, 64 total, but these share a
> NUMALink hub interface chip connection to the rest of the machine, and
> share a PCIe mezzanine interface.
> 
> We obviously want to keep md/RAID housekeeping bandwidth (stripe cache,
> RMW reads, etc) isolated to the node where it is attached so it doesn't
> needlessly traverse NUMALink eating precious, limited, 'high' latency
> NUMAlink system interconnect bandwidth.  We need to keep that free for
> our parallel application which is eating 100% of the other 504 cores and
> saturating NUMAlink with MPI and file IO traffic.
> 
> So lets say I have one NUMA node out of 64 dedicated to block device IO.
>  It has a PCIe x8 v2 IB 4x QDR HBA (4GB/s) connection to a SAN box with
> 18 SSDs (and 128 SAS rust).  The SAN RAID ASIC can't keep up with SSD
> RAID5 IO rates while also doing RAID for the rust.  So we export the
> SSDs individually and we make 2x 9 drive md/RAID5 arrays.  I've already
> created a cpuset with this NUMA node for strictly storage related
> processes including but not limited to XFS utils, backup processes,
> snapshots, etc, so that the only block IO traversing NUMAlink is user
> application data.  Now I add another 18 SSDs to the SAN chassis, and
> another IB HBA to this node board.
> 
> Ideally, my md/RAID write threads should already be bound to this
> cpuset.  So all I should need to do is add this 2nd node to the cpuset
> and I'm done.  No need to monkey with additional md/RAID specific
> interfaces.
> 
> Now, that's the simple scenario.  On this particular machine's
> architecture, you have two NUMA nodes per physical node, so expanding
> storage hardware on the same node board should be straightforward above.
>  However, most Altix UV machines will have storage HBAs plugged into
> many node boards.  If we create one cpuset and put all the md/RAID write
> theads in it, then we get housekeeping RAID IO traversing the NUMAlink
> interconnect.  So in this case we'd want to pin the threads to the
> physical node board where the PCIe cards, and thus disks, are attached.
> 
> The 'easy' way to do this is simply create multiple cpusets, one for
> each storage node.  But then you have the downside of administration
> headaches, because you may need to pin your FS utils, backup, etc to a
> different storage cpuset depending on which HBAs the filesystem resides,
> and do this each and every time, which is a nightmare with scheduled
> jobs.  Thus in this case its probably best to retain the single storage
> cpuset and simply make sure the node boards share the same upstream
> switch hop, keeping the traffic as local as possible.  The kernel
> scheduler might already have some NUMA scheduling intelligence here that
> works automagically even within a cpuset, to minimize this.  I simply
> lack knowledge in this area.
> 
> >> I still like the idea of an 'ioctl' which a process can call and will cause
> >> it to start handling requests.
> >> The process could bind itself to whatever cpu or cpuset it wanted to, then
> >> could call the ioctl on the relevant md array, and pass in a bitmap of cpus
> >> which indicate which requests it wants to be responsible for.  The current
> >> kernel thread will then only handle requests that no-one else has put their
> >> hand up for.  This leave all the details of configuration in user-space
> >> (where I think it belongs).
> > 
> > The 'ioctl' way is interesting. But there are something we need answer:
> > 
> > 1. How kernel knows if there will be process to handle one cpu's requests
> > before an 'ioctl' is called? I suppose you want 2 ioctls. One ioctl telles
> > kernel the process handles request from cpus of a cpumask. The other ioctl does
> > request handling. The process must sleep in the ioctl to wait requests.
> > 
> > 2. If a process is killed in the middle, how kernel knows? Do you want to hook
> > something in task management code? For normal process exit, we need another
> > ioctl to tell kernel the process is exiting.
> > 
> > The only difference between this way and my way is if the request handling task
> > is userspace or kernel space. In both ways, you need set affinity and uses
> > ioctl/sysfs to control requests source for the process.
> 
> Being a non dev I lack requisite knowledge to comment on ioctls.  I'll
> simply reiterate that whatever you go with should make use of an
> existing familiar user interface where this same scheduling is already
> handled, which is cpusets.  The only difference being kernel vs user
> space.  Which may turn out to be a problem, I dunno.

Hmm, there might be misunderstanding here. In my way:

#echo 3 > /sys/block/md0/md/auxthread_number. Create several kernel threads to
handle requests. You can use any approach to set smp affinity for the threads.
You can use cpuset to bind the threads too.

#echo 1-3 > /sys/block/md0/md/auxth0/cpulist. This doesn't set above threads'
affinity. It sets which CPU's requests the thread should handle. Regardless
using my way, cpuset or ioctl, we need the similar way to notify worker thread
which CPU's requests it should handle (unless we have a hook in scheduler, when
a thread's affinity is changed, we get a notification)

In the sumary, my approach doesn't prevent you to use CPUSET. Did I miss something?

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2013-04-01  1:57             ` Shaohua Li
@ 2013-04-01 19:31               ` Stan Hoeppner
  2013-04-02  0:39                 ` Shaohua Li
  0 siblings, 1 reply; 29+ messages in thread
From: Stan Hoeppner @ 2013-04-01 19:31 UTC (permalink / raw)
  To: Shaohua Li
  Cc: NeilBrown, linux-raid, dan.j.williams, Christoph Hellwig,
	Dave Chinner, Joe Landman

On 3/31/2013 8:57 PM, Shaohua Li wrote:
> On Fri, Mar 29, 2013 at 04:36:14AM -0500, Stan Hoeppner wrote:
>> I'm CC'ing Joe Landman as he's already building systems of the caliber
>> that would benefit from this write threading and may need configurable
>> CPU scheduling.  Joe I've not seen a post from you on linux-raid in a
>> while so I don't know if you've been following this topic.  Shaohua has
>> created patch sets to eliminate, or dramatically mitigate, the horrible
>> single threaded write performance of md/RAID 1, 10, 5, 6 on SSD.
>> Throughput no longer hits a wall due to peaking one core, as with the
>> currently shipping kernel code.  Your thoughts?
>>
>> On 3/28/2013 9:34 PM, Shaohua Li wrote:
>> ...
>>> Frankly I don't like the cpuset way. It might just work, but it's just another
>>> API to control process affinity and has no essential difference against my
>>> approach (which directly sets process affinity). Generally we use cpuset
>>> instead of process affinity is because of something like inherit affinity.
>>> While the raid5 process doesn't involve those.
>>
>> First I should again state I'm not a developer, but a sysadmin, and this
>> is the viewpoint from which I speak.
>>
>> The essential difference I see is the user interface the sysadmin will
>> employ to tweak thread placement/behavior.  Hypothetically, say I have a
>> 64 socket Altix UV machine w/8 core CPUs, 512 cores.  Each node board
>> has two sockets, two distinct NUMA nodes, 64 total, but these share a
>> NUMALink hub interface chip connection to the rest of the machine, and
>> share a PCIe mezzanine interface.
>>
>> We obviously want to keep md/RAID housekeeping bandwidth (stripe cache,
>> RMW reads, etc) isolated to the node where it is attached so it doesn't
>> needlessly traverse NUMALink eating precious, limited, 'high' latency
>> NUMAlink system interconnect bandwidth.  We need to keep that free for
>> our parallel application which is eating 100% of the other 504 cores and
>> saturating NUMAlink with MPI and file IO traffic.
>>
>> So lets say I have one NUMA node out of 64 dedicated to block device IO.
>>  It has a PCIe x8 v2 IB 4x QDR HBA (4GB/s) connection to a SAN box with
>> 18 SSDs (and 128 SAS rust).  The SAN RAID ASIC can't keep up with SSD
>> RAID5 IO rates while also doing RAID for the rust.  So we export the
>> SSDs individually and we make 2x 9 drive md/RAID5 arrays.  I've already
>> created a cpuset with this NUMA node for strictly storage related
>> processes including but not limited to XFS utils, backup processes,
>> snapshots, etc, so that the only block IO traversing NUMAlink is user
>> application data.  Now I add another 18 SSDs to the SAN chassis, and
>> another IB HBA to this node board.
>>
>> Ideally, my md/RAID write threads should already be bound to this
>> cpuset.  So all I should need to do is add this 2nd node to the cpuset
>> and I'm done.  No need to monkey with additional md/RAID specific
>> interfaces.
>>
>> Now, that's the simple scenario.  On this particular machine's
>> architecture, you have two NUMA nodes per physical node, so expanding
>> storage hardware on the same node board should be straightforward above.
>>  However, most Altix UV machines will have storage HBAs plugged into
>> many node boards.  If we create one cpuset and put all the md/RAID write
>> theads in it, then we get housekeeping RAID IO traversing the NUMAlink
>> interconnect.  So in this case we'd want to pin the threads to the
>> physical node board where the PCIe cards, and thus disks, are attached.
>>
>> The 'easy' way to do this is simply create multiple cpusets, one for
>> each storage node.  But then you have the downside of administration
>> headaches, because you may need to pin your FS utils, backup, etc to a
>> different storage cpuset depending on which HBAs the filesystem resides,
>> and do this each and every time, which is a nightmare with scheduled
>> jobs.  Thus in this case its probably best to retain the single storage
>> cpuset and simply make sure the node boards share the same upstream
>> switch hop, keeping the traffic as local as possible.  The kernel
>> scheduler might already have some NUMA scheduling intelligence here that
>> works automagically even within a cpuset, to minimize this.  I simply
>> lack knowledge in this area.
>>
>>>> I still like the idea of an 'ioctl' which a process can call and will cause
>>>> it to start handling requests.
>>>> The process could bind itself to whatever cpu or cpuset it wanted to, then
>>>> could call the ioctl on the relevant md array, and pass in a bitmap of cpus
>>>> which indicate which requests it wants to be responsible for.  The current
>>>> kernel thread will then only handle requests that no-one else has put their
>>>> hand up for.  This leave all the details of configuration in user-space
>>>> (where I think it belongs).
>>>
>>> The 'ioctl' way is interesting. But there are something we need answer:
>>>
>>> 1. How kernel knows if there will be process to handle one cpu's requests
>>> before an 'ioctl' is called? I suppose you want 2 ioctls. One ioctl telles
>>> kernel the process handles request from cpus of a cpumask. The other ioctl does
>>> request handling. The process must sleep in the ioctl to wait requests.
>>>
>>> 2. If a process is killed in the middle, how kernel knows? Do you want to hook
>>> something in task management code? For normal process exit, we need another
>>> ioctl to tell kernel the process is exiting.
>>>
>>> The only difference between this way and my way is if the request handling task
>>> is userspace or kernel space. In both ways, you need set affinity and uses
>>> ioctl/sysfs to control requests source for the process.
>>
>> Being a non dev I lack requisite knowledge to comment on ioctls.  I'll
>> simply reiterate that whatever you go with should make use of an
>> existing familiar user interface where this same scheduling is already
>> handled, which is cpusets.  The only difference being kernel vs user
>> space.  Which may turn out to be a problem, I dunno.
> 
> Hmm, there might be misunderstanding here. In my way:

Very likely.

> #echo 3 > /sys/block/md0/md/auxthread_number. Create several kernel threads to
> handle requests. You can use any approach to set smp affinity for the threads.
> You can use cpuset to bind the threads too.

So you have verified that these kernel threads can be placed by the
cpuset calls and shell commands?  Cool, then we're over one hurdle, so
to speak.  So say I create 8 threads with a boot script.  I want to
place 4 each in 2 different cpusets.  Will this work be left for every
sysadmin to figure out and create him/herself, or will you include
scripts/docs/etc to facilitate this integration?

> #echo 1-3 > /sys/block/md0/md/auxth0/cpulist. This doesn't set above threads'
> affinity. It sets which CPU's requests the thread should handle. Regardless
> using my way, cpuset or ioctl, we need the similar way to notify worker thread
> which CPU's requests it should handle (unless we have a hook in scheduler, when
> a thread's affinity is changed, we get a notification)

I don't even know if this is necessary.  From a NUMA perspective, and
all systems are now NUMA, it's far more critical to make sure a RAID
thread is executing on a core/socket to which the HBA is attached via
the PCIe bridge.  You should make it a priority to write code to
identify this path and automatically set RAID thread affinity to that
set of cores.  This keeps the extra mirror and parity write data, RMW
read data, and stripe cache accesses off the NUMA interconnect, as I
stated in a previous email.  This is critical to system performance, no
matter how large or small the system.

Once this is accomplished, I see zero downside, from a NUMA standpoint,
to having every RAID thread be able to service every core.  Obviously
this would require some kind of hashing so we don't generate hot spots.
 Does your code already prevent this?  Anyway, I think you can simply
eliminate this tunable parm altogether.

On that note, it would make sense to modify every md/RAID driver to
participate in this hashing.  Users run multiple RAID levels on a given
box, and we want the bandwidth and CPU load spread as evenly as possible
I would think.

> In the sumary, my approach doesn't prevent you to use CPUSET. Did I miss something?

IMO, it's not enough to simply make it work with cpusets, but to get
some seamless integration.  Now that I think more about this, it should
be possible to get optimal affinity automatically by identifying the
attachment point of the HBA(s), and sticking all RAID threads to cores
on that socket.  If the optimal number of threads to create could be
calculated for any system, you could eliminate all of these tunables,
and everything be be fully automatic.  No need for user defined parms,
and no need for cpusets.

-- 
Stan


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2013-04-01 19:31               ` Stan Hoeppner
@ 2013-04-02  0:39                 ` Shaohua Li
  2013-04-02  3:12                   ` Stan Hoeppner
  0 siblings, 1 reply; 29+ messages in thread
From: Shaohua Li @ 2013-04-02  0:39 UTC (permalink / raw)
  To: Stan Hoeppner
  Cc: NeilBrown, linux-raid, dan.j.williams, Christoph Hellwig,
	Dave Chinner, Joe Landman

On Mon, Apr 01, 2013 at 02:31:22PM -0500, Stan Hoeppner wrote:
> On 3/31/2013 8:57 PM, Shaohua Li wrote:
> > On Fri, Mar 29, 2013 at 04:36:14AM -0500, Stan Hoeppner wrote:
> >> I'm CC'ing Joe Landman as he's already building systems of the caliber
> >> that would benefit from this write threading and may need configurable
> >> CPU scheduling.  Joe I've not seen a post from you on linux-raid in a
> >> while so I don't know if you've been following this topic.  Shaohua has
> >> created patch sets to eliminate, or dramatically mitigate, the horrible
> >> single threaded write performance of md/RAID 1, 10, 5, 6 on SSD.
> >> Throughput no longer hits a wall due to peaking one core, as with the
> >> currently shipping kernel code.  Your thoughts?
> >>
> >> On 3/28/2013 9:34 PM, Shaohua Li wrote:
> >> ...
> >>> Frankly I don't like the cpuset way. It might just work, but it's just another
> >>> API to control process affinity and has no essential difference against my
> >>> approach (which directly sets process affinity). Generally we use cpuset
> >>> instead of process affinity is because of something like inherit affinity.
> >>> While the raid5 process doesn't involve those.
> >>
> >> First I should again state I'm not a developer, but a sysadmin, and this
> >> is the viewpoint from which I speak.
> >>
> >> The essential difference I see is the user interface the sysadmin will
> >> employ to tweak thread placement/behavior.  Hypothetically, say I have a
> >> 64 socket Altix UV machine w/8 core CPUs, 512 cores.  Each node board
> >> has two sockets, two distinct NUMA nodes, 64 total, but these share a
> >> NUMALink hub interface chip connection to the rest of the machine, and
> >> share a PCIe mezzanine interface.
> >>
> >> We obviously want to keep md/RAID housekeeping bandwidth (stripe cache,
> >> RMW reads, etc) isolated to the node where it is attached so it doesn't
> >> needlessly traverse NUMALink eating precious, limited, 'high' latency
> >> NUMAlink system interconnect bandwidth.  We need to keep that free for
> >> our parallel application which is eating 100% of the other 504 cores and
> >> saturating NUMAlink with MPI and file IO traffic.
> >>
> >> So lets say I have one NUMA node out of 64 dedicated to block device IO.
> >>  It has a PCIe x8 v2 IB 4x QDR HBA (4GB/s) connection to a SAN box with
> >> 18 SSDs (and 128 SAS rust).  The SAN RAID ASIC can't keep up with SSD
> >> RAID5 IO rates while also doing RAID for the rust.  So we export the
> >> SSDs individually and we make 2x 9 drive md/RAID5 arrays.  I've already
> >> created a cpuset with this NUMA node for strictly storage related
> >> processes including but not limited to XFS utils, backup processes,
> >> snapshots, etc, so that the only block IO traversing NUMAlink is user
> >> application data.  Now I add another 18 SSDs to the SAN chassis, and
> >> another IB HBA to this node board.
> >>
> >> Ideally, my md/RAID write threads should already be bound to this
> >> cpuset.  So all I should need to do is add this 2nd node to the cpuset
> >> and I'm done.  No need to monkey with additional md/RAID specific
> >> interfaces.
> >>
> >> Now, that's the simple scenario.  On this particular machine's
> >> architecture, you have two NUMA nodes per physical node, so expanding
> >> storage hardware on the same node board should be straightforward above.
> >>  However, most Altix UV machines will have storage HBAs plugged into
> >> many node boards.  If we create one cpuset and put all the md/RAID write
> >> theads in it, then we get housekeeping RAID IO traversing the NUMAlink
> >> interconnect.  So in this case we'd want to pin the threads to the
> >> physical node board where the PCIe cards, and thus disks, are attached.
> >>
> >> The 'easy' way to do this is simply create multiple cpusets, one for
> >> each storage node.  But then you have the downside of administration
> >> headaches, because you may need to pin your FS utils, backup, etc to a
> >> different storage cpuset depending on which HBAs the filesystem resides,
> >> and do this each and every time, which is a nightmare with scheduled
> >> jobs.  Thus in this case its probably best to retain the single storage
> >> cpuset and simply make sure the node boards share the same upstream
> >> switch hop, keeping the traffic as local as possible.  The kernel
> >> scheduler might already have some NUMA scheduling intelligence here that
> >> works automagically even within a cpuset, to minimize this.  I simply
> >> lack knowledge in this area.
> >>
> >>>> I still like the idea of an 'ioctl' which a process can call and will cause
> >>>> it to start handling requests.
> >>>> The process could bind itself to whatever cpu or cpuset it wanted to, then
> >>>> could call the ioctl on the relevant md array, and pass in a bitmap of cpus
> >>>> which indicate which requests it wants to be responsible for.  The current
> >>>> kernel thread will then only handle requests that no-one else has put their
> >>>> hand up for.  This leave all the details of configuration in user-space
> >>>> (where I think it belongs).
> >>>
> >>> The 'ioctl' way is interesting. But there are something we need answer:
> >>>
> >>> 1. How kernel knows if there will be process to handle one cpu's requests
> >>> before an 'ioctl' is called? I suppose you want 2 ioctls. One ioctl telles
> >>> kernel the process handles request from cpus of a cpumask. The other ioctl does
> >>> request handling. The process must sleep in the ioctl to wait requests.
> >>>
> >>> 2. If a process is killed in the middle, how kernel knows? Do you want to hook
> >>> something in task management code? For normal process exit, we need another
> >>> ioctl to tell kernel the process is exiting.
> >>>
> >>> The only difference between this way and my way is if the request handling task
> >>> is userspace or kernel space. In both ways, you need set affinity and uses
> >>> ioctl/sysfs to control requests source for the process.
> >>
> >> Being a non dev I lack requisite knowledge to comment on ioctls.  I'll
> >> simply reiterate that whatever you go with should make use of an
> >> existing familiar user interface where this same scheduling is already
> >> handled, which is cpusets.  The only difference being kernel vs user
> >> space.  Which may turn out to be a problem, I dunno.
> > 
> > Hmm, there might be misunderstanding here. In my way:
> 
> Very likely.
> 
> > #echo 3 > /sys/block/md0/md/auxthread_number. Create several kernel threads to
> > handle requests. You can use any approach to set smp affinity for the threads.
> > You can use cpuset to bind the threads too.
> 
> So you have verified that these kernel threads can be placed by the
> cpuset calls and shell commands?  Cool, then we're over one hurdle, so
> to speak.  So say I create 8 threads with a boot script.  I want to
> place 4 each in 2 different cpusets.  Will this work be left for every
> sysadmin to figure out and create him/herself, or will you include
> scripts/docs/etc to facilitate this integration?

Sure, verified cpuset can apply to kernel threads. No, I don't have scripts.
 
> > #echo 1-3 > /sys/block/md0/md/auxth0/cpulist. This doesn't set above threads'
> > affinity. It sets which CPU's requests the thread should handle. Regardless
> > using my way, cpuset or ioctl, we need the similar way to notify worker thread
> > which CPU's requests it should handle (unless we have a hook in scheduler, when
> > a thread's affinity is changed, we get a notification)
> 
> I don't even know if this is necessary.  From a NUMA perspective, and
> all systems are now NUMA, it's far more critical to make sure a RAID
> thread is executing on a core/socket to which the HBA is attached via
> the PCIe bridge.  You should make it a priority to write code to
> identify this path and automatically set RAID thread affinity to that
> set of cores.  This keeps the extra mirror and parity write data, RMW
> read data, and stripe cache accesses off the NUMA interconnect, as I
> stated in a previous email.  This is critical to system performance, no
> matter how large or small the system.
> 
> Once this is accomplished, I see zero downside, from a NUMA standpoint,
> to having every RAID thread be able to service every core.  Obviously
> this would require some kind of hashing so we don't generate hot spots.
>  Does your code already prevent this?  Anyway, I think you can simply
> eliminate this tunable parm altogether.
> 
> On that note, it would make sense to modify every md/RAID driver to
> participate in this hashing.  Users run multiple RAID levels on a given
> box, and we want the bandwidth and CPU load spread as evenly as possible
> I would think.
> 
> > In the sumary, my approach doesn't prevent you to use CPUSET. Did I miss something?
> 
> IMO, it's not enough to simply make it work with cpusets, but to get
> some seamless integration.  Now that I think more about this, it should
> be possible to get optimal affinity automatically by identifying the
> attachment point of the HBA(s), and sticking all RAID threads to cores
> on that socket.  If the optimal number of threads to create could be
> calculated for any system, you could eliminate all of these tunables,
> and everything be be fully automatic.  No need for user defined parms,
> and no need for cpusets.

I understand. It's always preferred everything is automatic set with best
performance. But last time I checked, different optimal thread number applies
in different setup/workload. After some discussions, we decided to add some
tunables. This isn't convenient from user point of view, but it's hard to
determine the optimal tunable value.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch 2/2 v3]raid5: create multiple threads to handle stripes
  2013-04-02  0:39                 ` Shaohua Li
@ 2013-04-02  3:12                   ` Stan Hoeppner
  0 siblings, 0 replies; 29+ messages in thread
From: Stan Hoeppner @ 2013-04-02  3:12 UTC (permalink / raw)
  To: Shaohua Li
  Cc: NeilBrown, linux-raid, dan.j.williams, Christoph Hellwig,
	Dave Chinner, Joe Landman

On 4/1/2013 7:39 PM, Shaohua Li wrote:
> On Mon, Apr 01, 2013 at 02:31:22PM -0500, Stan Hoeppner wrote:
>> On 3/31/2013 8:57 PM, Shaohua Li wrote:
>>> On Fri, Mar 29, 2013 at 04:36:14AM -0500, Stan Hoeppner wrote:
>>>> I'm CC'ing Joe Landman as he's already building systems of the caliber
>>>> that would benefit from this write threading and may need configurable
>>>> CPU scheduling.  Joe I've not seen a post from you on linux-raid in a
>>>> while so I don't know if you've been following this topic.  Shaohua has
>>>> created patch sets to eliminate, or dramatically mitigate, the horrible
>>>> single threaded write performance of md/RAID 1, 10, 5, 6 on SSD.
>>>> Throughput no longer hits a wall due to peaking one core, as with the
>>>> currently shipping kernel code.  Your thoughts?
>>>>
>>>> On 3/28/2013 9:34 PM, Shaohua Li wrote:
>>>> ...
>>>>> Frankly I don't like the cpuset way. It might just work, but it's just another
>>>>> API to control process affinity and has no essential difference against my
>>>>> approach (which directly sets process affinity). Generally we use cpuset
>>>>> instead of process affinity is because of something like inherit affinity.
>>>>> While the raid5 process doesn't involve those.
>>>>
>>>> First I should again state I'm not a developer, but a sysadmin, and this
>>>> is the viewpoint from which I speak.
>>>>
>>>> The essential difference I see is the user interface the sysadmin will
>>>> employ to tweak thread placement/behavior.  Hypothetically, say I have a
>>>> 64 socket Altix UV machine w/8 core CPUs, 512 cores.  Each node board
>>>> has two sockets, two distinct NUMA nodes, 64 total, but these share a
>>>> NUMALink hub interface chip connection to the rest of the machine, and
>>>> share a PCIe mezzanine interface.
>>>>
>>>> We obviously want to keep md/RAID housekeeping bandwidth (stripe cache,
>>>> RMW reads, etc) isolated to the node where it is attached so it doesn't
>>>> needlessly traverse NUMALink eating precious, limited, 'high' latency
>>>> NUMAlink system interconnect bandwidth.  We need to keep that free for
>>>> our parallel application which is eating 100% of the other 504 cores and
>>>> saturating NUMAlink with MPI and file IO traffic.
>>>>
>>>> So lets say I have one NUMA node out of 64 dedicated to block device IO.
>>>>  It has a PCIe x8 v2 IB 4x QDR HBA (4GB/s) connection to a SAN box with
>>>> 18 SSDs (and 128 SAS rust).  The SAN RAID ASIC can't keep up with SSD
>>>> RAID5 IO rates while also doing RAID for the rust.  So we export the
>>>> SSDs individually and we make 2x 9 drive md/RAID5 arrays.  I've already
>>>> created a cpuset with this NUMA node for strictly storage related
>>>> processes including but not limited to XFS utils, backup processes,
>>>> snapshots, etc, so that the only block IO traversing NUMAlink is user
>>>> application data.  Now I add another 18 SSDs to the SAN chassis, and
>>>> another IB HBA to this node board.
>>>>
>>>> Ideally, my md/RAID write threads should already be bound to this
>>>> cpuset.  So all I should need to do is add this 2nd node to the cpuset
>>>> and I'm done.  No need to monkey with additional md/RAID specific
>>>> interfaces.
>>>>
>>>> Now, that's the simple scenario.  On this particular machine's
>>>> architecture, you have two NUMA nodes per physical node, so expanding
>>>> storage hardware on the same node board should be straightforward above.
>>>>  However, most Altix UV machines will have storage HBAs plugged into
>>>> many node boards.  If we create one cpuset and put all the md/RAID write
>>>> theads in it, then we get housekeeping RAID IO traversing the NUMAlink
>>>> interconnect.  So in this case we'd want to pin the threads to the
>>>> physical node board where the PCIe cards, and thus disks, are attached.
>>>>
>>>> The 'easy' way to do this is simply create multiple cpusets, one for
>>>> each storage node.  But then you have the downside of administration
>>>> headaches, because you may need to pin your FS utils, backup, etc to a
>>>> different storage cpuset depending on which HBAs the filesystem resides,
>>>> and do this each and every time, which is a nightmare with scheduled
>>>> jobs.  Thus in this case its probably best to retain the single storage
>>>> cpuset and simply make sure the node boards share the same upstream
>>>> switch hop, keeping the traffic as local as possible.  The kernel
>>>> scheduler might already have some NUMA scheduling intelligence here that
>>>> works automagically even within a cpuset, to minimize this.  I simply
>>>> lack knowledge in this area.
>>>>
>>>>>> I still like the idea of an 'ioctl' which a process can call and will cause
>>>>>> it to start handling requests.
>>>>>> The process could bind itself to whatever cpu or cpuset it wanted to, then
>>>>>> could call the ioctl on the relevant md array, and pass in a bitmap of cpus
>>>>>> which indicate which requests it wants to be responsible for.  The current
>>>>>> kernel thread will then only handle requests that no-one else has put their
>>>>>> hand up for.  This leave all the details of configuration in user-space
>>>>>> (where I think it belongs).
>>>>>
>>>>> The 'ioctl' way is interesting. But there are something we need answer:
>>>>>
>>>>> 1. How kernel knows if there will be process to handle one cpu's requests
>>>>> before an 'ioctl' is called? I suppose you want 2 ioctls. One ioctl telles
>>>>> kernel the process handles request from cpus of a cpumask. The other ioctl does
>>>>> request handling. The process must sleep in the ioctl to wait requests.
>>>>>
>>>>> 2. If a process is killed in the middle, how kernel knows? Do you want to hook
>>>>> something in task management code? For normal process exit, we need another
>>>>> ioctl to tell kernel the process is exiting.
>>>>>
>>>>> The only difference between this way and my way is if the request handling task
>>>>> is userspace or kernel space. In both ways, you need set affinity and uses
>>>>> ioctl/sysfs to control requests source for the process.
>>>>
>>>> Being a non dev I lack requisite knowledge to comment on ioctls.  I'll
>>>> simply reiterate that whatever you go with should make use of an
>>>> existing familiar user interface where this same scheduling is already
>>>> handled, which is cpusets.  The only difference being kernel vs user
>>>> space.  Which may turn out to be a problem, I dunno.
>>>
>>> Hmm, there might be misunderstanding here. In my way:
>>
>> Very likely.
>>
>>> #echo 3 > /sys/block/md0/md/auxthread_number. Create several kernel threads to
>>> handle requests. You can use any approach to set smp affinity for the threads.
>>> You can use cpuset to bind the threads too.
>>
>> So you have verified that these kernel threads can be placed by the
>> cpuset calls and shell commands?  Cool, then we're over one hurdle, so
>> to speak.  So say I create 8 threads with a boot script.  I want to
>> place 4 each in 2 different cpusets.  Will this work be left for every
>> sysadmin to figure out and create him/herself, or will you include
>> scripts/docs/etc to facilitate this integration?
> 
> Sure, verified cpuset can apply to kernel threads. 

> No, I don't have scripts.

So your position is that you have no desire to integrate your features
with standard Linux interfaces that would normally be used with such
features.  Nor provide documentation on how to do so.  Is this correct?

Shaohua, Neil doesn't care much for me on most occasions because I'm
fond of pointing out faults with md/RAID, and I often tease him by
pointing out that md/RAID is used mostly by hobbyists and rarely in the
enterprise.

The increased write performance afforded by your multithread patches has
the potential to be a game changer here, and drive adoption of md/RAID
much higher up the food chain.  If you'd like this to occur, I think it
would be well worth your time and effort to identify other enterprise
level features, such as cpusets, that will integrate with this, and make
using these as easy as possible.

>>> #echo 1-3 > /sys/block/md0/md/auxth0/cpulist. This doesn't set above threads'
>>> affinity. It sets which CPU's requests the thread should handle. Regardless
>>> using my way, cpuset or ioctl, we need the similar way to notify worker thread
>>> which CPU's requests it should handle (unless we have a hook in scheduler, when
>>> a thread's affinity is changed, we get a notification)
>>
>> I don't even know if this is necessary.  From a NUMA perspective, and
>> all systems are now NUMA, it's far more critical to make sure a RAID
>> thread is executing on a core/socket to which the HBA is attached via
>> the PCIe bridge.  You should make it a priority to write code to
>> identify this path and automatically set RAID thread affinity to that
>> set of cores.  This keeps the extra mirror and parity write data, RMW
>> read data, and stripe cache accesses off the NUMA interconnect, as I
>> stated in a previous email.  This is critical to system performance, no
>> matter how large or small the system.
>>
>> Once this is accomplished, I see zero downside, from a NUMA standpoint,
>> to having every RAID thread be able to service every core.  Obviously
>> this would require some kind of hashing so we don't generate hot spots.
>>  Does your code already prevent this?  Anyway, I think you can simply
>> eliminate this tunable parm altogether.
>>
>> On that note, it would make sense to modify every md/RAID driver to
>> participate in this hashing.  Users run multiple RAID levels on a given
>> box, and we want the bandwidth and CPU load spread as evenly as possible
>> I would think.
>>
>>> In the sumary, my approach doesn't prevent you to use CPUSET. Did I miss something?
>>
>> IMO, it's not enough to simply make it work with cpusets, but to get
>> some seamless integration.  Now that I think more about this, it should
>> be possible to get optimal affinity automatically by identifying the
>> attachment point of the HBA(s), and sticking all RAID threads to cores
>> on that socket.  If the optimal number of threads to create could be
>> calculated for any system, you could eliminate all of these tunables,
>> and everything be be fully automatic.  No need for user defined parms,
>> and no need for cpusets.
> 
> I understand. It's always preferred everything is automatic set with best
> performance. But last time I checked, different optimal thread number applies
> in different setup/workload. After some discussions, we decided to add some
> tunables. This isn't convenient from user point of view, but it's hard to
> determine the optimal tunable value.

Actually I don't see how it is hard to determine.  You can identify
which HBA(s) the disks of a RAID set are attached to.  You can identify
which NUMA node the HBA(s) are attached to.  If you simply spawn a
thread for each RAID set on each core in this NUMA node then you're
fully optimized for the simple case on 1-4 socket machines, and nobody
should ever need to turn the knobs on this class of machines.  I've not
thought through all the possible configurations on a big NUMA such as
Altix, but my gut instinct says this method would work well as the
default setup there as well.

You certainly don't want md/RAID threads executing an any NUMA nodes to
which the HBAs are not attached.

-- 
Stan




^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2013-04-02  3:12 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-08-09  8:58 [patch 2/2 v3]raid5: create multiple threads to handle stripes Shaohua Li
2012-08-11  8:45 ` Jianpeng Ma
2012-08-13  0:21   ` Shaohua Li
2012-08-13  1:06     ` Jianpeng Ma
2012-08-13  2:13       ` Shaohua Li
2012-08-13  2:20         ` Shaohua Li
2012-08-13  2:25           ` Jianpeng Ma
2012-08-13  4:21           ` NeilBrown
2012-08-14 10:39           ` Jianpeng Ma
2012-08-15  3:51             ` Shaohua Li
2012-08-15  6:21               ` Jianpeng Ma
2012-08-15  8:04                 ` Shaohua Li
2012-08-15  8:19                   ` Jianpeng Ma
2012-09-24 11:15                   ` Jianpeng Ma
2012-09-26  1:26                     ` NeilBrown
2012-08-13  9:11     ` Jianpeng Ma
2012-08-13  4:29 ` NeilBrown
2012-08-13  6:22   ` Shaohua Li
2013-03-07  7:31 ` Shaohua Li
2013-03-12  1:39   ` NeilBrown
2013-03-13  0:44     ` Stan Hoeppner
2013-03-28  6:47       ` NeilBrown
2013-03-28 16:53         ` Stan Hoeppner
2013-03-29  2:34         ` Shaohua Li
2013-03-29  9:36           ` Stan Hoeppner
2013-04-01  1:57             ` Shaohua Li
2013-04-01 19:31               ` Stan Hoeppner
2013-04-02  0:39                 ` Shaohua Li
2013-04-02  3:12                   ` Stan Hoeppner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).