Linux RAID subsystem development

Linux RAID subsystem development
 help / color / mirror / Atom feed

* [PATCH] r5cache: allow adding journal to array without journal
From: Song Liu @ 2017-03-14  0:09 UTC (permalink / raw)
  To: linux-raid
  Cc: shli, neilb, kernel-team, dan.j.williams, hch, jsorensen,
	Song Liu

This seems just work.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 Manage.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/Manage.c b/Manage.c
index 5c3d2b9..9ab999d 100644
--- a/Manage.c
+++ b/Manage.c
@@ -963,11 +963,6 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
 
 		sysfs_free(mdp);
 
-		tst->ss->getinfo_super(tst, &mdi, NULL);
-		if (mdi.journal_device_required == 0) {
-			pr_err("%s does not support journal device.\n", devname);
-			return -1;
-		}
 		disc.raid_disk = 0;
 	}
 
-- 
2.9.3


^ permalink raw reply related

* [PATCH v2] r5cache: allow adding journal to array without journal
From: Song Liu @ 2017-03-14  0:11 UTC (permalink / raw)
  To: linux-raid
  Cc: shli, neilb, kernel-team, dan.j.williams, hch, jsorensen,
	Song Liu

This seems just work.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 Manage.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/Manage.c b/Manage.c
index 5c3d2b9..5ff5de3 100644
--- a/Manage.c
+++ b/Manage.c
@@ -946,7 +946,6 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
 
 	/* only add journal to array that supports journaling */
 	if (dv->disposition == 'j') {
-		struct mdinfo mdi;
 		struct mdinfo *mdp;
 
 		mdp = sysfs_read(fd, NULL, GET_ARRAY_STATE);
@@ -963,11 +962,6 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
 
 		sysfs_free(mdp);
 
-		tst->ss->getinfo_super(tst, &mdi, NULL);
-		if (mdi.journal_device_required == 0) {
-			pr_err("%s does not support journal device.\n", devname);
-			return -1;
-		}
 		disc.raid_disk = 0;
 	}
 
-- 
2.9.3


^ permalink raw reply related

* [PATCH V4] md: move bitmap_destroy to the beginning of __md_stop
From: Guoqing Jiang @ 2017-03-14  1:40 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, shli, Guoqing Jiang

Since we have switched to sync way to handle METADATA_UPDATED
msg for md-cluster, then process_metadata_update is depended
on mddev->thread->wqueue.

With the new change, clustered raid could possible hang if
array received a METADATA_UPDATED msg after array unregistered
mddev->thread, so we need to stop clustered raid (bitmap_destroy
-> bitmap_free -> md_cluster_stop) earlier than unregister
thread (mddev_detach -> md_unregister_thread).

And this change should be safe for non-clustered raid since
all writes are stopped before the destroy. Also in md_run,
we activate the personality (pers->run()) before activating
the bitmap (bitmap_create()). So it is pleasingly symmetric
to stop the bitmap (bitmap_destroy()) before stopping the
personality (__md_stop() calls pers->free()), we achieve this
by move bitmap_destroy to the beginning of __md_stop.

But we don't want to break the codes for waiting behind IO as
Shaohua mentioned, so introduce bitmap_wait_behind_writes to
call the codes, and call the new fun in both mddev_detach and
bitmap_destroy, then we will not break original behind IO code
and also fit the new condition well.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
Changes from v3:
1. move bitmap_destroy to __md_stop
2. add bitmap_wait_behind_writes to handle behind IO

 drivers/md/bitmap.c | 17 +++++++++++++++++
 drivers/md/bitmap.h |  1 +
 drivers/md/md.c     | 13 ++-----------
 3 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index b6fa55a3cff8..20cad80d6e34 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1764,6 +1764,21 @@ void bitmap_free(struct bitmap *bitmap)
 }
 EXPORT_SYMBOL(bitmap_free);
 
+void bitmap_wait_behind_writes(struct mddev *mddev)
+{
+	struct bitmap *bitmap = mddev->bitmap;
+
+	/* wait for behind writes to complete */
+	if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
+		pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
+			 mdname(mddev));
+		/* need to kick something here to make sure I/O goes? */
+		wait_event(bitmap->behind_wait,
+			   atomic_read(&bitmap->behind_writes) == 0);
+	}
+}
+EXPORT_SYMBOL(bitmap_wait_behind_writes);
+
 void bitmap_destroy(struct mddev *mddev)
 {
 	struct bitmap *bitmap = mddev->bitmap;
@@ -1771,6 +1786,8 @@ void bitmap_destroy(struct mddev *mddev)
 	if (!bitmap) /* there was no bitmap */
 		return;
 
+	bitmap_wait_behind_writes(mddev);
+
 	mutex_lock(&mddev->bitmap_info.mutex);
 	spin_lock(&mddev->lock);
 	mddev->bitmap = NULL; /* disconnect from the md device */
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 9f761097aab2..d15721ac07a6 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -271,6 +271,7 @@ struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot);
 int bitmap_copy_from_slot(struct mddev *mddev, int slot,
 				sector_t *lo, sector_t *hi, bool clear_bits);
 void bitmap_free(struct bitmap *bitmap);
+void bitmap_wait_behind_writes(struct mddev *mddev);
 #endif
 
 #endif
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 79a99a1c9ce7..dc131fabfc7c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5534,15 +5534,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes);
 
 static void mddev_detach(struct mddev *mddev)
 {
-	struct bitmap *bitmap = mddev->bitmap;
-	/* wait for behind writes to complete */
-	if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
-		pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
-			 mdname(mddev));
-		/* need to kick something here to make sure I/O goes? */
-		wait_event(bitmap->behind_wait,
-			   atomic_read(&bitmap->behind_writes) == 0);
-	}
+	bitmap_wait_behind_writes(mddev);
 	if (mddev->pers && mddev->pers->quiesce) {
 		mddev->pers->quiesce(mddev, 1);
 		mddev->pers->quiesce(mddev, 0);
@@ -5555,6 +5547,7 @@ static void mddev_detach(struct mddev *mddev)
 static void __md_stop(struct mddev *mddev)
 {
 	struct md_personality *pers = mddev->pers;
+	bitmap_destroy(mddev);
 	mddev_detach(mddev);
 	/* Ensure ->event_work is done */
 	flush_workqueue(md_misc_wq);
@@ -5575,7 +5568,6 @@ void md_stop(struct mddev *mddev)
 	 * This is called from dm-raid
 	 */
 	__md_stop(mddev);
-	bitmap_destroy(mddev);
 	if (mddev->bio_set)
 		bioset_free(mddev->bio_set);
 }
@@ -5713,7 +5705,6 @@ static int do_md_stop(struct mddev *mddev, int mode,
 	if (mode == 0) {
 		pr_info("md: %s stopped.\n", mdname(mddev));
 
-		bitmap_destroy(mddev);
 		if (mddev->bitmap_info.file) {
 			struct file *f = mddev->bitmap_info.file;
 			spin_lock(&mddev->lock);
-- 
2.6.2


^ permalink raw reply related

* [PATCH v1] md:fix a trivial typo in comments
From: Zhilong Liu @ 2017-03-14  7:52 UTC (permalink / raw)
  To: shli
  Cc: linux-raid, Zhilong Liu, Jack Wang, Guoqing Jiang, John Stoffel,
	Coly Li
In-Reply-To: <1489384407-12672-1-git-send-email-zlliu@suse.com>

raid1.c: fix a trivial typo in comments of freeze_array().

Cc: Jack Wang <jack.wang.usish@gmail.com>
Cc: Guoqing Jiang <gqjiang@suse.com>
Cc: John Stoffel <john@stoffel.org>
Cc: Coly Li <colyli@suse.de>
Signed-off-by: Zhilong Liu <zlliu@suse.com>
---
 drivers/md/raid1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index fbc2d78..92ca6de 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1027,7 +1027,7 @@ static int get_unqueued_pending(struct r1conf *conf)
 static void freeze_array(struct r1conf *conf, int extra)
 {
 	/* Stop sync I/O and normal I/O and wait for everything to
-	 * go quite.
+	 * go quiet.
 	 * This is called in two situations:
 	 * 1) management command handlers (reshape, remove disk, quiesce).
 	 * 2) one normal I/O request failed.
-- 
2.6.6


^ permalink raw reply related

* Re: [PATCH v1] md:fix a trivial typo in comments
From: Coly Li @ 2017-03-14  8:23 UTC (permalink / raw)
  To: Zhilong Liu, shli; +Cc: linux-raid, Jack Wang, Guoqing Jiang, John Stoffel
In-Reply-To: <1489477946-4202-1-git-send-email-zlliu@suse.com>

On 2017/3/14 下午3:52, Zhilong Liu wrote:
> raid1.c: fix a trivial typo in comments of freeze_array().
> 
> Cc: Jack Wang <jack.wang.usish@gmail.com>
> Cc: Guoqing Jiang <gqjiang@suse.com>
> Cc: John Stoffel <john@stoffel.org>
> Cc: Coly Li <colyli@suse.de>
> Signed-off-by: Zhilong Liu <zlliu@suse.com>

Acked-by: Coly Li <colyli@suse.de>

> ---
>  drivers/md/raid1.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index fbc2d78..92ca6de 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -1027,7 +1027,7 @@ static int get_unqueued_pending(struct r1conf *conf)
>  static void freeze_array(struct r1conf *conf, int extra)
>  {
>  	/* Stop sync I/O and normal I/O and wait for everything to
> -	 * go quite.
> +	 * go quiet.
>  	 * This is called in two situations:
>  	 * 1) management command handlers (reshape, remove disk, quiesce).
>  	 * 2) one normal I/O request failed.
> 


^ permalink raw reply

* Re: [PATCH 08/29] drivers, md: convert mddev.active from atomic_t to refcount_t
From: Michael Ellerman @ 2017-03-14 12:11 UTC (permalink / raw)
  To: gregkh
  Cc: linux-kernel, xen-devel, netdev, linux1394-devel, linux-bcache,
	linux-raid, linux-media, devel, linux-pci, linux-s390, fcoe-devel,
	linux-scsi, open-iscsi, devel, target-devel, linux-serial,
	linux-usb, peterz, Elena Reshetova, Hans Liljestrand, Kees Cook,
	David Windsor, linuxppc-dev
In-Reply-To: <1488810076-3754-9-git-send-email-elena.reshetova@intel.com>

Elena Reshetova <elena.reshetova@intel.com> writes:

> refcount_t type and corresponding API should be
> used instead of atomic_t when the variable is used as
> a reference counter. This allows to avoid accidental
> refcounter overflows that might lead to use-after-free
> situations.
>
> Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
> Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
> Signed-off-by: Kees Cook <keescook@chromium.org>
> Signed-off-by: David Windsor <dwindsor@gmail.com>
> ---
>  drivers/md/md.c | 6 +++---
>  drivers/md/md.h | 3 ++-
>  2 files changed, 5 insertions(+), 4 deletions(-)

When booting linux-next (specifically 5be4921c9958ec) I'm seeing the
backtrace below. I suspect this patch is just exposing an existing
issue?

cheers


[    0.230738] md: Waiting for all devices to be available before autodetect
[    0.230742] md: If you don't use raid, use raid=noautodetect
[    0.230962] refcount_t: increment on 0; use-after-free.
[    0.230988] ------------[ cut here ]------------
[    0.230996] WARNING: CPU: 0 PID: 1 at lib/refcount.c:114 .refcount_inc+0x5c/0x70
[    0.231001] Modules linked in:
[    0.231006] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.11.0-rc1-gccN-next-20170310-g5be4921 #1
[    0.231012] task: c000000049400000 task.stack: c000000049440000
[    0.231016] NIP: c0000000005ac6bc LR: c0000000005ac6b8 CTR: c000000000743390
[    0.231021] REGS: c000000049443160 TRAP: 0700   Not tainted  (4.11.0-rc1-gccN-next-20170310-g5be4921)
[    0.231026] MSR: 8000000000029032 <SF,EE,ME,IR,DR,RI>
[    0.231033]   CR: 24024422  XER: 0000000c
[    0.231038] CFAR: c000000000a5356c SOFTE: 1 
[    0.231038] GPR00: c0000000005ac6b8 c0000000494433e0 c000000001079d00 000000000000002b 
[    0.231038] GPR04: 0000000000000000 00000000000000ef 0000000000000000 c0000000010418a0 
[    0.231038] GPR08: 000000004af80000 c000000000ecc9a8 c000000000ecc9a8 0000000000000000 
[    0.231038] GPR12: 0000000028024824 c000000006bb0000 0000000000000000 c000000049443a00 
[    0.231038] GPR16: 0000000000000000 c000000049443a10 0000000000000000 0000000000000000 
[    0.231038] GPR20: 0000000000000000 0000000000000000 c000000000f7dd20 0000000000000000 
[    0.231038] GPR24: 00000000014080c0 c0000000012060b8 c000000001206080 0000000000000009 
[    0.231038] GPR28: c000000000f7dde0 0000000000900000 0000000000000000 c0000000461ae800 
[    0.231100] NIP [c0000000005ac6bc] .refcount_inc+0x5c/0x70
[    0.231104] LR [c0000000005ac6b8] .refcount_inc+0x58/0x70
[    0.231108] Call Trace:
[    0.231112] [c0000000494433e0] [c0000000005ac6b8] .refcount_inc+0x58/0x70 (unreliable)
[    0.231120] [c000000049443450] [c00000000086c008] .mddev_find+0x1e8/0x430
[    0.231125] [c000000049443530] [c000000000872b6c] .md_open+0x2c/0x140
[    0.231132] [c0000000494435c0] [c0000000003962a4] .__blkdev_get+0xd4/0x520
[    0.231138] [c000000049443690] [c000000000396cc0] .blkdev_get+0x1c0/0x4f0
[    0.231145] [c000000049443790] [c000000000336d64] .do_dentry_open.isra.1+0x2a4/0x410
[    0.231152] [c000000049443830] [c0000000003523f4] .path_openat+0x624/0x1580
[    0.231157] [c000000049443990] [c000000000354ce4] .do_filp_open+0x84/0x120
[    0.231163] [c000000049443b10] [c000000000338d74] .do_sys_open+0x214/0x300
[    0.231170] [c000000049443be0] [c000000000da69ac] .md_run_setup+0xa0/0xec
[    0.231176] [c000000049443c60] [c000000000da4fbc] .prepare_namespace+0x60/0x240
[    0.231182] [c000000049443ce0] [c000000000da47a8] .kernel_init_freeable+0x330/0x36c
[    0.231190] [c000000049443db0] [c00000000000dc44] .kernel_init+0x24/0x160
[    0.231197] [c000000049443e30] [c00000000000badc] .ret_from_kernel_thread+0x58/0x7c
[    0.231202] Instruction dump:
[    0.231206] 60000000 3d22ffee 89296bfb 2f890000 409effdc 3c62ffc6 39200001 3d42ffee 
[    0.231216] 38630928 992a6bfb 484a6e79 60000000 <0fe00000> 4bffffb8 60000000 60000000 
[    0.231226] ---[ end trace 8c51f269ad91ffc2 ]---
[    0.231233] md: Autodetecting RAID arrays.
[    0.231236] md: autorun ...
[    0.231239] md: ... autorun DONE.
[    0.234188] EXT4-fs (sda4): mounting ext3 file system using the ext4 subsystem
[    0.250506] refcount_t: underflow; use-after-free.
[    0.250531] ------------[ cut here ]------------
[    0.250537] WARNING: CPU: 0 PID: 3 at lib/refcount.c:207 .refcount_dec_not_one+0x104/0x120
[    0.250542] Modules linked in:
[    0.250546] CPU: 0 PID: 3 Comm: kworker/0:0 Tainted: G        W       4.11.0-rc1-gccN-next-20170310-g5be4921 #1
[    0.250553] Workqueue: events .delayed_fput
[    0.250557] task: c000000049404900 task.stack: c000000049448000
[    0.250562] NIP: c0000000005ac964 LR: c0000000005ac960 CTR: c000000000743390
[    0.250567] REGS: c00000004944b530 TRAP: 0700   Tainted: G        W        (4.11.0-rc1-gccN-next-20170310-g5be4921)
[    0.250572] MSR: 8000000000029032 <SF,EE,ME,IR,DR,RI>
[    0.250578]   CR: 24002422  XER: 00000007
[    0.250584] CFAR: c000000000a5356c SOFTE: 1 
[    0.250584] GPR00: c0000000005ac960 c00000004944b7b0 c000000001079d00 0000000000000026 
[    0.250584] GPR04: 0000000000000000 0000000000000113 0000000000000000 c0000000010418a0 
[    0.250584] GPR08: 000000004af80000 c000000000ecc9a8 c000000000ecc9a8 0000000000000000 
[    0.250584] GPR12: 0000000022002824 c000000006bb0000 c0000000001116d0 c000000049050200 
[    0.250584] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 
[    0.250584] GPR20: 0000000000000001 0000000000000000 c000000048030a98 0000000000000001 
[    0.250584] GPR24: 000000000002001d 0000000000000000 0000000000000000 c0000000461af000 
[    0.250584] GPR28: 0000000000000000 c000000048030bd8 c0000000461aea08 c0000000012060b8 
[    0.250645] NIP [c0000000005ac964] .refcount_dec_not_one+0x104/0x120
[    0.250650] LR [c0000000005ac960] .refcount_dec_not_one+0x100/0x120
[    0.250654] Call Trace:
[    0.250658] [c00000004944b7b0] [c0000000005ac960] .refcount_dec_not_one+0x100/0x120 (unreliable)
[    0.250665] [c00000004944b820] [c0000000005ac9a0] .refcount_dec_and_lock+0x20/0xc0
[    0.250671] [c00000004944b8a0] [c000000000870fa4] .mddev_put+0x34/0x180
[    0.250677] [c00000004944b930] [c000000000396108] .__blkdev_put+0x288/0x350
[    0.250683] [c00000004944ba30] [c0000000003968f0] .blkdev_close+0x30/0x50
[    0.250689] [c00000004944bab0] [c00000000033e7d8] .__fput+0xc8/0x2a0
[    0.250695] [c00000004944bb60] [c00000000033ea08] .delayed_fput+0x58/0x80
[    0.250701] [c00000004944bbe0] [c000000000107ea0] .process_one_work+0x2a0/0x630
[    0.250707] [c00000004944bc80] [c0000000001082c8] .worker_thread+0x98/0x6a0
[    0.250713] [c00000004944bd70] [c000000000111868] .kthread+0x198/0x1a0
[    0.250719] [c00000004944be30] [c00000000000badc] .ret_from_kernel_thread+0x58/0x7c
[    0.250724] Instruction dump:
[    0.250728] 419e000c 38210070 4e800020 7c0802a6 3c62ffc6 39200001 3d42ffee 38630958 
[    0.250738] 992a6bfe f8010080 484a6bd1 60000000 <0fe00000> e8010080 38600001 7c0803a6 
[    0.250748] ---[ end trace 8c51f269ad91ffc3 ]---
[    0.262454] EXT4-fs (sda4): mounted filesystem with ordered data mode. Opts: (null)

^ permalink raw reply

* RE: [PATCH 08/29] drivers, md: convert mddev.active from atomic_t to refcount_t
From: Reshetova, Elena @ 2017-03-14 12:29 UTC (permalink / raw)
  To: Michael Ellerman, gregkh@linuxfoundation.org
  Cc: peterz@infradead.org, linux-pci@vger.kernel.org,
	target-devel@vger.kernel.org,
	linux1394-devel@lists.sourceforge.net, devel@driverdev.osuosl.org,
	linux-s390@vger.kernel.org, linux-scsi@vger.kernel.org,
	linux-serial@vger.kernel.org, fcoe-devel@open-fcoe.org,
	xen-devel@lists.xenproject.org, open-iscsi@googlegroups.com,
	linux-media@vger.kernel.org, Kees Cook,
	linux-raid@vger.kernel.org, linux-bcache@vger.kernel.org
In-Reply-To: <87lgs8ukfq.fsf@concordia.ellerman.id.au>

> Elena Reshetova <elena.reshetova@intel.com> writes:
> 
> > refcount_t type and corresponding API should be
> > used instead of atomic_t when the variable is used as
> > a reference counter. This allows to avoid accidental
> > refcounter overflows that might lead to use-after-free
> > situations.
> >
> > Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
> > Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
> > Signed-off-by: Kees Cook <keescook@chromium.org>
> > Signed-off-by: David Windsor <dwindsor@gmail.com>
> > ---
> >  drivers/md/md.c | 6 +++---
> >  drivers/md/md.h | 3 ++-
> >  2 files changed, 5 insertions(+), 4 deletions(-)
> 
> When booting linux-next (specifically 5be4921c9958ec) I'm seeing the
> backtrace below. I suspect this patch is just exposing an existing
> issue?

Yes, we have actually been following this issue in the another thread. 
It looks like the object is re-used somehow, but I can't quite understand how just by reading the code. 
This was what I put into the previous thread:

"The log below indicates that you are using your refcounter in a bit weird way in mddev_find(). 
However, I can't find the place (just by reading the code) where you would increment refcounter from zero (vs. setting it to one).
It looks like you either iterate over existing nodes (and increment their counters, which should be >= 1 at the time of increment) or create a new node, but then mddev_init() sets the counter to 1. "

If you can help to understand what is going on with the object creation/destruction, would be appreciated!

Also Shaohua Li stopped this patch coming from his tree since the issue was caught at that time, so we are not going to merge this until we figure it out. 

Best Regards,
Elena.

> 
> cheers
> 
> 
> [    0.230738] md: Waiting for all devices to be available before autodetect
> [    0.230742] md: If you don't use raid, use raid=noautodetect
> [    0.230962] refcount_t: increment on 0; use-after-free.
> [    0.230988] ------------[ cut here ]------------
> [    0.230996] WARNING: CPU: 0 PID: 1 at lib/refcount.c:114
> .refcount_inc+0x5c/0x70
> [    0.231001] Modules linked in:
> [    0.231006] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.11.0-rc1-gccN-next-
> 20170310-g5be4921 #1
> [    0.231012] task: c000000049400000 task.stack: c000000049440000
> [    0.231016] NIP: c0000000005ac6bc LR: c0000000005ac6b8 CTR:
> c000000000743390
> [    0.231021] REGS: c000000049443160 TRAP: 0700   Not tainted  (4.11.0-rc1-
> gccN-next-20170310-g5be4921)
> [    0.231026] MSR: 8000000000029032 <SF,EE,ME,IR,DR,RI>
> [    0.231033]   CR: 24024422  XER: 0000000c
> [    0.231038] CFAR: c000000000a5356c SOFTE: 1
> [    0.231038] GPR00: c0000000005ac6b8 c0000000494433e0 c000000001079d00
> 000000000000002b
> [    0.231038] GPR04: 0000000000000000 00000000000000ef 0000000000000000
> c0000000010418a0
> [    0.231038] GPR08: 000000004af80000 c000000000ecc9a8 c000000000ecc9a8
> 0000000000000000
> [    0.231038] GPR12: 0000000028024824 c000000006bb0000 0000000000000000
> c000000049443a00
> [    0.231038] GPR16: 0000000000000000 c000000049443a10 0000000000000000
> 0000000000000000
> [    0.231038] GPR20: 0000000000000000 0000000000000000 c000000000f7dd20
> 0000000000000000
> [    0.231038] GPR24: 00000000014080c0 c0000000012060b8 c000000001206080
> 0000000000000009
> [    0.231038] GPR28: c000000000f7dde0 0000000000900000 0000000000000000
> c0000000461ae800
> [    0.231100] NIP [c0000000005ac6bc] .refcount_inc+0x5c/0x70
> [    0.231104] LR [c0000000005ac6b8] .refcount_inc+0x58/0x70
> [    0.231108] Call Trace:
> [    0.231112] [c0000000494433e0] [c0000000005ac6b8] .refcount_inc+0x58/0x70
> (unreliable)
> [    0.231120] [c000000049443450] [c00000000086c008]
> .mddev_find+0x1e8/0x430
> [    0.231125] [c000000049443530] [c000000000872b6c] .md_open+0x2c/0x140
> [    0.231132] [c0000000494435c0] [c0000000003962a4]
> .__blkdev_get+0xd4/0x520
> [    0.231138] [c000000049443690] [c000000000396cc0] .blkdev_get+0x1c0/0x4f0
> [    0.231145] [c000000049443790] [c000000000336d64]
> .do_dentry_open.isra.1+0x2a4/0x410
> [    0.231152] [c000000049443830] [c0000000003523f4]
> .path_openat+0x624/0x1580
> [    0.231157] [c000000049443990] [c000000000354ce4]
> .do_filp_open+0x84/0x120
> [    0.231163] [c000000049443b10] [c000000000338d74]
> .do_sys_open+0x214/0x300
> [    0.231170] [c000000049443be0] [c000000000da69ac]
> .md_run_setup+0xa0/0xec
> [    0.231176] [c000000049443c60] [c000000000da4fbc]
> .prepare_namespace+0x60/0x240
> [    0.231182] [c000000049443ce0] [c000000000da47a8]
> .kernel_init_freeable+0x330/0x36c
> [    0.231190] [c000000049443db0] [c00000000000dc44] .kernel_init+0x24/0x160
> [    0.231197] [c000000049443e30] [c00000000000badc]
> .ret_from_kernel_thread+0x58/0x7c
> [    0.231202] Instruction dump:
> [    0.231206] 60000000 3d22ffee 89296bfb 2f890000 409effdc 3c62ffc6 39200001
> 3d42ffee
> [    0.231216] 38630928 992a6bfb 484a6e79 60000000 <0fe00000> 4bffffb8
> 60000000 60000000
> [    0.231226] ---[ end trace 8c51f269ad91ffc2 ]---
> [    0.231233] md: Autodetecting RAID arrays.
> [    0.231236] md: autorun ...
> [    0.231239] md: ... autorun DONE.
> [    0.234188] EXT4-fs (sda4): mounting ext3 file system using the ext4 subsystem
> [    0.250506] refcount_t: underflow; use-after-free.
> [    0.250531] ------------[ cut here ]------------
> [    0.250537] WARNING: CPU: 0 PID: 3 at lib/refcount.c:207
> .refcount_dec_not_one+0x104/0x120
> [    0.250542] Modules linked in:
> [    0.250546] CPU: 0 PID: 3 Comm: kworker/0:0 Tainted: G        W       4.11.0-rc1-
> gccN-next-20170310-g5be4921 #1
> [    0.250553] Workqueue: events .delayed_fput
> [    0.250557] task: c000000049404900 task.stack: c000000049448000
> [    0.250562] NIP: c0000000005ac964 LR: c0000000005ac960 CTR:
> c000000000743390
> [    0.250567] REGS: c00000004944b530 TRAP: 0700   Tainted: G        W        (4.11.0-
> rc1-gccN-next-20170310-g5be4921)
> [    0.250572] MSR: 8000000000029032 <SF,EE,ME,IR,DR,RI>
> [    0.250578]   CR: 24002422  XER: 00000007
> [    0.250584] CFAR: c000000000a5356c SOFTE: 1
> [    0.250584] GPR00: c0000000005ac960 c00000004944b7b0 c000000001079d00
> 0000000000000026
> [    0.250584] GPR04: 0000000000000000 0000000000000113 0000000000000000
> c0000000010418a0
> [    0.250584] GPR08: 000000004af80000 c000000000ecc9a8 c000000000ecc9a8
> 0000000000000000
> [    0.250584] GPR12: 0000000022002824 c000000006bb0000 c0000000001116d0
> c000000049050200
> [    0.250584] GPR16: 0000000000000000 0000000000000000 0000000000000000
> 0000000000000000
> [    0.250584] GPR20: 0000000000000001 0000000000000000 c000000048030a98
> 0000000000000001
> [    0.250584] GPR24: 000000000002001d 0000000000000000 0000000000000000
> c0000000461af000
> [    0.250584] GPR28: 0000000000000000 c000000048030bd8 c0000000461aea08
> c0000000012060b8
> [    0.250645] NIP [c0000000005ac964] .refcount_dec_not_one+0x104/0x120
> [    0.250650] LR [c0000000005ac960] .refcount_dec_not_one+0x100/0x120
> [    0.250654] Call Trace:
> [    0.250658] [c00000004944b7b0] [c0000000005ac960]
> .refcount_dec_not_one+0x100/0x120 (unreliable)
> [    0.250665] [c00000004944b820] [c0000000005ac9a0]
> .refcount_dec_and_lock+0x20/0xc0
> [    0.250671] [c00000004944b8a0] [c000000000870fa4] .mddev_put+0x34/0x180
> [    0.250677] [c00000004944b930] [c000000000396108]
> .__blkdev_put+0x288/0x350
> [    0.250683] [c00000004944ba30] [c0000000003968f0] .blkdev_close+0x30/0x50
> [    0.250689] [c00000004944bab0] [c00000000033e7d8] .__fput+0xc8/0x2a0
> [    0.250695] [c00000004944bb60] [c00000000033ea08]
> .delayed_fput+0x58/0x80
> [    0.250701] [c00000004944bbe0] [c000000000107ea0]
> .process_one_work+0x2a0/0x630
> [    0.250707] [c00000004944bc80] [c0000000001082c8]
> .worker_thread+0x98/0x6a0
> [    0.250713] [c00000004944bd70] [c000000000111868] .kthread+0x198/0x1a0
> [    0.250719] [c00000004944be30] [c00000000000badc]
> .ret_from_kernel_thread+0x58/0x7c
> [    0.250724] Instruction dump:
> [    0.250728] 419e000c 38210070 4e800020 7c0802a6 3c62ffc6 39200001
> 3d42ffee 38630958
> [    0.250738] 992a6bfe f8010080 484a6bd1 60000000 <0fe00000> e8010080
> 38600001 7c0803a6
> [    0.250748] ---[ end trace 8c51f269ad91ffc3 ]---
> [    0.262454] EXT4-fs (sda4): mounted filesystem with ordered data mode. Opts:
> (null)

^ permalink raw reply

* Re: [PATCH 08/29] drivers, md: convert mddev.active from atomic_t to refcount_t
From: James Bottomley @ 2017-03-14 14:58 UTC (permalink / raw)
  To: Reshetova, Elena, Michael Ellerman, gregkh@linuxfoundation.org
  Cc: peterz@infradead.org, linux-pci@vger.kernel.org,
	target-devel@vger.kernel.org,
	linux1394-devel@lists.sourceforge.net, devel@driverdev.osuosl.org,
	linux-s390@vger.kernel.org, linux-scsi@vger.kernel.org,
	linux-serial@vger.kernel.org, fcoe-devel@open-fcoe.org,
	xen-devel@lists.xenproject.org, open-iscsi@googlegroups.com,
	linux-media@vger.kernel.org, Kees Cook,
	linux-raid@vger.kernel.org, linux-bcache@vger.kernel.org
In-Reply-To: <2236FBA76BA1254E88B949DDB74E612B41C588E8@IRSMSX102.ger.corp.intel.com>

On Tue, 2017-03-14 at 12:29 +0000, Reshetova, Elena wrote:
> > Elena Reshetova <elena.reshetova@intel.com> writes:
> > 
> > > refcount_t type and corresponding API should be
> > > used instead of atomic_t when the variable is used as
> > > a reference counter. This allows to avoid accidental
> > > refcounter overflows that might lead to use-after-free
> > > situations.
> > > 
> > > Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
> > > Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
> > > Signed-off-by: Kees Cook <keescook@chromium.org>
> > > Signed-off-by: David Windsor <dwindsor@gmail.com>
> > > ---
> > >  drivers/md/md.c | 6 +++---
> > >  drivers/md/md.h | 3 ++-
> > >  2 files changed, 5 insertions(+), 4 deletions(-)
> > 
> > When booting linux-next (specifically 5be4921c9958ec) I'm seeing
> > the
> > backtrace below. I suspect this patch is just exposing an existing
> > issue?
> 
> Yes, we have actually been following this issue in the another
> thread. 
> It looks like the object is re-used somehow, but I can't quite
> understand how just by reading the code. 
> This was what I put into the previous thread:
> 
> "The log below indicates that you are using your refcounter in a bit
> weird way in mddev_find(). 
> However, I can't find the place (just by reading the code) where you
> would increment refcounter from zero (vs. setting it to one).
> It looks like you either iterate over existing nodes (and increment
> their counters, which should be >= 1 at the time of increment) or
> create a new node, but then mddev_init() sets the counter to 1. "
> 
> If you can help to understand what is going on with the object
> creation/destruction, would be appreciated!
> 
> Also Shaohua Li stopped this patch coming from his tree since the 
> issue was caught at that time, so we are not going to merge this 
> until we figure it out. 

Asking on the correct list (dm-devel) would have got you the easy
answer:  The refcount behind mddev->active is a genuine atomic.  It has
refcount properties but only if the array fails to initialise (in that
case, final put kills it).  Once it's added to the system as a gendisk,
it cannot be freed until md_free().  Thus its ->active count can go to
zero (when it becomes inactive; usually because of an unmount). On a
simple allocation regardless of outcome, the last executed statement in
md_alloc is mddev_put(): that destroys the device if we didn't manage
to create it or returns 0 and adds an inactive device to the system
which the user can get with mddev_find().

James

^ permalink raw reply

* Re: linux-next: WARNING: CPU: 0 PID: 1 at lib/refcount.c:114 refcount_inc+0x37/0x40
From: Shaohua Li @ 2017-03-14 16:31 UTC (permalink / raw)
  To: Reshetova, Elena; +Cc: Andrei Vagin, linux-raid@vger.kernel.org
In-Reply-To: <2236FBA76BA1254E88B949DDB74E612B41C5816D@IRSMSX102.ger.corp.intel.com>

On Mon, Mar 13, 2017 at 10:04:32AM +0000, Reshetova, Elena wrote:
> > On Fri, Mar 10, 2017 at 12:01:06PM -0800, Andrei Vagin wrote:
> > > Hello,
> > >
> > > We run CRIU tests for linux-next kernels and here is a new issue:
> > >
> > > All logs are here: https://api.travis-ci.org/jobs/209680974/log.txt?deansi=true
> > > The kernel version is 4.11.0-rc1-next-20170310
> > 
> > Thanks for the reporting. It caused by 731d126(drivers, md: convert
> > mddev.active from atomic_t to refcount_t). It turns out the count doesn't match
> > the refcount usage. I'll drop the patch temporarily.
> 
> The log below indicates that you are using your refcounter in a bit weird way in mddev_find(). 
> However, I can't find the place (just by reading the code) where you would increment refcounter from zero (vs. setting it to one).
> It looks like you either iterate over existing nodes (and increment their counters, which should be >= 1 at the time of increment) or create a new node, but then mddev_init() sets the counter to 1. 
> 
> Do you somehow reuse the objects or?

Yes, we reuse the objects, so they are not typical refcounter. The other patch
for stripe->count probably has the same issue, as we will reuse the stripe even
its count equals to 0, I guess that doesn't fit into refcount too.

Thanks,
Shaohua

^ permalink raw reply

* Re: [RFC PATCH] md/raid10: refactor some codes from raid10_write_request
From: Shaohua Li @ 2017-03-14 16:48 UTC (permalink / raw)
  To: Guoqing Jiang; +Cc: linux-raid, shli, neilb
In-Reply-To: <1489397039-3353-1-git-send-email-gqjiang@suse.com>

On Mon, Mar 13, 2017 at 05:23:59PM +0800, Guoqing Jiang wrote:
> Previously, we clone both bio and repl_bio in raid10_write_request,
> then add the cloned bio to plug->pending or conf->pending_bio_list
> based on plug or not, and most of the logics are same for the two
> conditions.
> 
> So introduce handle_clonebio (a better name is welcome) for it, and
> use replacement parameter to distinguish the difference. No functional
> changes in the patch.
> 
> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
> ---
> Another reason for it is to improve the readability of code, but
> I didn't touch raid10 before so this is labeled as RFC.
> 
>  drivers/md/raid10.c | 172 ++++++++++++++++++++++------------------------------
>  1 file changed, 72 insertions(+), 100 deletions(-)
> 
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index b1b1f982a722..02d8eff8d26e 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -1188,18 +1188,81 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
>  	return;
>  }
>  
> -static void raid10_write_request(struct mddev *mddev, struct bio *bio,
> -				 struct r10bio *r10_bio)
> +static void handle_clonebio(struct mddev *mddev, struct r10bio *r10_bio,
> +			    struct bio *bio, int i, int replacement,
> +			    int max_sectors)

Maybe raid10_write_one_disk? Please replace 'i' with a meaningful name and
change to 'boo' for replacement.

>  {
> -	struct r10conf *conf = mddev->private;
> -	int i;
>  	const int op = bio_op(bio);
>  	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
>  	const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
>  	unsigned long flags;
> -	struct md_rdev *blocked_rdev;
>  	struct blk_plug_cb *cb;
>  	struct raid10_plug_cb *plug = NULL;
> +	struct r10conf *conf = mddev->private;
> +	struct md_rdev *rdev;
> +	int devnum = r10_bio->devs[i].devnum;
> +	struct bio *mbio;
> +
> +	if (replacement) {
> +		rdev = conf->mirrors[devnum].replacement;
> +		if (rdev == NULL) {
> +			/* Replacement just got moved to main 'rdev' */
> +			smp_mb();
> +			rdev = conf->mirrors[devnum].rdev;
> +		}
> +	} else
> +		rdev = conf->mirrors[devnum].rdev;
> +
> +	mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
> +	bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
> +	if (replacement)
> +		r10_bio->devs[i].repl_bio = mbio;
> +	else
> +		r10_bio->devs[i].bio = mbio;
> +
> +	mbio->bi_iter.bi_sector	= (r10_bio->devs[i].addr +
> +				   choose_data_offset(r10_bio, rdev));
> +	mbio->bi_bdev = rdev->bdev;
> +	mbio->bi_end_io	= raid10_end_write_request;
> +	bio_set_op_attrs(mbio, op, do_sync | do_fua);
> +	if (!replacement && test_bit(FailFast, &conf->mirrors[devnum].rdev->flags)
> +			 && enough(conf, devnum))
> +		mbio->bi_opf |= MD_FAILFAST;
> +	mbio->bi_private = r10_bio;
> +
> +	if (conf->mddev->gendisk)
> +		trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
> +				      mbio, disk_devt(conf->mddev->gendisk),
> +				      r10_bio->sector);
> +	/* flush_pending_writes() needs access to the rdev so...*/
> +	mbio->bi_bdev = (void *)rdev;
> +
> +	atomic_inc(&r10_bio->remaining);
> +
> +	cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
> +	if (cb)
> +		plug = container_of(cb, struct raid10_plug_cb, cb);
> +	else
> +		plug = NULL;
> +	spin_lock_irqsave(&conf->device_lock, flags);
> +	if (plug) {
> +		bio_list_add(&plug->pending, mbio);
> +		plug->pending_cnt++;
> +	} else {
> +		bio_list_add(&conf->pending_bio_list, mbio);
> +		conf->pending_count++;
> +	}
> +	spin_unlock_irqrestore(&conf->device_lock, flags);
> +	if (!plug)
> +		md_wakeup_thread(mddev->thread);
> +}
> +
> +static void raid10_write_request(struct mddev *mddev, struct bio *bio,
> +				 struct r10bio *r10_bio)
> +{
> +	struct r10conf *conf = mddev->private;
> +	int i;
> +	struct md_rdev *blocked_rdev;
>  	sector_t sectors;
>  	int sectors_handled;
>  	int max_sectors;
> @@ -1402,101 +1465,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>  	bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
>  
>  	for (i = 0; i < conf->copies; i++) {
> -		struct bio *mbio;
> -		int d = r10_bio->devs[i].devnum;
> -		if (r10_bio->devs[i].bio) {
> -			struct md_rdev *rdev = conf->mirrors[d].rdev;
> -			mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
> -			bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
> -				 max_sectors);
> -			r10_bio->devs[i].bio = mbio;
> -
> -			mbio->bi_iter.bi_sector	= (r10_bio->devs[i].addr+
> -					   choose_data_offset(r10_bio, rdev));
> -			mbio->bi_bdev = rdev->bdev;
> -			mbio->bi_end_io	= raid10_end_write_request;
> -			bio_set_op_attrs(mbio, op, do_sync | do_fua);
> -			if (test_bit(FailFast, &conf->mirrors[d].rdev->flags) &&
> -			    enough(conf, d))
> -				mbio->bi_opf |= MD_FAILFAST;
> -			mbio->bi_private = r10_bio;
> -
> -			if (conf->mddev->gendisk)
> -				trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
> -						      mbio, disk_devt(conf->mddev->gendisk),
> -						      r10_bio->sector);
> -			/* flush_pending_writes() needs access to the rdev so...*/
> -			mbio->bi_bdev = (void*)rdev;
> -
> -			atomic_inc(&r10_bio->remaining);
> -
> -			cb = blk_check_plugged(raid10_unplug, mddev,
> -					       sizeof(*plug));
> -			if (cb)
> -				plug = container_of(cb, struct raid10_plug_cb,
> -						    cb);
> -			else
> -				plug = NULL;
> -			spin_lock_irqsave(&conf->device_lock, flags);
> -			if (plug) {
> -				bio_list_add(&plug->pending, mbio);
> -				plug->pending_cnt++;
> -			} else {
> -				bio_list_add(&conf->pending_bio_list, mbio);
> -				conf->pending_count++;
> -			}
> -			spin_unlock_irqrestore(&conf->device_lock, flags);
> -			if (!plug)
> -				md_wakeup_thread(mddev->thread);
> -		}
> -
> -		if (r10_bio->devs[i].repl_bio) {
> -			struct md_rdev *rdev = conf->mirrors[d].replacement;
> -			if (rdev == NULL) {
> -				/* Replacement just got moved to main 'rdev' */
> -				smp_mb();
> -				rdev = conf->mirrors[d].rdev;
> -			}
> -			mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
> -			bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
> -				 max_sectors);
> -			r10_bio->devs[i].repl_bio = mbio;
> -
> -			mbio->bi_iter.bi_sector	= (r10_bio->devs[i].addr +
> -					   choose_data_offset(r10_bio, rdev));
> -			mbio->bi_bdev = rdev->bdev;
> -			mbio->bi_end_io	= raid10_end_write_request;
> -			bio_set_op_attrs(mbio, op, do_sync | do_fua);
> -			mbio->bi_private = r10_bio;
> -
> -			if (conf->mddev->gendisk)
> -				trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
> -						      mbio, disk_devt(conf->mddev->gendisk),
> -						      r10_bio->sector);
> -			/* flush_pending_writes() needs access to the rdev so...*/
> -			mbio->bi_bdev = (void*)rdev;
> -
> -			atomic_inc(&r10_bio->remaining);
> -
> -			cb = blk_check_plugged(raid10_unplug, mddev,
> -					       sizeof(*plug));
> -			if (cb)
> -				plug = container_of(cb, struct raid10_plug_cb,
> -						    cb);
> -			else
> -				plug = NULL;
> -			spin_lock_irqsave(&conf->device_lock, flags);
> -			if (plug) {
> -				bio_list_add(&plug->pending, mbio);
> -				plug->pending_cnt++;
> -			} else {
> -				bio_list_add(&conf->pending_bio_list, mbio);
> -				conf->pending_count++;
> -			}
> -			spin_unlock_irqrestore(&conf->device_lock, flags);
> -			if (!plug)
> -				md_wakeup_thread(mddev->thread);
> -		}
> +		if (r10_bio->devs[i].bio)
> +			handle_clonebio(mddev, r10_bio, bio, i, 0, max_sectors);
> +		if (r10_bio->devs[i].repl_bio)
> +			handle_clonebio(mddev, r10_bio, bio, i, 1, max_sectors);
>  	}
>  
>  	/* Don't remove the bias on 'remaining' (one_write_done) until
> -- 
> 2.6.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] md/r5cache: fix set_syndrome_sources()
From: Shaohua Li @ 2017-03-14 16:52 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-raid, shli, neilb, kernel-team, dan.j.williams, hch
In-Reply-To: <20170313204435.1732089-1-songliubraving@fb.com>

On Mon, Mar 13, 2017 at 01:44:35PM -0700, Song Liu wrote:
> With srctype == SYNDROME_SRC_WRITTEN, we need include both
> dev with non-null ->written and dev with R5_InJournal.

Thanks, applied. I'll add this to stable
 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>  drivers/md/raid5.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 1c554a8..88cc898 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -1499,7 +1499,8 @@ static int set_syndrome_sources(struct page **srcs,
>  		     (test_bit(R5_Wantdrain, &dev->flags) ||
>  		      test_bit(R5_InJournal, &dev->flags))) ||
>  		    (srctype == SYNDROME_SRC_WRITTEN &&
> -		     dev->written)) {
> +		     (dev->written ||
> +		      test_bit(R5_InJournal, &dev->flags)))) {
>  			if (test_bit(R5_InJournal, &dev->flags))
>  				srcs[slot] = sh->dev[i].orig_page;
>  			else
> -- 
> 2.9.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v2] r5cache: allow adding journal to array without journal
From: Shaohua Li @ 2017-03-14 16:56 UTC (permalink / raw)
  To: Song Liu
  Cc: linux-raid, shli, neilb, kernel-team, dan.j.williams, hch,
	jsorensen
In-Reply-To: <20170314001138.2194394-1-songliubraving@fb.com>

On Mon, Mar 13, 2017 at 05:11:38PM -0700, Song Liu wrote:
> This seems just work.

this is by no means a changelog :)
 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>  Manage.c | 6 ------
>  1 file changed, 6 deletions(-)
> 
> diff --git a/Manage.c b/Manage.c
> index 5c3d2b9..5ff5de3 100644
> --- a/Manage.c
> +++ b/Manage.c
> @@ -946,7 +946,6 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
>  
>  	/* only add journal to array that supports journaling */
>  	if (dv->disposition == 'j') {
> -		struct mdinfo mdi;
>  		struct mdinfo *mdp;
>  
>  		mdp = sysfs_read(fd, NULL, GET_ARRAY_STATE);
> @@ -963,11 +962,6 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
>  
>  		sysfs_free(mdp);
>  
> -		tst->ss->getinfo_super(tst, &mdi, NULL);
> -		if (mdi.journal_device_required == 0) {
> -			pr_err("%s does not support journal device.\n", devname);
> -			return -1;
> -		}
>  		disc.raid_disk = 0;
>  	}
>  
> -- 
> 2.9.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v6 0/4] Broadcom SBA RAID support
From: Dan Williams @ 2017-03-14 16:56 UTC (permalink / raw)
  To: Anup Patel
  Cc: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar, Ray Jui, Scott Branden, Jon Mason,
	Rob Rice, BCM Kernel Feedback, dmaengine@vger.kernel.org,
	Device Tree, linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-crypto, linux-raid,
	Shaohua Li
In-Reply-To: <1488793408-25592-1-git-send-email-anup.patel@broadcom.com>

On Mon, Mar 6, 2017 at 1:43 AM, Anup Patel <anup.patel@broadcom.com> wrote:
> The Broadcom SBA RAID is a stream-based device which provides
> RAID5/6 offload.
>
> It requires a SoC specific ring manager (such as Broadcom FlexRM
> ring manager) to provide ring-based programming interface. Due to
> this, the Broadcom SBA RAID driver (mailbox client) implements
> DMA device having one DMA channel using a set of mailbox channels
> provided by Broadcom SoC specific ring manager driver (mailbox
> controller).
>
> The Broadcom SBA RAID hardware requires PQ disk position instead
> of PQ disk coefficient. To address this, we have added raid_gflog
> table which will help driver to convert PQ disk coefficient to PQ
> disk position.
>
> This patchset is based on Linux-4.11-rc1 and depends on patchset
> "[PATCH v5 0/2] Broadcom FlexRM ring manager support"
>
> It is also available at sba-raid-v6 branch of
> https://github.com/Broadcom/arm64-linux.git
>
[..]
>
> Anup Patel (4):
>   lib/raid6: Add log-of-2 table for RAID6 HW requiring disk position
>   async_tx: Fix DMA_PREP_FENCE usage in do_async_gen_syndrome()
>   dmaengine: Add Broadcom SBA RAID driver
>   dt-bindings: Add DT bindings document for Broadcom SBA RAID driver

For the dmaengine and async_tx changes:

Acked-by: Dan Williams <dan.j.williams@intel.com>

The raid change should get an ack from Shaohua.

^ permalink raw reply

* Re: on assembly and recovery of a hardware RAID
From: Alfred Matthews @ 2017-03-14 17:27 UTC (permalink / raw)
  To: NeilBrown, linux-raid
In-Reply-To: <87tw6weubu.fsf@notabene.neil.brown.name>

> Does
>   dmraid -b /dev/sda /dev/sdb /dev/sdc
> tell you anything useful?
>

# dmraid -b /dev/sdb /dev/sdc
/dev/sdc:   5860533168 total, "WD-WCAWZ2927144"
/dev/sdb:   5860533168 total, "WD-WCAWZ2939730"

>
> You will probably want a command like:
>
>  mdadm --build /dev/md0 -l0 -n2 --chunk=SOMETHING /dev/sdXX /dev/sdYY
>
> where SOMETHING is the chunk size. e.g. 64K or 512K or something.
> Doing this is non-destructive so you can try several different times,
> using "mdadm --stop /dev/md0" to reset before trying again.
>

# mdadm --build /dev/md0 -l0 -n2 --chunk=512K /dev/sdb1 /dev/sdc1
mdadm: array /dev/md0 built and started.

> After building the array, try "cfdisk /dev/md0" or maybe "fdisk
> /dev/md0" to look at the partition table.
>

#fdisk /dev/md0

Command (m for help): p
Disk /dev/md0: 400 MiB, 419430400 bytes, 819200 sectors
Units: sectors of 1 * 512 = 512 bytes
Sector size (logical/physical): 512 bytes / 4096 bytes
I/O size (minimum/optimal): 524288 bytes / 1048576 bytes
Disklabel type: dos
Disk identifier: 0x00000000

> What filesystem(s) did you have on the device?

In fact, I'm insufficiently familiar with fsck to know why it's
reporting fsck-fat, except to suppose that that's the bootsector
format. The data partitions themselves are Apple RAID and presumed HFS
something.

Maybe "fsck -n /dev/md0p1"
> might tell you if the filesystem looks OK.
>

# fsck -n /dev/md0
fsck from util-linux 2.28.2
fsck.fat 4.0 (2016-05-06)
FATs differ - using first FAT.
Cluster 126974 out of range (43014379 > 403267). Setting to EOF.
Cluster 126975 out of range (2114643 > 403267). Setting to EOF.
Cluster 126976 out of range (3419700 > 403267). Setting to EOF.
Cluster 126977 out of range (2097410 > 403267). Setting to EOF.
Cluster 126980 out of range (1048608 > 403267). Setting to EOF.
Cluster 126982 out of range (409600 > 403267). Setting to EOF.
Cluster 126990 out of range (19464192 > 403267). Setting to EOF.
Cluster 126991 out of range (91280919 > 403267). Setting to EOF.
Cluster 126992 out of range (2115910 > 403267). Setting to EOF.
Cluster 126993 out of range (2105376 > 403267). Setting to EOF.
Cluster 126994 out of range (21372960 > 403267). Setting to EOF.
Cluster 126995 out of range (3289940 > 403267). Setting to EOF.
Cluster 126996 out of range (33169440 > 403267). Setting to EOF.
Cluster 126997 out of range (214994624 > 403267). Setting to EOF.
Cluster 126998 out of range (251362304 > 403267). Setting to EOF.
Cluster 127000 out of range (164004702 > 403267). Setting to EOF.
Cluster 127001 out of range (201328571 > 403267). Setting to EOF.
Cluster 127002 out of range (79725740 > 403267). Setting to EOF.
Cluster 127003 out of range (219067398 > 403267). Setting to EOF.
Cluster 127004 out of range (16116496 > 403267). Setting to EOF.
Cluster 127005 out of range (219598308 > 403267). Setting to EOF.
Cluster 127006 out of range (235539737 > 403267). Setting to EOF.
Cluster 127007 out of range (53309039 > 403267). Setting to EOF.
Cluster 127008 out of range (91517817 > 403267). Setting to EOF.
Cluster 127009 out of range (157556845 > 403267). Setting to EOF.
Cluster 127010 out of range (168651635 > 403267). Setting to EOF.
Cluster 127011 out of range (56980048 > 403267). Setting to EOF.
Cluster 127012 out of range (241246323 > 403267). Setting to EOF.
Cluster 127013 out of range (90906745 > 403267). Setting to EOF.
Cluster 127014 out of range (259268729 > 403267). Setting to EOF.
Cluster 127015 out of range (40202784 > 403267). Setting to EOF.
Cluster 127016 out of range (225734511 > 403267). Setting to EOF.
Cluster 127101 out of range (173342720 > 403267). Setting to EOF.
Cluster 127102 out of range (23155282 > 403267). Setting to EOF.
Cluster 127223 out of range (21066354 > 403267). Setting to EOF.
Cluster 127229 out of range (173342720 > 403267). Setting to EOF.
Cluster 127742 out of range (43014379 > 403267). Setting to EOF.
Cluster 127743 out of range (2114643 > 403267). Setting to EOF.
Cluster 127744 out of range (3419700 > 403267). Setting to EOF.
Cluster 127745 out of range (2097410 > 403267). Setting to EOF.
Cluster 127748 out of range (1048608 > 403267). Setting to EOF.
Cluster 127750 out of range (409600 > 403267). Setting to EOF.
Cluster 127758 out of range (19464192 > 403267). Setting to EOF.
Cluster 127759 out of range (91280919 > 403267). Setting to EOF.
Cluster 127760 out of range (2115910 > 403267). Setting to EOF.
Cluster 127761 out of range (2105376 > 403267). Setting to EOF.
Cluster 127762 out of range (21372960 > 403267). Setting to EOF.
Cluster 127763 out of range (3289940 > 403267). Setting to EOF.
Cluster 127764 out of range (33169440 > 403267). Setting to EOF.
Cluster 127765 out of range (214994624 > 403267). Setting to EOF.
Cluster 127766 out of range (251362304 > 403267). Setting to EOF.
Cluster 127768 out of range (164004702 > 403267). Setting to EOF.
Cluster 127769 out of range (201328571 > 403267). Setting to EOF.
Cluster 127770 out of range (79725740 > 403267). Setting to EOF.
Cluster 127771 out of range (219067398 > 403267). Setting to EOF.
Cluster 127772 out of range (16116496 > 403267). Setting to EOF.
Cluster 127773 out of range (219598308 > 403267). Setting to EOF.
Cluster 127774 out of range (235539737 > 403267). Setting to EOF.
Cluster 127775 out of range (53309039 > 403267). Setting to EOF.
Cluster 127776 out of range (91517817 > 403267). Setting to EOF.
Cluster 127777 out of range (157556845 > 403267). Setting to EOF.
Cluster 127778 out of range (168651635 > 403267). Setting to EOF.
Cluster 127779 out of range (56980048 > 403267). Setting to EOF.
Cluster 127780 out of range (241246323 > 403267). Setting to EOF.
Cluster 127781 out of range (90906745 > 403267). Setting to EOF.
Cluster 127782 out of range (259268729 > 403267). Setting to EOF.
Cluster 127783 out of range (40202784 > 403267). Setting to EOF.
Cluster 127784 out of range (225734511 > 403267). Setting to EOF.
Cluster 127869 out of range (173342720 > 403267). Setting to EOF.
Cluster 127870 out of range (23155282 > 403267). Setting to EOF.
Cluster 127991 out of range (21066354 > 403267). Setting to EOF.
Cluster 127997 out of range (173342720 > 403267). Setting to EOF.
Cluster 131070 out of range (268435440 > 403267). Setting to EOF.
Reclaimed 93 unused clusters (47616 bytes).
Leaving filesystem unchanged.
/dev/md0: 0 files, 1/403266 clusters

Thanks for your comments so far.

Al.

> NeilBrown

^ permalink raw reply

* Re: [PATCH] md/r5cache: flush data in memory during journal device failure
From: Shaohua Li @ 2017-03-14 17:50 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-raid, shli, neilb, kernel-team, dan.j.williams, hch
In-Reply-To: <20170313233626.2109293-1-songliubraving@fb.com>

On Mon, Mar 13, 2017 at 04:36:26PM -0700, Song Liu wrote:
> For the raid456 with writeback cache, when journal device failed during
> normal operation, it is still possible to persist all data, as all
> pending data is still in stripe cache. However, the stripe will be
> marked as fail with s.log_failed. Thus, the write out from stripe cache
> cannot make progress.
> 
> To unblock the write out in journal failures, this patch allows stripes
> with data injournal to make progress.

what about the parity part? if log failed, we should skip journaling the parity.

Thanks,
Shaohua
 
> The array should be read-only in journal failures. Therefore, pending
> writes (in dev->towrite) are excluded in this write (in delay_towrite).
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>  drivers/md/raid5.c | 10 +++++++++-
>  1 file changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 3233975..447d9dd 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -3069,6 +3069,10 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
>   *      When LOG_CRITICAL, stripes with injournal == 0 will be sent to
>   *      no_space_stripes list.
>   *
> + *   3. during journal failure
> + *      In journal failure, we try to flush all cached data to raid disks
> + *      based on data in stripe cache. The array is read-only to upper
> + *      layers, so we would skip all pending writes.
>   */
>  static inline bool delay_towrite(struct r5conf *conf,
>  				 struct r5dev *dev,
> @@ -3082,6 +3086,9 @@ static inline bool delay_towrite(struct r5conf *conf,
>  	if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
>  	    s->injournal > 0)
>  		return true;
> +	/* case 3 above */
> +	if (s->log_failed && s->injournal)
> +		return true;
>  	return false;
>  }
>  
> @@ -4721,7 +4728,8 @@ static void handle_stripe(struct stripe_head *sh)
>  	/* check if the array has lost more than max_degraded devices and,
>  	 * if so, some requests might need to be failed.
>  	 */
> -	if (s.failed > conf->max_degraded || s.log_failed) {
> +	if (s.failed > conf->max_degraded ||
> +	    (s.log_failed && s.injournal == 0)) {
>  		sh->check_state = 0;
>  		sh->reconstruct_state = 0;
>  		break_stripe_batch_list(sh, 0);
> -- 
> 2.9.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH V4] md: move bitmap_destroy to the beginning of __md_stop
From: Shaohua Li @ 2017-03-14 18:05 UTC (permalink / raw)
  To: Guoqing Jiang; +Cc: linux-raid, neilb, shli
In-Reply-To: <1489455620-16306-1-git-send-email-gqjiang@suse.com>

On Tue, Mar 14, 2017 at 09:40:20AM +0800, Guoqing Jiang wrote:
> Since we have switched to sync way to handle METADATA_UPDATED
> msg for md-cluster, then process_metadata_update is depended
> on mddev->thread->wqueue.
> 
> With the new change, clustered raid could possible hang if
> array received a METADATA_UPDATED msg after array unregistered
> mddev->thread, so we need to stop clustered raid (bitmap_destroy
> -> bitmap_free -> md_cluster_stop) earlier than unregister
> thread (mddev_detach -> md_unregister_thread).
> 
> And this change should be safe for non-clustered raid since
> all writes are stopped before the destroy. Also in md_run,
> we activate the personality (pers->run()) before activating
> the bitmap (bitmap_create()). So it is pleasingly symmetric
> to stop the bitmap (bitmap_destroy()) before stopping the
> personality (__md_stop() calls pers->free()), we achieve this
> by move bitmap_destroy to the beginning of __md_stop.
> 
> But we don't want to break the codes for waiting behind IO as
> Shaohua mentioned, so introduce bitmap_wait_behind_writes to
> call the codes, and call the new fun in both mddev_detach and
> bitmap_destroy, then we will not break original behind IO code
> and also fit the new condition well.
> 
> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
> ---
> Changes from v3:
> 1. move bitmap_destroy to __md_stop
> 2. add bitmap_wait_behind_writes to handle behind IO
> 
>  drivers/md/bitmap.c | 17 +++++++++++++++++
>  drivers/md/bitmap.h |  1 +
>  drivers/md/md.c     | 13 ++-----------
>  3 files changed, 20 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
> index b6fa55a3cff8..20cad80d6e34 100644
> --- a/drivers/md/bitmap.c
> +++ b/drivers/md/bitmap.c
> @@ -1764,6 +1764,21 @@ void bitmap_free(struct bitmap *bitmap)
>  }
>  EXPORT_SYMBOL(bitmap_free);
>  
> +void bitmap_wait_behind_writes(struct mddev *mddev)
> +{
> +	struct bitmap *bitmap = mddev->bitmap;
> +
> +	/* wait for behind writes to complete */
> +	if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
> +		pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
> +			 mdname(mddev));
> +		/* need to kick something here to make sure I/O goes? */
> +		wait_event(bitmap->behind_wait,
> +			   atomic_read(&bitmap->behind_writes) == 0);
> +	}
> +}
> +EXPORT_SYMBOL(bitmap_wait_behind_writes);

Applied, thanks! I deleted the EXPORT_SYMBOL here, it's not used by a module.

>  void bitmap_destroy(struct mddev *mddev)
>  {
>  	struct bitmap *bitmap = mddev->bitmap;
> @@ -1771,6 +1786,8 @@ void bitmap_destroy(struct mddev *mddev)
>  	if (!bitmap) /* there was no bitmap */
>  		return;
>  
> +	bitmap_wait_behind_writes(mddev);
> +
>  	mutex_lock(&mddev->bitmap_info.mutex);
>  	spin_lock(&mddev->lock);
>  	mddev->bitmap = NULL; /* disconnect from the md device */
> diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
> index 9f761097aab2..d15721ac07a6 100644
> --- a/drivers/md/bitmap.h
> +++ b/drivers/md/bitmap.h
> @@ -271,6 +271,7 @@ struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot);
>  int bitmap_copy_from_slot(struct mddev *mddev, int slot,
>  				sector_t *lo, sector_t *hi, bool clear_bits);
>  void bitmap_free(struct bitmap *bitmap);
> +void bitmap_wait_behind_writes(struct mddev *mddev);
>  #endif
>  
>  #endif
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 79a99a1c9ce7..dc131fabfc7c 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -5534,15 +5534,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes);
>  
>  static void mddev_detach(struct mddev *mddev)
>  {
> -	struct bitmap *bitmap = mddev->bitmap;
> -	/* wait for behind writes to complete */
> -	if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
> -		pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
> -			 mdname(mddev));
> -		/* need to kick something here to make sure I/O goes? */
> -		wait_event(bitmap->behind_wait,
> -			   atomic_read(&bitmap->behind_writes) == 0);
> -	}
> +	bitmap_wait_behind_writes(mddev);
>  	if (mddev->pers && mddev->pers->quiesce) {
>  		mddev->pers->quiesce(mddev, 1);
>  		mddev->pers->quiesce(mddev, 0);
> @@ -5555,6 +5547,7 @@ static void mddev_detach(struct mddev *mddev)
>  static void __md_stop(struct mddev *mddev)
>  {
>  	struct md_personality *pers = mddev->pers;
> +	bitmap_destroy(mddev);
>  	mddev_detach(mddev);
>  	/* Ensure ->event_work is done */
>  	flush_workqueue(md_misc_wq);
> @@ -5575,7 +5568,6 @@ void md_stop(struct mddev *mddev)
>  	 * This is called from dm-raid
>  	 */
>  	__md_stop(mddev);
> -	bitmap_destroy(mddev);
>  	if (mddev->bio_set)
>  		bioset_free(mddev->bio_set);
>  }
> @@ -5713,7 +5705,6 @@ static int do_md_stop(struct mddev *mddev, int mode,
>  	if (mode == 0) {
>  		pr_info("md: %s stopped.\n", mdname(mddev));
>  
> -		bitmap_destroy(mddev);
>  		if (mddev->bitmap_info.file) {
>  			struct file *f = mddev->bitmap_info.file;
>  			spin_lock(&mddev->lock);
> -- 
> 2.6.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v1] md:fix a trivial typo in comments
From: Shaohua Li @ 2017-03-14 18:09 UTC (permalink / raw)
  To: Zhilong Liu
  Cc: shli, linux-raid, Jack Wang, Guoqing Jiang, John Stoffel, Coly Li
In-Reply-To: <1489477946-4202-1-git-send-email-zlliu@suse.com>

On Tue, Mar 14, 2017 at 03:52:26PM +0800, Zhilong Liu wrote:
> raid1.c: fix a trivial typo in comments of freeze_array().
> 
> Cc: Jack Wang <jack.wang.usish@gmail.com>
> Cc: Guoqing Jiang <gqjiang@suse.com>
> Cc: John Stoffel <john@stoffel.org>
> Cc: Coly Li <colyli@suse.de>
> Signed-off-by: Zhilong Liu <zlliu@suse.com>

applied, thanks

> ---
>  drivers/md/raid1.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index fbc2d78..92ca6de 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -1027,7 +1027,7 @@ static int get_unqueued_pending(struct r1conf *conf)
>  static void freeze_array(struct r1conf *conf, int extra)
>  {
>  	/* Stop sync I/O and normal I/O and wait for everything to
> -	 * go quite.
> +	 * go quiet.
>  	 * This is called in two situations:
>  	 * 1) management command handlers (reshape, remove disk, quiesce).
>  	 * 2) one normal I/O request failed.
> -- 
> 2.6.6
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v6 0/4] Broadcom SBA RAID support
From: Shaohua Li @ 2017-03-14 18:48 UTC (permalink / raw)
  To: Dan Williams
  Cc: Anup Patel, Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar, Ray Jui, Scott Branden, Jon Mason,
	Rob Rice, BCM Kernel Feedback,
	dmaengine-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Device Tree,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-crypto-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <CAPcyv4hGGbhF-g8B__9=rsP+QwR6RBi=WqALLgABNb1M4D3rBw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Tue, Mar 14, 2017 at 09:56:35AM -0700, Dan Williams wrote:
> On Mon, Mar 6, 2017 at 1:43 AM, Anup Patel <anup.patel-dY08KVG/lbpWk0Htik3J/w@public.gmane.org> wrote:
> > The Broadcom SBA RAID is a stream-based device which provides
> > RAID5/6 offload.
> >
> > It requires a SoC specific ring manager (such as Broadcom FlexRM
> > ring manager) to provide ring-based programming interface. Due to
> > this, the Broadcom SBA RAID driver (mailbox client) implements
> > DMA device having one DMA channel using a set of mailbox channels
> > provided by Broadcom SoC specific ring manager driver (mailbox
> > controller).
> >
> > The Broadcom SBA RAID hardware requires PQ disk position instead
> > of PQ disk coefficient. To address this, we have added raid_gflog
> > table which will help driver to convert PQ disk coefficient to PQ
> > disk position.
> >
> > This patchset is based on Linux-4.11-rc1 and depends on patchset
> > "[PATCH v5 0/2] Broadcom FlexRM ring manager support"
> >
> > It is also available at sba-raid-v6 branch of
> > https://github.com/Broadcom/arm64-linux.git
> >
> [..]
> >
> > Anup Patel (4):
> >   lib/raid6: Add log-of-2 table for RAID6 HW requiring disk position
> >   async_tx: Fix DMA_PREP_FENCE usage in do_async_gen_syndrome()
> >   dmaengine: Add Broadcom SBA RAID driver
> >   dt-bindings: Add DT bindings document for Broadcom SBA RAID driver
> 
> For the dmaengine and async_tx changes:
> 
> Acked-by: Dan Williams <dan.j.williams-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
> 
> The raid change should get an ack from Shaohua.

For the raid6 part:

Acked-by: Shaohua Li <shli-b10kYP2dOMg@public.gmane.org>
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] md/r5cache: flush data in memory during journal device failure
From: Song Liu @ 2017-03-14 22:40 UTC (permalink / raw)
  To: Shaohua Li
  Cc: linux-raid, Shaohua Li, NeilBrown, Kernel Team,
	dan.j.williams@intel.com, hch@infradead.org
In-Reply-To: <20170314175048.rjyaufwgaclsmhdz@kernel.org>


> On Mar 14, 2017, at 10:50 AM, Shaohua Li <shli@kernel.org> wrote:
> 
> On Mon, Mar 13, 2017 at 04:36:26PM -0700, Song Liu wrote:
>> For the raid456 with writeback cache, when journal device failed during
>> normal operation, it is still possible to persist all data, as all
>> pending data is still in stripe cache. However, the stripe will be
>> marked as fail with s.log_failed. Thus, the write out from stripe cache
>> cannot make progress.
>> 
>> To unblock the write out in journal failures, this patch allows stripes
>> with data injournal to make progress.
> 
> what about the parity part? if log failed, we should skip journaling the parity.
> 
> Thanks,
> Shaohua
> 

For stripes with data in journal (not flushed yet), the state machine 
can flush them out. The behavior is just like when there are no journal 
at all. 

On the other hand, other writes will be gated by the log_failed flags, 
so the array appears to be read-only to upper layers. 

Thanks,
Song

>> The array should be read-only in journal failures. Therefore, pending
>> writes (in dev->towrite) are excluded in this write (in delay_towrite).
>> 
>> Signed-off-by: Song Liu <songliubraving@fb.com>
>> ---
>> drivers/md/raid5.c | 10 +++++++++-
>> 1 file changed, 9 insertions(+), 1 deletion(-)
>> 
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index 3233975..447d9dd 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -3069,6 +3069,10 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
>>  *      When LOG_CRITICAL, stripes with injournal == 0 will be sent to
>>  *      no_space_stripes list.
>>  *
>> + *   3. during journal failure
>> + *      In journal failure, we try to flush all cached data to raid disks
>> + *      based on data in stripe cache. The array is read-only to upper
>> + *      layers, so we would skip all pending writes.
>>  */
>> static inline bool delay_towrite(struct r5conf *conf,
>> 				 struct r5dev *dev,
>> @@ -3082,6 +3086,9 @@ static inline bool delay_towrite(struct r5conf *conf,
>> 	if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
>> 	    s->injournal > 0)
>> 		return true;
>> +	/* case 3 above */
>> +	if (s->log_failed && s->injournal)
>> +		return true;
>> 	return false;
>> }
>> 
>> @@ -4721,7 +4728,8 @@ static void handle_stripe(struct stripe_head *sh)
>> 	/* check if the array has lost more than max_degraded devices and,
>> 	 * if so, some requests might need to be failed.
>> 	 */
>> -	if (s.failed > conf->max_degraded || s.log_failed) {
>> +	if (s.failed > conf->max_degraded ||
>> +	    (s.log_failed && s.injournal == 0)) {
>> 		sh->check_state = 0;
>> 		sh->reconstruct_state = 0;
>> 		break_stripe_batch_list(sh, 0);
>> -- 
>> 2.9.3
>> 
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply

* Re: on assembly and recovery of a hardware RAID
From: NeilBrown @ 2017-03-15  2:56 UTC (permalink / raw)
  To: Alfred Matthews, linux-raid
In-Reply-To: <CAAZLhTcEhn2GezDSDJHOLv2VNSynzT-he0FRrVNbz1xtakLsMQ@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 927 bytes --]

On Tue, Mar 14 2017, Alfred Matthews wrote:

>> Does
>>   dmraid -b /dev/sda /dev/sdb /dev/sdc
>> tell you anything useful?
>>
>
> # dmraid -b /dev/sdb /dev/sdc
> /dev/sdc:   5860533168 total, "WD-WCAWZ2927144"
> /dev/sdb:   5860533168 total, "WD-WCAWZ2939730"

No, not useful.

Your other output also doesn't show anything interesting.
I had another look at the lsdrv output you showed before and I'm
wondering if there really is anything "hardware RAID" here at all.

Both drives (sdb and sdc) are partitioned into a 200MB EFI partition, a
2.75TB hfsplus (Apple file system) / unknown partition, and a 128M
hfsplus boot partition.

Maybe the two 2.75TB paritions were raided together.
sdc looks like the "first" device if this were the case.
Try:

  mdadm --build /dev/md0 --level=0 -n2 --chunk=512M /dev/sdc2 /dev/sdb2

Then try fsck.hfs of hpfsck ... in the "hfsplus" package on Debian.
 hpfsck /dev/md0
maybe.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* Re: [RFC PATCH] md/raid10: refactor some codes from raid10_write_request
From: Guoqing Jiang @ 2017-03-15  3:02 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, shli, neilb
In-Reply-To: <20170314164836.mekttgqcw4smsgsw@kernel.org>



On 03/15/2017 12:48 AM, Shaohua Li wrote:
> On Mon, Mar 13, 2017 at 05:23:59PM +0800, Guoqing Jiang wrote:
>> Previously, we clone both bio and repl_bio in raid10_write_request,
>> then add the cloned bio to plug->pending or conf->pending_bio_list
>> based on plug or not, and most of the logics are same for the two
>> conditions.
>>
>> So introduce handle_clonebio (a better name is welcome) for it, and
>> use replacement parameter to distinguish the difference. No functional
>> changes in the patch.
>>
>> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
>> ---
>> Another reason for it is to improve the readability of code, but
>> I didn't touch raid10 before so this is labeled as RFC.
>>
>>   drivers/md/raid10.c | 172 ++++++++++++++++++++++------------------------------
>>   1 file changed, 72 insertions(+), 100 deletions(-)
>>
>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>> index b1b1f982a722..02d8eff8d26e 100644
>> --- a/drivers/md/raid10.c
>> +++ b/drivers/md/raid10.c
>> @@ -1188,18 +1188,81 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
>>   	return;
>>   }
>>   
>> -static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>> -				 struct r10bio *r10_bio)
>> +static void handle_clonebio(struct mddev *mddev, struct r10bio *r10_bio,
>> +			    struct bio *bio, int i, int replacement,
>> +			    int max_sectors)
> Maybe raid10_write_one_disk?

Thanks for review, I will use it unless a better name is found.

>   Please replace 'i' with a meaningful name and
> change to 'boo' for replacement.

Ok, I would replace 'i' with 'n_copy', what is the meaning of 'boo'?
IMO, replacement fits better here, thanks.

+		if (r10_bio->devs[i].bio)
+			handle_clonebio(mddev, r10_bio, bio, i, 0, max_sectors);
+		if (r10_bio->devs[i].repl_bio)
+			handle_clonebio(mddev, r10_bio, bio, i, 1, max_sectors);



Regards,
Guoqing



^ permalink raw reply

* [md PATCH 00/15 v2] remove all abuse of bi_phys_segments
From: NeilBrown @ 2017-03-15  3:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch

This is a revised version of my series to remove all abuse of
bi_phys_segments from md/raid.

Most of the effort is for raid5 - raid1 and raid10 are
comparatively straight forward.

I've also include some optimization of md_write_start() which is now
used more heavily.  This now uses percpu-refcount, which needed a
small modification.

As noted in some of these changelogs, a couple of bugs are fixed
in raid1/raid10.  We should probably create fixes suitable for
-stable.

In this series there is a 'block' patch and a 'percpu-refcount' patch,
which should probably go in through other trees.  I might send those
off as appropriate soonish.

Thanks,
NeilBrown


---

NeilBrown (15):
      md/raid5: use md_write_start to count stripes, not bios
      md/raid5: simplfy delaying of writes while metadata is updated.
      md/raid5: call bio_endio() directly rather than queueing for later.
      block: trace completion of all bios.
      md/raid5: use bio_inc_remaining() instead of repurposing bi_phys_segments as a counter
      md/raid5: remove over-loading of ->bi_phys_segments.
      Revert "md/raid5: limit request size according to implementation limits"
      md/raid1,raid10: move rXbio accounting closer to allocation.
      md/raid10: stop using bi_phys_segments
      md/raid1: stop using bi_phys_segment
      md/raid5: don't test ->writes_pending in raid5_remove_disk
      md: factor out set_in_sync()
      md: close a race with setting mddev->in_sync
      percpu-refcount: support synchronous switch to atomic mode.
      MD: use per-cpu counter for writes_pending


 block/bio.c                     |    3 +
 drivers/md/dm.c                 |    1 
 drivers/md/md.c                 |  136 ++++++++++++++++++++-----------
 drivers/md/md.h                 |    4 +
 drivers/md/raid1.c              |   95 ++++++----------------
 drivers/md/raid10.c             |   82 ++++++-------------
 drivers/md/raid5-cache.c        |   14 +--
 drivers/md/raid5-log.h          |    2 
 drivers/md/raid5.c              |  170 +++++++++++----------------------------
 drivers/md/raid5.h              |   49 -----------
 include/linux/percpu-refcount.h |    1 
 lib/percpu-refcount.c           |   18 ++++
 12 files changed, 216 insertions(+), 359 deletions(-)

--
Signature


^ permalink raw reply

* [md PATCH 01/15] md/raid5: use md_write_start to count stripes, not bios
From: NeilBrown @ 2017-03-15  3:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148954692173.18641.1294690639716682540.stgit@noble>

We use md_write_start() to increase the count of pending writes, and
md_write_end() to decrement the count.  We currently count bios
submitted to md/raid5.  Change it count stripe_heads that a WRITE bio
has been attached to.

So now, raid5_make_request() calls md_write_start() and then
md_write_end() to keep the count elevated during the setup of the
request.

add_stripe_bio() calls md_write_start() for each stripe_head, and the
completion routines always call md_write_end(), instead of only
calling it when raid5_dec_bi_active_stripes() returns 0.
make_discard_request also calls md_write_start/end().

The parallel between md_write_{start,end} and use of bi_phys_segments
can be seen in that:
 Whenever we set bi_phys_segments to 1, we now call md_write_start.
 Whenever we increment it on non-read requests with
   raid5_inc_bi_active_stripes(), we now call md_write_start().
 Whenever we decrement bi_phys_segments on non-read requsts with
    raid5_dec_bi_active_stripes(), we now call md_write_end().

This reduces our dependence on keeping a per-bio count of active
stripes in bi_phys_segments.

md_write_inc() is added which parallels md_write_start(), but requires
that a write has already been started, and is certain never to sleep.
This can be used inside a spinlocked region when adding to a write
request.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c          |   17 +++++++++++++++++
 drivers/md/md.h          |    1 +
 drivers/md/raid5-cache.c |    2 +-
 drivers/md/raid5.c       |   27 +++++++++++++--------------
 4 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index af9118711228..bad5771bced4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7916,6 +7916,23 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
 }
 EXPORT_SYMBOL(md_write_start);
 
+/* md_write_inc can only be called when md_write_start() has
+ * already been called at least once of the current request.
+ * It increments the counter and is useful when a single request
+ * is split into several parts.  Each part causes an increment and
+ * so needs a matching md_write_end().
+ * Unlike md_write_start(), it is safe to call md_write_inc() inside
+ * a spinlocked region.
+ */
+void md_write_inc(struct mddev *mddev, struct bio *bi)
+{
+	if (bio_data_dir(bi) != WRITE)
+		return;
+	WARN_ON_ONCE(mddev->in_sync || mddev->ro);
+	atomic_inc(&mddev->writes_pending);
+}
+EXPORT_SYMBOL(md_write_inc);
+
 void md_write_end(struct mddev *mddev)
 {
 	if (atomic_dec_and_test(&mddev->writes_pending)) {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index e0940064c3ec..0cd12721a536 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -648,6 +648,7 @@ extern void md_wakeup_thread(struct md_thread *thread);
 extern void md_check_recovery(struct mddev *mddev);
 extern void md_reap_sync_thread(struct mddev *mddev);
 extern void md_write_start(struct mddev *mddev, struct bio *bi);
+extern void md_write_inc(struct mddev *mddev, struct bio *bi);
 extern void md_write_end(struct mddev *mddev);
 extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
 extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 64493132470b..f5034ecb4e94 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -318,8 +318,8 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
 	while (wbi && wbi->bi_iter.bi_sector <
 	       dev->sector + STRIPE_SECTORS) {
 		wbi2 = r5_next_bio(wbi, dev->sector);
+		md_write_end(conf->mddev);
 		if (!raid5_dec_bi_active_stripes(wbi)) {
-			md_write_end(conf->mddev);
 			bio_list_add(return_bi, wbi);
 		}
 		wbi = wbi2;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1c554a811d20..cc2d039b4aae 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3273,6 +3273,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 		bi->bi_next = *bip;
 	*bip = bi;
 	raid5_inc_bi_active_stripes(bi);
+	md_write_inc(conf->mddev, bi);
 
 	if (forwrite) {
 		/* check if page is covered */
@@ -3396,10 +3397,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
 
 			bi->bi_error = -EIO;
-			if (!raid5_dec_bi_active_stripes(bi)) {
-				md_write_end(conf->mddev);
+			md_write_end(conf->mddev);
+			if (!raid5_dec_bi_active_stripes(bi))
 				bio_list_add(return_bi, bi);
-			}
 			bi = nextbi;
 		}
 		if (bitmap_end)
@@ -3420,10 +3420,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
 
 			bi->bi_error = -EIO;
-			if (!raid5_dec_bi_active_stripes(bi)) {
-				md_write_end(conf->mddev);
+			md_write_end(conf->mddev);
+			if (!raid5_dec_bi_active_stripes(bi))
 				bio_list_add(return_bi, bi);
-			}
 			bi = bi2;
 		}
 
@@ -3780,10 +3779,9 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 				while (wbi && wbi->bi_iter.bi_sector <
 					dev->sector + STRIPE_SECTORS) {
 					wbi2 = r5_next_bio(wbi, dev->sector);
-					if (!raid5_dec_bi_active_stripes(wbi)) {
-						md_write_end(conf->mddev);
+					md_write_end(conf->mddev);
+					if (!raid5_dec_bi_active_stripes(wbi))
 						bio_list_add(return_bi, wbi);
-					}
 					wbi = wbi2;
 				}
 				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
@@ -5486,6 +5484,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 
 	bi->bi_next = NULL;
 	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+	md_write_start(mddev, bi);
 
 	stripe_sectors = conf->chunk_sectors *
 		(conf->raid_disks - conf->max_degraded);
@@ -5532,6 +5531,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 			sh->dev[d].towrite = bi;
 			set_bit(R5_OVERWRITE, &sh->dev[d].flags);
 			raid5_inc_bi_active_stripes(bi);
+			md_write_inc(mddev, bi);
 			sh->overwrite_disks++;
 		}
 		spin_unlock_irq(&sh->stripe_lock);
@@ -5554,9 +5554,9 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 		release_stripe_plug(mddev, sh);
 	}
 
+	md_write_end(mddev);
 	remaining = raid5_dec_bi_active_stripes(bi);
 	if (remaining == 0) {
-		md_write_end(mddev);
 		bio_endio(bi);
 	}
 }
@@ -5591,8 +5591,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 		do_flush = bi->bi_opf & REQ_PREFLUSH;
 	}
 
-	md_write_start(mddev, bi);
-
 	/*
 	 * If array is degraded, better not do chunk aligned read because
 	 * later we might have to read it again in order to reconstruct
@@ -5614,6 +5612,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	last_sector = bio_end_sector(bi);
 	bi->bi_next = NULL;
 	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
+	md_write_start(mddev, bi);
 
 	prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
@@ -5748,11 +5747,11 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	}
 	finish_wait(&conf->wait_for_overlap, &w);
 
+	if (rw == WRITE)
+		md_write_end(mddev);
 	remaining = raid5_dec_bi_active_stripes(bi);
 	if (remaining == 0) {
 
-		if ( rw == WRITE )
-			md_write_end(mddev);
 
 		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
 					 bi, 0);



^ permalink raw reply related

* [md PATCH 02/15] md/raid5: simplfy delaying of writes while metadata is updated.
From: NeilBrown @ 2017-03-15  3:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148954692173.18641.1294690639716682540.stgit@noble>

If a device fails during a write, we must ensure the failure is
recorded in the metadata before the completion of the write is
acknowleged.

Commit c3cce6cda162 ("md/raid5: ensure device failure recorded before
write request returns.")  added code for this, but it was
unnecessarily complicated.  We already had similar functionality for
handling updates to the bad-block-list, thanks to Commit de393cdea66c
("md: make it easier to wait for bad blocks to be acknowledged.")

So revert most of the former commit, and instead avoid collecting
completed writes if MD_CHANGE_PENDING is set.  raid5d() will then flush
the metadata and retry the stripe_head.
As this change can leave a stripe_head ready for handling immediately
after handle_active_stripes() returns, we change raid5_do_work() to
pause when MD_CHANGE_PENDING is set, so that it doesn't spin.

We check MD_CHANGE_PENDING *after* analyse_stripe() as it could be set
asynchronously.  After analyse_stripe(), we have collected stable data
about the state of devices, which will be used to make decisions.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5.c |   31 ++++++++-----------------------
 drivers/md/raid5.h |    3 ---
 2 files changed, 8 insertions(+), 26 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index cc2d039b4aae..f990f74901d2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4690,7 +4690,8 @@ static void handle_stripe(struct stripe_head *sh)
 	if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
 		goto finish;
 
-	if (s.handle_bad_blocks) {
+	if (s.handle_bad_blocks ||
+	    test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
 		set_bit(STRIPE_HANDLE, &sh->state);
 		goto finish;
 	}
@@ -5020,15 +5021,8 @@ static void handle_stripe(struct stripe_head *sh)
 			md_wakeup_thread(conf->mddev->thread);
 	}
 
-	if (!bio_list_empty(&s.return_bi)) {
-		if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
-			spin_lock_irq(&conf->device_lock);
-			bio_list_merge(&conf->return_bi, &s.return_bi);
-			spin_unlock_irq(&conf->device_lock);
-			md_wakeup_thread(conf->mddev->thread);
-		} else
-			return_io(&s.return_bi);
-	}
+	if (!bio_list_empty(&s.return_bi))
+		return_io(&s.return_bi);
 
 	clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
 }
@@ -6225,6 +6219,7 @@ static void raid5_do_work(struct work_struct *work)
 	struct r5worker *worker = container_of(work, struct r5worker, work);
 	struct r5worker_group *group = worker->group;
 	struct r5conf *conf = group->conf;
+	struct mddev *mddev = conf->mddev;
 	int group_id = group - conf->worker_groups;
 	int handled;
 	struct blk_plug plug;
@@ -6245,6 +6240,9 @@ static void raid5_do_work(struct work_struct *work)
 		if (!batch_size && !released)
 			break;
 		handled += batch_size;
+		wait_event_lock_irq(mddev->sb_wait,
+				    !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags),
+				    conf->device_lock);
 	}
 	pr_debug("%d stripes handled\n", handled);
 
@@ -6272,18 +6270,6 @@ static void raid5d(struct md_thread *thread)
 
 	md_check_recovery(mddev);
 
-	if (!bio_list_empty(&conf->return_bi) &&
-	    !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
-		struct bio_list tmp = BIO_EMPTY_LIST;
-		spin_lock_irq(&conf->device_lock);
-		if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
-			bio_list_merge(&tmp, &conf->return_bi);
-			bio_list_init(&conf->return_bi);
-		}
-		spin_unlock_irq(&conf->device_lock);
-		return_io(&tmp);
-	}
-
 	blk_start_plug(&plug);
 	handled = 0;
 	spin_lock_irq(&conf->device_lock);
@@ -6935,7 +6921,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	INIT_LIST_HEAD(&conf->hold_list);
 	INIT_LIST_HEAD(&conf->delayed_list);
 	INIT_LIST_HEAD(&conf->bitmap_list);
-	bio_list_init(&conf->return_bi);
 	init_llist_head(&conf->released_stripes);
 	atomic_set(&conf->active_stripes, 0);
 	atomic_set(&conf->preread_active_stripes, 0);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index ba5b7a3790af..13800dc9dd88 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -638,9 +638,6 @@ struct r5conf {
 	int			skip_copy; /* Don't copy data from bio to stripe cache */
 	struct list_head	*last_hold; /* detect hold_list promotions */
 
-	/* bios to have bi_end_io called after metadata is synced */
-	struct bio_list		return_bi;
-
 	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */
 	/* unfortunately we need two cache names as we temporarily have
 	 * two caches.



^ permalink raw reply related

* [md PATCH 03/15] md/raid5: call bio_endio() directly rather than queueing for later.
From: NeilBrown @ 2017-03-15  3:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148954692173.18641.1294690639716682540.stgit@noble>

We currently gather bios that need to be returned into a bio_list
and call bio_endio() on them all together.
The original reason for this was to avoid making the calls while
holding a spinlock.
Locking has changed a lot since then, and that reason is no longer
valid.

So discard return_io() and various return_bi lists, and just call
bio_endio() directly as needed.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5-cache.c |   13 +++++--------
 drivers/md/raid5-log.h   |    2 +-
 drivers/md/raid5.c       |   38 ++++++++++----------------------------
 drivers/md/raid5.h       |    1 -
 4 files changed, 16 insertions(+), 38 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index f5034ecb4e94..5be8dbc5d91b 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -308,8 +308,7 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
 }
 
 static void
-r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
-			      struct bio_list *return_bi)
+r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
 {
 	struct bio *wbi, *wbi2;
 
@@ -319,23 +318,21 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
 	       dev->sector + STRIPE_SECTORS) {
 		wbi2 = r5_next_bio(wbi, dev->sector);
 		md_write_end(conf->mddev);
-		if (!raid5_dec_bi_active_stripes(wbi)) {
-			bio_list_add(return_bi, wbi);
-		}
+		if (!raid5_dec_bi_active_stripes(wbi))
+			bio_endio(wbi);
 		wbi = wbi2;
 	}
 }
 
 void r5c_handle_cached_data_endio(struct r5conf *conf,
-	  struct stripe_head *sh, int disks, struct bio_list *return_bi)
+				  struct stripe_head *sh, int disks)
 {
 	int i;
 
 	for (i = sh->disks; i--; ) {
 		if (sh->dev[i].written) {
 			set_bit(R5_UPTODATE, &sh->dev[i].flags);
-			r5c_return_dev_pending_writes(conf, &sh->dev[i],
-						      return_bi);
+			r5c_return_dev_pending_writes(conf, &sh->dev[i]);
 			bitmap_endwrite(conf->mddev->bitmap, sh->sector,
 					STRIPE_SECTORS,
 					!test_bit(STRIPE_DEGRADED, &sh->state),
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 4f5a0f4e0b1f..738930ff5d17 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -21,7 +21,7 @@ extern void r5c_release_extra_page(struct stripe_head *sh);
 extern void r5c_use_extra_page(struct stripe_head *sh);
 extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
 extern void r5c_handle_cached_data_endio(struct r5conf *conf,
-	struct stripe_head *sh, int disks, struct bio_list *return_bi);
+	struct stripe_head *sh, int disks);
 extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh);
 extern void r5c_make_stripe_write_out(struct stripe_head *sh);
 extern void r5c_flush_cache(struct r5conf *conf, int num);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f990f74901d2..f07cd105b9f9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -158,17 +158,6 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
 	return slot;
 }
 
-static void return_io(struct bio_list *return_bi)
-{
-	struct bio *bi;
-	while ((bi = bio_list_pop(return_bi)) != NULL) {
-		bi->bi_iter.bi_size = 0;
-		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
-					 bi, 0);
-		bio_endio(bi);
-	}
-}
-
 static void print_raid5_conf (struct r5conf *conf);
 
 static int stripe_operations_active(struct stripe_head *sh)
@@ -1310,7 +1299,6 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
 static void ops_complete_biofill(void *stripe_head_ref)
 {
 	struct stripe_head *sh = stripe_head_ref;
-	struct bio_list return_bi = BIO_EMPTY_LIST;
 	int i;
 
 	pr_debug("%s: stripe %llu\n", __func__,
@@ -1335,15 +1323,13 @@ static void ops_complete_biofill(void *stripe_head_ref)
 				dev->sector + STRIPE_SECTORS) {
 				rbi2 = r5_next_bio(rbi, dev->sector);
 				if (!raid5_dec_bi_active_stripes(rbi))
-					bio_list_add(&return_bi, rbi);
+					bio_endio(rbi);
 				rbi = rbi2;
 			}
 		}
 	}
 	clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
 
-	return_io(&return_bi);
-
 	set_bit(STRIPE_HANDLE, &sh->state);
 	raid5_release_stripe(sh);
 }
@@ -3350,8 +3336,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
 
 static void
 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
-				struct stripe_head_state *s, int disks,
-				struct bio_list *return_bi)
+		     struct stripe_head_state *s, int disks)
 {
 	int i;
 	BUG_ON(sh->batch_head);
@@ -3399,7 +3384,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			bi->bi_error = -EIO;
 			md_write_end(conf->mddev);
 			if (!raid5_dec_bi_active_stripes(bi))
-				bio_list_add(return_bi, bi);
+				bio_endio(bi);
 			bi = nextbi;
 		}
 		if (bitmap_end)
@@ -3422,7 +3407,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			bi->bi_error = -EIO;
 			md_write_end(conf->mddev);
 			if (!raid5_dec_bi_active_stripes(bi))
-				bio_list_add(return_bi, bi);
+				bio_endio(bi);
 			bi = bi2;
 		}
 
@@ -3448,7 +3433,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 
 				bi->bi_error = -EIO;
 				if (!raid5_dec_bi_active_stripes(bi))
-					bio_list_add(return_bi, bi);
+					bio_endio(bi);
 				bi = nextbi;
 			}
 		}
@@ -3747,7 +3732,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
  * never LOCKED, so we don't need to test 'failed' directly.
  */
 static void handle_stripe_clean_event(struct r5conf *conf,
-	struct stripe_head *sh, int disks, struct bio_list *return_bi)
+	struct stripe_head *sh, int disks)
 {
 	int i;
 	struct r5dev *dev;
@@ -3781,7 +3766,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 					wbi2 = r5_next_bio(wbi, dev->sector);
 					md_write_end(conf->mddev);
 					if (!raid5_dec_bi_active_stripes(wbi))
-						bio_list_add(return_bi, wbi);
+						bio_endio(wbi);
 					wbi = wbi2;
 				}
 				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
@@ -4724,7 +4709,7 @@ static void handle_stripe(struct stripe_head *sh)
 		sh->reconstruct_state = 0;
 		break_stripe_batch_list(sh, 0);
 		if (s.to_read+s.to_write+s.written)
-			handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
+			handle_failed_stripe(conf, sh, &s, disks);
 		if (s.syncing + s.replacing)
 			handle_failed_sync(conf, sh, &s);
 	}
@@ -4790,10 +4775,10 @@ static void handle_stripe(struct stripe_head *sh)
 			     && !test_bit(R5_LOCKED, &qdev->flags)
 			     && (test_bit(R5_UPTODATE, &qdev->flags) ||
 				 test_bit(R5_Discard, &qdev->flags))))))
-		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
+		handle_stripe_clean_event(conf, sh, disks);
 
 	if (s.just_cached)
-		r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
+		r5c_handle_cached_data_endio(conf, sh, disks);
 	log_stripe_write_finished(sh);
 
 	/* Now we might consider reading some blocks, either to check/generate
@@ -5021,9 +5006,6 @@ static void handle_stripe(struct stripe_head *sh)
 			md_wakeup_thread(conf->mddev->thread);
 	}
 
-	if (!bio_list_empty(&s.return_bi))
-		return_io(&s.return_bi);
-
 	clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
 }
 
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 13800dc9dd88..fd5c21cde77f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -278,7 +278,6 @@ struct stripe_head_state {
 	int dec_preread_active;
 	unsigned long ops_request;
 
-	struct bio_list return_bi;
 	struct md_rdev *blocked_rdev;
 	int handle_bad_blocks;
 	int log_failed;



^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox