Linux RAID subsystem development
 help / color / mirror / Atom feed
* [PATCH 13/13] md/multipath: Replace printk() calls by the usage of higher level interfaces
From: SF Markus Elfring @ 2016-10-02 12:10 UTC (permalink / raw)
  To: linux-raid, Jens Axboe, NeilBrown, Shaohua Li
  Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <4a31d7a7-f70c-12f7-202f-963bd8706066@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Sun, 2 Oct 2016 12:42:46 +0200

1. Add a definition for the macros "MY_LOG_PREFIX" and "pr_fmt"
   so that their information can be used for consistent message output.

2. Prefer usage of some higher level macros over calling "printk" directly
   in this software module.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/multipath.c | 69 ++++++++++++++++++++++++--------------------------
 1 file changed, 33 insertions(+), 36 deletions(-)

diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 85f6c85..045b866 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -19,6 +19,8 @@
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#define MY_LOG_PREFIX KBUILD_MODNAME ": "
+#define pr_fmt(fmt) MY_LOG_PREFIX fmt
 #include <linux/blkdev.h>
 #include <linux/module.h>
 #include <linux/raid/md_u.h>
@@ -51,8 +53,7 @@ static int multipath_map(struct mpconf *conf)
 		}
 	}
 	rcu_read_unlock();
-
-	printk(KERN_ERR "multipath_map(): no more operational IO paths?\n");
+	pr_err("map: no more operational IO paths?\n");
 	return (-1);
 }
 
@@ -97,7 +98,8 @@ static void multipath_end_request(struct bio *bio)
 		 */
 		char b[BDEVNAME_SIZE];
 		md_error(mp_bh->mddev, rdev);
-		printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n",
+
+		pr_err("%s: rescheduling sector %llu\n",
 		       bdevname(rdev->bdev, b),
 		       (unsigned long long)bio->bi_iter.bi_sector);
 		multipath_reschedule_retry(mp_bh);
@@ -195,8 +197,7 @@ static void multipath_error(struct mddev *mddev, struct md_rdev *rdev)
 		 * first check if this is a queued request for a device
 		 * which has just failed.
 		 */
-		printk(KERN_ALERT
-		       "multipath: only one IO path left and IO error.\n");
+		pr_alert("only one IO path left and IO error.\n");
 		/* leave it active... it's all we have */
 		return;
 	}
@@ -211,12 +212,10 @@ static void multipath_error(struct mddev *mddev, struct md_rdev *rdev)
 	}
 	set_bit(Faulty, &rdev->flags);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
-	printk(KERN_ALERT "multipath: IO failure on %s,"
-	       " disabling IO path.\n"
-	       "multipath: Operation continuing"
-	       " on %d IO paths.\n",
-	       bdevname(rdev->bdev, b),
-	       conf->raid_disks - mddev->degraded);
+	pr_alert("IO failure on %s, disabling IO path.\n"
+		 MY_LOG_PREFIX "Operation continuing on %d IO paths.\n",
+		 bdevname(rdev->bdev, b),
+		 conf->raid_disks - mddev->degraded);
 }
 
 static void print_multipath_conf(struct mpconf *conf)
@@ -224,21 +223,22 @@ static void print_multipath_conf(struct mpconf *conf)
 	int i;
 	struct multipath_info *tmp;
 
-	printk("MULTIPATH conf printout:\n");
+	pr_info("conf printout:\n");
 	if (!conf) {
-		printk("(conf==NULL)\n");
+		pr_info("(conf==NULL)\n");
 		return;
 	}
-	printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
-			 conf->raid_disks);
+	pr_info("--- wd:%d rd:%d\n",
+		conf->raid_disks - conf->mddev->degraded,
+		conf->raid_disks);
 
 	for (i = 0; i < conf->raid_disks; i++) {
 		char b[BDEVNAME_SIZE];
 		tmp = conf->multipaths + i;
 		if (tmp->rdev)
-			printk(" disk%d, o:%d, dev:%s\n",
-			       i, !test_bit(Faulty, &tmp->rdev->flags),
-			       bdevname(tmp->rdev->bdev, b));
+			pr_info("disk%d, o:%d, dev:%s\n",
+				i, !test_bit(Faulty, &tmp->rdev->flags),
+				bdevname(tmp->rdev->bdev, b));
 	}
 }
 
@@ -295,8 +295,8 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	if (rdev == p->rdev) {
 		if (test_bit(In_sync, &rdev->flags) ||
 		    atomic_read(&rdev->nr_pending)) {
-			printk(KERN_ERR "hot-remove-disk, slot %d is identified"
-			       " but is still operational!\n", number);
+			pr_err("hot-remove-disk, slot %d is identified but is still operational!\n",
+			       number);
 			err = -EBUSY;
 			goto abort;
 		}
@@ -350,16 +350,14 @@ static void multipathd(struct md_thread *thread)
 
 		mp_bh->path = multipath_map(conf);
 		if (mp_bh->path < 0) {
-			printk(KERN_ALERT "multipath: %s: unrecoverable IO read"
-				" error for block %llu\n",
-				bdevname(bio->bi_bdev, b),
-				(unsigned long long)bio->bi_iter.bi_sector);
+			pr_alert("%s: unrecoverable IO read error for block %llu\n",
+				 bdevname(bio->bi_bdev, b),
+				 (unsigned long long)bio->bi_iter.bi_sector);
 			multipath_end_bh_io(mp_bh, -EIO);
 		} else {
-			printk(KERN_ERR "multipath: %s: redirecting sector %llu"
-				" to another IO path\n",
-				bdevname(bio->bi_bdev, b),
-				(unsigned long long)bio->bi_iter.bi_sector);
+			pr_err("%s: redirecting sector %llu to another IO path\n",
+			       bdevname(bio->bi_bdev, b),
+			       (unsigned long long)bio->bi_iter.bi_sector);
 			*bio = *(mp_bh->master_bio);
 			bio->bi_iter.bi_sector +=
 				conf->multipaths[mp_bh->path].rdev->data_offset;
@@ -393,8 +391,8 @@ static int multipath_run(struct mddev *mddev)
 		return -EINVAL;
 
 	if (mddev->level != LEVEL_MULTIPATH) {
-		printk("multipath: %s: raid level not set to multipath IO (%d)\n",
-		       mdname(mddev), mddev->level);
+		pr_notice("%s: raid level not set to multipath IO (%d)\n",
+			  mdname(mddev), mddev->level);
 		goto out;
 	}
 	/*
@@ -436,8 +434,7 @@ static int multipath_run(struct mddev *mddev)
 	INIT_LIST_HEAD(&conf->retry_list);
 
 	if (!working_disks) {
-		printk(KERN_ERR "multipath: no operational IO paths for %s\n",
-			mdname(mddev));
+		pr_err("no operational IO paths for %s\n", mdname(mddev));
 		goto free_multipaths;
 	}
 	mddev->degraded = conf->raid_disks - working_disks;
@@ -451,10 +448,10 @@ static int multipath_run(struct mddev *mddev)
 	if (!mddev->thread)
 		goto destroy_pool;
 
-	printk(KERN_INFO
-		"multipath: array %s active with %d out of %d IO paths\n",
-		mdname(mddev), conf->raid_disks - mddev->degraded,
-	       mddev->raid_disks);
+	pr_info("array %s active with %d out of %d IO paths\n",
+		mdname(mddev),
+		conf->raid_disks - mddev->degraded,
+		mddev->raid_disks);
 	/*
 	 * Ok, everything is just fine now
 	 */
-- 
2.10.0


^ permalink raw reply related

* Re: WARNING: mismatch_cnt is not 0 on <array device>
From: Benjammin2068 @ 2016-10-02 17:33 UTC (permalink / raw)
  To: Linux-RAID
In-Reply-To: <b87f3f0d-e9a9-8ece-d708-9b8730157bfb@gmail.com>


On 09/28/2016 10:55 AM, Benjammin2068 wrote:
> On 09/27/2016 06:09 PM, Adam Goryachev wrote:
>> Just out of interest, but I'm not sure how useful your munin monitoring will be... AFAIK, the mismatch_cnt value is only updated when you run a check, which would probably take some number of hours to complete. I would guess that you are unlikely to run more than one check a week or month.... and as soon as there is any change (unless you know the explanation) then you should be looking to resolve that.
>>
>> Unless of course I'm wrong about when the count is updated?
> You would be correct - only during checks - but it's an easy way for me to just keep track of it..
>
> And a check takes about 3hrs on this array. (and it happens every weekend)
>
>

Ok... so as an update....

I ran 2 checks during the week after fixing the thermal issues I was seeing that I think were part of the problem. Both checks resulted in the mismatch_cnt remaining at 0.

The usual CRON automated check ran this weekend -- and bumped the count to 24 on my RAID6 (there's a RAID1 swap that went to 160, but I've read for something like SWAP, this can be ignored. I just thought I'd mention it since it feels a bit like Schrodinger's drive array.)

So... I've emailed the Vendor for a replacement since this SAS/SATA add-on card is new. And maybe it's just flaky.

IN the meantime, is it possible to shrink the array back to RAID5 (4 members) so I can run it on the MB's controller only -- where I never seemed to have a mismatch count problem? (I kind need the array to be up and running since I need to work. :P)

Also -- for academic reasons, is it possible to get a list of the the blocks that causes the count to increase to try confirm it's the new add-on card that's the problem?
If there's a way to list what blocks and drive that were the issue, then I could go check those files....

Otherwise, the mdadm reports the array as clean and happy and FSCK reports the partition as clean and happy.

How disconcerting is that?

Thanks,

 -Ben


^ permalink raw reply

* Re: [PATCH] Fix bus error when accessing MBR partition records
From: NeilBrown @ 2016-10-02 22:32 UTC (permalink / raw)
  To: linux-raid; +Cc: James Clarke
In-Reply-To: <20160929122838.66975-1-jrtc27@jrtc27.com>

[-- Attachment #1: Type: text/plain, Size: 3947 bytes --]

On Thu, Sep 29 2016, James Clarke wrote:

> Since the MBR layout only has partition records as 2-byte aligned, the 32-bit
> fields in them are not aligned. Thus, they cannot be accessed on some
> architectures (such as SPARC) by using a "struct MBR_part_record *" pointer,
> as the compiler can assume that the pointer is properly aligned. Instead, the
> records must be accessed by going through the MBR struct itself every time.

Weird....

Can you see if adding "__attribute__((packed))" to struct
MBR_part_record also fixes the problem?

It seems strange that the compiler lets you take a pointer, but then
doesn't use it correctly.  Maybe it is an inconsistency in the types.

I don't necessarily disagree with your fix, but I'd like to understand
why the current code is wrong.

Thanks,
NeilBrown


>
> Signed-off-by: James Clarke <jrtc27@jrtc27.com>
> ---
>  super-mbr.c |  6 ++++++
>  util.c      | 14 +++++++-------
>  2 files changed, 13 insertions(+), 7 deletions(-)
>
> diff --git a/super-mbr.c b/super-mbr.c
> index 62b3f03..303dde4 100644
> --- a/super-mbr.c
> +++ b/super-mbr.c
> @@ -57,6 +57,9 @@ static void examine_mbr(struct supertype *st, char *homehost)
>  
>  	printf("   MBR Magic : %04x\n", sb->magic);
>  	for (i = 0; i < MBR_PARTITIONS; i++)
> +		/* Have to make every access through sb rather than using a pointer to
> +		 * the partition table (or an entry), since the entries are not
> +		 * properly aligned. */
>  		if (sb->parts[i].blocks_num)
>  			printf("Partition[%d] : %12lu sectors at %12lu (type %02x)\n",
>  			       i,
> @@ -151,6 +154,9 @@ static void getinfo_mbr(struct supertype *st, struct mdinfo *info, char *map)
>  	info->component_size = 0;
>  
>  	for (i = 0; i < MBR_PARTITIONS ; i++)
> +		/* Have to make every access through sb rather than using a pointer to
> +		 * the partition table (or an entry), since the entries are not
> +		 * properly aligned. */
>  		if (sb->parts[i].blocks_num) {
>  			unsigned long last =
>  				(unsigned long)__le32_to_cpu(sb->parts[i].blocks_num)
> diff --git a/util.c b/util.c
> index a238a21..08adbd5 100644
> --- a/util.c
> +++ b/util.c
> @@ -1412,7 +1412,6 @@ static int get_gpt_last_partition_end(int fd, unsigned long long *endofpart)
>  static int get_last_partition_end(int fd, unsigned long long *endofpart)
>  {
>  	struct MBR boot_sect;
> -	struct MBR_part_record *part;
>  	unsigned long long curr_part_end;
>  	unsigned part_nr;
>  	int retval = 0;
> @@ -1429,21 +1428,22 @@ static int get_last_partition_end(int fd, unsigned long long *endofpart)
>  	if (boot_sect.magic == MBR_SIGNATURE_MAGIC) {
>  		retval = 1;
>  		/* found the correct signature */
> -		part = boot_sect.parts;
>  
>  		for (part_nr = 0; part_nr < MBR_PARTITIONS; part_nr++) {
> +			/* Have to make every access through boot_sect rather than using a
> +			 * pointer to the partition table (or an entry), since the entries
> +			 * are not properly aligned. */
> +
>  			/* check for GPT type */
> -			if (part->part_type == MBR_GPT_PARTITION_TYPE) {
> +			if (boot_sect.parts[part_nr].part_type == MBR_GPT_PARTITION_TYPE) {
>  				retval = get_gpt_last_partition_end(fd, endofpart);
>  				break;
>  			}
>  			/* check the last used lba for the current partition  */
> -			curr_part_end = __le32_to_cpu(part->first_sect_lba) +
> -				__le32_to_cpu(part->blocks_num);
> +			curr_part_end = __le32_to_cpu(boot_sect.parts[part_nr].first_sect_lba) +
> +				__le32_to_cpu(boot_sect.parts[part_nr].blocks_num);
>  			if (curr_part_end > *endofpart)
>  				*endofpart = curr_part_end;
> -
> -			part++;
>  		}
>  	} else {
>  		/* Unknown partition table */
> -- 
> 2.10.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 800 bytes --]

^ permalink raw reply

* Prefetch in /lib/raid6/avx2.c
From: Doug Dumitru @ 2016-10-02 22:40 UTC (permalink / raw)
  To: linux-raid

I have been doing some high bandwidth testing of raid-6, and the
pretetch in raid6_avx24_gen_syndrome appears to be less than optimal.

This is my patch (against 4.4.0-38 [Ubuntu 16.04LTS)

--- cut here ---
--- lib/raid6/avx2.c0   2016-10-01 21:42:25.280347868 -0700
+++ lib/raid6/avx2.c    2016-10-02 15:35:48.168480760 -0700
@@ -189,10 +189,8 @@

                for (z = z0; z >= 0; z--) {

-                       asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
-                       asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
-                       asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
-                       asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
+                       asm volatile("prefetchnta %0" : : "m" (dptr[z][d+128]));
+                       asm volatile("prefetchnta %0" : : "m" (dptr[z][d+192]));

                        asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
                        asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
--- cut here ---

In perf, the cpu cycles goes from 5.3% to 3.0% for
raid6_avx24_gen_syndrome in my test and throughput increases from
about 8.2GB/sec to almost 10GB/sec.  It is a very "synthetic" test,
but the avx2 code does seem to be a factor.

I suspect other SSE and AVX "unroll variants" have similar issues, but
I have not tested those.

My test system is an E5-1650 v3 (single socket) with DDR4.  This might
help dual sockets even more.

Doug


-- 
Doug Dumitru
EasyCo LLC

^ permalink raw reply

* Re: [PATCH] Fix bus error when accessing MBR partition records
From: James Clarke @ 2016-10-02 23:00 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid, debian-sparc
In-Reply-To: <87r37y5qmg.fsf@notabene.neil.brown.name>

Hi Neil,

> On 2 Oct 2016, at 23:32, NeilBrown <neilb@suse.com> wrote:
> 
>> On Thu, Sep 29 2016, James Clarke wrote:
>> 
>> Since the MBR layout only has partition records as 2-byte aligned, the 32-bit
>> fields in them are not aligned. Thus, they cannot be accessed on some
>> architectures (such as SPARC) by using a "struct MBR_part_record *" pointer,
>> as the compiler can assume that the pointer is properly aligned. Instead, the
>> records must be accessed by going through the MBR struct itself every time.
> 
> Weird....
> 
> Can you see if adding "__attribute__((packed))" to struct
> MBR_part_record also fixes the problem?

That also works. When I wrote the patch initially, I wasn’t sure if it was a
"correct" fix, but having looked into it more I *believe* it is conformant. The
alignment of a packed struct is 1-byte, so, while the compiler may know that the
32-bit fields are 8-byte aligned within the struct, the pointer to the struct
need not be aligned, and so the correct conservative code is generated.

> It seems strange that the compiler lets you take a pointer, but then
> doesn't use it correctly.  Maybe it is an inconsistency in the types.

Yes, the type doesn’t include the provenance of the pointer, so in general the
compiler can’t know it came from a packed struct (although in this case not much
static analysis would be needed). See [1] and [2].

> I don't necessarily disagree with your fix, but I'd like to understand
> why the current code is wrong.

Hopefully the links make it clearer.

Regards,
James

[1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51628
[2] https://llvm.org/bugs/show_bug.cgi?id=22821

^ permalink raw reply

* Corrupted FS after RAID5->6 reshape
From: Marcus Fong @ 2016-10-03  1:07 UTC (permalink / raw)
  To: linux-raid

Hi, all.

I’ve had a 5-drive RAID5 mdraid array that’s been running fine for several years, but recently had a drive begin to fail with SMART errors. I replaced the drive, then decided to add another drive and reshape the array to RAID6 for added redundancy in case any of the other drives failed in future.

I put the backup file on an external USB drive without realising that mdraid would be writing to the backup file throughout the reshape operation, and the USB drive disconnected itself midway through the reshape. The reshape continued on without any apparent critical errors, though, so I left it rather than try to stop and resume it.

The reshape appeared to complete this morning without reporting any issues, but when I rebooted the machine it started in emergency mode because it failed to mount the filesystems on the RAID array. In one case mount can’t find the ext4 superblock at all, in two other cases the ext4 filesystem can be found but appears corrupted.

I’m just wondering if anyone might have any suggestions about how best to attempt recovery. There’s nothing on the array I ultimately can’t live without, but I’d like to retrieve any data I can.

/var/log/syslog output when starting the reshape operation:

Sep 28 11:19:22 bibliotheca kernel: [  706.039584] md/raid:md0: raid level 6 active with 5 out of 6 devices, algorithm 18
Sep 28 11:19:23 bibliotheca mdadm[889]: RebuildStarted event detected on md device /dev/md0
Sep 28 11:19:23 bibliotheca kernel: [  706.952130] md: reshape of RAID array md0
Sep 28 11:19:23 bibliotheca kernel: [  706.952136] md: minimum _guaranteed_  speed: 1000 KB/sec/disk.
Sep 28 11:19:23 bibliotheca kernel: [  706.952139] md: using maximum available idle IO bandwidth (but not more than 200000 KB/sec) for reshape.
Sep 28 11:19:23 bibliotheca kernel: [  706.952147] md: using 128k window, over a total of 3906885120k.
Sep 28 11:19:25 bibliotheca mdadm[889]: RebuildFinished event detected on md device /dev/md0
Sep 28 11:36:05 bibliotheca mdadm[889]: RebuildStarted event detected on md device /dev/md0

/var/log/syslog output when the USB drive disconnected:

Sep 29 16:54:00 bibliotheca kernel: [107184.545094] EXT4-fs warning (device sdh1): ext4_end_bio:329: I/O error -5 writing to inode 12 (offset 0 size 0 starting
block 35329)
Sep 29 16:54:00 bibliotheca kernel: [107184.545102] Buffer I/O error on device sdh1, logical block 34817
Sep 29 16:54:00 bibliotheca kernel: [107184.545149] Buffer I/O error on device sdh1, logical block 34818
Sep 29 16:54:00 bibliotheca kernel: [107184.545183] Buffer I/O error on device sdh1, logical block 34819
Sep 29 16:54:00 bibliotheca kernel: [107184.545217] Buffer I/O error on device sdh1, logical block 34820
…
Sep 29 16:54:00 bibliotheca kernel: [107184.548901] Aborting journal on device sdh1-8.
Sep 29 16:54:00 bibliotheca kernel: [107184.549451] JBD2: Error -5 detected when updating journal superblock for sdh1-8.
Sep 29 16:54:00 bibliotheca kernel: [107184.550633] EXT4-fs error (device sdh1): ext4_journal_check_start:56: Detected aborted journal
Sep 29 16:54:00 bibliotheca kernel: [107184.551804] EXT4-fs (sdh1): Remounting filesystem read-only
Sep 29 16:54:00 bibliotheca kernel: [107184.552420] EXT4-fs (sdh1): previous I/O error to superblock detected
Sep 29 16:54:00 bibliotheca kernel: [107184.553155] EXT4-fs (sdh1): ext4_writepages: jbd2_start: 9223372036854775807 pages, ino 12; err -30

/var/log/syslog output when the reshape finished:

Oct  3 04:27:10 bibliotheca kernel: [404373.812923] md: md0: reshape done.
Oct  3 04:27:10 bibliotheca mdadm[889]: RebuildFinished event detected on md device /dev/md0
Oct  3 04:27:10 bibliotheca kernel: [404373.836909] md: recovery of RAID array md0
Oct  3 04:27:10 bibliotheca kernel: [404373.836915] md: minimum _guaranteed_  speed: 1000 KB/sec/disk.
Oct  3 04:27:10 bibliotheca kernel: [404373.836918] md: using maximum available idle IO bandwidth (but not more than 200000 KB/sec) for recovery.
Oct  3 04:27:10 bibliotheca kernel: [404373.836926] md: using 128k window, over a total of 3906885120k.
Oct  3 04:27:10 bibliotheca kernel: [404373.836930] md: resuming recovery of md0 from checkpoint.
Oct  3 04:27:10 bibliotheca kernel: [404373.836940] md: md0: recovery done.
Oct  3 04:27:10 bibliotheca kernel: [404373.837219] EXT4-fs error (device sdh1): ext4_wait_block_bitmap:503: comm mdadm: Cannot read block bitmap - block_group = 1, block_bitmap = 1026
Oct  3 04:27:10 bibliotheca kernel: [404373.839207] EXT4-fs error (device sdh1): ext4_discard_preallocations:4021: comm mdadm: Error loading buddy information for 1
Oct  3 04:27:11 bibliotheca mdadm[889]: RebuildStarted event detected on md device /dev/md0
Oct  3 04:27:11 bibliotheca mdadm[889]: RebuildFinished event detected on md device /dev/md0
Oct  3 04:27:11 bibliotheca mdadm[889]: SpareActive event detected on md device /dev/md0, component device /dev/sdg

mdadm and lsdrv output:

$ sudo mdadm -D /dev/md0
/dev/md0:
        Version : 1.2
  Creation Time : Wed Jun 26 21:00:29 2013
     Raid Level : raid6
     Array Size : 15627540480 (14903.58 GiB 16002.60 GB)
  Used Dev Size : 3906885120 (3725.90 GiB 4000.65 GB)
   Raid Devices : 6
  Total Devices : 6
    Persistence : Superblock is persistent

    Update Time : Mon Oct  3 10:57:44 2016
          State : clean
 Active Devices : 6
Working Devices : 6
 Failed Devices : 0
  Spare Devices : 0

         Layout : left-symmetric-6
     Chunk Size : 512K

           Name : bibliotheca:0  (local to host bibliotheca)
           UUID : d7c7fad6:fa50ed87:1d51f480:3d6405a5
         Events : 2565608

    Number   Major   Minor   RaidDevice State
       6       8       65        0      active sync   /dev/sde1
       1       8       49        1      active sync   /dev/sdd1
       2       8        1        2      active sync   /dev/sda1
       3       8       33        3      active sync   /dev/sdc1
       5       8       17        4      active sync   /dev/sdb1
       7       8       96        5      active sync   /dev/sdg

$ sudo mdadm -E /dev/sda1
/dev/sda1:
          Magic : a92b4efc
        Version : 1.2
    Feature Map : 0x0
     Array UUID : d7c7fad6:fa50ed87:1d51f480:3d6405a5
           Name : bibliotheca:0  (local to host bibliotheca)
  Creation Time : Wed Jun 26 21:00:29 2013
     Raid Level : raid6
   Raid Devices : 6

 Avail Dev Size : 7813771264 (3725.90 GiB 4000.65 GB)
     Array Size : 15627540480 (14903.58 GiB 16002.60 GB)
  Used Dev Size : 7813770240 (3725.90 GiB 4000.65 GB)
    Data Offset : 262144 sectors
   Super Offset : 8 sectors
   Unused Space : before=262064 sectors, after=1024 sectors
          State : clean
    Device UUID : bf1149d3:6289a253:e441df54:9d8a41a8

    Update Time : Mon Oct  3 10:57:44 2016
       Checksum : f578645a - correct
         Events : 2565608

         Layout : left-symmetric-6
     Chunk Size : 512K

   Device Role : Active device 2
   Array State : AAAAAA ('A' == active, '.' == missing, 'R' == replacing)

$ sudo mdadm -E /dev/sdb1
/dev/sdb1:
          Magic : a92b4efc
        Version : 1.2
    Feature Map : 0x0
     Array UUID : d7c7fad6:fa50ed87:1d51f480:3d6405a5
           Name : bibliotheca:0  (local to host bibliotheca)
  Creation Time : Wed Jun 26 21:00:29 2013
     Raid Level : raid6
   Raid Devices : 6

 Avail Dev Size : 7813771264 (3725.90 GiB 4000.65 GB)
     Array Size : 15627540480 (14903.58 GiB 16002.60 GB)
  Used Dev Size : 7813770240 (3725.90 GiB 4000.65 GB)
    Data Offset : 262144 sectors
   Super Offset : 8 sectors
   Unused Space : before=262064 sectors, after=1024 sectors
          State : clean
    Device UUID : a3233875:73036f81:b64b4098:e6244f43

    Update Time : Mon Oct  3 10:57:44 2016
       Checksum : a3a2946d - correct
         Events : 2565608

         Layout : left-symmetric-6
     Chunk Size : 512K

   Device Role : Active device 4
   Array State : AAAAAA ('A' == active, '.' == missing, 'R' == replacing)

$ sudo mdadm -E /dev/sdc1
/dev/sdc1:
          Magic : a92b4efc
        Version : 1.2
    Feature Map : 0x0
     Array UUID : d7c7fad6:fa50ed87:1d51f480:3d6405a5
           Name : bibliotheca:0  (local to host bibliotheca)
  Creation Time : Wed Jun 26 21:00:29 2013
     Raid Level : raid6
   Raid Devices : 6

 Avail Dev Size : 7813771264 (3725.90 GiB 4000.65 GB)
     Array Size : 15627540480 (14903.58 GiB 16002.60 GB)
  Used Dev Size : 7813770240 (3725.90 GiB 4000.65 GB)
    Data Offset : 262144 sectors
   Super Offset : 8 sectors
   Unused Space : before=262064 sectors, after=1024 sectors
          State : clean
    Device UUID : 9d0277b9:c20b549a:40b6788c:5dbb7cfa

    Update Time : Mon Oct  3 10:57:44 2016
       Checksum : ac2c7cb6 - correct
         Events : 2565608

         Layout : left-symmetric-6
     Chunk Size : 512K

   Device Role : Active device 3
   Array State : AAAAAA ('A' == active, '.' == missing, 'R' == replacing)

$ sudo mdadm -E /dev/sdd1
/dev/sdd1:
          Magic : a92b4efc
        Version : 1.2
    Feature Map : 0x0
     Array UUID : d7c7fad6:fa50ed87:1d51f480:3d6405a5
           Name : bibliotheca:0  (local to host bibliotheca)
  Creation Time : Wed Jun 26 21:00:29 2013
     Raid Level : raid6
   Raid Devices : 6

 Avail Dev Size : 7813771264 (3725.90 GiB 4000.65 GB)
     Array Size : 15627540480 (14903.58 GiB 16002.60 GB)
  Used Dev Size : 7813770240 (3725.90 GiB 4000.65 GB)
    Data Offset : 262144 sectors
   Super Offset : 8 sectors
   Unused Space : before=262064 sectors, after=1024 sectors
          State : clean
    Device UUID : b7da88c1:d3d78671:76a4b284:49530d65

    Update Time : Mon Oct  3 10:57:44 2016
       Checksum : ee3ba700 - correct
         Events : 2565608

         Layout : left-symmetric-6
     Chunk Size : 512K

   Device Role : Active device 1
   Array State : AAAAAA ('A' == active, '.' == missing, 'R' == replacing)

$ sudo mdadm -E /dev/sde1
/dev/sde1:
          Magic : a92b4efc
        Version : 1.2
    Feature Map : 0x0
     Array UUID : d7c7fad6:fa50ed87:1d51f480:3d6405a5
           Name : bibliotheca:0  (local to host bibliotheca)
  Creation Time : Wed Jun 26 21:00:29 2013
     Raid Level : raid6
   Raid Devices : 6

 Avail Dev Size : 7813771264 (3725.90 GiB 4000.65 GB)
     Array Size : 15627540480 (14903.58 GiB 16002.60 GB)
  Used Dev Size : 7813770240 (3725.90 GiB 4000.65 GB)
    Data Offset : 262144 sectors
   Super Offset : 8 sectors
   Unused Space : before=262056 sectors, after=1024 sectors
          State : clean
    Device UUID : 8d552590:9541d3e9:bf926301:be485b84

    Update Time : Mon Oct  3 10:57:44 2016
  Bad Block Log : 512 entries available at offset 72 sectors
       Checksum : d12b6fa3 - correct
         Events : 2565608

         Layout : left-symmetric-6
     Chunk Size : 512K

   Device Role : Active device 0
   Array State : AAAAAA ('A' == active, '.' == missing, 'R' == replacing)

$ sudo mdadm -E /dev/sdg
/dev/sdg:
          Magic : a92b4efc
        Version : 1.2
    Feature Map : 0x0
     Array UUID : d7c7fad6:fa50ed87:1d51f480:3d6405a5
           Name : bibliotheca:0  (local to host bibliotheca)
  Creation Time : Wed Jun 26 21:00:29 2013
     Raid Level : raid6
   Raid Devices : 6

 Avail Dev Size : 7813775024 (3725.90 GiB 4000.65 GB)
     Array Size : 15627540480 (14903.58 GiB 16002.60 GB)
  Used Dev Size : 7813770240 (3725.90 GiB 4000.65 GB)
    Data Offset : 262144 sectors
   Super Offset : 8 sectors
   Unused Space : before=262056 sectors, after=4784 sectors
          State : clean
    Device UUID : 1163b8d4:01cecee1:070f39f0:9562f32b

    Update Time : Mon Oct  3 10:57:44 2016
  Bad Block Log : 512 entries available at offset 72 sectors
       Checksum : a427ae64 - correct
         Events : 2565608

         Layout : left-symmetric-6
     Chunk Size : 512K

   Device Role : Active device 5
   Array State : AAAAAA ('A' == active, '.' == missing, 'R' == replacing)

$ python lsdrv
PCI [ata_piix] 00:1f.2 IDE interface: Intel Corporation 82801IB (ICH9) 2 port SATA Controller [IDE mode] (rev 02)
├scsi 0:0:0:0 ATA      ST4000DM000-1F21
│└sda 3.64t [8:0] Empty/Unknown
│ └sda1 3.64t [8:1] Empty/Unknown
│  └md0 14.55t [9:0] MD v1.2 raid6 (6) clean, 512k Chunk {None}
│   │                Empty/Unknown
│   ├dm-2 10.00t [252:2] Empty/Unknown
│   ├dm-3 4.00t [252:3] Empty/Unknown
│   │└dm-5 4.00t [252:5] Empty/Unknown
│   └dm-4 567.58g [252:4] Empty/Unknown
└scsi 1:0:0:0 ATA      ST4000DM000-1F21
 └sdb 3.64t [8:16] Empty/Unknown
  └sdb1 3.64t [8:17] Empty/Unknown
   └md0 14.55t [9:0] MD v1.2 raid6 (6) clean, 512k Chunk {None}
                     Empty/Unknown
PCI [ata_piix] 00:1f.5 IDE interface: Intel Corporation 82801I (ICH9 Family) 2 port SATA Controller [IDE mode] (rev 02)
├scsi 2:0:0:0 ATA      ST4000DM000-1F21
│└sdc 3.64t [8:32] Empty/Unknown
│ └sdc1 3.64t [8:33] Empty/Unknown
│  └md0 14.55t [9:0] MD v1.2 raid6 (6) clean, 512k Chunk {None}
│                    Empty/Unknown
└scsi 3:0:0:0 ATA      ST4000DM000-1F21
 └sdd 3.64t [8:48] Empty/Unknown
  └sdd1 3.64t [8:49] Empty/Unknown
   └md0 14.55t [9:0] MD v1.2 raid6 (6) clean, 512k Chunk {None}
                     Empty/Unknown
USB [usb-storage] Bus 001 Device 002: ID 0781:5580 SanDisk Corp. SDCZ80 Flash Drive {AA010803151943020344}
└scsi 4:0:0:0 SanDisk  Extreme
 └sdf 29.22g [8:80] Empty/Unknown
  ├sdf1 243.00m [8:81] Empty/Unknown
  │└Mounted as /dev/sdf1 @ /boot
  ├sdf2 1.00k [8:82] Empty/Unknown
  └sdf5 14.60g [8:85] Empty/Unknown
   ├dm-0 12.72g [252:0] Empty/Unknown
   │└Mounted as /dev/mapper/bibliotheca-root @ /
   └dm-1 1.88g [252:1] Empty/Unknown
PCI [ahci] 03:00.0 SATA controller: ASMedia Technology Inc. ASM1062 Serial ATA Controller (rev 02)
├scsi 5:0:0:0 ATA      ST4000DM000-2AE1
│└sde 3.64t [8:64] Empty/Unknown
│ └sde1 3.64t [8:65] Empty/Unknown
│  └md0 14.55t [9:0] MD v1.2 raid6 (6) clean, 512k Chunk {None}
│                    Empty/Unknown
└scsi 6:0:0:0 ATA      ST4000DM000-2AE1
 └sdg 3.64t [8:96] Empty/Unknown
  └md0 14.55t [9:0] MD v1.2 raid6 (6) clean, 512k Chunk {None}
                    Empty/Unknown
PCI [pata_marvell] 04:00.0 IDE interface: Marvell Technology Group Ltd. 88SE6121 SATA II / PATA Controller (rev b2)
├scsi 7:0:0:0 LITE-ON  DVDRW SOHW-832S  {2004062400030158}
│└sr0 1.00g [11:0] Empty/Unknown
└scsi 8:x:x:x [Empty]
Other Block Devices
├loop0 0.00k [7:0] Empty/Unknown
├loop1 0.00k [7:1] Empty/Unknown
├loop2 0.00k [7:2] Empty/Unknown
├loop3 0.00k [7:3] Empty/Unknown
├loop4 0.00k [7:4] Empty/Unknown
├loop5 0.00k [7:5] Empty/Unknown
├loop6 0.00k [7:6] Empty/Unknown
├loop7 0.00k [7:7] Empty/Unknown
├ram0 64.00m [1:0] Empty/Unknown
├ram1 64.00m [1:1] Empty/Unknown
├ram2 64.00m [1:2] Empty/Unknown
├ram3 64.00m [1:3] Empty/Unknown
├ram4 64.00m [1:4] Empty/Unknown
├ram5 64.00m [1:5] Empty/Unknown
├ram6 64.00m [1:6] Empty/Unknown
├ram7 64.00m [1:7] Empty/Unknown
├ram8 64.00m [1:8] Empty/Unknown
├ram9 64.00m [1:9] Empty/Unknown
├ram10 64.00m [1:10] Empty/Unknown
├ram11 64.00m [1:11] Empty/Unknown
├ram12 64.00m [1:12] Empty/Unknown
├ram13 64.00m [1:13] Empty/Unknown
├ram14 64.00m [1:14] Empty/Unknown
└ram15 64.00m [1:15] Empty/Unknown

^ permalink raw reply

* kernel BUG at block/bio.c:1785 while trying to issue a discard to LVM on RAID1 md
From: Sitsofe Wheeler @ 2016-10-03 16:47 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Shaohua Li, linux-raid, linux-block, linux-kernel@vger.kernel.org

Hi,

While trying to do a discard (via blkdiscard --length 1048576
/dev/<pathtodevice>) to an LVM device atop a two disk md RAID1 the
following oops was generated:

[  103.306243] md: resync of RAID array md127
[  103.306246] md: minimum _guaranteed_  speed: 1000 KB/sec/disk.
[  103.306248] md: using maximum available idle IO bandwidth (but not
more than 200000 KB/sec) for resync.
[  103.306251] md: using 128k window, over a total of 244194432k.
[  103.308158] ------------[ cut here ]------------
[  103.308205] kernel BUG at block/bio.c:1785!
[  103.308243] invalid opcode: 0000 [#1] SMP
[  103.308279] Modules linked in: vmw_vsock_vmci_transport vsock
sb_edac raid1 edac_core intel_powerclamp coretemp crct10dif_pclmul
crc32_pclmul ghash_clmulni_intel vmw_balloon ppdev intel_rapl_perf
joydev vmxnet3 parport_pc vmw_vmci parport shpchp acpi_cpufreq fjes
tpm_tis tpm i2c_piix4 dm_multipath vmwgfx drm_kms_helper ttm drm
crc32c_intel serio_raw vmw_pvscsi ata_generic pata_acpi
[  103.308641] CPU: 0 PID: 391 Comm: md127_raid1 Not tainted
4.7.5-200.fc24.x86_64 #1
[  103.308699] Hardware name: VMware, Inc. VMware Virtual
Platform/440BX Desktop Reference Platform, BIOS 6.00 09/30/2014
[  103.308784] task: ffff88003beb0000 ti: ffff88000016c000 task.ti:
ffff88000016c000
[  103.308841] RIP: 0010:[<ffffffffa23a4312>]  [<ffffffffa23a4312>]
bio_split+0x82/0x90
[  103.308921] RSP: 0018:ffff88000016fb38  EFLAGS: 00010246
[  103.308972] RAX: 00057fffffffffff RBX: 0000000000000000 RCX: ffff88003f017a80
[  103.309038] RDX: 0000000002400000 RSI: 0000000000000000 RDI: ffff88003bc01500
[  103.309110] RBP: ffff88000016fb50 R08: 0000000000000080 R09: ffff88003bc01500
[  103.310652] R10: ffff88000016fbb0 R11: 0000000000000000 R12: 0000000000000000
[  103.312043] R13: 0000000000000000 R14: 0000000000000002 R15: ffff88003f168900
[  103.313419] FS:  0000000000000000(0000) GS:ffff88003ec00000(0000)
knlGS:0000000000000000
[  103.314815] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  103.315731] CR2: 00007fdd4daeb400 CR3: 000000003b2c5000 CR4: 00000000001406f0
[  103.316328] Stack:
[  103.316879]  0000000000000000 00000000000001fe 0000000000000000
ffff88000016fbf0
[  103.317473]  ffffffffa23b1afd 0000000000000246 0000000002011200
ffff88003f017a80
[  103.318050]  0005800000000000 000000803e8013c0 ffff88000016fc00
ffff88003bc01500
[  103.318626] Call Trace:
[  103.319196]  [<ffffffffa23b1afd>] blk_queue_split+0x2cd/0x620
[  103.319780]  [<ffffffffa23acb83>] blk_queue_bio+0x53/0x3d0
[  103.320378]  [<ffffffffa23ab022>] generic_make_request+0xf2/0x1d0
[  103.320960]  [<ffffffffa23ab176>] submit_bio+0x76/0x160
[  103.321535]  [<ffffffffa23a1693>] submit_bio_wait+0x63/0x90
[  103.322112]  [<ffffffffc058e27a>] raid1d+0x3ea/0xfb0 [raid1]
[  103.322688]  [<ffffffffa27eb3ec>] ? schedule_timeout+0x1ac/0x270
[  103.323268]  [<ffffffffa2649c59>] md_thread+0x139/0x150
[  103.323848]  [<ffffffffa20e46e0>] ? prepare_to_wait_event+0xf0/0xf0
[  103.324417]  [<ffffffffa2649b20>] ? find_pers+0x70/0x70
[  103.324988]  [<ffffffffa20c0588>] kthread+0xd8/0xf0
[  103.325562]  [<ffffffffa27ec77f>] ret_from_fork+0x1f/0x40
[  103.326108]  [<ffffffffa20c04b0>] ? kthread_worker_fn+0x180/0x180
[  103.326654] Code: 44 89 e2 4c 89 ef e8 1e 47 03 00 41 8b 75 28 48
89 df e8 92 d6 ff ff 5b 4c 89 e8 41 5c 41 5d 5d c3 e8 63 fc ff ff 49
89 c5 eb b6 <0f> 0b 0f 0b 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00
48 8b
[  103.328410] RIP  [<ffffffffa23a4312>] bio_split+0x82/0x90
[  103.328943]  RSP <ffff88000016fb38>
[  103.329474] ---[ end trace f093e2f8fabdb9b3 ]---

The kernel is 4.7.5-200.fc24.x86_64 from Fedora 24. While md is stuck
in the PENDING state the oops seems to be reproducible. If md is in
the middle of resyncing the system locks up entirely without printing
anything instead...

The "disks" are raw disk mappings of SSDs on ESXi being passed to a
VM. Here's the some initial /sys/block/ output before the any discards
are issued:
 # grep . /sys/block/sdc/queue/*
/sys/block/sdc/queue/add_random:0
/sys/block/sdc/queue/discard_granularity:512
/sys/block/sdc/queue/discard_max_bytes:4294966784
/sys/block/sdc/queue/discard_max_hw_bytes:4294966784
/sys/block/sdc/queue/discard_zeroes_data:0
/sys/block/sdc/queue/hw_sector_size:512
/sys/block/sdc/queue/io_poll:0
grep: /sys/block/sdc/queue/iosched: Is a directory
/sys/block/sdc/queue/iostats:1
/sys/block/sdc/queue/logical_block_size:512
/sys/block/sdc/queue/max_hw_sectors_kb:32767
/sys/block/sdc/queue/max_integrity_segments:0
/sys/block/sdc/queue/max_sectors_kb:1280
/sys/block/sdc/queue/max_segments:128
/sys/block/sdc/queue/max_segment_size:65536
/sys/block/sdc/queue/minimum_io_size:512
/sys/block/sdc/queue/nomerges:0
/sys/block/sdc/queue/nr_requests:128
/sys/block/sdc/queue/optimal_io_size:0
/sys/block/sdc/queue/physical_block_size:512
/sys/block/sdc/queue/read_ahead_kb:128
/sys/block/sdc/queue/rotational:0
/sys/block/sdc/queue/rq_affinity:1
/sys/block/sdc/queue/scheduler:[noop] deadline cfq
/sys/block/sdc/queue/write_cache:write through
/sys/block/sdc/queue/write_same_max_bytes:0

-- 
Sitsofe | http://sucs.org/~sits/

^ permalink raw reply

* Linux raid wiki - advice again please
From: Wols Lists @ 2016-10-03 23:42 UTC (permalink / raw)
  To: linux-raid

I want to move the old hardware issues page into the archive section,
copying anything of modern relevance into a new page. Unfortunately, I
don't know enough about the issues involved. I strongly suspect,
however, that things like udev have rendered large chunks of the text
obsolete.

https://raid.wiki.kernel.org/index.php/Hardware_issues

The first bit, "drive selection", is an update I did, which I'll be
moving elsewhere.

The section "SATA Configuration" mentions port multipliers, which I know
nothing about. Is this still relevant?

And the bit about "hot swap" - sata, sas, and sca - feels to me like
it's now obsolete because udev will do all that for you - am I right?

I won't be deleting the page, it'll just be tucked away where no-one
will stumble over it and be misled.

Cheers,
Wol

^ permalink raw reply

* [PATCH] async_pq_val: fix DMA memory leak
From: Justin Maggard @ 2016-10-04 20:17 UTC (permalink / raw)
  To: dan.j.williams; +Cc: linux-raid, dmaengine, Justin Maggard

Add missing dmaengine_unmap_put(), so we don't OOM during RAID6 sync.

Fixes: 1786b943dad0 ("async_pq_val: convert to dmaengine_unmap_data")
Signed-off-by: Justin Maggard <jmaggard@netgear.com>
---
 crypto/async_tx/async_pq.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c
index 08b3ac6..f83de99 100644
--- a/crypto/async_tx/async_pq.c
+++ b/crypto/async_tx/async_pq.c
@@ -368,8 +368,6 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int disks,
 
 		dma_set_unmap(tx, unmap);
 		async_tx_submit(chan, tx, submit);
-
-		return tx;
 	} else {
 		struct page *p_src = P(blocks, disks);
 		struct page *q_src = Q(blocks, disks);
@@ -424,9 +422,11 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int disks,
 		submit->cb_param = cb_param_orig;
 		submit->flags = flags_orig;
 		async_tx_sync_epilog(submit);
-
-		return NULL;
+		tx = NULL;
 	}
+	dmaengine_unmap_put(unmap);
+
+	return tx;
 }
 EXPORT_SYMBOL_GPL(async_syndrome_val);
 
-- 
2.10.1


^ permalink raw reply related

* Re: [PATCH] async_pq_val: fix DMA memory leak
From: Dan Williams @ 2016-10-04 20:26 UTC (permalink / raw)
  To: Justin Maggard
  Cc: linux-raid, dmaengine@vger.kernel.org, Justin Maggard, Vinod Koul
In-Reply-To: <20161004201758.13876-1-jmaggard@netgear.com>

[ adding Vinod ]

On Tue, Oct 4, 2016 at 1:17 PM, Justin Maggard <jmaggard10@gmail.com> wrote:
> Add missing dmaengine_unmap_put(), so we don't OOM during RAID6 sync.
>
> Fixes: 1786b943dad0 ("async_pq_val: convert to dmaengine_unmap_data")
> Signed-off-by: Justin Maggard <jmaggard@netgear.com>

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Cc: <stable@vger.kernel.org>


> ---
>  crypto/async_tx/async_pq.c | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c
> index 08b3ac6..f83de99 100644
> --- a/crypto/async_tx/async_pq.c
> +++ b/crypto/async_tx/async_pq.c
> @@ -368,8 +368,6 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int disks,
>
>                 dma_set_unmap(tx, unmap);
>                 async_tx_submit(chan, tx, submit);
> -
> -               return tx;
>         } else {
>                 struct page *p_src = P(blocks, disks);
>                 struct page *q_src = Q(blocks, disks);
> @@ -424,9 +422,11 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int disks,
>                 submit->cb_param = cb_param_orig;
>                 submit->flags = flags_orig;
>                 async_tx_sync_epilog(submit);
> -
> -               return NULL;
> +               tx = NULL;
>         }
> +       dmaengine_unmap_put(unmap);
> +
> +       return tx;
>  }
>  EXPORT_SYMBOL_GPL(async_syndrome_val);
>
> --
> 2.10.1
>

^ permalink raw reply

* Re: [PATCH v2 3/6] r5cache: reclaim support
From: Song Liu @ 2016-10-04 21:59 UTC (permalink / raw)
  To: Shaohua Li
  Cc: linux-raid@vger.kernel.org, neilb@suse.com, Shaohua Li,
	Kernel Team, dan.j.williams@intel.com, hch@infradead.org,
	liuzhengyuang521@gmail.com, liuzhengyuan@kylinos.cn
In-Reply-To: <20160928003438.GC98100@kernel.org>


> On Sep 27, 2016, at 5:34 PM, Shaohua Li <shli@kernel.org> wrote:
> 
>> 
>> +	if (!conf->log)
>> +		return;
>> +	spin_lock(&conf->device_lock);
>> +	if (r5c_total_cached_stripes(conf) > conf->max_nr_stripes * 3 / 4 ||
>> +	    atomic_read(&conf->empty_inactive_list_nr) > 0)
>> +		r5c_flush_cache(conf, R5C_RECLAIM_STRIPE_GROUP);
> 
> I still worry about the max_nr_stripes usage. It can be changed at runtime. If
> there are no enough stripes, should we just allocate more stripes or reclaim
> stripe cache? If memory system tries to shrink stripes (eg, decrease
> max_nr_stripes), will it cause deadlock for r5cache?
> 

Write cache will not have dead lock due to stripe cache usage, because cached
stripe will hold a stripe until it is flushed to the raid disks. 


>> +	else if (r5c_total_cached_stripes(conf) >
>> +		 conf->max_nr_stripes * 1 / 2)
>> +		r5c_flush_cache(conf, 1);
> 
> This one is a defensive reclaim. It should always reclaim stripes with full
> data. If there are no enough such stripes, do nothing. Flushing 1 stripe would
> always be wrong unless we are in critical stripe space shortage, as reclaim
> involves disk cache flush and is slow, we should do aggretation as much as
> possible.

I will remove the defensive reclaim. 

> 
>>  *
>> @@ -198,10 +260,9 @@ void r5c_freeze_stripe_for_reclaim(struct stripe_head *sh)
>> {
>> 	struct r5conf *conf = sh->raid_conf;
>> 
>> -	if (!conf->log)
>> +	if (!conf->log || test_bit(STRIPE_R5C_FROZEN, &sh->state))
>> 		return;
>> 
>> -	WARN_ON(test_bit(STRIPE_R5C_FROZEN, &sh->state));
>> 	set_bit(STRIPE_R5C_FROZEN, &sh->state);
> 
> This is confusing. The WARN_ON suggests the STRIPE_R5C_FROZEN isn't set for sh,
> but the change suggests it's possible the bit is set. Which one is correct?
> 

Will fix this. 

>> 
>> @@ -518,6 +583,14 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
>> 	atomic_inc(&io->pending_stripe);
>> 	sh->log_io = io;
>> 
>> +	if (sh->log_start == MaxSector) {
>> +		BUG_ON(!list_empty(&sh->r5c));
>> +		sh->log_start = io->log_start;
>> +		spin_lock_irqsave(&log->stripe_in_cache_lock, flags);
>> +		list_add_tail(&sh->r5c,
>> +			      &log->stripe_in_cache);
>> +		spin_unlock_irqrestore(&log->stripe_in_cache_lock, flags);
>> +	}
> what if it's in writethrogh mode?

This does not impact write through mode. But let me add clear check just in case. 
> 
>> +		last_checkpoint = (list_first_entry(&log->stripe_in_cache,
>> +						    struct stripe_head, r5c))->log_start;
>> +		spin_unlock(&log->stripe_in_cache_lock);
>> +		if (sh->log_start != last_checkpoint) {
>> +			spin_lock(&log->no_space_stripes_lock);
>> +			list_add_tail(&sh->log_list, &log->no_space_stripes);
>> +			spin_unlock(&log->no_space_stripes_lock);
>> +			mutex_unlock(&log->io_mutex);
>> +			return -ENOSPC;
> 
> So if a stripe is in cache, we try to reclaim it. We should have some mechanism
> to guarantee there are enough space for reclaim (eg for parity). Otherwise
> there could be a deadlock because the space allocation in reclaim path is to
> free space. Could you please explain how this is done?
> 

I redefined this part in newer version, which should be clear. 

>> +		} else 	if (!r5l_has_free_space(log, reserve)) {
>> +			WARN(1, "%s: run out of journal space\n", __func__);
>> +			BUG();
> that's scaring, why it happens?
> 

I rewrite some of the code in newer version. But in case something similar happens, it
is a bug with reclaim. 

>> 
>> +		if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
>> +		    sh->log_start != last_checkpoint)
>> +			continue;
> what's this check for?

With this check, the code only flushes stripes at last_checkpoint on journal. This is no 
longer needed in newer version. I will remove it. 

> 
>> 		list_del_init(&sh->log_list);
>> +
>> static sector_t r5l_reclaimable_space(struct r5l_log *log)
>> {
>> +	struct r5conf *conf = log->rdev->mddev->private;
>> +
>> 	return r5l_ring_distance(log, log->last_checkpoint,
>> -				 log->next_checkpoint);
>> +				 r5c_calculate_last_cp(conf));
> will this work for writethrouth?

r5c_calculate_last_cp() returns next_checkpoint for write through mode. 
> 
>> }
>> 
>> 
>> 	if (reclaimable == 0)
>> 		return;
>> -
>> 	/*
>> 	 * write_super will flush cache of each raid disk. We must write super
>> 	 * here, because the log area might be reused soon and we don't want to
>> @@ -877,10 +995,7 @@ static void r5l_do_reclaim(struct r5l_log *log)
>> 
>> 	mutex_lock(&log->io_mutex);
>> 	log->last_checkpoint = next_checkpoint;
>> -	log->last_cp_seq = next_cp_seq;
> why not update last_cp_seq?

\x10Currently, we don't need last_cp_seq anywhere in the code. To keep track of 
last_cp_seq, we need add this field to stripe_head. From what I can this, this 
seems not necessary. 

>> 	mutex_unlock(&log->io_mutex);
>> -
>> -	r5l_run_no_space_stripes(log);
> 
> I don't understand why move r5l_run_no_space_stripes to r5c_flush_cache. It's
> natural we run this after some spaces are reclaimed.

I will move it back. 

>> }
>> 
>> static void r5l_reclaim_thread(struct md_thread *thread)
>> @@ -891,7 +1006,9 @@ static void r5l_reclaim_thread(struct md_thread *thread)
>> 
>> 	if (!log)
>> 		return;
>> +	r5c_do_reclaim(conf);
>> 	r5l_do_reclaim(log);
>> +	md_wakeup_thread(mddev->thread);
> 
> this wakeup is a bit strange. After we reclaim some spaces, we will rerun
> pending stripes, which will wakeup mddev->thread. Do miss some wakeup in other
> reclaim places?

This one is not really necessary. I will remove it. 


>> }
>> 
>> void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
>> 
>> }
>> 
>> -static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
>> +/*
>> + * r5c_flush_cache will move stripe from cached list to handle_list or
>> + * r5c_priority_list
> 
> What's the r5c_priority_list for? If you want to make sure reclaim makes
> progress, I think it's the wrong way. If there are no spaces, handling other
> normal stripes will mean moving them to no_space list and do nothing else. Then
> the reclaim stripes will get the turn to run. There is no extra list required.

I will try to remove it. 

> 
>> 
>> +	BUG_ON(test_bit(STRIPE_R5C_PRIORITY, &sh->state) &&
>> +	       !test_bit(STRIPE_HANDLE, &sh->state));
>> +
>> +	if (test_bit(STRIPE_R5C_PRIORITY, &sh->state))
>> +		return 0;
>> +	if (test_bit(STRIPE_HANDLE, &sh->state) && !priority)
>> +		return 0;
>> +
>> 	r5c_freeze_stripe_for_reclaim(sh);
>> -	atomic_inc(&conf->active_stripes);
>> +	if (!test_and_set_bit(STRIPE_HANDLE, &sh->state)) {
>> +		atomic_inc(&conf->active_stripes);
>> +	}
> 
> shouldn't the stripe is always accounted to active before it's reclaimed? Do we
> decrease the count before the stripe is reclaimed? sounds like a bug.
> 

We decrease active count when the stripe is on r5c_cached_full_stripe or 
r5c_cached_partial stripe. These two lists are variation of inactive_list. 

>> 
>> +
>> +			mutex_unlock(&log->io_mutex);
>> +			return -ENOSPC;
>> 		}
>> +		pr_debug("%s: write sh %lu to free log space\n", __func__, sh->sector);
>> +	}
>> +	if (!r5l_has_free_space(log, reserve)) {
>> +		pr_err("%s: cannot reserve space %d\n", __func__, reserve);
>> +		BUG();
> 
> same here. we should put the stripe into no_space list. If we can't allocate
> space eventually, it indicates reclaim has bug.

BUG() here already indicates bug in reclaim. I will test this thoroughly with new version. 

>> 
>> +				if (before_jiffies > 20)
>> +					pr_debug("%s: wait for sh takes %lu jiffies\n", __func__, before_jiffies);
> please remove the debug code.
> 
> Thanks,
> Shaohua


^ permalink raw reply

* Re: [PATCH] async_pq_val: fix DMA memory leak
From: Vinod Koul @ 2016-10-05  0:49 UTC (permalink / raw)
  To: Justin Maggard; +Cc: dan.j.williams, linux-raid, dmaengine, Justin Maggard
In-Reply-To: <20161004201758.13876-1-jmaggard@netgear.com>

On Tue, Oct 04, 2016 at 01:17:58PM -0700, Justin Maggard wrote:
> Add missing dmaengine_unmap_put(), so we don't OOM during RAID6 sync.

Applied, thanks

-- 
~Vinod

^ permalink raw reply

* Re: [PATCH] Fix bus error when accessing MBR partition records
From: NeilBrown @ 2016-10-05  2:21 UTC (permalink / raw)
  To: James Clarke; +Cc: linux-raid, debian-sparc, Jes Sorensen
In-Reply-To: <CB9B156B-28E1-4EDE-9738-9B03ECC60291@jrtc27.com>

[-- Attachment #1: Type: text/plain, Size: 2349 bytes --]

On Mon, Oct 03 2016, James Clarke wrote:

> Hi Neil,
>
>> On 2 Oct 2016, at 23:32, NeilBrown <neilb@suse.com> wrote:
>> 
>>> On Thu, Sep 29 2016, James Clarke wrote:
>>> 
>>> Since the MBR layout only has partition records as 2-byte aligned, the 32-bit
>>> fields in them are not aligned. Thus, they cannot be accessed on some
>>> architectures (such as SPARC) by using a "struct MBR_part_record *" pointer,
>>> as the compiler can assume that the pointer is properly aligned. Instead, the
>>> records must be accessed by going through the MBR struct itself every time.
>> 
>> Weird....
>> 
>> Can you see if adding "__attribute__((packed))" to struct
>> MBR_part_record also fixes the problem?
>
> That also works. When I wrote the patch initially, I wasn’t sure if it was a
> "correct" fix, but having looked into it more I *believe* it is conformant. The
> alignment of a packed struct is 1-byte, so, while the compiler may know that the
> 32-bit fields are 8-byte aligned within the struct, the pointer to the struct
> need not be aligned, and so the correct conservative code is generated.
>
>> It seems strange that the compiler lets you take a pointer, but then
>> doesn't use it correctly.  Maybe it is an inconsistency in the types.
>
> Yes, the type doesn’t include the provenance of the pointer, so in general the
> compiler can’t know it came from a packed struct (although in this case not much
> static analysis would be needed). See [1] and [2].
>
>> I don't necessarily disagree with your fix, but I'd like to understand
>> why the current code is wrong.
>
> Hopefully the links make it clearer.
>
> Regards,
> James
>
> [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51628
> [2] https://llvm.org/bugs/show_bug.cgi?id=22821

Thanks.

It looks as though the type change I suggested would work, but probably isn't
the best solution.
Your patch is probably safest, though adding the __attribute__((packed))
as well wouldn't hurt.

I'll leave it for Jes to decide what exactly to apply, but I can offer
a

  Reviewed-by: NeilBrown <neilb@suse.com>

for you patch.

BTW I tried compiling mdadm with clang to see if my clang was new enough
to give a warning (it isn't) but it found a few other things to give
errors about ... I should post patches.

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 800 bytes --]

^ permalink raw reply

* Re: RAID6 - CPU At 100% Usage After Reassembly
From: Francisco Parada @ 2016-10-05 11:12 UTC (permalink / raw)
  To: mdraid
In-Reply-To: <CAOW94uvFaiL9FO=k=gOsGzH0q1Qhw4X3n7ZFURj9_eTgGa656g@mail.gmail.com>

Hi all,

Just wanted to send one last ping on this.  I didn't hear back and
really don't know where else to turn to before abandoning hope.
Should I just start wiping the drives so I can start from scratch at
this point?

I rebuilt the kernel with debug patch that Shaohua provided and I sent
the list the output of the trace.  Does any one have any other
suggestions?

Thank you once again,
Cisco


On Fri, Sep 30, 2016 at 9:06 AM, Francisco Parada
<advanceandconquer@gmail.com> wrote:
> Hello Shaohua and all,
>
> Was anyone able to take a look at the trace I provided?  Curious if I
> should just give up hope and start from scratch.  Any feedback is
> appreciated.
>
> Thank you,
> Cisco
>
> On Mon, Sep 26, 2016 at 10:29 AM, Francisco Parada
> <advanceandconquer@gmail.com> wrote:
>> Hi all,
>>
>> It doesn't seem like my response from last night, made it to the list:
>>
>>
>> Hi Shaohua and all,
>>
>> I was finally able to upgrade my Ubuntu server to a newer version of
>> the kernel and mdadm:
>> ==========================
>> $ uname -r; mdadm -V
>>
>> 4.8.0-rc7-custom
>>
>> mdadm - v3.4 - 28th January 2016
>> ==========================
>>
>>
>> I rebuilt the kernel with the options that Shaohua asked me to build it with:
>> ======================================
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index 5883ef0..db484ca 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -62,6 +62,9 @@
>>  #include "raid0.h"
>>  #include "bitmap.h"
>>
>> +#undef pr_debug
>> +#define pr_debug trace_printk
>> +
>>  #define cpu_to_group(cpu) cpu_to_node(cpu)
>>  #define ANY_GROUP NUMA_NO_NODE
>> ======================================
>>
>>
>> Here's how things look so far, nothing different yet:
>> ======================================
>> $ cat /proc/mdstat
>> Personalities : [linear] [multipath] [raid0] [raid1] [raid6] [raid5]
>> [raid4] [raid10]
>> md127 : inactive sdd[10](S) sdk[0](S) sdj[2](S) sdh[3](S) sde[11](S)
>> sdg[9](S) sdf[7](S) sdb[13](S) sdc[12](S)
>>       26371219608 blocks super 1.2
>>
>> unused devices: <none>
>> ======================================
>>
>>
>> Here's an event snapshot of my array, just keep in mind that
>> "/dev/sdi" is my failed drive, so I omitted it from the examination:
>> ======================================
>> # mdadm -E /dev/sd[b-h,j,k] |grep Events
>>          Events : 280033
>>          Events : 280033
>>          Events : 280033
>>          Events : 280033
>>          Events : 280033
>>          Events : 280033
>>          Events : 280011
>>          Events : 280033
>>          Events : 280033
>> ======================================
>>
>>
>> It's important to note, that since I haven't done anything yet, my CPU is idle:
>> ======================================
>> top - 20:22:00 up  5:56,  2 users,  load average: 0.04, 0.03, 0.00
>>
>> Tasks: 221 total,   1 running, 220 sleeping,   0 stopped,   0 zombie
>>
>> %Cpu(s):  1.0 us,  1.0 sy,  0.0 ni, 97.5 id,  0.5 wa,  0.0 hi,  0.0 si,  0.0 st
>>
>> KiB Mem :  1525400 total,   103836 free,   696208 used,   725356 buff/cache
>>
>> KiB Swap: 25153532 total, 25117380 free,    36152 used.   454808 avail Mem
>>  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND
>>  2093 cisco     20   0 1761112 153108  55640 S   1.0 10.0   0:12.61 gnome-shell
>>  4322 root      20   0   40520   3684   3100 R   1.0  0.2   0:00.22 top
>>     1 root      20   0  119692   5540   3992 S   0.0  0.4   0:02.44 systemd
>>     2 root      20   0       0      0      0 S   0.0  0.0   0:00.00 kthreadd
>>     3 root      20   0       0      0      0 S   0.0  0.0   0:00.09 ksoftirqd/0
>>     5 root       0 -20       0      0      0 S   0.0  0.0   0:00.00 kworker/0:
>> ======================================
>>
>>
>> Now onto the fun part.  I stopped "/dev/md127":
>> ======================================
>> # mdadm --stop /dev/md127
>> mdadm: stopped /dev/md127
>> # cat /proc/mdstat
>> Personalities : [linear] [multipath] [raid0] [raid1] [raid6] [raid5]
>> [raid4] [raid10]
>> unused devices: <none>
>> ======================================
>>
>>
>> For completion, here's the trace output after stopping the array, and
>> before reassembling:
>> ======================================
>> # tracer: nop
>> #
>> # entries-in-buffer/entries-written: 0/0   #P:2
>> #
>> #                              _-----=> irqs-off
>> #                             / _----=> need-resched
>> #                            | / _---=> hardirq/softirq
>> #                            || / _--=> preempt-depth
>> #                            ||| /     delay
>> #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
>> #              | |       |   ||||       |         |
>> /sys/kernel/debug/tracing/trace (END)
>> ======================================
>>
>>
>> Then I reassembled the array:
>> ======================================
>> # mdadm -Afv /dev/md127 /dev/sdb /dev/sdc /dev/sdd /dev/sde /dev/sdf
>> /dev/sdg /dev/sdh /dev/sdj /dev/sdk
>> mdadm: looking for devices for /dev/md127
>> mdadm: /dev/sdb is busy - skipping
>> mdadm: Merging with already-assembled /dev/md/en1
>> mdadm: /dev/sdb is identified as a member of /dev/md/en1, slot 7.
>> mdadm: /dev/sdc is identified as a member of /dev/md/en1, slot 8.
>> mdadm: /dev/sdd is identified as a member of /dev/md/en1, slot 6.
>> mdadm: /dev/sde is identified as a member of /dev/md/en1, slot 9.
>> mdadm: /dev/sdf is identified as a member of /dev/md/en1, slot 4.
>> mdadm: /dev/sdg is identified as a member of /dev/md/en1, slot 1.
>> mdadm: /dev/sdh is identified as a member of /dev/md/en1, slot 3.
>> mdadm: /dev/sdj is identified as a member of /dev/md/en1, slot 2.
>> mdadm: /dev/sdk is identified as a member of /dev/md/en1, slot 0.
>> mdadm: Marking array /dev/md/en1 as 'clean'
>> mdadm: /dev/md/en1 has an active reshape - checking if critical
>> section needs to be restored
>> mdadm: No backup metadata on device-7
>> mdadm: No backup metadata on device-8
>> mdadm: No backup metadata on device-9
>> mdadm: added /dev/sdg to /dev/md/en1 as 1
>> mdadm: added /dev/sdj to /dev/md/en1 as 2
>> mdadm: added /dev/sdh to /dev/md/en1 as 3 (possibly out of date)
>> mdadm: added /dev/sdf to /dev/md/en1 as 4
>> mdadm: no uptodate device for slot 5 of /dev/md/en1
>> mdadm: added /dev/sdd to /dev/md/en1 as 6
>> mdadm: /dev/sdb is already in /dev/md/en1 as 7
>> mdadm: added /dev/sdc to /dev/md/en1 as 8
>> mdadm: added /dev/sde to /dev/md/en1 as 9
>> mdadm: added /dev/sdk to /dev/md/en1 as 0
>> ======================================
>>
>>
>> And of course, CPU shoots to 100%:
>> ======================================
>> top - 20:38:44 up  6:13,  3 users,  load average: 5.05, 3.25, 1.41
>> Tasks: 239 total,   3 running, 236 sleeping,   0 stopped,   0 zombie
>> %Cpu(s):  5.9 us, 52.7 sy,  0.0 ni,  0.0 id, 41.4 wa,  0.0 hi,  0.0 si,  0.0 st
>> KiB Mem :  1525400 total,    73124 free,   739576 used,   712700 buff/cache
>> KiB Swap: 25153532 total, 25111140 free,    42392 used.   415840 avail Mem
>>
>>  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND
>>  6423 root      20   0       0      0      0 R  99.0  0.0   4:43.51 md127_raid6
>>  1166 root      20   0  280588   8780   6192 S   3.0  0.6   0:06.56 polkitd
>>  4022 cisco     20   0  394756  32884  26064 S   3.0  2.2   0:08.77 gnome-disks
>>  1903 cisco     20   0  256660  34060  26280 S   2.0  2.2   0:29.56 Xorg
>>  2093 cisco     20   0 1760364 153572  55572 S   2.0 10.1   0:17.96 gnome-shell
>> ======================================
>>
>>
>> Then surely the array reshape speed goes back down to nothing:
>> ======================================
>> # cat /proc/mdstat
>> Personalities : [linear] [multipath] [raid0] [raid1] [raid6] [raid5]
>> [raid4] [raid10]
>> md127 : active raid6 sdk[0] sde[11] sdc[12] sdd[10] sdf[7] sdj[2] sdg[9] sdb[13]
>>       14650675200 blocks super 1.2 level 6, 512k chunk, algorithm 2
>> [10/8] [UUU_U_UUUU]
>>       [=======>.............]  reshape = 39.1% (1146348512/2930135040)
>> finish=22057575.1min speed=1K/sec
>>       bitmap: 0/22 pages [0KB], 65536KB chunk
>>
>> unused devices: <none>
>> ======================================
>>
>>
>> The size of the trace file is gigantic, so hopefully it doesn't get
>> trimmed in the email, but any help would be appreciated, thanks in
>> advance:
>> ================
>> # tracer: nop
>> #
>> # entries-in-buffer/entries-written: 44739/81554230   #P:2
>> #
>> #                              _-----=> irqs-off
>> #                             / _----=> need-resched
>> #                            | / _---=> hardirq/softirq
>> #                            || / _--=> preempt-depth
>> #                            ||| /     delay
>> #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
>> #              | |       |   ||||       |         |
>>      md127_raid6-6423  [001] .... 22228.159918: analyse_stripe: check
>> 7: state 0x13 read           (null) write           (null) written
>>       (null)
>>      md127_raid6-6423  [001] .... 22228.159919: analyse_stripe: check
>> 6: state 0xa01 read           (null) write           (null) written
>>        (null)
>>      md127_raid6-6423  [001] .... 22228.159919: analyse_stripe: check
>> 5: state 0x801 read           (null) write           (null) written
>>        (null)
>>      md127_raid6-6423  [001] .... 22228.159920: analyse_stripe: check
>> 4: state 0x811 read           (null) write           (null) written
>>        (null)
>>      md127_raid6-6423  [001] .... 22228.159921: analyse_stripe: check
>> 3: state 0x801 read           (null) write           (null) written
>>        (null)
>>      md127_raid6-6423  [001] .... 22228.159921: analyse_stripe: check
>> 2: state 0x811 read           (null) write           (null) written
>>        (null)
>>      md127_raid6-6423  [001] .... 22228.159922: analyse_stripe: check
>> 1: state 0xa01 read           (null) write           (null) written
>>        (null)
>>      md127_raid6-6423  [001] .... 22228.159923: analyse_stripe: check
>> 0: state 0x811 read           (null) write           (null) written
>>        (null)
>>      md127_raid6-6423  [001] .... 22228.159924: handle_stripe:
>> locked=2 uptodate=10 to_read=0 to_write=0 failed=4 failed_num=6,5
>>      md127_raid6-6423  [001] .... 22228.159925:
>> schedule_reconstruction: schedule_reconstruction: stripe 2292697672
>> locked: 4 ops_request: 10
>>      md127_raid6-6423  [001] .... 22228.159925: raid_run_ops:
>> ops_run_reconstruct6: stripe 2292697672
>>      md127_raid6-6423  [001] .... 22228.159943:
>> ops_complete_reconstruct: ops_complete_reconstruct: stripe 2292697672
>>      md127_raid6-6423  [001] .... 22228.159944: handle_stripe:
>> handling stripe 2292697680, state=0x1401 cnt=1, pd_idx=7, qd_idx=8
>> , check:0, reconstruct:6
>> ===========================================
>>
>> I trimmed it because of the failure to send issue.  However, if
>> someone needs a lengthier snip of the trace, let me know.
>>
>> Thanks,
>>
>> 'Cisco

^ permalink raw reply

* md-cluster - Assemble/Scan During Resync
From: Marc Smith @ 2016-10-05 20:43 UTC (permalink / raw)
  To: linux-raid

Hi,

First, I believe this issue may have been reported/solved with this
thread ("[PATCH 3/3] MD: hold mddev lock for md-cluster receive
thread"):
http://www.spinics.net/lists/raid/msg53121.html

But I'm not totally sure, and I'm looking for confirmation, or maybe
this is a new one... I'm trying to hold out for Linux 4.9 in my
project, and I am hoping to just cherry pick any patches until then.

Testing md-cluster with Linux 4.5.2 (yes, I know its dated)... two
nodes connected to shared SAS storage, and I'm using DM Multipath in
front of the individual SAS disks (two I/O modules with dual-domain
SAS disks).

On tgtnode2 I created the array like this: mdadm --create --verbose
--run /dev/md/test4 --name=test4 --level=raid1 --raid-devices=2
--chunk=64 --bitmap=clustered /dev/dm-4 /dev/dm-5

And then, without waiting for the resync to complete, on the second
node (tgtnode1) I do this: mdadm --assemble --scan

Then I end up with this on tgtnode1:
--snip--
Oct  5 16:02:26 tgtnode1 kernel: [687524.358611] BUG: unable to handle
kernel NULL pointer dereference at 0000000000000098
Oct  5 16:02:26 tgtnode1 kernel: [687524.358637] IP:
[<ffffffff8182434a>] recv_daemon+0x104/0x366
Oct  5 16:02:26 tgtnode1 kernel: [687524.358660] PGD 0
Oct  5 16:02:26 tgtnode1 kernel: [687524.358669] Oops: 0000 [#1] SMP
Oct  5 16:02:26 tgtnode1 kernel: [687524.358683] Modules linked in:
fcst(O) scst_changer(O) scst_tape(O) scst_vdisk(O) scst_disk(O)
ib_srpt(O) iscsi_scst(O) qla2x00tgt(O) scst(O) qla2xxx bonding
mlx5_core bna ib_umad rdma_ucm ib_uverbs ib_srp iw_nes iw_cxgb4 cxgb4
iw_cxgb3 ib_qib mlx4_ib ib_mthca [last unloaded: scst]
Oct  5 16:02:26 tgtnode1 kernel: [687524.358791] CPU: 8 PID: 4840
Comm: md127_cluster_r Tainted: G           O    4.5.2-esos.prod #1
Oct  5 16:02:26 tgtnode1 kernel: [687524.358809] Hardware name: Dell
Inc. PowerEdge R710/00NH4P, BIOS 6.4.0 07/23/2013
Oct  5 16:02:26 tgtnode1 kernel: [687524.359038] task:
ffff880618991600 ti: ffff8806198a0000 task.ti: ffff8806198a0000
Oct  5 16:02:26 tgtnode1 kernel: [687524.359271] RIP:
0010:[<ffffffff8182434a>]  [<ffffffff8182434a>]
recv_daemon+0x104/0x366
Oct  5 16:02:26 tgtnode1 kernel: [687524.359515] RSP:
0018:ffff8806198a3df8  EFLAGS: 00010286
Oct  5 16:02:26 tgtnode1 kernel: [687524.359639] RAX: 0000000000000000
RBX: ffff8806189ce000 RCX: 00000000004cd980
Oct  5 16:02:26 tgtnode1 kernel: [687524.359885] RDX: 00000000004dd980
RSI: 0000000000000001 RDI: ffff8806189ce000
Oct  5 16:02:26 tgtnode1 kernel: [687524.360124] RBP: ffff88031a5ce700
R08: 0000000000016ec0 R09: ffff88061e85dfc0
Oct  5 16:02:26 tgtnode1 kernel: [687524.360367] R10: ffffffff8182431d
R11: 0000000000000002 R12: ffff88061e85dfc0
Oct  5 16:02:26 tgtnode1 kernel: [687524.360600] R13: ffff8800aeb60480
R14: 0000000000000000 R15: ffff8800aeb60b80
Oct  5 16:02:26 tgtnode1 kernel: [687524.360827] FS:
0000000000000000(0000) GS:ffff88062fc80000(0000)
knlGS:0000000000000000
Oct  5 16:02:26 tgtnode1 kernel: [687524.361059] CS:  0010 DS: 0000
ES: 0000 CR0: 000000008005003b
Oct  5 16:02:26 tgtnode1 kernel: [687524.361184] CR2: 0000000000000098
CR3: 0000000002012000 CR4: 00000000000006e0
Oct  5 16:02:26 tgtnode1 kernel: [687524.361422] Stack:
Oct  5 16:02:26 tgtnode1 kernel: [687524.361535]  ffff88031a5ce730
00000000004dd980 00000000004cd980 0000000000000001
Oct  5 16:02:26 tgtnode1 kernel: [687524.361771]  00000000004cd980
00000000004dd980 0000000000000000 0000000000000000
Oct  5 16:02:26 tgtnode1 kernel: [687524.362007]  0000000000000000
0000000093f3fcfe ffff88061efde3c0 7fffffffffffffff
Oct  5 16:02:26 tgtnode1 kernel: [687524.362251] Call Trace:
Oct  5 16:02:26 tgtnode1 kernel: [687524.362369]  [<ffffffff8183df32>]
? md_thread+0x112/0x128
Oct  5 16:02:26 tgtnode1 kernel: [687524.362491]  [<ffffffff8108b4d6>]
? wait_woken+0x69/0x69
Oct  5 16:02:26 tgtnode1 kernel: [687524.362611]  [<ffffffff8183de20>]
? md_wait_for_blocked_rdev+0x102/0x102
Oct  5 16:02:26 tgtnode1 kernel: [687524.362736]  [<ffffffff81077eb1>]
? kthread+0xc1/0xc9
Oct  5 16:02:26 tgtnode1 kernel: [687524.362855]  [<ffffffff81077df0>]
? kthread_create_on_node+0x163/0x163
Oct  5 16:02:26 tgtnode1 kernel: [687524.362979]  [<ffffffff81a3111f>]
? ret_from_fork+0x3f/0x70
Oct  5 16:02:26 tgtnode1 kernel: [687524.363099]  [<ffffffff81077df0>]
? kthread_create_on_node+0x163/0x163
Oct  5 16:02:26 tgtnode1 kernel: [687524.363223] Code: c0 49 89 c4 0f
84 86 00 00 00 48 8b 54 24 08 48 8b 4c 24 10 48 89 df 44 89 30 be 01
00 00 00 48 89 48 08 48 89 50 10 48 8b 43 08 <ff> 90 98 00 00 00 48 8b
43 08 31 f6 48 89 df ff 90 98 00 00 00
Oct  5 16:02:26 tgtnode1 kernel: [687524.363707] RIP
[<ffffffff8182434a>] recv_daemon+0x104/0x366
Oct  5 16:02:26 tgtnode1 kernel: [687524.363832]  RSP <ffff8806198a3df8>
Oct  5 16:02:26 tgtnode1 kernel: [687524.363952] CR2: 0000000000000098
Oct  5 16:02:26 tgtnode1 kernel: [687524.364395] ---[ end trace
18dcff928d33f203 ]---
Oct  5 16:02:27 tgtnode1 kernel: [687525.358844]
gather_all_resync_info:700 Resync[5036416..5101952] in progress on 0
Oct  5 16:02:27 tgtnode1 kernel: [687525.758862] bitmap_read_sb:587 bm
slot: 2 offset: 24
Oct  5 16:02:27 tgtnode1 kernel: [687525.759203] created bitmap (1
pages) for device md127
Oct  5 16:02:27 tgtnode1 kernel: [687525.759536] md127: bitmap
initialized from disk: read 1 pages, set 0 of 1093 bits
Oct  5 16:02:27 tgtnode1 kernel: [687525.759990] bitmap_read_sb:587 bm
slot: 3 offset: 32
Oct  5 16:02:27 tgtnode1 kernel: [687525.760335] created bitmap (1
pages) for device md127
Oct  5 16:02:27 tgtnode1 kernel: [687525.760650] md127: bitmap
initialized from disk: read 1 pages, set 0 of 1093 bits
Oct  5 16:02:27 tgtnode1 kernel: [687525.761137] bitmap_read_sb:587 bm
slot: 1 offset: 16
Oct  5 16:02:27 tgtnode1 kernel: [687525.761459] created bitmap (1
pages) for device md127
Oct  5 16:02:27 tgtnode1 kernel: [687525.761793] md127: bitmap
initialized from disk: read 1 pages, set 0 of 1093 bits
Oct  5 16:03:22 tgtnode1 kernel: <28>[687580.180227] udevd[482]:
worker [4803] /devices/virtual/block/dm-5 is taking a long time
Oct  5 16:03:22 tgtnode1 kernel: <28>[687580.180515] udevd[482]:
worker [4804] /devices/virtual/block/dm-4 is taking a long time
--snip--

And it appears the resync task hangs then and makes no more progress...

On tgtnode2:
# cat /proc/mdstat
Personalities : [linear] [raid0] [raid1] [raid10] [raid6] [raid5] [raid4]
md127 : active raid1 dm-5[1] dm-4[0]
      71621824 blocks super 1.2 [2/2] [UU]
      [>....................]  resync =  3.5% (2518208/71621824)
finish=212.1min speed=5427K/sec
      bitmap: 1/1 pages [4KB], 65536KB chunk

unused devices: <none>


On tgtnode1:
# cat /proc/mdstat
Personalities : [linear] [raid0] [raid1] [raid10] [raid6] [raid5] [raid4]
md127 : active raid1 dm-4[0] dm-5[1]
      71621824 blocks super 1.2 [2/2] [UU]
        resync=PENDING
      bitmap: 0/1 pages [0KB], 65536KB chunk

unused devices: <none>


So, again, this may already be fixed, just looking for confirmation if
the aforementioned patch / thread is related to this bug (or maybe
another).

I appreciate your time.


--Marc

^ permalink raw reply

* Re: kernel BUG at block/bio.c:1785 while trying to issue a discard to LVM on RAID1 md
From: Sitsofe Wheeler @ 2016-10-05 21:31 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Jens Axboe, linux-raid, linux-block, linux-kernel@vger.kernel.org
In-Reply-To: <CALjAwxgg-m2bgFf10hLB3j=2ntVxqt0vDSKuQyOavqvKXC2G5Q@mail.gmail.com>

On 3 October 2016 at 17:47, Sitsofe Wheeler <sitsofe@gmail.com> wrote:
>
> While trying to do a discard (via blkdiscard --length 1048576
> /dev/<pathtodevice>) to an LVM device atop a two disk md RAID1 the
> following oops was generated:
>
> [  103.306243] md: resync of RAID array md127
> [  103.306246] md: minimum _guaranteed_  speed: 1000 KB/sec/disk.
> [  103.306248] md: using maximum available idle IO bandwidth (but not
> more than 200000 KB/sec) for resync.
> [  103.306251] md: using 128k window, over a total of 244194432k.
> [  103.308158] ------------[ cut here ]------------
> [  103.308205] kernel BUG at block/bio.c:1785!

This still seems to be here but slightly modified with a 4.8.0 kernel:

[  156.501373] ------------[ cut here ]------------
[  156.502374] kernel BUG at block/bio.c:1785!
[  156.503139] invalid opcode: 0000 [#1] SMP
[  156.503816] Modules linked in: vmw_vsock_vmci_transport vsock
sb_edac edac_core intel_powerclamp coretemp raid1 crct10dif_pclmul
crc32_pclmul ghash_clmulni_intel ppdev intel_rapl_perf vmxnet3
vmw_balloon pcspkr joydev acpi_cpufreq tpm_tis i2c_piix4 tpm_tis_core
vmw_vmci parport_pc fjes tpm shpchp parport dm_multipath vmwgfx
drm_kms_helper ttm drm crc32c_intel serio_raw vmw_pvscsi ata_generic
pata_acpi
[  156.510208] CPU: 0 PID: 407 Comm: md127_raid1 Not tainted
4.8.0-1.vanilla.knurd.1.fc24.x86_64 #1
[  156.511559] Hardware name: VMware, Inc. VMware Virtual
Platform/440BX Desktop Reference Platform, BIOS 6.00 09/30/2014
[  156.513209] task: ffff9944f9ed9d80 task.stack: ffff9944f903c000
[  156.514159] RIP: 0010:[<ffffffffbc3aa71a>]  [<ffffffffbc3aa71a>]
bio_split+0x8a/0x90
[  156.515471] RSP: 0018:ffff9944f903fb38  EFLAGS: 00010246
[  156.516323] RAX: 00000000fffdd1f3 RBX: 0000000000000000 RCX: ffff9944ff05cf00
[  156.517456] RDX: 0000000002400000 RSI: 0000000000000000 RDI: ffff9944f830b700
[  156.518573] RBP: ffff9944f903fb50 R08: 0000000000000001 R09: ffff9944f830b700
[  156.519680] R10: ffff9944f903fbb0 R11: 0000000000000000 R12: 0000000000000000
[  156.520807] R13: 0000000000000000 R14: 0000000000000000 R15: ffff9944ff1388e0
[  156.521916] FS:  0000000000000000(0000) GS:ffff9944fec00000(0000)
knlGS:0000000000000000
[  156.523187] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  156.524080] CR2: 0000561c09a99108 CR3: 000000003ab2e000 CR4: 00000000001406f0
[  156.525228] Stack:
[  156.525571]  0000000000000000 0000000000000200 0000000000000000
ffff9944f903fbf0
[  156.526812]  ffffffffbc3b7f0e ffff9944f8928000 ffff9944fec19580
ffff9944ff05cf00
[  156.528072]  0005803fff747000 ffff9944f903fc00 00000001f8928080
ffff9944f830b700
[  156.529308] Call Trace:
[  156.529706]  [<ffffffffbc3b7f0e>] blk_queue_split+0x18e/0x640
[  156.530609]  [<ffffffffbc3b2fe3>] blk_queue_bio+0x53/0x3c0
[  156.531487]  [<ffffffffbc3b1482>] generic_make_request+0xf2/0x1d0
[  156.532436]  [<ffffffffbc3b15dd>] submit_bio+0x7d/0x150
[  156.533272]  [<ffffffffbc3aa47e>] ? bio_clone_bioset+0x14e/0x360
[  156.534211]  [<ffffffffbc3a7a0d>] submit_bio_wait+0x5d/0x90
[  156.535108]  [<ffffffffc051a3b5>] raid1d+0x405/0x1010 [raid1]
[  156.536040]  [<ffffffffbc0d4d49>] ? set_next_entity+0x49/0xd0
[  156.536938]  [<ffffffffbc64ffe9>] md_thread+0x139/0x150
[  156.537766]  [<ffffffffbc0e39b0>] ? prepare_to_wait_event+0xf0/0xf0
[  156.538758]  [<ffffffffbc64feb0>] ? find_pers+0x70/0x70
[  156.539578]  [<ffffffffbc0c0328>] kthread+0xd8/0xf0
[  156.540355]  [<ffffffffbc7fd43f>] ret_from_fork+0x1f/0x40
[  156.541208]  [<ffffffffbc0c0250>] ? kthread_worker_fn+0x180/0x180
[  156.542186] Code: 44 89 e2 4c 89 ef e8 d6 51 03 00 41 8b 75 28 48
89 df e8 3a d3 ff ff 5b 4c 89 e8 41 5c 41 5d 5d c3 e8 1b fc ff ff 49
89 c5 eb b6 <0f> 0b 0f 0b 66 90 0f 1f 44 00 00 48 8b 07 55 48 89 e5 48
85 c0
[  156.546655] RIP  [<ffffffffbc3aa71a>] bio_split+0x8a/0x90
[  156.547535]  RSP <ffff9944f903fb38>
[  156.548127] ---[ end trace bdaed0bbc089e451 ]---
[  156.549888] BUG: unable to handle kernel paging request at 000000004ebddcbe
[  156.551080] IP: [<ffffffffbc0e323b>] __wake_up_common+0x2b/0x80
[  156.552036] PGD 0
[  156.552430] Oops: 0000 [#2] SMP
[  156.552958] Modules linked in: vmw_vsock_vmci_transport vsock
sb_edac edac_core intel_powerclamp coretemp raid1 crct10dif_pclmul
crc32_pclmul ghash_clmulni_intel ppdev intel_rapl_perf vmxnet3
vmw_balloon pcspkr joydev acpi_cpufreq tpm_tis i2c_piix4 tpm_tis_core
vmw_vmci parport_pc fjes tpm shpchp parport dm_multipath vmwgfx
drm_kms_helper ttm drm crc32c_intel serio_raw vmw_pvscsi ata_generic
pata_acpi
[  156.559295] CPU: 0 PID: 407 Comm: md127_raid1 Tainted: G      D
    4.8.0-1.vanilla.knurd.1.fc24.x86_64 #1
[  156.560844] Hardware name: VMware, Inc. VMware Virtual
Platform/440BX Desktop Reference Platform, BIOS 6.00 09/30/2014
[  156.562491] task: ffff9944f9ed9d80 task.stack: ffff9944f903c000
[  156.563418] RIP: 0010:[<ffffffffbc0e323b>]  [<ffffffffbc0e323b>]
__wake_up_common+0x2b/0x80
[  156.564740] RSP: 0018:ffff9944f903fe38  EFLAGS: 00010096
[  156.565588] RAX: 0000000000000282 RBX: ffff9944f903ff10 RCX: 0000000000000000
[  156.566703] RDX: 000000004ebddcbe RSI: 0000000000000003 RDI: ffff9944f903ff10
[  156.567819] RBP: ffff9944f903fe70 R08: 0000000000000000 R09: 0000000000000000
[  156.568924] R10: 0000000000000000 R11: 00000000fa83b2da R12: ffff9944f903ff18
[  156.570038] R13: 0000000000000282 R14: 0000000000000001 R15: 0000000000000003
[  156.571158] FS:  0000000000000000(0000) GS:ffff9944fec00000(0000)
knlGS:0000000000000000
[  156.572414] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  156.573303] CR2: 000000004ebddcbe CR3: 00000000388ca000 CR4: 00000000001406f0
[  156.574498] Stack:
[  156.574829]  00000001f8920000 0000000000000000 ffff9944f903ff10
ffff9944f903ff08
[  156.576098]  0000000000000282 0000000000000001 0000000000000000
ffff9944f903fe80
[  156.577333]  ffffffffbc0e32f3 ffff9944f903fea8 ffffffffbc0e3e57
0000000000000000
[  156.578567] Call Trace:
[  156.578962]  [<ffffffffbc0e32f3>] __wake_up_locked+0x13/0x20
[  156.579904]  [<ffffffffbc0e3e57>] complete+0x37/0x50
[  156.580699]  [<ffffffffbc09e13c>] mm_release+0xbc/0x140
[  156.581513]  [<ffffffffbc0a47f5>] do_exit+0x155/0xb10
[  156.582331]  [<ffffffffbc7feac7>] rewind_stack_do_exit+0x17/0x20
[  156.583262]  [<ffffffffbc0c0250>] ? kthread_worker_fn+0x180/0x180
[  156.584230] Code: 0f 1f 44 00 00 55 48 89 e5 41 57 41 56 41 55 41
54 4c 8d 67 08 53 41 89 f7 48 83 ec 10 89 55 cc 48 8b 57 08 4c 89 45
d0 49 39 d4 <48> 8b 32 74 40 48 8d 42 e8 4c 8d 6e e8 41 89 ce 8b 18 48
8b 4d
[  156.588679] RIP  [<ffffffffbc0e323b>] __wake_up_common+0x2b/0x80
[  156.589646]  RSP <ffff9944f903fe38>
[  156.590203] CR2: 000000004ebddcbe
[  156.590754] ---[ end trace bdaed0bbc089e452 ]---
[  156.591478] ------------[ cut here ]------------
[  156.592235] WARNING: CPU: 0 PID: 407 at kernel/exit.c:737 do_exit+0x62/0xb10

The difference is the last oops seems to be endlessly repeated over
and over again essentially making it feel like the system has locked
up. Partner issue is over on
http://www.gossamer-threads.com/lists/linux/kernel/2538757?do=post_view_threaded#2538757
.

-- 
Sitsofe | http://sucs.org/~sits/

^ permalink raw reply

* Re: kernel BUG at block/bio.c:1785 while trying to issue a discard to LVM on RAID1 md
From: Shaohua Li @ 2016-10-05 21:39 UTC (permalink / raw)
  To: Sitsofe Wheeler
  Cc: Jens Axboe, linux-raid, linux-block, linux-kernel@vger.kernel.org
In-Reply-To: <CALjAwxhYh0JDKh7E_t9sm-vBBPX-3dzL5hDMi4rA6uE2RqaJoQ@mail.gmail.com>

On Wed, Oct 05, 2016 at 10:31:11PM +0100, Sitsofe Wheeler wrote:
> On 3 October 2016 at 17:47, Sitsofe Wheeler <sitsofe@gmail.com> wrote:
> >
> > While trying to do a discard (via blkdiscard --length 1048576
> > /dev/<pathtodevice>) to an LVM device atop a two disk md RAID1 the
> > following oops was generated:
> >
> > [  103.306243] md: resync of RAID array md127
> > [  103.306246] md: minimum _guaranteed_  speed: 1000 KB/sec/disk.
> > [  103.306248] md: using maximum available idle IO bandwidth (but not
> > more than 200000 KB/sec) for resync.
> > [  103.306251] md: using 128k window, over a total of 244194432k.
> > [  103.308158] ------------[ cut here ]------------
> > [  103.308205] kernel BUG at block/bio.c:1785!
> 
> This still seems to be here but slightly modified with a 4.8.0 kernel:

Does this fix the issue? Looks there is IO error


diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 21dc00e..349eb11 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2196,7 +2196,6 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
 			wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
 		}
 
-		bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
 		wbio->bi_iter.bi_sector = r1_bio->sector;
 		wbio->bi_iter.bi_size = r1_bio->sectors << 9;
 

^ permalink raw reply related

* Re: Prefetch in /lib/raid6/avx2.c
From: Shaohua Li @ 2016-10-05 23:17 UTC (permalink / raw)
  To: Doug Dumitru
  Cc: linux-raid, gayatri.kammela, ravi.v.shankar, hpa, yu-cheng.yu,
	yuanhan.liu
In-Reply-To: <CAFx4rwS5-TCWKxRYpXHeRsfTiJ=mTV0gxoL-yUuqoEbpXst08A@mail.gmail.com>

On Sun, Oct 02, 2016 at 03:40:09PM -0700, Doug Dumitru wrote:
> I have been doing some high bandwidth testing of raid-6, and the
> pretetch in raid6_avx24_gen_syndrome appears to be less than optimal.
> 
> This is my patch (against 4.4.0-38 [Ubuntu 16.04LTS)
> 
> --- cut here ---
> --- lib/raid6/avx2.c0   2016-10-01 21:42:25.280347868 -0700
> +++ lib/raid6/avx2.c    2016-10-02 15:35:48.168480760 -0700
> @@ -189,10 +189,8 @@
> 
>                 for (z = z0; z >= 0; z--) {
> 
> -                       asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
> -                       asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
> -                       asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
> -                       asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
> +                       asm volatile("prefetchnta %0" : : "m" (dptr[z][d+128]));
> +                       asm volatile("prefetchnta %0" : : "m" (dptr[z][d+192]));
> 
>                         asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
>                         asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
> --- cut here ---
> 
> In perf, the cpu cycles goes from 5.3% to 3.0% for
> raid6_avx24_gen_syndrome in my test and throughput increases from
> about 8.2GB/sec to almost 10GB/sec.  It is a very "synthetic" test,
> but the avx2 code does seem to be a factor.
> 
> I suspect other SSE and AVX "unroll variants" have similar issues, but
> I have not tested those.
> 
> My test system is an E5-1650 v3 (single socket) with DDR4.  This might
> help dual sockets even more.

CC some intel folks to see if they have ideas

^ permalink raw reply

* Re: Bug#837964: 95a05b3 broke mdadm --add on my superblock 1.0 array
From: NeilBrown @ 2016-10-06  1:26 UTC (permalink / raw)
  To: Guoqing Jiang, 837964, Anthony DeRobertis, linux-raid
In-Reply-To: <57E34496.8020108@suse.com>

[-- Attachment #1: Type: text/plain, Size: 4054 bytes --]

On Thu, Sep 22 2016, Guoqing Jiang wrote:

> On 09/21/2016 02:45 AM, Guoqing Jiang wrote:
>>
>>
>> On 09/20/2016 02:31 PM, Anthony DeRobertis wrote:
>>> Sorry for the amount of emails I'm sending, but I noticed something 
>>> that's probably important. I'm also appending some gdb log from 
>>> tracing through the function (trying to answer why it's doing cluster 
>>> mode stuff at all).
>>>
>>> While tracing through, I noticed that *before* the write-bitmap loop, 
>>> mdadm -E considers the superblock valid. That agrees with what I saw 
>>> from strace, I suppose. To my first glance, it figures out how much 
>>> to write by calling this function:
>>>
>>> static unsigned int calc_bitmap_size(bitmap_super_t *bms, unsigned 
>>> int boundary)
>>> {
>>>     unsigned long long bits, bytes;
>>>
>>>     bits = __le64_to_cpu(bms->sync_size) / 
>>> (__le32_to_cpu(bms->chunksize)>>9);
>>>     bytes = (bits+7) >> 3;
>>>     bytes += sizeof(bitmap_super_t);
>>>     bytes = ROUND_UP(bytes, boundary);
>>>
>>>     return bytes;
>>> }
>>>
>>> That code looked familiar, and I figured out where—it's also in 
>>> 95a05b37e8eb2bc0803b1a0298fce6adc60eff16, the commit that I found 
>>> originally broke it. But that commit is making a change to it: it 
>>> changed the ROUND_UP line from 512 to 4096 (and from the gdb trace, 
>>> boundary==4096).
>>>
>>> I tested changing that line to "bytes = ROUND_UP(bytes, 512);", and 
>>> it works. Adds the new disk to the array and produces no warnings or 
>>> errors.
>>
>> I think it is is a coincidence that above change works,  4a3d29e 
>> commit made
>> the change but it didn't change the logic at all.
>
> Hmm, seems bitmap is aligned to 512 in previous mdadm, but with commit 
> 95a05b3
> we made it aligned to 4k, so it causes the latest mdadm can't work with 
> previous
> created array.
>
> Does the below change work? Thanks.
>
> diff --git a/super1.c b/super1.c
> index 9f62d23..6a0b075 100644
> --- a/super1.c
> +++ b/super1.c
> @@ -2433,7 +2433,10 @@ static int write_bitmap1(struct supertype *st, 
> int fd, enum bitmap_update update
>                          memset(buf, 0xff, 4096);
>                  memcpy(buf, (char *)bms, sizeof(bitmap_super_t));
>
> -               towrite = calc_bitmap_size(bms, 4096);
> +               if (__le32_to_cpu(bms->nodes) == 0)
> +                       towrite = calc_bitmap_size(bms, 512);
> +               else
> +                       towrite = calc_bitmap_size(bms, 4096);
>                  while (towrite > 0) {

(sorry for the late reply ... travel, jetlag, ....)

I think a better, simpler, fix is:

> -               towrite = calc_bitmap_size(bms, 4096);
> +               towrite = calc_bitmap_size(bms, 512);

The only reason that we are rounding up here is that we are using
O_DIRECT writes and they require 512-byte alignment.

Any bytes beyond the end of the actual bitmap will be ignored, so it
doesn't matter whether they are written or not.

Current mdadm always aligns bitmaps on a 4K boundary, but older version
of mdadm didn't.  If the bitmap was less than 4K before the superblock
(quite possible), writing 4K for bitmap would corrupt the superblock.
This can certainly happen with 1.0 metadata.

However ... the reason that everything is now 4K aligned is that some
drives use a 4K block size.  For those, we really should be doing 4K
writes, not 512-byte writes.

So it would make sense to round up to 4K sometimes, and use 512 at other
times.  However the correct test isn't whether cluster-raid is in use.

The metadata has always been aligned on a 4K boundary.
If data_offset and bblog_offset and bitmap_offset all have 4K alignment,
then rounding up to 4K for the bitmap writes would be correct.
If anything have a smaller alignment, then it isn't necessary and so
should be avoided.

So the best fix would be to test those 3 offsets, and round up to a
multiple of 4096 only if all of them are on a 4K boundary.

NeilBrown


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 800 bytes --]

^ permalink raw reply

* Re: kernel BUG at block/bio.c:1785 while trying to issue a discard to LVM on RAID1 md
From: Sitsofe Wheeler @ 2016-10-06  6:57 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Jens Axboe, linux-raid, linux-block, linux-kernel@vger.kernel.org
In-Reply-To: <20161005213945.GA84123@kernel.org>

On 5 October 2016 at 22:39, Shaohua Li <shli@kernel.org> wrote:
> On Wed, Oct 05, 2016 at 10:31:11PM +0100, Sitsofe Wheeler wrote:
>> On 3 October 2016 at 17:47, Sitsofe Wheeler <sitsofe@gmail.com> wrote:
>> >
>> > While trying to do a discard (via blkdiscard --length 1048576
>> > /dev/<pathtodevice>) to an LVM device atop a two disk md RAID1 the
>> > following oops was generated:
>> >
>> > [  103.306243] md: resync of RAID array md127
>> > [  103.306246] md: minimum _guaranteed_  speed: 1000 KB/sec/disk.
>> > [  103.306248] md: using maximum available idle IO bandwidth (but not
>> > more than 200000 KB/sec) for resync.
>> > [  103.306251] md: using 128k window, over a total of 244194432k.
>> > [  103.308158] ------------[ cut here ]------------
>> > [  103.308205] kernel BUG at block/bio.c:1785!
>>
>> This still seems to be here but slightly modified with a 4.8.0 kernel:
>
> Does this fix the issue? Looks there is IO error
>
>
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 21dc00e..349eb11 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -2196,7 +2196,6 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
>                         wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
>                 }
>
> -               bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
>                 wbio->bi_iter.bi_sector = r1_bio->sector;
>                 wbio->bi_iter.bi_size = r1_bio->sectors << 9;
>

Yes the patch above fixes the issue and make blkdiscard just report
that the BLKDISCARD ioctl failed. Since having this patch applied
means the issue seen in
http://www.gossamer-threads.com/lists/linux/kernel/2538757?do=post_view_threaded#2538757
(BUG at arch/x86/kernel/pci-nommu.c:66 / BUG at
./include/linux/scatterlist.h:90) can't be reached does that mean
whatever was seen there is also spurious?

Additionally as this issue seems to have been a problem going back to
at least the 3.18 kernels, would a fix similar to this be eligible for
stable kernels?

-- 
Sitsofe | http://sucs.org/~sits/

^ permalink raw reply

* [PATCH v3 0/8] raid5-cache: enabling cache features
From: Song Liu @ 2016-10-06  7:14 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuang521,
	liuzhengyuan, Song Liu

These are the 3rd version of patches to enable write cache part of
raid5-cache. The journal part was released with kernel 4.4.

The caching part uses same disk format of raid456 journal, and provides
acceleration to writes. Write operations are committed (bio_endio) once
the data is secured in journal. Reconstruct and RMW are postponed to
reclaim path, which is (hopefully) not on the critical path.

The changes are organized in 8 patches (details below).

Patch for chunk_aligned_read in earlier RFC is not included yet
(http://marc.info/?l=linux-raid&m=146432700719277). But we may still need
some optimizations later, especially for SSD raid devices.

Changes from PATCH v2 (http://marc.info/?l=linux-raid&m=147493266208102):
  1. Incorporate feedback from Shaohua
  2. Reorganize the patches, for hopefully easier review
  3. Make sure no change to write through mode (journal only)
  4. Change reclaim design to avoid deadlock due to log space

Thanks,
Song

Song Liu (8):
  md/r5cache: Check array size in r5l_init_log
  md/r5cache: move some code to raid5.h
  md/r5cache: State machine for raid5-cache write back mode
  md/r5cache: write part of r5cache
  md/r5cache: reclaim support
  md/r5cache: sysfs entry r5c_state
  md/r5cache: r5c recovery
  md/r5cache: handle SYNC and FUA

 drivers/md/raid5-cache.c | 1597 ++++++++++++++++++++++++++++++++++++++++------
 drivers/md/raid5.c       |  258 +++++---
 drivers/md/raid5.h       |  150 ++++-
 3 files changed, 1712 insertions(+), 293 deletions(-)

--
2.9.3

^ permalink raw reply

* [PATCH v3 1/8] md/r5cache: Check array size in r5l_init_log
From: Song Liu @ 2016-10-06  7:14 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuang521,
	liuzhengyuan, Song Liu
In-Reply-To: <20161006071416.3295093-1-songliubraving@fb.com>

Currently, r5l_write_stripe checks meta size for each stripe write,
which is not necessary.

With this patch, r5l_init_log checks maximal meta size of the array,
which is (r5l_meta_block + raid_disks x r5l_payload_data_parity).
If this is too big to fit in one page, r5l_init_log aborts.

With current meta data, r5l_log support raid_disks up to 203.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 1b1ab4a..7557791b 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -441,7 +441,6 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 {
 	int write_disks = 0;
 	int data_pages, parity_pages;
-	int meta_size;
 	int reserve;
 	int i;
 	int ret = 0;
@@ -473,15 +472,6 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 	parity_pages = 1 + !!(sh->qd_idx >= 0);
 	data_pages = write_disks - parity_pages;
 
-	meta_size =
-		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
-		 * data_pages) +
-		sizeof(struct r5l_payload_data_parity) +
-		sizeof(__le32) * parity_pages;
-	/* Doesn't work with very big raid array */
-	if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
-		return -EINVAL;
-
 	set_bit(STRIPE_LOG_TRAPPED, &sh->state);
 	/*
 	 * The stripe must enter state machine again to finish the write, so
@@ -1184,6 +1174,22 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 
 	if (PAGE_SIZE != 4096)
 		return -EINVAL;
+
+	/*
+	 * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and
+	 * raid_disks r5l_payload_data_parity.
+	 *
+	 * Write journal and cache does not work for very big array
+	 * (raid_disks > 203)
+	 */
+	if (sizeof(struct r5l_meta_block) +
+	    ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
+	     conf->raid_disks) > PAGE_SIZE) {
+		pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n",
+		       mdname(conf->mddev), conf->raid_disks);
+		return -EINVAL;
+	}
+
 	log = kzalloc(sizeof(*log), GFP_KERNEL);
 	if (!log)
 		return -ENOMEM;
-- 
2.9.3


^ permalink raw reply related

* [PATCH v3 2/8] md/r5cache: move some code to raid5.h
From: Song Liu @ 2016-10-06  7:14 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuang521,
	liuzhengyuan, Song Liu
In-Reply-To: <20161006071416.3295093-1-songliubraving@fb.com>

Move some define and inline functions to raid5.h, so they can be
used in raid5-cache.c

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5.c | 71 -------------------------------------------------
 drivers/md/raid5.h | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 71 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f94472d..67d4f49 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -70,19 +70,6 @@ module_param(devices_handle_discard_safely, bool, 0644);
 MODULE_PARM_DESC(devices_handle_discard_safely,
 		 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
 static struct workqueue_struct *raid5_wq;
-/*
- * Stripe cache
- */
-
-#define NR_STRIPES		256
-#define STRIPE_SIZE		PAGE_SIZE
-#define STRIPE_SHIFT		(PAGE_SHIFT - 9)
-#define STRIPE_SECTORS		(STRIPE_SIZE>>9)
-#define	IO_THRESHOLD		1
-#define BYPASS_THRESHOLD	1
-#define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
-#define HASH_MASK		(NR_HASH - 1)
-#define MAX_STRIPE_BATCH	8
 
 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
 {
@@ -126,64 +113,6 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
 	local_irq_enable();
 }
 
-/* bio's attached to a stripe+device for I/O are linked together in bi_sector
- * order without overlap.  There may be several bio's per stripe+device, and
- * a bio could span several devices.
- * When walking this list for a particular stripe+device, we must never proceed
- * beyond a bio that extends past this device, as the next bio might no longer
- * be valid.
- * This function is used to determine the 'next' bio in the list, given the sector
- * of the current stripe+device
- */
-static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
-{
-	int sectors = bio_sectors(bio);
-	if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
-		return bio->bi_next;
-	else
-		return NULL;
-}
-
-/*
- * We maintain a biased count of active stripes in the bottom 16 bits of
- * bi_phys_segments, and a count of processed stripes in the upper 16 bits
- */
-static inline int raid5_bi_processed_stripes(struct bio *bio)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-	return (atomic_read(segments) >> 16) & 0xffff;
-}
-
-static inline int raid5_dec_bi_active_stripes(struct bio *bio)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-	return atomic_sub_return(1, segments) & 0xffff;
-}
-
-static inline void raid5_inc_bi_active_stripes(struct bio *bio)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-	atomic_inc(segments);
-}
-
-static inline void raid5_set_bi_processed_stripes(struct bio *bio,
-	unsigned int cnt)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-	int old, new;
-
-	do {
-		old = atomic_read(segments);
-		new = (old & 0xffff) | (cnt << 16);
-	} while (atomic_cmpxchg(segments, old, new) != old);
-}
-
-static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-	atomic_set(segments, cnt);
-}
-
 /* Find first data disk in a raid6 stripe */
 static inline int raid6_d0(struct stripe_head *sh)
 {
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 517d4b6..46cfe93 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -410,6 +410,83 @@ struct disk_info {
 	struct md_rdev	*rdev, *replacement;
 };
 
+/*
+ * Stripe cache
+ */
+
+#define NR_STRIPES		256
+#define STRIPE_SIZE		PAGE_SIZE
+#define STRIPE_SHIFT		(PAGE_SHIFT - 9)
+#define STRIPE_SECTORS		(STRIPE_SIZE>>9)
+#define	IO_THRESHOLD		1
+#define BYPASS_THRESHOLD	1
+#define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
+#define HASH_MASK		(NR_HASH - 1)
+#define MAX_STRIPE_BATCH	8
+
+/* bio's attached to a stripe+device for I/O are linked together in bi_sector
+ * order without overlap.  There may be several bio's per stripe+device, and
+ * a bio could span several devices.
+ * When walking this list for a particular stripe+device, we must never proceed
+ * beyond a bio that extends past this device, as the next bio might no longer
+ * be valid.
+ * This function is used to determine the 'next' bio in the list, given the
+ * sector of the current stripe+device
+ */
+static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
+{
+	int sectors = bio_sectors(bio);
+
+	if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
+		return bio->bi_next;
+	else
+		return NULL;
+}
+
+/*
+ * We maintain a biased count of active stripes in the bottom 16 bits of
+ * bi_phys_segments, and a count of processed stripes in the upper 16 bits
+ */
+static inline int raid5_bi_processed_stripes(struct bio *bio)
+{
+	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+
+	return (atomic_read(segments) >> 16) & 0xffff;
+}
+
+static inline int raid5_dec_bi_active_stripes(struct bio *bio)
+{
+	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+
+	return atomic_sub_return(1, segments) & 0xffff;
+}
+
+static inline void raid5_inc_bi_active_stripes(struct bio *bio)
+{
+	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+
+	atomic_inc(segments);
+}
+
+static inline void raid5_set_bi_processed_stripes(struct bio *bio,
+	unsigned int cnt)
+{
+	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+	int old, new;
+
+	do {
+		old = atomic_read(segments);
+		new = (old & 0xffff) | (cnt << 16);
+	} while (atomic_cmpxchg(segments, old, new) != old);
+}
+
+static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
+{
+	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+
+	atomic_set(segments, cnt);
+}
+
 /* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
  * This is because we sometimes take all the spinlocks
  * and creating that much locking depth can cause
-- 
2.9.3


^ permalink raw reply related

* [PATCH v3 3/8] md/r5cache: State machine for raid5-cache write back mode
From: Song Liu @ 2016-10-06  7:14 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuang521,
	liuzhengyuan, Song Liu
In-Reply-To: <20161006071416.3295093-1-songliubraving@fb.com>

The raid5-cache write back mode as 2 states for each stripe: write state
and reclaim state. This patch adds bare state machine for these two
states.

2 flags are added to sh->state for raid5-cache states:
 - STRIPE_R5C_FROZEN
 - STRIPE_R5C_WRITTEN

STRIPE_R5C_FROZEN is the key flag to differentiate write state
and reclaim state.

STRIPE_R5C_WRITTEN is a helper flag to bring the stripe back from
reclaim state back to write state.

In write through mode, every stripe also goes between write state
and reclaim state (in r5c_handle_stripe_dirtying() and
r5c_handle_stripe_written()).

Please note: this is a "no-op" patch for raid5-cache write through
mode.

The following detailed explanation is copied from the raid5-cache.c:

/*
 * raid5 cache state machine
 *
 * The RAID cache works in two major states for each stripe: write state
 * and reclaim state. These states are controlled by flags STRIPE_R5C_FROZEN
 * and STRIPE_R5C_WRITTEN
 *
 * STRIPE_R5C_FROZEN is the key flag to differentiate write state and reclaim
 * state. The write state runs w/ STRIPE_R5C_FROZEN == 0. While the reclaim
 * state runs w/ STRIPE_R5C_FROZEN == 1.
 *
 * STRIPE_R5C_WRITTEN is a helper flag to bring the stripe back from reclaim
 * state to write state. Specifically, STRIPE_R5C_WRITTEN triggers clean up
 * process in r5c_handle_stripe_written. STRIPE_R5C_WRITTEN is set when data
 * and parity of a stripe is all in journal device; and cleared when the data
 * and parity are all in RAID disks.
 *
 * The following is another way to show how STRIPE_R5C_FROZEN and
 * STRIPE_R5C_WRITTEN work:
 *
 * write state: STRIPE_R5C_FROZEN = 0 STRIPE_R5C_WRITTEN = 0
 * reclaim state: STRIPE_R5C_FROZEN = 1
 *
 * write => reclaim: set STRIPE_R5C_FROZEN in r5c_freeze_stripe_for_reclaim
 * reclaim => write:
 * 1. write parity to journal, when finished, set STRIPE_R5C_WRITTEN
 * 2. write data/parity to raid disks, when finished, clear both
 *    STRIPE_R5C_FROZEN and STRIPE_R5C_WRITTEN
 *
 * In write through mode (journal only) the stripe still goes through these
 * state change, except that STRIPE_R5C_FROZEN is set on write in
 * r5c_handle_stripe_dirtying().
 */

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c | 125 +++++++++++++++++++++++++++++++++++++++++++++--
 drivers/md/raid5.c       |  20 ++++++--
 drivers/md/raid5.h       |  10 +++-
 3 files changed, 148 insertions(+), 7 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 7557791b..9e05850 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -40,6 +40,47 @@
  */
 #define R5L_POOL_SIZE	4
 
+enum r5c_state {
+	R5C_STATE_NO_CACHE = 0,
+	R5C_STATE_WRITE_THROUGH = 1,
+	R5C_STATE_WRITE_BACK = 2,
+	R5C_STATE_CACHE_BROKEN = 3,
+};
+
+/*
+ * raid5 cache state machine
+ *
+ * The RAID cache works in two major states for each stripe: write state and
+ * reclaim state. These states are controlled by flags STRIPE_R5C_FROZEN and
+ * STRIPE_R5C_WRITTEN
+ *
+ * STRIPE_R5C_FROZEN is the key flag to differentiate write state and reclaim
+ * state. The write state runs w/ STRIPE_R5C_FROZEN == 0. While the reclaim
+ * state runs w/ STRIPE_R5C_FROZEN == 1.
+ *
+ * STRIPE_R5C_WRITTEN is a helper flag to bring the stripe back from reclaim
+ * state to write state. Specifically, STRIPE_R5C_WRITTEN triggers clean up
+ * process in r5c_handle_stripe_written. STRIPE_R5C_WRITTEN is set when data
+ * and parity of a stripe is all in journal device; and cleared when the data
+ * and parity are all in RAID disks.
+ *
+ * The following is another way to show how STRIPE_R5C_FROZEN and
+ * STRIPE_R5C_WRITTEN work:
+ *
+ * write state: STRIPE_R5C_FROZEN = 0 STRIPE_R5C_WRITTEN = 0
+ * reclaim state: STRIPE_R5C_FROZEN = 1
+ *
+ * write => reclaim: set STRIPE_R5C_FROZEN in r5c_freeze_stripe_for_reclaim
+ * reclaim => write:
+ * 1. write parity to journal, when finished, set STRIPE_R5C_WRITTEN
+ * 2. write data/parity to raid disks, when finished, clear both
+ *    STRIPE_R5C_FROZEN and STRIPE_R5C_WRITTEN
+ *
+ * In write through mode (journal only) the stripe also goes through these
+ * state change, except that STRIPE_R5C_FROZEN is set on write in
+ * r5c_handle_stripe_dirtying().
+ */
+
 struct r5l_log {
 	struct md_rdev *rdev;
 
@@ -96,6 +137,9 @@ struct r5l_log {
 	spinlock_t no_space_stripes_lock;
 
 	bool need_cache_flush;
+
+	/* for r5c_cache */
+	enum r5c_state r5c_state;
 };
 
 /*
@@ -133,6 +177,11 @@ enum r5l_io_unit_state {
 	IO_UNIT_STRIPE_END = 3,	/* stripes data finished writing to raid */
 };
 
+bool r5c_is_writeback(struct r5l_log *log)
+{
+	return (log != NULL && log->r5c_state == R5C_STATE_WRITE_BACK);
+}
+
 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
 {
 	start += inc;
@@ -168,12 +217,44 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
 	io->state = state;
 }
 
+/*
+ * Freeze the stripe, thus send the stripe into reclaim path.
+ *
+ * In current implementation, STRIPE_R5C_FROZEN is also set in write through
+ * mode (in r5c_handle_stripe_dirtying). This does not change the behavior of
+ * for write through mode.
+ */
+void r5c_freeze_stripe_for_reclaim(struct stripe_head *sh)
+{
+	struct r5conf *conf = sh->raid_conf;
+	struct r5l_log *log = conf->log;
+
+	if (!log)
+		return;
+	WARN_ON(test_bit(STRIPE_R5C_FROZEN, &sh->state));
+	set_bit(STRIPE_R5C_FROZEN, &sh->state);
+}
+
+static void r5c_finish_cache_stripe(struct stripe_head *sh)
+{
+	struct r5l_log *log = sh->raid_conf->log;
+
+	if (log->r5c_state == R5C_STATE_WRITE_THROUGH) {
+		BUG_ON(!test_bit(STRIPE_R5C_FROZEN, &sh->state));
+		set_bit(STRIPE_R5C_WRITTEN, &sh->state);
+	} else
+		BUG(); /* write back logic in next patch */
+}
+
 static void r5l_io_run_stripes(struct r5l_io_unit *io)
 {
 	struct stripe_head *sh, *next;
 
 	list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
 		list_del_init(&sh->log_list);
+
+		r5c_finish_cache_stripe(sh);
+
 		set_bit(STRIPE_HANDLE, &sh->state);
 		raid5_release_stripe(sh);
 	}
@@ -412,18 +493,19 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 		r5l_append_payload_page(log, sh->dev[i].page);
 	}
 
-	if (sh->qd_idx >= 0) {
+	if (parity_pages == 2) {
 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
 					sh->sector, sh->dev[sh->pd_idx].log_checksum,
 					sh->dev[sh->qd_idx].log_checksum, true);
 		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
 		r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
-	} else {
+	} else if (parity_pages == 1) {
 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
 					sh->sector, sh->dev[sh->pd_idx].log_checksum,
 					0, false);
 		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
-	}
+	} else
+		BUG_ON(parity_pages != 0);
 
 	list_add_tail(&sh->log_list, &io->stripe_list);
 	atomic_inc(&io->pending_stripe);
@@ -455,6 +537,8 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 		return -EAGAIN;
 	}
 
+	WARN_ON(!test_bit(STRIPE_R5C_FROZEN, &sh->state));
+
 	for (i = 0; i < sh->disks; i++) {
 		void *addr;
 
@@ -1101,6 +1185,39 @@ static void r5l_write_super(struct r5l_log *log, sector_t cp)
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 }
 
+int r5c_handle_stripe_dirtying(struct r5conf *conf,
+			       struct stripe_head *sh,
+			       struct stripe_head_state *s,
+			       int disks)
+{
+	struct r5l_log *log = conf->log;
+
+	if (!log || test_bit(STRIPE_R5C_FROZEN, &sh->state))
+		return -EAGAIN;
+
+	if (conf->log->r5c_state == R5C_STATE_WRITE_THROUGH ||
+	    conf->mddev->degraded != 0) {
+		/* write through mode */
+		r5c_freeze_stripe_for_reclaim(sh);
+		return -EAGAIN;
+	}
+	BUG();  /* write back logic in next commit */
+	return 0;
+}
+
+/*
+ * clean up the stripe (clear STRIPE_R5C_FROZEN etc.) after the stripe is
+ * committed to RAID disks
+*/
+void r5c_handle_stripe_written(struct r5conf *conf,
+			       struct stripe_head *sh)
+{
+	if (!test_and_clear_bit(STRIPE_R5C_WRITTEN, &sh->state))
+		return;
+	WARN_ON(!test_bit(STRIPE_R5C_FROZEN, &sh->state));
+	clear_bit(STRIPE_R5C_FROZEN, &sh->state);
+}
+
 static int r5l_load_log(struct r5l_log *log)
 {
 	struct md_rdev *rdev = log->rdev;
@@ -1236,6 +1353,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	INIT_LIST_HEAD(&log->no_space_stripes);
 	spin_lock_init(&log->no_space_stripes_lock);
 
+	log->r5c_state = R5C_STATE_WRITE_THROUGH;
+
 	if (r5l_load_log(log))
 		goto error;
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 67d4f49..2e3e61a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3506,6 +3506,9 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 	int rmw = 0, rcw = 0, i;
 	sector_t recovery_cp = conf->mddev->recovery_cp;
 
+	if (r5c_handle_stripe_dirtying(conf, sh, s, disks) == 0)
+		return;
+
 	/* Check whether resync is now happening or should start.
 	 * If yes, then the array is dirty (after unclean shutdown or
 	 * initial creation), so parity in some stripes might be inconsistent.
@@ -4396,13 +4399,23 @@ static void handle_stripe(struct stripe_head *sh)
 	    || s.expanding)
 		handle_stripe_fill(sh, &s, disks);
 
-	/* Now to consider new write requests and what else, if anything
-	 * should be read.  We do not handle new writes when:
+	/*
+	 * When the stripe finishes full journal write cycle (write to journal
+	 * and raid disk), this is the clean up procedure so it is ready for
+	 * next operation.
+	 */
+	r5c_handle_stripe_written(conf, sh);
+
+	/*
+	 * Now to consider new write requests, cache write back and what else,
+	 * if anything should be read.  We do not handle new writes when:
 	 * 1/ A 'write' operation (copy+xor) is already in flight.
 	 * 2/ A 'check' operation is in flight, as it may clobber the parity
 	 *    block.
+	 * 3/ A r5c cache log write is in flight.
 	 */
-	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
+	if ((s.to_write || test_bit(STRIPE_R5C_FROZEN, &sh->state)) &&
+	    !sh->reconstruct_state && !sh->check_state && !sh->log_io)
 		handle_stripe_dirtying(conf, sh, &s, disks);
 
 	/* maybe we need to check and possibly fix the parity for this stripe
@@ -5122,6 +5135,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	 * data on failed drives.
 	 */
 	if (rw == READ && mddev->degraded == 0 &&
+	    !r5c_is_writeback(conf->log) &&
 	    mddev->reshape_position == MaxSector) {
 		bi = chunk_aligned_read(mddev, bi);
 		if (!bi)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 46cfe93..8bae64b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -345,7 +345,9 @@ enum {
 	STRIPE_BITMAP_PENDING,	/* Being added to bitmap, don't add
 				 * to batch yet.
 				 */
-	STRIPE_LOG_TRAPPED, /* trapped into log */
+	STRIPE_LOG_TRAPPED,	/* trapped into log */
+	STRIPE_R5C_FROZEN,	/* r5c_cache frozen and being written out */
+	STRIPE_R5C_WRITTEN,	/* ready for r5c_handle_stripe_written() */
 };
 
 #define STRIPE_EXPAND_SYNC_FLAGS \
@@ -712,4 +714,10 @@ extern void r5l_stripe_write_finished(struct stripe_head *sh);
 extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
 extern void r5l_quiesce(struct r5l_log *log, int state);
 extern bool r5l_log_disk_error(struct r5conf *conf);
+extern bool r5c_is_writeback(struct r5l_log *log);
+extern int
+r5c_handle_stripe_dirtying(struct r5conf *conf, struct stripe_head *sh,
+			   struct stripe_head_state *s, int disks);
+extern void
+r5c_handle_stripe_written(struct r5conf *conf, struct stripe_head *sh);
 #endif
-- 
2.9.3


^ permalink raw reply related

* [PATCH v3 4/8] md/r5cache: write part of r5cache
From: Song Liu @ 2016-10-06  7:14 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuang521,
	liuzhengyuan, Song Liu
In-Reply-To: <20161006071416.3295093-1-songliubraving@fb.com>

This is the write part of r5cache. The cache is integrated with
stripe cache of raid456. It leverages code of r5l_log to write
data to journal device.

r5cache split current write path into 2 parts: the write path
and the reclaim path. The write path is as following:
1. write data to journal
   (r5c_handle_stripe_dirtying, r5c_cache_data)
2. call bio_endio
   (r5c_handle_data_cached, r5c_return_dev_pending_writes).

Then the reclaim path is as:
1. Freeze the stripe (r5c_freeze_stripe_for_reclaim)
2. Calcualte parity (reconstruct or RMW)
3. Write parity (and maybe some other data) to journal device
4. Write data and parity to RAID disks

Reclaim path of the cache is implemented in the next patch.

With r5cache, write operation does not wait for parity calculation
and write out, so the write latency is lower (1 write to journal
device vs. read and then write to raid disks). Also, r5cache will
reduce RAID overhead (multipile IO due to read-modify-write of
parity) and provide more opportunities of full stripe writes.

This patch adds 2 flags to stripe_head.state:
 - STRIPE_R5C_PARTIAL_STRIPE,
 - STRIPE_R5C_FULL_STRIPE,

Instead of inactive_list, stripes with cached data are tracked in
r5conf->r5c_full_stripe_list and r5conf->r5c_partial_stripe_list.
STRIPE_R5C_FULL_STRIPE and STRIPE_R5C_PARTIAL_STRIPE are flags for
stripes in these lists. Note: stripes in r5c_full/partial_stripe_list
are not considered as "active".

For RMW, the code allocates an extra page for each data block
being updated.  This is stored in r5dev->page and the old data
is read into it.  Then the prexor calculation subtracts ->page
from the parity block, and the reconstruct calculation adds the
->orig_page data back into the parity block.

r5cache naturally excludes SkipCopy. With R5_Wantcache bit set,
async_copy_data will not skip copy.

There are some known limitations of the cache implementation:

1. Write cache only covers full page writes (R5_OVERWRITE). Writes
   of smaller granularity are write through.
2. Only one log io (sh->log_io) for each stripe at anytime. Later
   writes for the same stripe have to wait. This can be improved by
   moving log_io to r5dev.
3. With writeback cache, read path must enter state machine, which
   is a significant bottleneck for some workloads.
4. There is no per stripe checkpoint (with r5l_payload_flush) in
   the log, so recovery code has to replay more than necessary data
   (sometimes all the log from last_checkpoint). This reduces
   availability of the array.

This patch includes a fix proposed by ZhengYuan Liu
<liuzhengyuan@kylinos.cn>

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c | 204 +++++++++++++++++++++++++++++++++++++++++++++--
 drivers/md/raid5.c       | 137 ++++++++++++++++++++++++++-----
 drivers/md/raid5.h       |  22 +++++
 3 files changed, 336 insertions(+), 27 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 9e05850..92d3d7b 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -20,6 +20,7 @@
 #include <linux/random.h>
 #include "md.h"
 #include "raid5.h"
+#include "bitmap.h"
 
 /*
  * metadata/data stored in disk with 4k size unit (a block) regardless
@@ -217,6 +218,44 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
 	io->state = state;
 }
 
+static void
+r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
+			      struct bio_list *return_bi)
+{
+	struct bio *wbi, *wbi2;
+
+	wbi = dev->written;
+	dev->written = NULL;
+	while (wbi && wbi->bi_iter.bi_sector <
+	       dev->sector + STRIPE_SECTORS) {
+		wbi2 = r5_next_bio(wbi, dev->sector);
+		if (!raid5_dec_bi_active_stripes(wbi)) {
+			md_write_end(conf->mddev);
+			bio_list_add(return_bi, wbi);
+		}
+		wbi = wbi2;
+	}
+}
+
+void r5c_handle_cached_data_endio(struct r5conf *conf,
+	  struct stripe_head *sh, int disks, struct bio_list *return_bi)
+{
+	int i;
+
+	for (i = sh->disks; i--; ) {
+		if (test_bit(R5_InCache, &sh->dev[i].flags) &&
+		    sh->dev[i].written) {
+			set_bit(R5_UPTODATE, &sh->dev[i].flags);
+			r5c_return_dev_pending_writes(conf, &sh->dev[i],
+						      return_bi);
+			bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+					STRIPE_SECTORS,
+					!test_bit(STRIPE_DEGRADED, &sh->state),
+					0);
+		}
+	}
+}
+
 /*
  * Freeze the stripe, thus send the stripe into reclaim path.
  *
@@ -233,6 +272,48 @@ void r5c_freeze_stripe_for_reclaim(struct stripe_head *sh)
 		return;
 	WARN_ON(test_bit(STRIPE_R5C_FROZEN, &sh->state));
 	set_bit(STRIPE_R5C_FROZEN, &sh->state);
+
+	if (log->r5c_state == R5C_STATE_WRITE_THROUGH)
+		return;
+
+	if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+		atomic_inc(&conf->preread_active_stripes);
+
+	if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
+		BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
+		atomic_dec(&conf->r5c_cached_partial_stripes);
+	}
+
+	if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
+		BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
+		atomic_dec(&conf->r5c_cached_full_stripes);
+	}
+}
+
+static void r5c_handle_data_cached(struct stripe_head *sh)
+{
+	int i;
+
+	for (i = sh->disks; i--; )
+		if (test_and_clear_bit(R5_Wantcache, &sh->dev[i].flags)) {
+			set_bit(R5_InCache, &sh->dev[i].flags);
+			clear_bit(R5_LOCKED, &sh->dev[i].flags);
+			atomic_inc(&sh->dev_in_cache);
+		}
+}
+
+/*
+ * this journal write must contain full parity,
+ * it may also contain some data pages
+ */
+static void r5c_handle_parity_cached(struct stripe_head *sh)
+{
+	int i;
+
+	for (i = sh->disks; i--; )
+		if (test_bit(R5_InCache, &sh->dev[i].flags))
+			set_bit(R5_Wantwrite, &sh->dev[i].flags);
+	set_bit(STRIPE_R5C_WRITTEN, &sh->state);
 }
 
 static void r5c_finish_cache_stripe(struct stripe_head *sh)
@@ -242,8 +323,10 @@ static void r5c_finish_cache_stripe(struct stripe_head *sh)
 	if (log->r5c_state == R5C_STATE_WRITE_THROUGH) {
 		BUG_ON(!test_bit(STRIPE_R5C_FROZEN, &sh->state));
 		set_bit(STRIPE_R5C_WRITTEN, &sh->state);
-	} else
-		BUG(); /* write back logic in next patch */
+	} else if (test_bit(STRIPE_R5C_FROZEN, &sh->state))
+		r5c_handle_parity_cached(sh);
+	else
+		r5c_handle_data_cached(sh);
 }
 
 static void r5l_io_run_stripes(struct r5l_io_unit *io)
@@ -483,7 +566,8 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 	io = log->current_io;
 
 	for (i = 0; i < sh->disks; i++) {
-		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
+		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) &&
+		    !test_bit(R5_Wantcache, &sh->dev[i].flags))
 			continue;
 		if (i == sh->pd_idx || i == sh->qd_idx)
 			continue;
@@ -514,7 +598,6 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 	return 0;
 }
 
-static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
 /*
  * running in raid5d, where reclaim could wait for raid5d too (when it flushes
  * data from log to raid disks), so we shouldn't wait for reclaim here
@@ -544,6 +627,10 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 
 		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
 			continue;
+
+		if (test_bit(R5_InCache, &sh->dev[i].flags))
+			continue;
+
 		write_disks++;
 		/* checksum is already calculated in last run */
 		if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
@@ -809,7 +896,6 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
 	}
 }
 
-
 static void r5l_do_reclaim(struct r5l_log *log)
 {
 	sector_t reclaim_target = xchg(&log->reclaim_target, 0);
@@ -872,7 +958,7 @@ static void r5l_reclaim_thread(struct md_thread *thread)
 	r5l_do_reclaim(log);
 }
 
-static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
+void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
 {
 	unsigned long target;
 	unsigned long new = (unsigned long)space; /* overflow in theory */
@@ -1191,6 +1277,8 @@ int r5c_handle_stripe_dirtying(struct r5conf *conf,
 			       int disks)
 {
 	struct r5l_log *log = conf->log;
+	int i;
+	struct r5dev *dev;
 
 	if (!log || test_bit(STRIPE_R5C_FROZEN, &sh->state))
 		return -EAGAIN;
@@ -1201,21 +1289,121 @@ int r5c_handle_stripe_dirtying(struct r5conf *conf,
 		r5c_freeze_stripe_for_reclaim(sh);
 		return -EAGAIN;
 	}
-	BUG();  /* write back logic in next commit */
+
+	s->to_cache = 0;
+
+	for (i = disks; i--; ) {
+		dev = &sh->dev[i];
+		/* if none-overwrite, use the reclaim path (write through) */
+		if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
+		    !test_bit(R5_InCache, &dev->flags)) {
+			r5c_freeze_stripe_for_reclaim(sh);
+			return -EAGAIN;
+		}
+	}
+
+	for (i = disks; i--; ) {
+		dev = &sh->dev[i];
+		if (dev->towrite) {
+			set_bit(R5_Wantcache, &dev->flags);
+			set_bit(R5_Wantdrain, &dev->flags);
+			set_bit(R5_LOCKED, &dev->flags);
+			s->to_cache++;
+		}
+	}
+
+	if (s->to_cache)
+		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+
 	return 0;
 }
 
 /*
  * clean up the stripe (clear STRIPE_R5C_FROZEN etc.) after the stripe is
  * committed to RAID disks
-*/
+ */
 void r5c_handle_stripe_written(struct r5conf *conf,
 			       struct stripe_head *sh)
 {
+	int i;
+	int do_wakeup = 0;
+
 	if (!test_and_clear_bit(STRIPE_R5C_WRITTEN, &sh->state))
 		return;
 	WARN_ON(!test_bit(STRIPE_R5C_FROZEN, &sh->state));
 	clear_bit(STRIPE_R5C_FROZEN, &sh->state);
+
+	if (conf->log->r5c_state == R5C_STATE_WRITE_THROUGH)
+		return;
+
+	for (i = sh->disks; i--; ) {
+		if (test_and_clear_bit(R5_InCache, &sh->dev[i].flags))
+			atomic_dec(&sh->dev_in_cache);
+		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+			do_wakeup = 1;
+	}
+
+	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
+		if (atomic_dec_and_test(&conf->pending_full_writes))
+			md_wakeup_thread(conf->mddev->thread);
+
+	if (do_wakeup)
+		wake_up(&conf->wait_for_overlap);
+}
+
+int
+r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
+	       struct stripe_head_state *s)
+{
+	int pages;
+	int reserve;
+	int i;
+	int ret = 0;
+	int page_count = 0;
+
+	BUG_ON(!log);
+
+	for (i = 0; i < sh->disks; i++) {
+		void *addr;
+
+		if (!test_bit(R5_Wantcache, &sh->dev[i].flags))
+			continue;
+		addr = kmap_atomic(sh->dev[i].page);
+		sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
+						    addr, PAGE_SIZE);
+		kunmap_atomic(addr);
+		page_count++;
+	}
+	WARN_ON(page_count != s->to_cache);
+	pages = s->to_cache;
+
+	/*
+	 * The stripe must enter state machine again to call endio, so
+	 * don't delay.
+	 */
+	clear_bit(STRIPE_DELAYED, &sh->state);
+	atomic_inc(&sh->count);
+
+	mutex_lock(&log->io_mutex);
+	/* meta + data */
+	reserve = (1 + pages) << (PAGE_SHIFT - 9);
+	if (!r5l_has_free_space(log, reserve)) {
+		spin_lock(&log->no_space_stripes_lock);
+		list_add_tail(&sh->log_list, &log->no_space_stripes);
+		spin_unlock(&log->no_space_stripes_lock);
+
+		r5l_wake_reclaim(log, reserve);
+	} else {
+		ret = r5l_log_stripe(log, sh, pages, 0);
+		if (ret) {
+			spin_lock_irq(&log->io_list_lock);
+			list_add_tail(&sh->log_list, &log->no_mem_stripes);
+			spin_unlock_irq(&log->io_list_lock);
+		}
+	}
+
+	mutex_unlock(&log->io_mutex);
+	return 0;
 }
 
 static int r5l_load_log(struct r5l_log *log)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2e3e61a..27fd183 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -245,8 +245,25 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 			    < IO_THRESHOLD)
 				md_wakeup_thread(conf->mddev->thread);
 		atomic_dec(&conf->active_stripes);
-		if (!test_bit(STRIPE_EXPANDING, &sh->state))
-			list_add_tail(&sh->lru, temp_inactive_list);
+		if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
+			if (atomic_read(&sh->dev_in_cache) == 0) {
+				list_add_tail(&sh->lru, temp_inactive_list);
+			} else if (atomic_read(&sh->dev_in_cache) ==
+				   conf->raid_disks - conf->max_degraded) {
+				/* full stripe */
+				if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
+					atomic_inc(&conf->r5c_cached_full_stripes);
+				if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
+					atomic_dec(&conf->r5c_cached_partial_stripes);
+				list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
+			} else {
+				/* partial stripe */
+				if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE,
+						      &sh->state))
+					atomic_inc(&conf->r5c_cached_partial_stripes);
+				list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
+			}
+		}
 	}
 }
 
@@ -830,6 +847,11 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
 	might_sleep();
 
+	if (s->to_cache) {
+		r5c_cache_data(conf->log, sh, s);
+		return;
+	}
+
 	if (r5l_write_stripe(conf->log, sh) == 0)
 		return;
 	for (i = disks; i--; ) {
@@ -1044,7 +1066,7 @@ again:
 static struct dma_async_tx_descriptor *
 async_copy_data(int frombio, struct bio *bio, struct page **page,
 	sector_t sector, struct dma_async_tx_descriptor *tx,
-	struct stripe_head *sh)
+	struct stripe_head *sh, int no_skipcopy)
 {
 	struct bio_vec bvl;
 	struct bvec_iter iter;
@@ -1084,7 +1106,8 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
 			if (frombio) {
 				if (sh->raid_conf->skip_copy &&
 				    b_offset == 0 && page_offset == 0 &&
-				    clen == STRIPE_SIZE)
+				    clen == STRIPE_SIZE &&
+				    !no_skipcopy)
 					*page = bio_page;
 				else
 					tx = async_memcpy(*page, bio_page, page_offset,
@@ -1166,7 +1189,7 @@ static void ops_run_biofill(struct stripe_head *sh)
 			while (rbi && rbi->bi_iter.bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				tx = async_copy_data(0, rbi, &dev->page,
-					dev->sector, tx, sh);
+						     dev->sector, tx, sh, 0);
 				rbi = r5_next_bio(rbi, dev->sector);
 			}
 		}
@@ -1293,7 +1316,8 @@ static int set_syndrome_sources(struct page **srcs,
 		if (i == sh->qd_idx || i == sh->pd_idx ||
 		    (srctype == SYNDROME_SRC_ALL) ||
 		    (srctype == SYNDROME_SRC_WANT_DRAIN &&
-		     test_bit(R5_Wantdrain, &dev->flags)) ||
+		     (test_bit(R5_Wantdrain, &dev->flags) ||
+		      test_bit(R5_InCache, &dev->flags))) ||
 		    (srctype == SYNDROME_SRC_WRITTEN &&
 		     dev->written))
 			srcs[slot] = sh->dev[i].page;
@@ -1472,9 +1496,25 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 static void ops_complete_prexor(void *stripe_head_ref)
 {
 	struct stripe_head *sh = stripe_head_ref;
+	int i;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
+
+	if (!r5c_is_writeback(sh->raid_conf->log))
+		return;
+
+	/*
+	 * raid5-cache write back uses orig_page during prexor. after prexor,
+	 * it is time to free orig_page
+	 */
+	for (i = sh->disks; i--; )
+		if (sh->dev[i].page != sh->dev[i].orig_page) {
+			struct page *p = sh->dev[i].page;
+
+			sh->dev[i].page = sh->dev[i].orig_page;
+			put_page(p);
+		}
 }
 
 static struct dma_async_tx_descriptor *
@@ -1496,7 +1536,8 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 		/* Only process blocks that are known to be uptodate */
-		if (test_bit(R5_Wantdrain, &dev->flags))
+		if (test_bit(R5_Wantdrain, &dev->flags) ||
+		    test_bit(R5_InCache, &dev->flags))
 			xor_srcs[count++] = dev->page;
 	}
 
@@ -1547,6 +1588,10 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 
 again:
 			dev = &sh->dev[i];
+			if (test_and_clear_bit(R5_InCache, &dev->flags)) {
+				BUG_ON(atomic_read(&sh->dev_in_cache) == 0);
+				atomic_dec(&sh->dev_in_cache);
+			}
 			spin_lock_irq(&sh->stripe_lock);
 			chosen = dev->towrite;
 			dev->towrite = NULL;
@@ -1554,7 +1599,13 @@ again:
 			BUG_ON(dev->written);
 			wbi = dev->written = chosen;
 			spin_unlock_irq(&sh->stripe_lock);
-			WARN_ON(dev->page != dev->orig_page);
+
+			/*
+			 * in biodrain stage, prexor for data in raid5-cache
+			 * is the only case where page != orig_page
+			 */
+			if (!test_bit(R5_Wantcache, &dev->flags))
+				WARN_ON(dev->page != dev->orig_page);
 
 			while (wbi && wbi->bi_iter.bi_sector <
 				dev->sector + STRIPE_SECTORS) {
@@ -1566,8 +1617,10 @@ again:
 					set_bit(R5_Discard, &dev->flags);
 				else {
 					tx = async_copy_data(1, wbi, &dev->page,
-						dev->sector, tx, sh);
-					if (dev->page != dev->orig_page) {
+							     dev->sector, tx, sh,
+							     test_bit(R5_Wantcache, &dev->flags));
+					if (dev->page != dev->orig_page &&
+					    !test_bit(R5_Wantcache, &dev->flags)) {
 						set_bit(R5_SkipCopy, &dev->flags);
 						clear_bit(R5_UPTODATE, &dev->flags);
 						clear_bit(R5_OVERWRITE, &dev->flags);
@@ -1675,7 +1728,8 @@ again:
 		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if (head_sh->dev[i].written)
+			if (head_sh->dev[i].written ||
+			    test_bit(R5_InCache, &head_sh->dev[i].flags))
 				xor_srcs[count++] = dev->page;
 		}
 	} else {
@@ -1930,6 +1984,7 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
 		INIT_LIST_HEAD(&sh->batch_list);
 		INIT_LIST_HEAD(&sh->lru);
 		atomic_set(&sh->count, 1);
+		atomic_set(&sh->dev_in_cache, 0);
 		for (i = 0; i < disks; i++) {
 			struct r5dev *dev = &sh->dev[i];
 
@@ -2816,6 +2871,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 				if (!expand)
 					clear_bit(R5_UPTODATE, &dev->flags);
 				s->locked++;
+			} else if (test_bit(R5_InCache, &dev->flags)) {
+				set_bit(R5_LOCKED, &dev->flags);
+				s->locked++;
 			}
 		}
 		/* if we are not expanding this is a proper write request, and
@@ -2855,6 +2913,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 				set_bit(R5_LOCKED, &dev->flags);
 				clear_bit(R5_UPTODATE, &dev->flags);
 				s->locked++;
+			} else if (test_bit(R5_InCache, &dev->flags)) {
+				set_bit(R5_LOCKED, &dev->flags);
+				s->locked++;
 			}
 		}
 		if (!s->locked)
@@ -3529,9 +3590,12 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 	} else for (i = disks; i--; ) {
 		/* would I have to read this buffer for read_modify_write */
 		struct r5dev *dev = &sh->dev[i];
-		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
+		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx ||
+		     test_bit(R5_InCache, &dev->flags)) &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
-		    !(test_bit(R5_UPTODATE, &dev->flags) ||
+		    !((test_bit(R5_UPTODATE, &dev->flags) &&
+		       (!test_bit(R5_InCache, &dev->flags) ||
+			dev->page != dev->orig_page)) ||
 		      test_bit(R5_Wantcompute, &dev->flags))) {
 			if (test_bit(R5_Insync, &dev->flags))
 				rmw++;
@@ -3543,13 +3607,15 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 		    i != sh->pd_idx && i != sh->qd_idx &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
 		    !(test_bit(R5_UPTODATE, &dev->flags) ||
-		    test_bit(R5_Wantcompute, &dev->flags))) {
+		      test_bit(R5_InCache, &dev->flags) ||
+		      test_bit(R5_Wantcompute, &dev->flags))) {
 			if (test_bit(R5_Insync, &dev->flags))
 				rcw++;
 			else
 				rcw += 2*disks;
 		}
 	}
+
 	pr_debug("for sector %llu, rmw=%d rcw=%d\n",
 		(unsigned long long)sh->sector, rmw, rcw);
 	set_bit(STRIPE_HANDLE, &sh->state);
@@ -3561,10 +3627,18 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 					  (unsigned long long)sh->sector, rmw);
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
+			if (test_bit(R5_InCache, &dev->flags) &&
+			    dev->page == dev->orig_page)
+				dev->page = alloc_page(GFP_NOIO);  /* prexor */
+
+			if ((dev->towrite ||
+			     i == sh->pd_idx || i == sh->qd_idx ||
+			     test_bit(R5_InCache, &dev->flags)) &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
-			    !(test_bit(R5_UPTODATE, &dev->flags) ||
-			    test_bit(R5_Wantcompute, &dev->flags)) &&
+			    !((test_bit(R5_UPTODATE, &dev->flags) &&
+			       (!test_bit(R5_InCache, &dev->flags) ||
+				dev->page != dev->orig_page)) ||
+			      test_bit(R5_Wantcompute, &dev->flags)) &&
 			    test_bit(R5_Insync, &dev->flags)) {
 				if (test_bit(STRIPE_PREREAD_ACTIVE,
 					     &sh->state)) {
@@ -3590,6 +3664,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 			    i != sh->pd_idx && i != sh->qd_idx &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
 			    !(test_bit(R5_UPTODATE, &dev->flags) ||
+			      test_bit(R5_InCache, &dev->flags) ||
 			      test_bit(R5_Wantcompute, &dev->flags))) {
 				rcw++;
 				if (test_bit(R5_Insync, &dev->flags) &&
@@ -3629,7 +3704,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 	 */
 	if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
 	    (s->locked == 0 && (rcw == 0 || rmw == 0) &&
-	    !test_bit(STRIPE_BIT_DELAY, &sh->state)))
+	     !test_bit(STRIPE_BIT_DELAY, &sh->state)))
 		schedule_reconstruction(sh, s, rcw == 0, 0);
 }
 
@@ -4120,6 +4195,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 			if (rdev && !test_bit(Faulty, &rdev->flags))
 				do_recovery = 1;
 		}
+		if (test_bit(R5_InCache, &dev->flags) && dev->written)
+			s->just_cached++;
+		if (test_bit(R5_Wantcache, &dev->flags) && dev->written)
+			s->want_cache++;
 	}
 	if (test_bit(STRIPE_SYNCING, &sh->state)) {
 		/* If there is a failed device being replaced,
@@ -4285,6 +4364,17 @@ static void handle_stripe(struct stripe_head *sh)
 
 	analyse_stripe(sh, &s);
 
+	if (s.want_cache) {
+		/* In last run of handle_stripe, we have finished
+		 * r5c_handle_stripe_dirtying and ops_run_biodrain, but
+		 * r5c_cache_data didn't finish because the journal device
+		 * didn't have enough space. This time we should continue
+		 * r5c_cache_data
+		 */
+		s.to_cache = s.want_cache;
+		goto finish;
+	}
+
 	if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
 		goto finish;
 
@@ -4348,7 +4438,7 @@ static void handle_stripe(struct stripe_head *sh)
 			struct r5dev *dev = &sh->dev[i];
 			if (test_bit(R5_LOCKED, &dev->flags) &&
 				(i == sh->pd_idx || i == sh->qd_idx ||
-				 dev->written)) {
+				 dev->written || test_bit(R5_InCache, &dev->flags))) {
 				pr_debug("Writing block %d\n", i);
 				set_bit(R5_Wantwrite, &dev->flags);
 				if (prexor)
@@ -4388,6 +4478,10 @@ static void handle_stripe(struct stripe_head *sh)
 				 test_bit(R5_Discard, &qdev->flags))))))
 		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
 
+	if (s.just_cached)
+		r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
+	r5l_stripe_write_finished(sh);
+
 	/* Now we might consider reading some blocks, either to check/generate
 	 * parity, or to satisfy requests
 	 * or to load a block that is being partially written.
@@ -6526,6 +6620,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
 		INIT_LIST_HEAD(conf->temp_inactive_list + i);
 
+	atomic_set(&conf->r5c_cached_full_stripes, 0);
+	INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
+	atomic_set(&conf->r5c_cached_partial_stripes, 0);
+	INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
+
 	conf->level = mddev->new_level;
 	conf->chunk_sectors = mddev->new_chunk_sectors;
 	if (raid5_alloc_percpu(conf) != 0)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 8bae64b..ac6d7c7 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -226,6 +226,7 @@ struct stripe_head {
 
 	struct r5l_io_unit	*log_io;
 	struct list_head	log_list;
+	atomic_t		dev_in_cache;
 	/**
 	 * struct stripe_operations
 	 * @target - STRIPE_OP_COMPUTE_BLK target
@@ -263,6 +264,7 @@ struct stripe_head_state {
 	 */
 	int syncing, expanding, expanded, replacing;
 	int locked, uptodate, to_read, to_write, failed, written;
+	int to_cache, want_cache, just_cached;
 	int to_fill, compute, req_compute, non_overwrite;
 	int failed_num[2];
 	int p_failed, q_failed;
@@ -313,6 +315,8 @@ enum r5dev_flags {
 			 */
 	R5_Discard,	/* Discard the stripe */
 	R5_SkipCopy,	/* Don't copy data from bio to stripe cache */
+	R5_Wantcache,	/* Want write data to write cache */
+	R5_InCache,	/* Data in cache */
 };
 
 /*
@@ -348,6 +352,12 @@ enum {
 	STRIPE_LOG_TRAPPED,	/* trapped into log */
 	STRIPE_R5C_FROZEN,	/* r5c_cache frozen and being written out */
 	STRIPE_R5C_WRITTEN,	/* ready for r5c_handle_stripe_written() */
+	STRIPE_R5C_PARTIAL_STRIPE,	/* in r5c cache (to-be/being handled or
+					 * in conf->r5c_partial_stripe_list)
+					 */
+	STRIPE_R5C_FULL_STRIPE,	/* in r5c cache (to-be/being handled or
+				 * in conf->r5c_full_stripe_list)
+				 */
 };
 
 #define STRIPE_EXPAND_SYNC_FLAGS \
@@ -600,6 +610,12 @@ struct r5conf {
 	 */
 	atomic_t		active_stripes;
 	struct list_head	inactive_list[NR_STRIPE_HASH_LOCKS];
+
+	atomic_t		r5c_cached_full_stripes;
+	struct list_head	r5c_full_stripe_list;
+	atomic_t		r5c_cached_partial_stripes;
+	struct list_head	r5c_partial_stripe_list;
+
 	atomic_t		empty_inactive_list_nr;
 	struct llist_head	released_stripes;
 	wait_queue_head_t	wait_for_quiescent;
@@ -720,4 +736,10 @@ r5c_handle_stripe_dirtying(struct r5conf *conf, struct stripe_head *sh,
 			   struct stripe_head_state *s, int disks);
 extern void
 r5c_handle_stripe_written(struct r5conf *conf, struct stripe_head *sh);
+extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
+extern void r5c_handle_cached_data_endio(struct r5conf *conf,
+	struct stripe_head *sh, int disks, struct bio_list *return_bi);
+extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
+			  struct stripe_head_state *s);
+extern void r5c_freeze_stripe_for_reclaim(struct stripe_head *sh);
 #endif
-- 
2.9.3


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox