Linux RAID subsystem development

Linux RAID subsystem development
 help / color / mirror / Atom feed

* [PATCH] imsm: retrieve nvme serial from sysfs
From: Artur Paszkiewicz @ 2016-10-06  9:13 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid, Artur Paszkiewicz

Don't rely on SCSI ioctl for reading NVMe serials - SCSI emulation for
NVMe devices can be disabled in the kernel config. Instead, try to get a
serial from /sys/block/nvme*/device/serial. If that fails for whatever
reason (i.e. no such attribute in old kernels) - fall back to the SCSI
method.

This also moves some SCSI-specific code from imsm_read_serial() to
scsi_get_serial().

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Reviewed-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
Reviewed-by: Alexey Obitotskiy <aleksey.obitotskiy@intel.com>
---
 sg_io.c       | 23 +++++++++++++++++++----
 super-intel.c | 46 +++++++++++++++++++++++++++-------------------
 2 files changed, 46 insertions(+), 23 deletions(-)

diff --git a/sg_io.c b/sg_io.c
index 50ad180..42c91e1 100644
--- a/sg_io.c
+++ b/sg_io.c
@@ -23,20 +23,35 @@
 
 int scsi_get_serial(int fd, void *buf, size_t buf_len)
 {
-	unsigned char inq_cmd[] = {INQUIRY, 1, 0x80, 0, buf_len, 0};
+	unsigned char rsp_buf[255];
+	unsigned char inq_cmd[] = {INQUIRY, 1, 0x80, 0, sizeof(rsp_buf), 0};
 	unsigned char sense[32];
 	struct sg_io_hdr io_hdr;
+	int rv;
+	unsigned int rsp_len;
 
 	memset(&io_hdr, 0, sizeof(io_hdr));
 	io_hdr.interface_id = 'S';
 	io_hdr.cmdp = inq_cmd;
 	io_hdr.cmd_len = sizeof(inq_cmd);
-	io_hdr.dxferp = buf;
-	io_hdr.dxfer_len = buf_len;
+	io_hdr.dxferp = rsp_buf;
+	io_hdr.dxfer_len = sizeof(rsp_buf);
 	io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
 	io_hdr.sbp = sense;
 	io_hdr.mx_sb_len = sizeof(sense);
 	io_hdr.timeout = 5000;
 
-	return ioctl(fd, SG_IO, &io_hdr);
+	rv = ioctl(fd, SG_IO, &io_hdr);
+
+	if (rv)
+		return rv;
+
+	rsp_len = rsp_buf[3];
+
+	if (!rsp_len || buf_len < rsp_len)
+		return -1;
+
+	memcpy(buf, &rsp_buf[4], rsp_len);
+
+	return 0;
 }
diff --git a/super-intel.c b/super-intel.c
index cfc8904..1304737 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -3331,23 +3331,40 @@ static void fd2devname(int fd, char *name)
 	}
 }
 
+static int nvme_get_serial(int fd, void *buf, size_t buf_len)
+{
+	char path[60];
+	char *name = fd2kname(fd);
+
+	if (!name)
+		return 1;
+
+	if (strncmp(name, "nvme", 4) != 0)
+		return 1;
+
+	snprintf(path, sizeof(path) - 1, "/sys/block/%s/device/serial", name);
+
+	return load_sys(path, buf, buf_len);
+}
+
 extern int scsi_get_serial(int fd, void *buf, size_t buf_len);
 
 static int imsm_read_serial(int fd, char *devname,
 			    __u8 serial[MAX_RAID_SERIAL_LEN])
 {
-	unsigned char scsi_serial[255];
+	char buf[50];
 	int rv;
-	int rsp_len;
 	int len;
 	char *dest;
 	char *src;
-	char *rsp_buf;
-	int i;
+	unsigned int i;
+
+	memset(buf, 0, sizeof(buf));
 
-	memset(scsi_serial, 0, sizeof(scsi_serial));
+	rv = nvme_get_serial(fd, buf, sizeof(buf));
 
-	rv = scsi_get_serial(fd, scsi_serial, sizeof(scsi_serial));
+	if (rv)
+		rv = scsi_get_serial(fd, buf, sizeof(buf));
 
 	if (rv && check_env("IMSM_DEVNAME_AS_SERIAL")) {
 		memset(serial, 0, MAX_RAID_SERIAL_LEN);
@@ -3362,20 +3379,11 @@ static int imsm_read_serial(int fd, char *devname,
 		return rv;
 	}
 
-	rsp_len = scsi_serial[3];
-	if (!rsp_len) {
-		if (devname)
-			pr_err("Failed to retrieve serial for %s\n",
-			       devname);
-		return 2;
-	}
-	rsp_buf = (char *) &scsi_serial[4];
-
 	/* trim all whitespace and non-printable characters and convert
 	 * ':' to ';'
 	 */
-	for (i = 0, dest = rsp_buf; i < rsp_len; i++) {
-		src = &rsp_buf[i];
+	for (i = 0, dest = buf; i < sizeof(buf) && buf[i]; i++) {
+		src = &buf[i];
 		if (*src > 0x20) {
 			/* ':' is reserved for use in placeholder serial
 			 * numbers for missing disks
@@ -3386,8 +3394,8 @@ static int imsm_read_serial(int fd, char *devname,
 				*dest++ = *src;
 		}
 	}
-	len = dest - rsp_buf;
-	dest = rsp_buf;
+	len = dest - buf;
+	dest = buf;
 
 	/* truncate leading characters */
 	if (len > MAX_RAID_SERIAL_LEN) {
-- 
2.10.0


^ permalink raw reply related

* [PATCH 16/54] md/raid1: Delete an error message for a failed memory allocation
From: SF Markus Elfring @ 2016-10-06  9:12 UTC (permalink / raw)
  To: linux-raid, Christoph Hellwig, Guoqing Jiang, Jens Axboe,
	Mike Christie, Neil Brown, Shaohua Li, Tomasz Majchrzak
  Cc: LKML, kernel-janitors, Julia Lawall, Wolfram Sang
In-Reply-To: <786843ef-4b6f-eb04-7326-2f6f5b408826@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 4 Oct 2016 18:15:20 +0200

Omit an extra message for a memory allocation failure in this function.

Link: http://events.linuxfoundation.org/sites/events/files/slides/LCJ16-Refactor_Strings-WSang_0.pdf

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/raid1.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7dc45ba..15b9652 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2882,9 +2882,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 
 	conf->thread = md_register_thread(raid1d, mddev, "raid1");
 	if (!conf->thread) {
-		printk(KERN_ERR
-		       "md/raid1:%s: couldn't allocate thread\n",
-		       mdname(mddev));
 		err = -ENOMEM;
 		goto destroy_pool;
 	}
-- 
2.10.1


^ permalink raw reply related

* [PATCH 15/54] md/raid1: Less function calls in setup_conf() after error detection
From: SF Markus Elfring @ 2016-10-06  9:11 UTC (permalink / raw)
  To: linux-raid, Christoph Hellwig, Guoqing Jiang, Jens Axboe,
	Mike Christie, Neil Brown, Shaohua Li, Tomasz Majchrzak
  Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <786843ef-4b6f-eb04-7326-2f6f5b408826@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 4 Oct 2016 18:01:02 +0200

Resource release functions were called in up to four cases
by the setup_conf() function during error handling even if
the passed data structure members contained a null pointer.

Adjust jump targets according to the Linux coding style convention.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/raid1.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4d2fcbc..7dc45ba 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2786,19 +2786,19 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 				GFP_KERNEL);
 	if (!conf->mirrors) {
 		err = -ENOMEM;
-		goto abort;
+		goto free_conf;
 	}
 
 	conf->tmppage = alloc_page(GFP_KERNEL);
 	if (!conf->tmppage) {
 		err = -ENOMEM;
-		goto abort;
+		goto free_mirrors;
 	}
 
 	conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
 	if (!conf->poolinfo) {
 		err = -ENOMEM;
-		goto abort;
+		goto put_page;
 	}
 
 	conf->poolinfo->raid_disks = mddev->raid_disks * 2;
@@ -2807,7 +2807,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 					  conf->poolinfo);
 	if (!conf->r1bio_pool) {
 		err = -ENOMEM;
-		goto abort;
+		goto free_poolinfo;
 	}
 
 	conf->poolinfo->mddev = mddev;
@@ -2825,7 +2825,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 
 		if (disk->rdev) {
 			err = -EINVAL;
-			goto abort;
+			goto destroy_pool;
 		}
 
 		disk->rdev = rdev;
@@ -2866,7 +2866,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 				if (!test_bit(In_sync, &disk->rdev->flags)) {
 					/* Original is not in_sync - bad */
 					err = -EIO;
-					goto abort;
+					goto destroy_pool;
 				}
 			}
 		}
@@ -2886,16 +2886,19 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 		       "md/raid1:%s: couldn't allocate thread\n",
 		       mdname(mddev));
 		err = -ENOMEM;
-		goto abort;
+		goto destroy_pool;
 	}
 
 	return conf;
-
- abort:
+destroy_pool:
 	mempool_destroy(conf->r1bio_pool);
+free_poolinfo:
 	kfree(conf->poolinfo);
+put_page:
 	safe_put_page(conf->tmppage);
+free_mirrors:
 	kfree(conf->mirrors);
+free_conf:
 	kfree(conf);
 	return ERR_PTR(err);
 }
-- 
2.10.1

^ permalink raw reply related

* [PATCH 14/54] md/raid1: Move assignments for the variable "err" in setup_conf()
From: SF Markus Elfring @ 2016-10-06  9:10 UTC (permalink / raw)
  To: linux-raid, Christoph Hellwig, Guoqing Jiang, Jens Axboe,
	Mike Christie, Neil Brown, Shaohua Li, Tomasz Majchrzak
  Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <786843ef-4b6f-eb04-7326-2f6f5b408826@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 4 Oct 2016 17:34:10 +0200

One local variable was set to an error code in a few cases before
a concrete error situation was detected. Thus move the corresponding
assignments into if branches to indicate a software failure there.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/raid1.c | 40 ++++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 74346f5..4d2fcbc 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2775,7 +2775,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	int i;
 	struct raid1_info *disk;
 	struct md_rdev *rdev;
-	int err = -ENOMEM;
+	int err;
 
 	conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL);
 	if (!conf)
@@ -2784,26 +2784,33 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	conf->mirrors = kcalloc(mddev->raid_disks * 2,
 				sizeof(*conf->mirrors),
 				GFP_KERNEL);
-	if (!conf->mirrors)
+	if (!conf->mirrors) {
+		err = -ENOMEM;
 		goto abort;
+	}
 
 	conf->tmppage = alloc_page(GFP_KERNEL);
-	if (!conf->tmppage)
+	if (!conf->tmppage) {
+		err = -ENOMEM;
 		goto abort;
+	}
 
 	conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
-	if (!conf->poolinfo)
+	if (!conf->poolinfo) {
+		err = -ENOMEM;
 		goto abort;
+	}
+
 	conf->poolinfo->raid_disks = mddev->raid_disks * 2;
 	conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
 					  r1bio_pool_free,
 					  conf->poolinfo);
-	if (!conf->r1bio_pool)
+	if (!conf->r1bio_pool) {
+		err = -ENOMEM;
 		goto abort;
+	}
 
 	conf->poolinfo->mddev = mddev;
-
-	err = -EINVAL;
 	spin_lock_init(&conf->device_lock);
 	rdev_for_each(rdev, mddev) {
 		struct request_queue *q;
@@ -2816,8 +2823,11 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 		else
 			disk = conf->mirrors + disk_idx;
 
-		if (disk->rdev)
+		if (disk->rdev) {
+			err = -EINVAL;
 			goto abort;
+		}
+
 		disk->rdev = rdev;
 		q = bdev_get_queue(rdev->bdev);
 
@@ -2838,8 +2848,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 
 	conf->start_next_window = MaxSector;
 	conf->current_window_requests = conf->next_window_requests = 0;
-
-	err = -EIO;
 	for (i = 0; i < conf->raid_disks * 2; i++) {
 
 		disk = conf->mirrors + i;
@@ -2854,9 +2862,13 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 				disk->rdev =
 					disk[conf->raid_disks].rdev;
 				disk[conf->raid_disks].rdev = NULL;
-			} else if (!test_bit(In_sync, &disk->rdev->flags))
-				/* Original is not in_sync - bad */
-				goto abort;
+			} else {
+				if (!test_bit(In_sync, &disk->rdev->flags)) {
+					/* Original is not in_sync - bad */
+					err = -EIO;
+					goto abort;
+				}
+			}
 		}
 
 		if (!disk->rdev ||
@@ -2868,12 +2880,12 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 		}
 	}
 
-	err = -ENOMEM;
 	conf->thread = md_register_thread(raid1d, mddev, "raid1");
 	if (!conf->thread) {
 		printk(KERN_ERR
 		       "md/raid1:%s: couldn't allocate thread\n",
 		       mdname(mddev));
+		err = -ENOMEM;
 		goto abort;
 	}
 
-- 
2.10.1

^ permalink raw reply related

* [PATCH 13/54] md/raid1: Return directly after a failed kzalloc() in setup_conf()
From: SF Markus Elfring @ 2016-10-06  9:09 UTC (permalink / raw)
  To: linux-raid, Christoph Hellwig, Guoqing Jiang, Jens Axboe,
	Mike Christie, Neil Brown, Shaohua Li, Tomasz Majchrzak
  Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <786843ef-4b6f-eb04-7326-2f6f5b408826@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 4 Oct 2016 15:56:41 +0200

* Return directly after a call of the function "kzalloc" failed
  at the beginning.

* Delete a repeated check for the local variable "conf"
  which became unnecessary with this refactoring.

* Reorder two calls for the function "kfree" at the end.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/raid1.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 5969711..74346f5 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2779,7 +2779,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 
 	conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL);
 	if (!conf)
-		goto abort;
+		return ERR_PTR(-ENOMEM);
 
 	conf->mirrors = kcalloc(mddev->raid_disks * 2,
 				sizeof(*conf->mirrors),
@@ -2880,13 +2880,11 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	return conf;
 
  abort:
-	if (conf) {
-		mempool_destroy(conf->r1bio_pool);
-		kfree(conf->mirrors);
-		safe_put_page(conf->tmppage);
-		kfree(conf->poolinfo);
-		kfree(conf);
-	}
+	mempool_destroy(conf->r1bio_pool);
+	kfree(conf->poolinfo);
+	safe_put_page(conf->tmppage);
+	kfree(conf->mirrors);
+	kfree(conf);
 	return ERR_PTR(err);
 }
 
-- 
2.10.1

^ permalink raw reply related

* [PATCH 12/54] md/raid1: Use kcalloc() in setup_conf()
From: SF Markus Elfring @ 2016-10-06  9:07 UTC (permalink / raw)
  To: linux-raid, Christoph Hellwig, Guoqing Jiang, Jens Axboe,
	Mike Christie, Neil Brown, Shaohua Li, Tomasz Majchrzak
  Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <786843ef-4b6f-eb04-7326-2f6f5b408826@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 4 Oct 2016 15:30:28 +0200

* A multiplication for the size determination of a memory allocation
  indicated that an array data structure should be processed.
  Thus use the corresponding function "kcalloc".

* Replace the specification of a data structure by a pointer dereference
  to make the corresponding size determination a bit safer according to
  the Linux coding style convention.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/raid1.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e75ae87..5969711 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2781,9 +2781,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	if (!conf)
 		goto abort;
 
-	conf->mirrors = kzalloc(sizeof(struct raid1_info)
-				* mddev->raid_disks * 2,
-				 GFP_KERNEL);
+	conf->mirrors = kcalloc(mddev->raid_disks * 2,
+				sizeof(*conf->mirrors),
+				GFP_KERNEL);
 	if (!conf->mirrors)
 		goto abort;
 
-- 
2.10.1

^ permalink raw reply related

* [PATCH 11/54] md/raid1: Use kcalloc() in raid1_reshape()
From: SF Markus Elfring @ 2016-10-06  9:06 UTC (permalink / raw)
  To: linux-raid, Christoph Hellwig, Guoqing Jiang, Jens Axboe,
	Mike Christie, Neil Brown, Shaohua Li, Tomasz Majchrzak
  Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <786843ef-4b6f-eb04-7326-2f6f5b408826@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 4 Oct 2016 15:17:13 +0200

* A multiplication for the size determination of a memory allocation
  indicated that an array data structure should be processed.
  Thus use the corresponding function "kcalloc".

* Replace the specification of a data structure by a pointer dereference
  to make the corresponding size determination a bit safer according to
  the Linux coding style convention.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/raid1.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d2759f8..e75ae87 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -3081,8 +3081,8 @@ static int raid1_reshape(struct mddev *mddev)
 		kfree(newpoolinfo);
 		return -ENOMEM;
 	}
-	newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2,
-			     GFP_KERNEL);
+
+	newmirrors = kcalloc(raid_disks * 2, sizeof(*newmirrors), GFP_KERNEL);
 	if (!newmirrors) {
 		kfree(newpoolinfo);
 		mempool_destroy(newpool);
-- 
2.10.1


^ permalink raw reply related

* [PATCH 10/54] md/raid1: Use kcalloc() in alloc_behind_pages()
From: SF Markus Elfring @ 2016-10-06  9:05 UTC (permalink / raw)
  To: linux-raid, Christoph Hellwig, Guoqing Jiang, Jens Axboe,
	Mike Christie, Neil Brown, Shaohua Li, Tomasz Majchrzak
  Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <786843ef-4b6f-eb04-7326-2f6f5b408826@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 4 Oct 2016 15:01:04 +0200

* A multiplication for the size determination of a memory allocation
  indicated that an array data structure should be processed.
  Thus use the corresponding function "kcalloc".

  This issue was detected by using the Coccinelle software.

* Replace the specification of a data structure by a pointer dereference
  to make the corresponding size determination a bit safer according to
  the Linux coding style convention.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/raid1.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 1961d82..d2759f8 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -955,8 +955,7 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
 {
 	int i;
 	struct bio_vec *bvec;
-	struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
-					GFP_NOIO);
+	struct bio_vec *bvecs = kcalloc(bio->bi_vcnt, sizeof(*bvecs), GFP_NOIO);
 	if (unlikely(!bvecs))
 		return;
 
-- 
2.10.1

^ permalink raw reply related

* [PATCH 09/54] md/raid0: Add some spaces for better code readability
From: SF Markus Elfring @ 2016-10-06  9:03 UTC (permalink / raw)
  To: linux-raid, Christoph Hellwig, Guoqing Jiang, Jens Axboe,
	Mike Christie, Neil Brown, Shaohua Li, Tomasz Majchrzak
  Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <786843ef-4b6f-eb04-7326-2f6f5b408826@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 4 Oct 2016 14:48:34 +0200

Use space characters at some source code places according to
the Linux coding style convention.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/raid0.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 1abe1ee..39d407c 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -55,7 +55,7 @@ static void dump_zones(struct mddev *mddev)
 	int raid_disks = conf->strip_zone[0].nb_dev;
 	pr_info("configuration for %s - %d zone%s\n",
 	       mdname(mddev),
-	       conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s");
+	       conf->nr_strip_zones, conf->nr_strip_zones == 1 ? "" : "s");
 	for (j = 0; j < conf->nr_strip_zones; j++) {
 		pr_info("zone%d=[", j);
 		for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
@@ -106,9 +106,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		rdev_for_each(rdev2, mddev) {
 			pr_debug("%s:   comparing %s(%llu) with %s(%llu)\n",
 				 mdname(mddev),
-				 bdevname(rdev1->bdev,b),
+				 bdevname(rdev1->bdev, b),
 				 (unsigned long long)rdev1->sectors,
-				 bdevname(rdev2->bdev,b2),
+				 bdevname(rdev2->bdev, b2),
 				 (unsigned long long)rdev2->sectors);
 			if (rdev2 == rdev1) {
 				pr_debug("%s:   END\n",
@@ -232,7 +232,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		smallest = NULL;
 		c = 0;
 
-		for (j=0; j<cnt; j++) {
+		for (j = 0; j < cnt; j++) {
 			rdev = conf->devlist[j];
 			if (rdev->sectors <= zone->dev_start) {
 				pr_debug("%s: checking %s ... nope\n",
@@ -418,8 +418,8 @@ static int raid0_run(struct mddev *mddev)
 		 */
 		int stripe = mddev->raid_disks *
 			(mddev->chunk_sectors << 9) / PAGE_SIZE;
-		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
-			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
+		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
+			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
 	}
 
 	dump_zones(mddev);
-- 
2.10.1


^ permalink raw reply related

* [PATCH 08/54] md/raid0: Delete an unnecessary return statement in raid0_status()
From: SF Markus Elfring @ 2016-10-06  9:01 UTC (permalink / raw)
  To: linux-raid, Christoph Hellwig, Guoqing Jiang, Jens Axboe,
	Mike Christie, Neil Brown, Shaohua Li, Tomasz Majchrzak
  Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <786843ef-4b6f-eb04-7326-2f6f5b408826@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 4 Oct 2016 14:34:33 +0200

The script "checkpatch.pl" pointed information out like the following.

WARNING: void function return statements are not generally useful

Thus remove such a statement here.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/raid0.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 9b76eae..1abe1ee 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -503,7 +503,6 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 static void raid0_status(struct seq_file *seq, struct mddev *mddev)
 {
 	seq_printf(seq, " %dk chunks", mddev->chunk_sectors / 2);
-	return;
 }
 
 static void *raid0_takeover_raid45(struct mddev *mddev)
-- 
2.10.1


^ permalink raw reply related

* [PATCH 07/54] md/raid0: Move two misplaced braces
From: SF Markus Elfring @ 2016-10-06  9:00 UTC (permalink / raw)
  To: linux-raid, Christoph Hellwig, Guoqing Jiang, Jens Axboe,
	Mike Christie, Neil Brown, Shaohua Li, Tomasz Majchrzak
  Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <786843ef-4b6f-eb04-7326-2f6f5b408826@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 4 Oct 2016 14:25:01 +0200

The script "checkpatch.pl" pointed information out like the following.

ERROR: that open brace { should be on the previous line

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/raid0.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 06cb172..9b76eae 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -222,8 +222,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 	curr_zone_end = zone->zone_end;
 
 	/* now do the other zones */
-	for (i = 1; i < conf->nr_strip_zones; i++)
-	{
+	for (i = 1; i < conf->nr_strip_zones; i++) {
 		int j;
 
 		zone = conf->strip_zone + i;
@@ -665,8 +664,7 @@ static void raid0_quiesce(struct mddev *mddev, int state)
 {
 }
 
-static struct md_personality raid0_personality=
-{
+static struct md_personality raid0_personality = {
 	.name		= "raid0",
 	.level		= 0,
 	.owner		= THIS_MODULE,
-- 
2.10.1


^ permalink raw reply related

* [PATCH 06/54] md/raid0: Delete four unwanted spaces behind function names
From: SF Markus Elfring @ 2016-10-06  8:59 UTC (permalink / raw)
  To: linux-raid, Christoph Hellwig, Guoqing Jiang, Jens Axboe,
	Mike Christie, Neil Brown, Shaohua Li, Tomasz Majchrzak
  Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <786843ef-4b6f-eb04-7326-2f6f5b408826@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 4 Oct 2016 14:21:21 +0200

The script "checkpatch.pl" pointed information out like the following.

WARNING: space prohibited between function name and open parenthesis '('

Thus fix the affected source code places.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/raid0.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 0315f1e..06cb172 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -680,14 +680,14 @@ static struct md_personality raid0_personality=
 	.congested	= raid0_congested,
 };
 
-static int __init raid0_init (void)
+static int __init raid0_init(void)
 {
-	return register_md_personality (&raid0_personality);
+	return register_md_personality(&raid0_personality);
 }
 
-static void raid0_exit (void)
+static void raid0_exit(void)
 {
-	unregister_md_personality (&raid0_personality);
+	unregister_md_personality(&raid0_personality);
 }
 
 module_init(raid0_init);
-- 
2.10.1


^ permalink raw reply related

* [PATCH 05/54] md/raid0: Move another variable assignment in create_strip_zones()
From: SF Markus Elfring @ 2016-10-06  8:57 UTC (permalink / raw)
  To: linux-raid, Christoph Hellwig, Guoqing Jiang, Jens Axboe,
	Mike Christie, Neil Brown, Shaohua Li, Tomasz Majchrzak
  Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <786843ef-4b6f-eb04-7326-2f6f5b408826@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 4 Oct 2016 13:54:36 +0200

One local variable was set to an error code before a concrete
error situation was detected. Thus move the corresponding assignment
to the end to indicate a software failure there.
Use it finally in four if branches for exception handling.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/raid0.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 3079c3e..0315f1e 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -173,7 +173,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 	cnt = 0;
 	smallest = NULL;
 	dev = conf->devlist;
-	err = -EINVAL;
 	rdev_for_each(rdev1, mddev) {
 		int j = rdev1->raid_disk;
 
@@ -194,17 +193,17 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		if (j < 0) {
 			pr_err("%s: remove inactive devices before converting to RAID0\n",
 			       mdname(mddev));
-			goto free_device_list;
+			goto e_inval;
 		}
 		if (j >= mddev->raid_disks) {
 			pr_err("%s: bad disk number %d%s",
 			       mdname(mddev), j, " - aborting!\n");
-			goto free_device_list;
+			goto e_inval;
 		}
 		if (dev[j]) {
 			pr_err("%s: multiple devices for %d%s",
 			       mdname(mddev), j, " - aborting!\n");
-			goto free_device_list;
+			goto e_inval;
 		}
 		dev[j] = rdev1;
 
@@ -215,7 +214,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 	if (cnt != mddev->raid_disks) {
 		pr_err("%s: too few disks (%d of %d)%s",
 		       mdname(mddev), cnt, mddev->raid_disks, " - aborting!\n");
-		goto free_device_list;
+		goto e_inval;
 	}
 	zone->nb_dev = cnt;
 	zone->zone_end = smallest->sectors * cnt;
@@ -280,6 +279,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 	kfree(conf);
 	*private_conf = ERR_PTR(err);
 	return err;
+e_inval:
+	err = -EINVAL;
+	goto free_device_list;
 }
 
 /* Find the zone which holds a particular offset
-- 
2.10.1


^ permalink raw reply related

* [PATCH 04/54] md/raid0: Replace printk() calls by the usage of higher level interfaces
From: SF Markus Elfring @ 2016-10-06  8:56 UTC (permalink / raw)
  To: linux-raid, Christoph Hellwig, Guoqing Jiang, Jens Axboe,
	Mike Christie, Neil Brown, Shaohua Li, Tomasz Majchrzak
  Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <786843ef-4b6f-eb04-7326-2f6f5b408826@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 4 Oct 2016 13:30:39 +0200

1. Add a definition for the macros "MY_LOG_PREFIX" and "pr_fmt"
   so that their information can be used for consistent message output.

2. Prefer usage of some higher level macros over calling "printk" directly
   in this software module.

3. Remove prefixes from strings which were passed to some function
   and macro calls.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/raid0.c | 100 +++++++++++++++++++++++++----------------------------
 1 file changed, 47 insertions(+), 53 deletions(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 71bd398..3079c3e 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -17,6 +17,8 @@
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 
+#define MY_LOG_PREFIX KBUILD_MODNAME ": "
+#define pr_fmt(fmt) MY_LOG_PREFIX fmt
 #include <linux/blkdev.h>
 #include <linux/seq_file.h>
 #include <linux/module.h>
@@ -51,20 +53,20 @@ static void dump_zones(struct mddev *mddev)
 	char b[BDEVNAME_SIZE];
 	struct r0conf *conf = mddev->private;
 	int raid_disks = conf->strip_zone[0].nb_dev;
-	printk(KERN_INFO "md: RAID0 configuration for %s - %d zone%s\n",
+	pr_info("configuration for %s - %d zone%s\n",
 	       mdname(mddev),
 	       conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s");
 	for (j = 0; j < conf->nr_strip_zones; j++) {
-		printk(KERN_INFO "md: zone%d=[", j);
+		pr_info("zone%d=[", j);
 		for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
-			printk(KERN_CONT "%s%s", k?"/":"",
-			bdevname(conf->devlist[j*raid_disks
-						+ k]->bdev, b));
-		printk(KERN_CONT "]\n");
-
+			pr_cont("%s%s",
+				k ? "/" : "",
+				bdevname(conf->devlist[j * raid_disks + k]
+					 ->bdev,
+					 b));
+		pr_cont("]\n");
 		zone_size  = conf->strip_zone[j].zone_end - zone_start;
-		printk(KERN_INFO "      zone-offset=%10lluKB, "
-				"device-offset=%10lluKB, size=%10lluKB\n",
+		pr_info("      zone-offset=%10lluKB, device-offset=%10lluKB, size=%10lluKB\n",
 			(unsigned long long)zone_start>>1,
 			(unsigned long long)conf->strip_zone[j].dev_start>>1,
 			(unsigned long long)zone_size>>1);
@@ -88,7 +90,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 	if (!conf)
 		return -ENOMEM;
 	rdev_for_each(rdev1, mddev) {
-		pr_debug("md/raid0:%s: looking at %s\n",
+		pr_debug("%s: looking at %s\n",
 			 mdname(mddev),
 			 bdevname(rdev1->bdev, b));
 		c = 0;
@@ -102,15 +104,14 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 				      rdev1->bdev->bd_disk->queue));
 
 		rdev_for_each(rdev2, mddev) {
-			pr_debug("md/raid0:%s:   comparing %s(%llu)"
-				 " with %s(%llu)\n",
+			pr_debug("%s:   comparing %s(%llu) with %s(%llu)\n",
 				 mdname(mddev),
 				 bdevname(rdev1->bdev,b),
 				 (unsigned long long)rdev1->sectors,
 				 bdevname(rdev2->bdev,b2),
 				 (unsigned long long)rdev2->sectors);
 			if (rdev2 == rdev1) {
-				pr_debug("md/raid0:%s:   END\n",
+				pr_debug("%s:   END\n",
 					 mdname(mddev));
 				break;
 			}
@@ -119,30 +120,30 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 				 * Not unique, don't count it as a new
 				 * group
 				 */
-				pr_debug("md/raid0:%s:   EQUAL\n",
+				pr_debug("%s:   EQUAL\n",
 					 mdname(mddev));
 				c = 1;
 				break;
 			}
-			pr_debug("md/raid0:%s:   NOT EQUAL\n",
+			pr_debug("%s:   NOT EQUAL\n",
 				 mdname(mddev));
 		}
 		if (!c) {
-			pr_debug("md/raid0:%s:   ==> UNIQUE\n",
+			pr_debug("%s:   ==> UNIQUE\n",
 				 mdname(mddev));
 			conf->nr_strip_zones++;
-			pr_debug("md/raid0:%s: %d zones\n",
+			pr_debug("%s: %d zones\n",
 				 mdname(mddev), conf->nr_strip_zones);
 		}
 	}
-	pr_debug("md/raid0:%s: FINAL %d zones\n",
+	pr_debug("%s: FINAL %d zones\n",
 		 mdname(mddev), conf->nr_strip_zones);
 	/*
 	 * now since we have the hard sector sizes, we can make sure
 	 * chunk size is a multiple of that sector size
 	 */
 	if ((mddev->chunk_sectors << 9) % blksize) {
-		printk(KERN_ERR "md/raid0:%s: chunk_size of %d not multiple of block size %d\n",
+		pr_err("%s: chunk_size of %d not multiple of block size %d\n",
 		       mdname(mddev),
 		       mddev->chunk_sectors << 9, blksize);
 		err = -EINVAL;
@@ -191,19 +192,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		}
 
 		if (j < 0) {
-			printk(KERN_ERR
-			       "md/raid0:%s: remove inactive devices before converting to RAID0\n",
+			pr_err("%s: remove inactive devices before converting to RAID0\n",
 			       mdname(mddev));
 			goto free_device_list;
 		}
 		if (j >= mddev->raid_disks) {
-			printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
-			       "aborting!\n", mdname(mddev), j);
+			pr_err("%s: bad disk number %d%s",
+			       mdname(mddev), j, " - aborting!\n");
 			goto free_device_list;
 		}
 		if (dev[j]) {
-			printk(KERN_ERR "md/raid0:%s: multiple devices for %d - "
-			       "aborting!\n", mdname(mddev), j);
+			pr_err("%s: multiple devices for %d%s",
+			       mdname(mddev), j, " - aborting!\n");
 			goto free_device_list;
 		}
 		dev[j] = rdev1;
@@ -213,8 +213,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		cnt++;
 	}
 	if (cnt != mddev->raid_disks) {
-		printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - "
-		       "aborting!\n", mdname(mddev), cnt, mddev->raid_disks);
+		pr_err("%s: too few disks (%d of %d)%s",
+		       mdname(mddev), cnt, mddev->raid_disks, " - aborting!\n");
 		goto free_device_list;
 	}
 	zone->nb_dev = cnt;
@@ -229,8 +229,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 
 		zone = conf->strip_zone + i;
 		dev = conf->devlist + i * mddev->raid_disks;
-
-		pr_debug("md/raid0:%s: zone %d\n", mdname(mddev), i);
+		pr_debug("%s: zone %d\n", mdname(mddev), i);
 		zone->dev_start = smallest->sectors;
 		smallest = NULL;
 		c = 0;
@@ -238,20 +237,19 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		for (j=0; j<cnt; j++) {
 			rdev = conf->devlist[j];
 			if (rdev->sectors <= zone->dev_start) {
-				pr_debug("md/raid0:%s: checking %s ... nope\n",
+				pr_debug("%s: checking %s ... nope\n",
 					 mdname(mddev),
 					 bdevname(rdev->bdev, b));
 				continue;
 			}
-			pr_debug("md/raid0:%s: checking %s ..."
-				 " contained as device %d\n",
+			pr_debug("%s: checking %s ... contained as device %d\n",
 				 mdname(mddev),
 				 bdevname(rdev->bdev, b), c);
 			dev[c] = rdev;
 			c++;
 			if (!smallest || rdev->sectors < smallest->sectors) {
 				smallest = rdev;
-				pr_debug("md/raid0:%s:  (%llu) is smallest!.\n",
+				pr_debug("%s: (%llu) is smallest!.\n",
 					 mdname(mddev),
 					 (unsigned long long)rdev->sectors);
 			}
@@ -259,19 +257,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 
 		zone->nb_dev = c;
 		sectors = (smallest->sectors - zone->dev_start) * c;
-		pr_debug("md/raid0:%s: zone->nb_dev: %d, sectors: %llu\n",
+		pr_debug("%s: zone->nb_dev: %d, sectors: %llu\n",
 			 mdname(mddev),
 			 zone->nb_dev, (unsigned long long)sectors);
 
 		curr_zone_end += sectors;
 		zone->zone_end = curr_zone_end;
-
-		pr_debug("md/raid0:%s: current zone start: %llu\n",
+		pr_debug("%s: current zone start: %llu\n",
 			 mdname(mddev),
 			 (unsigned long long)smallest->sectors);
 	}
 
-	pr_debug("md/raid0:%s: done.\n", mdname(mddev));
+	pr_debug("%s: done.\n", mdname(mddev));
 	*private_conf = conf;
 
 	return 0;
@@ -364,8 +361,7 @@ static int raid0_run(struct mddev *mddev)
 	int ret;
 
 	if (mddev->chunk_sectors == 0) {
-		printk(KERN_ERR "md/raid0:%s: chunk size must be set.\n",
-		       mdname(mddev));
+		pr_err("%s: chunk size must be set.\n", mdname(mddev));
 		return -EINVAL;
 	}
 	if (md_check_no_bitmap(mddev))
@@ -405,10 +401,9 @@ static int raid0_run(struct mddev *mddev)
 
 	/* calculate array device size */
 	md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
-
-	printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n",
-	       mdname(mddev),
-	       (unsigned long long)mddev->array_sectors);
+	pr_info("%s: md_size is %llu sectors.\n",
+		mdname(mddev),
+		(unsigned long long)mddev->array_sectors);
 
 	if (mddev->queue) {
 		/* calculate the max read-ahead size.
@@ -516,7 +511,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
 	struct r0conf *priv_conf;
 
 	if (mddev->degraded != 1) {
-		printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n",
+		pr_err("%s: raid5 must be degraded! Degraded disks: %d\n",
 		       mdname(mddev),
 		       mddev->degraded);
 		return ERR_PTR(-EINVAL);
@@ -525,7 +520,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
 	rdev_for_each(rdev, mddev) {
 		/* check slot number for a disk */
 		if (rdev->raid_disk == mddev->raid_disks-1) {
-			printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n",
+			pr_err("%s: raid5 must have missing parity disk!\n",
 			       mdname(mddev));
 			return ERR_PTR(-EINVAL);
 		}
@@ -556,18 +551,18 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
 	 *  - all mirrors must be already degraded
 	 */
 	if (mddev->layout != ((1 << 8) + 2)) {
-		printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takeover layout: 0x%x\n",
+		pr_err("%s: Raid0 cannot takeover layout: 0x%x\n",
 		       mdname(mddev),
 		       mddev->layout);
 		return ERR_PTR(-EINVAL);
 	}
 	if (mddev->raid_disks & 1) {
-		printk(KERN_ERR "md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.\n",
+		pr_err("%s: Raid0 cannot takeover Raid10 with odd disk number.\n",
 		       mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
 	if (mddev->degraded != (mddev->raid_disks>>1)) {
-		printk(KERN_ERR "md/raid0:%s: All mirrors must be already degraded!\n",
+		pr_err("%s: All mirrors must be already degraded!\n",
 		       mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
@@ -595,7 +590,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
 	 *  - (N - 1) mirror drives must be already faulty
 	 */
 	if ((mddev->raid_disks - 1) != mddev->degraded) {
-		printk(KERN_ERR "md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n",
+		pr_err("%s: (N - 1) mirrors drives must be already faulty!\n",
 		       mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
@@ -638,7 +633,7 @@ static void *raid0_takeover(struct mddev *mddev)
 	 */
 
 	if (mddev->bitmap) {
-		printk(KERN_ERR "md/raid0: %s: cannot takeover array with bitmap\n",
+		pr_err("%s: cannot takeover array with bitmap\n",
 		       mdname(mddev));
 		return ERR_PTR(-EBUSY);
 	}
@@ -649,7 +644,7 @@ static void *raid0_takeover(struct mddev *mddev)
 		if (mddev->layout == ALGORITHM_PARITY_N)
 			return raid0_takeover_raid45(mddev);
 
-		printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n",
+		pr_err("%s: Raid can only takeover Raid5 with layout: %d\n",
 		       mdname(mddev), ALGORITHM_PARITY_N);
 	}
 
@@ -659,9 +654,8 @@ static void *raid0_takeover(struct mddev *mddev)
 	if (mddev->level == 1)
 		return raid0_takeover_raid1(mddev);
 
-	printk(KERN_ERR "Takeover from raid%i to raid0 not supported\n",
+	pr_err("Takeover from raid%i to raid0 not supported\n",
 		mddev->level);
-
 	return ERR_PTR(-EINVAL);
 }
 
-- 
2.10.1

^ permalink raw reply related

* [PATCH 03/54] md/raid0: Move a variable assignment in create_strip_zones()
From: SF Markus Elfring @ 2016-10-06  8:54 UTC (permalink / raw)
  To: linux-raid, Christoph Hellwig, Guoqing Jiang, Jens Axboe,
	Mike Christie, Neil Brown, Shaohua Li, Tomasz Majchrzak
  Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <786843ef-4b6f-eb04-7326-2f6f5b408826@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 4 Oct 2016 12:02:29 +0200

One local variable was set to an error code before a concrete
error situation was detected. Thus move the corresponding assignment into
two if branches to indicate a memory allocation failure there.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/raid0.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 3c76451..71bd398 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -149,17 +149,21 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		goto free_conf;
 	}
 
-	err = -ENOMEM;
 	conf->strip_zone = kcalloc(conf->nr_strip_zones,
 				   sizeof(*conf->strip_zone),
 				   GFP_KERNEL);
-	if (!conf->strip_zone)
+	if (!conf->strip_zone) {
+		err = -ENOMEM;
 		goto free_conf;
+	}
+
 	conf->devlist = kcalloc(conf->nr_strip_zones * mddev->raid_disks,
 				sizeof(*conf->devlist),
 				GFP_KERNEL);
-	if (!conf->devlist)
+	if (!conf->devlist) {
+		err = -ENOMEM;
 		goto free_zone;
+	}
 
 	/* The first zone must contain all devices, so here we check that
 	 * there is a proper alignment of slots to devices and find them all
-- 
2.10.1

^ permalink raw reply related

* [PATCH 02/54] md/raid0: Less function calls in create_strip_zones() after error detection
From: SF Markus Elfring @ 2016-10-06  8:53 UTC (permalink / raw)
  To: linux-raid, Christoph Hellwig, Guoqing Jiang, Jens Axboe,
	Mike Christie, Neil Brown, Shaohua Li, Tomasz Majchrzak
  Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <786843ef-4b6f-eb04-7326-2f6f5b408826@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 4 Oct 2016 10:43:33 +0200

The kfree() function was called in up to two cases
by the create_strip_zones() function during error handling even if
the passed data structure member (or variable) contained a null pointer.

Adjust jump targets according to the Linux coding style convention.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/raid0.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 50e8a63..3c76451 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -146,7 +146,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		       mdname(mddev),
 		       mddev->chunk_sectors << 9, blksize);
 		err = -EINVAL;
-		goto abort;
+		goto free_conf;
 	}
 
 	err = -ENOMEM;
@@ -154,12 +154,12 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 				   sizeof(*conf->strip_zone),
 				   GFP_KERNEL);
 	if (!conf->strip_zone)
-		goto abort;
+		goto free_conf;
 	conf->devlist = kcalloc(conf->nr_strip_zones * mddev->raid_disks,
 				sizeof(*conf->devlist),
 				GFP_KERNEL);
 	if (!conf->devlist)
-		goto abort;
+		goto free_zone;
 
 	/* The first zone must contain all devices, so here we check that
 	 * there is a proper alignment of slots to devices and find them all
@@ -190,17 +190,17 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 			printk(KERN_ERR
 			       "md/raid0:%s: remove inactive devices before converting to RAID0\n",
 			       mdname(mddev));
-			goto abort;
+			goto free_device_list;
 		}
 		if (j >= mddev->raid_disks) {
 			printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
 			       "aborting!\n", mdname(mddev), j);
-			goto abort;
+			goto free_device_list;
 		}
 		if (dev[j]) {
 			printk(KERN_ERR "md/raid0:%s: multiple devices for %d - "
 			       "aborting!\n", mdname(mddev), j);
-			goto abort;
+			goto free_device_list;
 		}
 		dev[j] = rdev1;
 
@@ -211,7 +211,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 	if (cnt != mddev->raid_disks) {
 		printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - "
 		       "aborting!\n", mdname(mddev), cnt, mddev->raid_disks);
-		goto abort;
+		goto free_device_list;
 	}
 	zone->nb_dev = cnt;
 	zone->zone_end = smallest->sectors * cnt;
@@ -271,9 +271,11 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 	*private_conf = conf;
 
 	return 0;
-abort:
-	kfree(conf->strip_zone);
+free_device_list:
 	kfree(conf->devlist);
+free_zone:
+	kfree(conf->strip_zone);
+free_conf:
 	kfree(conf);
 	*private_conf = ERR_PTR(err);
 	return err;
-- 
2.10.1

^ permalink raw reply related

* [PATCH 01/54] md/raid0: Use kcalloc() in create_strip_zones()
From: SF Markus Elfring @ 2016-10-06  8:51 UTC (permalink / raw)
  To: linux-raid, Christoph Hellwig, Guoqing Jiang, Jens Axboe,
	Mike Christie, Neil Brown, Shaohua Li, Tomasz Majchrzak
  Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <786843ef-4b6f-eb04-7326-2f6f5b408826@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 4 Oct 2016 10:10:52 +0200

* Multiplications for the size determination of memory allocations
  indicated that array data structures should be processed.
  Thus use the corresponding function "kcalloc".

  This issue was detected by using the Coccinelle software.

* Replace the specification of data types by pointer dereferences
  to make the corresponding size determination a bit safer according to
  the Linux coding style convention.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/raid0.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 258986a..50e8a63 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -150,12 +150,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 	}
 
 	err = -ENOMEM;
-	conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
-				conf->nr_strip_zones, GFP_KERNEL);
+	conf->strip_zone = kcalloc(conf->nr_strip_zones,
+				   sizeof(*conf->strip_zone),
+				   GFP_KERNEL);
 	if (!conf->strip_zone)
 		goto abort;
-	conf->devlist = kzalloc(sizeof(struct md_rdev*)*
-				conf->nr_strip_zones*mddev->raid_disks,
+	conf->devlist = kcalloc(conf->nr_strip_zones * mddev->raid_disks,
+				sizeof(*conf->devlist),
 				GFP_KERNEL);
 	if (!conf->devlist)
 		goto abort;
-- 
2.10.1

^ permalink raw reply related

* Re: MD-RAID: Fine-tuning for several function implementations
From: Christoph Hellwig @ 2016-10-06  8:49 UTC (permalink / raw)
  To: SF Markus Elfring
  Cc: linux-raid, Christoph Hellwig, Guoqing Jiang, Jens Axboe,
	Mike Christie, Neil Brown, Shaohua Li, Tomasz Majchrzak, LKML,
	kernel-janitors, Julia Lawall
In-Reply-To: <786843ef-4b6f-eb04-7326-2f6f5b408826@users.sourceforge.net>

Please stop it NOW.  This is not fine tuning but a waste of everyones
time.

^ permalink raw reply

* MD-RAID: Fine-tuning for several function implementations
From: SF Markus Elfring @ 2016-10-06  8:46 UTC (permalink / raw)
  To: linux-raid, Christoph Hellwig, Guoqing Jiang, Jens Axboe,
	Mike Christie, Neil Brown, Shaohua Li, Tomasz Majchrzak
  Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <566ABCD9.1060404@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Wed, 6 Oct 2016 10:10:01 +0200

Several update suggestions were taken into account
from static source code analysis.

Markus Elfring (54):
  raid0: Use kcalloc() in create_strip_zones()
  raid0: Less function calls in create_strip_zones() after error detection
  raid0: Move a variable assignment in create_strip_zones()
  raid0: Replace printk() calls by the usage of higher level interfaces
  raid0: Move another variable assignment in create_strip_zones()
  raid0: Delete four unwanted spaces behind function names
  raid0: Move two misplaced braces
  raid0: Delete an unnecessary return statement in raid0_status()
  raid0: Add some spaces for better code readability
  raid1: Use kcalloc() in alloc_behind_pages()
  raid1: Use kcalloc() in raid1_reshape()
  raid1: Use kcalloc() in setup_conf()
  raid1: Return directly after a failed kzalloc() in setup_conf()
  raid1: Move assignments for the variable "err" in setup_conf()
  raid1: Less function calls in setup_conf() after error detection
  raid1: Delete an error message for a failed memory allocation
  raid1: Move a brace for a designated initialiser
  raid1: Adjust 12 checks for null pointers
  raid1: Replace printk() calls by the usage of higher level interfaces
  raid1: Add some spaces for better code readability
  raid1: Delete three unwanted spaces behind asterisks
  raid1: Delete three unwanted spaces before increment operators
  raid1: Replace a seq_printf() call by seq_puts() in raid1_status()
  raid1: Improve another size determination in setup_conf()
  raid5: Use kcalloc() in three functions
  raid5: Improve another size determination in setup_conf()
  raid5: Return directly after a failed kzalloc() in setup_conf()
  raid5: Rename a jump label in setup_conf()
  raid5: Return directly after a failed kcalloc() in alloc_thread_groups()
  raid5: Delete two error messages for a failed memory allocation
  raid5: Adjust two function calls together with a variable assignment
  raid5: Move a brace for three designated initialisers
  raid5: Replace printk() calls by the usage of higher level interfaces
  raid5: Delete indentation for two jump labels
  raid5: Adjust 13 checks for null pointers
  raid5: Delete four unwanted spaces behind function names
  raid5: Replace a seq_printf() call by seq_puts() in raid5_status()
  raid5: Move four asterisks
  raid5: Add some spaces for better code readability
  raid10: Use kcalloc() in two functions
  raid10: Improve another size determination in setup_conf()
  raid10: Delete an error message for a failed memory allocation
  raid10: Return directly after detection of unsupported settings in setup_conf()
  raid10: Return directly after a failed kzalloc() in setup_conf()
  raid10: Move assignments for the variable "err" in setup_conf()
  raid10: Less function calls in setup_conf() after error detection
  raid10: Improve another size determination in raid10_start_reshape()
  raid10: Move a brace for a designated initialiser
  raid10: Replace printk() calls by the usage of higher level interfaces
  raid10: Delete indentation for one jump label
  raid10: Adjust 22 checks for null pointers
  raid10: Replace a seq_printf() call by seq_puts() in raid10_status()
  raid10: Delete two unwanted spaces behind asterisks
  raid10: Add some spaces for better code readability

 drivers/md/raid0.c  | 170 ++++++++---------
 drivers/md/raid1.c  | 285 ++++++++++++++--------------
 drivers/md/raid10.c | 366 ++++++++++++++++++------------------
 drivers/md/raid5.c  | 523 ++++++++++++++++++++++++----------------------------
 4 files changed, 649 insertions(+), 695 deletions(-)

-- 
2.10.1

^ permalink raw reply

* AW: Prefetch in /lib/raid6/avx2.c
From: Markus Stockhausen @ 2016-10-06  7:27 UTC (permalink / raw)
  To: Shaohua Li, Doug Dumitru
  Cc: linux-raid, gayatri.kammela@intel.com, ravi.v.shankar@intel.com,
	hpa@zytor.com, yu-cheng.yu@intel.com, yuanhan.liu@intel.com
In-Reply-To: <20161005231710.GB2804@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 2643 bytes --]

> Von: linux-raid-owner@vger.kernel.org [linux-raid-owner@vger.kernel.org]&quot; im Auftrag von &quot;Shaohua Li [shli@kernel.org]
> Gesendet: Donnerstag, 6. Oktober 2016 01:17
> An: Doug Dumitru
> Cc: linux-raid; gayatri.kammela@intel.com; ravi.v.shankar@intel.com; hpa@zytor.com; yu-cheng.yu@intel.com; yuanhan.liu@intel.com
> Betreff: Re: Prefetch in /lib/raid6/avx2.c
> 
> On Sun, Oct 02, 2016 at 03:40:09PM -0700, Doug Dumitru wrote:
> > I have been doing some high bandwidth testing of raid-6, and the
> > pretetch in raid6_avx24_gen_syndrome appears to be less than optimal.
> >
> > This is my patch (against 4.4.0-38 [Ubuntu 16.04LTS)
> >
> > --- cut here ---
> > --- lib/raid6/avx2.c0   2016-10-01 21:42:25.280347868 -0700
> > +++ lib/raid6/avx2.c    2016-10-02 15:35:48.168480760 -0700
> > @@ -189,10 +189,8 @@
> >
> >                 for (z = z0; z >= 0; z--) {
> >
> > -                       asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
> > -                       asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
> > -                       asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
> > -                       asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
> > +                       asm volatile("prefetchnta %0" : : "m" (dptr[z][d+128]));
> > +                       asm volatile("prefetchnta %0" : : "m" (dptr[z][d+192]));

From the first look that looks strange. 

1) It will add 2 prefetches for the last blocks beyond the data. Feels like bad coding.
2) The prefetch for the next block is already in the next loop (d+128)

Maybe the prefetcher takes longer than expected. And thus the next loop
will benefit from the "relocated" hint.

> >
> >                         asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
> >                         asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
> > --- cut here ---
> >
> > In perf, the cpu cycles goes from 5.3% to 3.0% for
> > raid6_avx24_gen_syndrome in my test and throughput increases from
> > about 8.2GB/sec to almost 10GB/sec.  It is a very "synthetic" test,
> > but the avx2 code does seem to be a factor.
> >
> > I suspect other SSE and AVX "unroll variants" have similar issues, but
> > I have not tested those.
> >
> > My test system is an E5-1650 v3 (single socket) with DDR4.  This might
> > help dual sockets even more.
> 
> CC some intel folks to see if they have ideas
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
=

[-- Attachment #2: InterScan_Disclaimer.txt --]
[-- Type: text/plain, Size: 1650 bytes --]

****************************************************************************
Diese E-Mail enthÃ¤lt vertrauliche und/oder rechtlich geschÃ¼tzte
Informationen. Wenn Sie nicht der richtige Adressat sind oder diese E-Mail
irrtÃ¼mlich erhalten haben, informieren Sie bitte sofort den Absender und
vernichten Sie diese Mail. Das unerlaubte Kopieren sowie die unbefugte
Weitergabe dieser Mail ist nicht gestattet.

Ãœber das Internet versandte E-Mails kÃ¶nnen unter fremden Namen erstellt oder
manipuliert werden. Deshalb ist diese als E-Mail verschickte Nachricht keine
rechtsverbindliche WillenserklÃ¤rung.

Collogia
Unternehmensberatung AG
Ubierring 11
D-50678 KÃ¶ln

Vorstand:
Kadir Akin
Dr. Michael HÃ¶hnerbach

Vorsitzender des Aufsichtsrates:
Hans Kristian Langva

Registergericht: Amtsgericht KÃ¶ln
Registernummer: HRB 52 497

This e-mail may contain confidential and/or privileged information. If you
are not the intended recipient (or have received this e-mail in error)
please notify the sender immediately and destroy this e-mail. Any
unauthorized copying, disclosure or distribution of the material in this
e-mail is strictly forbidden.

e-mails sent over the internet may have been written under a wrong name or
been manipulated. That is why this message sent as an e-mail is not a
legally binding declaration of intention.

Collogia
Unternehmensberatung AG
Ubierring 11
D-50678 KÃ¶ln

executive board:
Kadir Akin
Dr. Michael HÃ¶hnerbach

President of the supervisory board:
Hans Kristian Langva

Registry office: district court Cologne
Register number: HRB 52 497

****************************************************************************

^ permalink raw reply

* [PATCH v3 8/8] md/r5cache: handle SYNC and FUA
From: Song Liu @ 2016-10-06  7:14 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuang521,
	liuzhengyuan, Song Liu
In-Reply-To: <20161006071416.3295093-1-songliubraving@fb.com>

With raid5 cache, we committing data from journal device. When
there is flush request, we need to flush journal device's cache.
This was not needed in raid5 journal, because we will flush the
journal before committing data to raid disks.

This is similar to FUA, except that we also need flush journal for
FUA. Otherwise, corruptions in earlier meta data will stop recovery
from reaching FUA data.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c | 162 +++++++++++++++++++++++++++++++++++++++++------
 drivers/md/raid5.c       |   8 +++
 drivers/md/raid5.h       |   1 +
 3 files changed, 153 insertions(+), 18 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 4f72dc2..5230cc4 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -19,6 +19,7 @@
 #include <linux/raid/md_p.h>
 #include <linux/crc32c.h>
 #include <linux/random.h>
+#include <trace/events/block.h>
 #include "md.h"
 #include "raid5.h"
 #include "bitmap.h"
@@ -158,6 +159,9 @@ struct r5l_log {
 						 */
 	spinlock_t stripe_in_cache_lock;
 	atomic_t stripe_in_cache_count;
+
+	/* to submit async io_units, to fulfill ordering of flush */
+	struct work_struct deferred_io_work;
 };
 
 /*
@@ -184,6 +188,18 @@ struct r5l_io_unit {
 
 	int state;
 	bool need_split_bio;
+	struct bio *split_bio;
+
+	unsigned int has_flush:1;      /* include flush request */
+	unsigned int has_fua:1;        /* include fua request */
+	unsigned int has_null_flush:1; /* include empty flush request */
+	/*
+	 * io isn't sent yet, flush/fua request can only be submitted till it's
+	 * the first IO in running_ios list
+	 */
+	unsigned int io_deferred:1;
+
+	struct bio_list flush_barriers;   /* size == 0 flush bios */
 };
 
 /* r5l_io_unit state */
@@ -488,9 +504,11 @@ static void r5l_move_to_end_ios(struct r5l_log *log)
 	}
 }
 
+static void __r5l_stripe_write_finished(struct r5l_io_unit *io);
 static void r5l_log_endio(struct bio *bio)
 {
 	struct r5l_io_unit *io = bio->bi_private;
+	struct r5l_io_unit *io_deferred;
 	struct r5l_log *log = io->log;
 	unsigned long flags;
 
@@ -506,18 +524,89 @@ static void r5l_log_endio(struct bio *bio)
 		r5l_move_to_end_ios(log);
 	else
 		r5l_log_run_stripes(log);
+	if (!list_empty(&log->running_ios)) {
+		/*
+		 * FLUSH/FUA io_unit is deferred because of ordering, now we
+		 * can dispatch it
+		 */
+		io_deferred = list_first_entry(&log->running_ios,
+					       struct r5l_io_unit, log_sibling);
+		if (io_deferred->io_deferred)
+			schedule_work(&log->deferred_io_work);
+	}
+
 	spin_unlock_irqrestore(&log->io_list_lock, flags);
 
 	if (log->need_cache_flush)
 		md_wakeup_thread(log->rdev->mddev->thread);
+
+	if (io->has_null_flush) {
+		struct bio *bi;
+
+		WARN_ON(bio_list_empty(&io->flush_barriers));
+		while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
+			bio_endio(bi);
+			atomic_dec(&io->pending_stripe);
+		}
+		if (atomic_read(&io->pending_stripe) == 0)
+			__r5l_stripe_write_finished(io);
+	}
+}
+
+static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&log->io_list_lock, flags);
+	__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
+	spin_unlock_irqrestore(&log->io_list_lock, flags);
+
+	if (io->has_flush)
+		bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FLUSH);
+	if (io->has_fua)
+		bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FUA);
+	submit_bio(io->current_bio);
+
+	if (!io->split_bio)
+		return;
+
+	if (io->has_flush)
+		bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FLUSH);
+	if (io->has_fua)
+		bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FUA);
+	submit_bio(io->split_bio);
+}
+
+/* deferred io_unit will be dispatched here */
+static void r5l_submit_io_async(struct work_struct *work)
+{
+	struct r5l_log *log = container_of(work, struct r5l_log,
+					   deferred_io_work);
+	struct r5l_io_unit *io = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&log->io_list_lock, flags);
+	if (!list_empty(&log->running_ios)) {
+		io = list_first_entry(&log->running_ios, struct r5l_io_unit,
+				      log_sibling);
+		if (!io->io_deferred)
+			io = NULL;
+		else
+			io->io_deferred = 0;
+	}
+	spin_unlock_irqrestore(&log->io_list_lock, flags);
+	if (io)
+		r5l_do_submit_io(log, io);
 }
 
 static void r5l_submit_current_io(struct r5l_log *log)
 {
 	struct r5l_io_unit *io = log->current_io;
+	struct bio *bio;
 	struct r5l_meta_block *block;
 	unsigned long flags;
 	u32 crc;
+	bool do_submit = true;
 
 	if (!io)
 		return;
@@ -526,13 +615,20 @@ static void r5l_submit_current_io(struct r5l_log *log)
 	block->meta_size = cpu_to_le32(io->meta_offset);
 	crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
 	block->checksum = cpu_to_le32(crc);
+	bio = io->current_bio;
 
 	log->current_io = NULL;
 	spin_lock_irqsave(&log->io_list_lock, flags);
-	__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
+	if (io->has_flush || io->has_fua) {
+		if (io != list_first_entry(&log->running_ios,
+					   struct r5l_io_unit, log_sibling)) {
+			io->io_deferred = 1;
+			do_submit = false;
+		}
+	}
 	spin_unlock_irqrestore(&log->io_list_lock, flags);
-
-	submit_bio(io->current_bio);
+	if (do_submit)
+		r5l_do_submit_io(log, io);
 }
 
 static struct bio *r5l_bio_alloc(struct r5l_log *log)
@@ -577,6 +673,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
 	io->log = log;
 	INIT_LIST_HEAD(&io->log_sibling);
 	INIT_LIST_HEAD(&io->stripe_list);
+	bio_list_init(&io->flush_barriers);
 	io->state = IO_UNIT_RUNNING;
 
 	io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
@@ -647,12 +744,11 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
 	struct r5l_io_unit *io = log->current_io;
 
 	if (io->need_split_bio) {
-		struct bio *prev = io->current_bio;
-
+		BUG_ON(io->split_bio);
+		io->split_bio = io->current_bio;
 		io->current_bio = r5l_bio_alloc(log);
-		bio_chain(io->current_bio, prev);
-
-		submit_bio(prev);
+		bio_chain(io->current_bio, io->split_bio);
+		io->need_split_bio = false;
 	}
 
 	if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
@@ -682,12 +778,23 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 
 	io = log->current_io;
 
+	if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state))
+		io->has_flush = 1;
+
 	for (i = 0; i < sh->disks; i++) {
 		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) &&
 		    !test_bit(R5_Wantcache, &sh->dev[i].flags))
 			continue;
 		if (i == sh->pd_idx || i == sh->qd_idx)
 			continue;
+		if (test_bit(R5_WantFUA, &sh->dev[i].flags)) {
+			io->has_fua = 1;
+			/*
+			 * we need to flush journal to make sure recovery can
+			 * reach the data with fua flag
+			 */
+			io->has_flush = 1;
+		}
 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
 					raid5_compute_blocknr(sh, i, 0),
 					sh->dev[i].log_checksum, 0, false);
@@ -839,17 +946,34 @@ int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
 {
 	if (!log)
 		return -ENODEV;
-	/*
-	 * we flush log disk cache first, then write stripe data to raid disks.
-	 * So if bio is finished, the log disk cache is flushed already. The
-	 * recovery guarantees we can recovery the bio from log disk, so we
-	 * don't need to flush again
-	 */
-	if (bio->bi_iter.bi_size == 0) {
-		bio_endio(bio);
-		return 0;
+
+	if (log->r5c_state == R5C_STATE_WRITE_THROUGH) {
+		/*
+		 * in write through (journal only)
+		 * we flush log disk cache first, then write stripe data to
+		 * raid disks. So if bio is finished, the log disk cache is
+		 * flushed already. The recovery guarantees we can recovery
+		 * the bio from log disk, so we don't need to flush again
+		 */
+		if (bio->bi_iter.bi_size == 0) {
+			bio_endio(bio);
+			return 0;
+		}
+		bio->bi_opf &= ~REQ_PREFLUSH;
+	} else {
+		/* write back (with cache) */
+		if (bio->bi_iter.bi_size == 0) {
+			mutex_lock(&log->io_mutex);
+			r5l_get_meta(log, 0);
+			bio_list_add(&log->current_io->flush_barriers, bio);
+			log->current_io->has_flush = 1;
+			log->current_io->has_null_flush = 1;
+			atomic_inc(&log->current_io->pending_stripe);
+			r5l_submit_current_io(log);
+			mutex_unlock(&log->io_mutex);
+			return 0;
+		}
 	}
-	bio->bi_opf &= ~REQ_PREFLUSH;
 	return -EAGAIN;
 }
 
@@ -2322,6 +2446,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	INIT_LIST_HEAD(&log->no_space_stripes);
 	spin_lock_init(&log->no_space_stripes_lock);
 
+	INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
+
 	log->r5c_state = R5C_STATE_WRITE_THROUGH;
 	INIT_LIST_HEAD(&log->stripe_in_cache_list);
 	spin_lock_init(&log->stripe_in_cache_lock);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ec51129..5c193ab 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5226,6 +5226,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	int remaining;
 	DEFINE_WAIT(w);
 	bool do_prepare;
+	bool do_flush = false;
 
 	if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
 		int ret = r5l_handle_flush_request(conf->log, bi);
@@ -5237,6 +5238,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 			return;
 		}
 		/* ret == -EAGAIN, fallback */
+		do_flush = true;
 	}
 
 	md_write_start(mddev, bi);
@@ -5376,6 +5378,12 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 				do_prepare = true;
 				goto retry;
 			}
+			if (do_flush) {
+				set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
+				/* we only need flush for one stripe */
+				do_flush = false;
+			}
+
 			set_bit(STRIPE_HANDLE, &sh->state);
 			clear_bit(STRIPE_DELAYED, &sh->state);
 			if ((!sh->batch_head || sh == sh->batch_head) &&
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 6898a76..317f204 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -360,6 +360,7 @@ enum {
 	STRIPE_R5C_FULL_STRIPE,	/* in r5c cache (to-be/being handled or
 				 * in conf->r5c_full_stripe_list)
 				 */
+	STRIPE_R5C_PREFLUSH,	/* need to flush journal device */
 };
 
 #define STRIPE_EXPAND_SYNC_FLAGS \
-- 
2.9.3


^ permalink raw reply related

* [PATCH v3 7/8] md/r5cache: r5c recovery
From: Song Liu @ 2016-10-06  7:14 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuang521,
	liuzhengyuan, Song Liu
In-Reply-To: <20161006071416.3295093-1-songliubraving@fb.com>

This is the recovery part of raid5-cache.

With cache feature, there are 2 different scenarios of recovery:
1. Data-Parity stripe: a stripe with complete parity in journal.
2. Data-Only stripe: a stripe with only data in journal (or partial
   parity).

The code differentiate Data-Parity stripe from Data-Only stripe with
flag (STRIPE_R5C_WRITTEN).

For Data-Parity stripes, we use the same procedure as raid5 journal,
where all the data and parity are replayed to the RAID devices.

For Data-Only strips, we need to finish complete calculate parity and
finish the full reconstruct write or RMW write. For simplicity, in
the recovery, we load the stripe to stripe cache. Once the array is
started, the stripe cache state machine will handle these stripes
through normal write path.

r5c_recovery_flush_log contains the main procedure of recovery. The
recovery code first scans through the journal and loads data to
stripe cache. The code keeps tracks of all these stripes in a list
(use sh->lru and ctx->cached_list), stripes in the list are
organized in the order of its first appearance on the journal.
During the scan, the recovery code assesses each stripe as
Data-Parity or Data-Only.

During scan, the array may run out of stripe cache. In these cases,
the recovery code tries to release some stripe head by replaying
existing Data-Parity stripes. Once these replays are done, these
stripes can be released. When releasing Data-Parity stripes is not
enough, the recovery code will also call raid5_set_cache_size to
increase stripe cache size.

At the end of scan, the recovery code replays all Data-Parity
stripes, and sets proper states for Data-Only stripes. The recovery
code also increases seq number by 10 and rewrites all Data-Only
stripes to journal. This is to avoid confusion after repeated
crashes. More details is explained in raid5-cache.c before
r5c_recovery_rewrite_data_only_stripes().

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c | 684 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 550 insertions(+), 134 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index c1288a7..4f72dc2 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2015 Shaohua Li <shli@fb.com>
+ * Copyright (C) 2016 Song Liu <songliubraving@fb.com>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -1203,10 +1204,13 @@ struct r5l_recovery_ctx {
 	sector_t meta_total_blocks;	/* total size of current meta and data */
 	sector_t pos;			/* recovery position */
 	u64 seq;			/* recovery position seq */
+	int data_parity_stripes;	/* number of data_parity stripes */
+	int data_only_stripes;		/* number of data_only stripes */
+	struct list_head cached_list;
 };
 
-static int r5l_read_meta_block(struct r5l_log *log,
-			       struct r5l_recovery_ctx *ctx)
+static int r5l_recovery_read_meta_block(struct r5l_log *log,
+					struct r5l_recovery_ctx *ctx)
 {
 	struct page *page = ctx->meta_page;
 	struct r5l_meta_block *mb;
@@ -1238,170 +1242,580 @@ static int r5l_read_meta_block(struct r5l_log *log,
 	return 0;
 }
 
-static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
-					 struct r5l_recovery_ctx *ctx,
-					 sector_t stripe_sect,
-					 int *offset, sector_t *log_offset)
+/*
+ * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite
+ * to mark valid (potentially not flushed) data in the journal.
+ *
+ * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb,
+ * so there should not be any mismatch here.
+ */
+static void r5l_recovery_load_data(struct r5l_log *log,
+				   struct stripe_head *sh,
+				   struct r5l_recovery_ctx *ctx,
+				   struct r5l_payload_data_parity *payload,
+				   sector_t log_offset)
 {
-	struct r5conf *conf = log->rdev->mddev->private;
-	struct stripe_head *sh;
-	struct r5l_payload_data_parity *payload;
+	struct mddev *mddev = log->rdev->mddev;
+	struct r5conf *conf = mddev->private;
 	int disk_index;
 
-	sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
-	while (1) {
-		payload = page_address(ctx->meta_page) + *offset;
+	raid5_compute_sector(conf,
+			     le64_to_cpu(payload->location), 0,
+			     &disk_index, sh);
+	sync_page_io(log->rdev, log_offset, PAGE_SIZE,
+		     sh->dev[disk_index].page, REQ_OP_READ, 0, false);
+	sh->dev[disk_index].log_checksum =
+		le32_to_cpu(payload->checksum[0]);
+	ctx->meta_total_blocks += BLOCK_SECTORS;
 
-		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
-			raid5_compute_sector(conf,
-					     le64_to_cpu(payload->location), 0,
-					     &disk_index, sh);
+	set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
+}
 
-			sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
-				     sh->dev[disk_index].page, REQ_OP_READ, 0,
-				     false);
-			sh->dev[disk_index].log_checksum =
-				le32_to_cpu(payload->checksum[0]);
-			set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
-			ctx->meta_total_blocks += BLOCK_SECTORS;
-		} else {
-			disk_index = sh->pd_idx;
-			sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
-				     sh->dev[disk_index].page, REQ_OP_READ, 0,
-				     false);
-			sh->dev[disk_index].log_checksum =
-				le32_to_cpu(payload->checksum[0]);
-			set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
-
-			if (sh->qd_idx >= 0) {
-				disk_index = sh->qd_idx;
-				sync_page_io(log->rdev,
-					     r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
-					     PAGE_SIZE, sh->dev[disk_index].page,
-					     REQ_OP_READ, 0, false);
-				sh->dev[disk_index].log_checksum =
-					le32_to_cpu(payload->checksum[1]);
-				set_bit(R5_Wantwrite,
-					&sh->dev[disk_index].flags);
-			}
-			ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
-		}
+static void r5l_recovery_load_parity(struct r5l_log *log,
+				     struct stripe_head *sh,
+				     struct r5l_recovery_ctx *ctx,
+				     struct r5l_payload_data_parity *payload,
+				     sector_t log_offset)
+{
+	struct mddev *mddev = log->rdev->mddev;
+	struct r5conf *conf = mddev->private;
 
-		*log_offset = r5l_ring_add(log, *log_offset,
-					   le32_to_cpu(payload->size));
-		*offset += sizeof(struct r5l_payload_data_parity) +
-			sizeof(__le32) *
-			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
-		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
-			break;
+	ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
+	sync_page_io(log->rdev, log_offset, PAGE_SIZE,
+		     sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false);
+	sh->dev[sh->pd_idx].log_checksum =
+		le32_to_cpu(payload->checksum[0]);
+	set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
+
+	if (sh->qd_idx >= 0) {
+		sync_page_io(log->rdev,
+			     r5l_ring_add(log, log_offset, BLOCK_SECTORS),
+			     PAGE_SIZE, sh->dev[sh->qd_idx].page,
+			     REQ_OP_READ, 0, false);
+		sh->dev[sh->qd_idx].log_checksum =
+			le32_to_cpu(payload->checksum[1]);
+		set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
 	}
+	set_bit(STRIPE_R5C_WRITTEN, &sh->state);
+}
 
-	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
-		void *addr;
-		u32 checksum;
+static void r5l_recovery_reset_stripe(struct stripe_head *sh)
+{
+	int i;
 
+	sh->state = 0;
+	sh->log_start = MaxSector;
+	for (i = sh->disks; i--; )
+		sh->dev[i].flags = 0;
+}
+
+static void
+r5l_recovery_replay_one_stripe(struct r5conf *conf,
+			       struct stripe_head *sh,
+			       struct r5l_recovery_ctx *ctx)
+{
+	struct md_rdev *rdev, *rrdev;
+	int disk_index;
+	int data_count = 0;
+
+	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
 		if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
 			continue;
-		addr = kmap_atomic(sh->dev[disk_index].page);
-		checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
-		kunmap_atomic(addr);
-		if (checksum != sh->dev[disk_index].log_checksum)
-			goto error;
+		if (disk_index == sh->qd_idx || disk_index == sh->pd_idx)
+			continue;
+		data_count++;
 	}
+	/* stripes only have parity are already flushed to RAID */
+	if (data_count == 0)
+		goto out;
 
 	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
-		struct md_rdev *rdev, *rrdev;
-
-		if (!test_and_clear_bit(R5_Wantwrite,
-					&sh->dev[disk_index].flags))
+		if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
 			continue;
 
 		/* in case device is broken */
 		rdev = rcu_dereference(conf->disks[disk_index].rdev);
 		if (rdev)
-			sync_page_io(rdev, stripe_sect, PAGE_SIZE,
+			sync_page_io(rdev, sh->sector, PAGE_SIZE,
 				     sh->dev[disk_index].page, REQ_OP_WRITE, 0,
 				     false);
 		rrdev = rcu_dereference(conf->disks[disk_index].replacement);
 		if (rrdev)
-			sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
+			sync_page_io(rrdev, sh->sector, PAGE_SIZE,
 				     sh->dev[disk_index].page, REQ_OP_WRITE, 0,
 				     false);
 	}
-	raid5_release_stripe(sh);
+	ctx->data_parity_stripes++;
+out:
+	r5l_recovery_reset_stripe(sh);
+}
+
+static void
+r5l_recovery_create_emtpy_meta_block(struct r5l_log *log,
+				     struct page *page,
+				     sector_t pos, u64 seq)
+{
+	struct r5l_meta_block *mb;
+	u32 crc;
+
+	mb = page_address(page);
+	clear_page(mb);
+	mb->magic = cpu_to_le32(R5LOG_MAGIC);
+	mb->version = R5LOG_VERSION;
+	mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
+	mb->seq = cpu_to_le64(seq);
+	mb->position = cpu_to_le64(pos);
+	crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
+	mb->checksum = cpu_to_le32(crc);
+}
+
+static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
+					  u64 seq)
+{
+	struct page *page;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+	r5l_recovery_create_emtpy_meta_block(log, page, pos, seq);
+	if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
+			  WRITE_FUA, false)) {
+		__free_page(page);
+		return -EIO;
+	}
+	__free_page(page);
 	return 0;
+}
 
-error:
-	for (disk_index = 0; disk_index < sh->disks; disk_index++)
-		sh->dev[disk_index].flags = 0;
-	raid5_release_stripe(sh);
-	return -EINVAL;
+static struct stripe_head *
+r5c_recovery_alloc_stripe(struct r5conf *conf,
+			  struct list_head *recovery_list,
+			  sector_t stripe_sect,
+			  sector_t log_start)
+{
+	struct stripe_head *sh;
+
+	sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0);
+	if (!sh)
+		return NULL;  /* no more stripe available */
+
+	r5l_recovery_reset_stripe(sh);
+	sh->log_start = log_start;
+
+	return sh;
 }
 
-static int r5l_recovery_flush_one_meta(struct r5l_log *log,
-				       struct r5l_recovery_ctx *ctx)
+static struct stripe_head *
+r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect)
 {
-	struct r5conf *conf = log->rdev->mddev->private;
+	struct stripe_head *sh;
+
+	list_for_each_entry(sh, list, lru)
+		if (sh->sector == sect)
+			return sh;
+	return NULL;
+}
+
+static void
+r5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
+			    struct r5l_recovery_ctx *ctx)
+{
+	struct stripe_head *sh, *next;
+
+	list_for_each_entry_safe(sh, next, cached_stripe_list, lru)
+		if (test_bit(STRIPE_R5C_WRITTEN, &sh->state)) {
+			r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx);
+			list_del_init(&sh->lru);
+			raid5_release_stripe(sh);
+		}
+}
+
+/* returns 0 for match; 1 for mismtach */
+static int
+r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page,
+				  sector_t log_offset, __le32 log_checksum)
+{
+	void *addr;
+	u32 checksum;
+
+	sync_page_io(log->rdev, log_offset, PAGE_SIZE,
+		     page, REQ_OP_READ, 0, false);
+	addr = kmap_atomic(page);
+	checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
+	kunmap_atomic(addr);
+	return le32_to_cpu(log_checksum) != checksum;
+}
+
+/*
+ * before loading data to stripe cache, we need verify checksum for all data,
+ * if there is mismatch for any data page, we drop all data in the mata block
+ */
+static int
+r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
+					 struct r5l_recovery_ctx *ctx)
+{
+	struct mddev *mddev = log->rdev->mddev;
+	struct r5conf *conf = mddev->private;
+	struct r5l_meta_block *mb = page_address(ctx->meta_page);
+	sector_t mb_offset = sizeof(struct r5l_meta_block);
+	sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
+	struct page *page;
 	struct r5l_payload_data_parity *payload;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	while (mb_offset < le32_to_cpu(mb->meta_size)) {
+		payload = (void *)mb + mb_offset;
+
+		if (payload->header.type == R5LOG_PAYLOAD_DATA) {
+			if (r5l_recovery_verify_data_checksum(
+				    log, page, log_offset,
+				    payload->checksum[0]))
+				goto mismatch;
+		} else if (payload->header.type == R5LOG_PAYLOAD_PARITY) {
+			if (r5l_recovery_verify_data_checksum(
+				    log, page, log_offset,
+				    payload->checksum[0]))
+				goto mismatch;
+			if (conf->max_degraded == 2 && /* q for RAID 6 */
+			    r5l_recovery_verify_data_checksum(
+				    log, page,
+				    r5l_ring_add(log, log_offset,
+						 BLOCK_SECTORS),
+				    payload->checksum[1]))
+				goto mismatch;
+		} else
+			goto mismatch;
+
+		log_offset = r5l_ring_add(log, log_offset,
+					  le32_to_cpu(payload->size));
+
+		mb_offset += sizeof(struct r5l_payload_data_parity) +
+			sizeof(__le32) *
+			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
+	}
+
+	put_page(page);
+	return 0;
+
+mismatch:
+	put_page(page);
+	return -EINVAL;
+}
+
+static int
+r5c_recovery_analyze_meta_block(struct r5l_log *log,
+				struct r5l_recovery_ctx *ctx,
+				struct list_head *cached_stripe_list)
+{
+	struct mddev *mddev = log->rdev->mddev;
+	struct r5conf *conf = mddev->private;
 	struct r5l_meta_block *mb;
-	int offset;
+	struct r5l_payload_data_parity *payload;
+	int mb_offset;
 	sector_t log_offset;
-	sector_t stripe_sector;
+	sector_t stripe_sect;
+	struct stripe_head *sh;
+	int ret;
+
+	/*
+	 * for mismatch in data blocks, we will drop all data in this mb, but
+	 * we will still read next mb for other data with FLUSH flag, as
+	 * io_unit could finish out of order.
+	 */
+	ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx);
+	if (ret == -EINVAL)
+		return -EAGAIN;
+	else if (ret)
+		return ret;
 
 	mb = page_address(ctx->meta_page);
-	offset = sizeof(struct r5l_meta_block);
+	mb_offset = sizeof(struct r5l_meta_block);
 	log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
 
-	while (offset < le32_to_cpu(mb->meta_size)) {
+	while (mb_offset < le32_to_cpu(mb->meta_size)) {
 		int dd;
 
-		payload = (void *)mb + offset;
-		stripe_sector = raid5_compute_sector(conf,
-						     le64_to_cpu(payload->location), 0, &dd, NULL);
-		if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
-						  &offset, &log_offset))
+		payload = (void *)mb + mb_offset;
+		stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ?
+			raid5_compute_sector(
+				conf, le64_to_cpu(payload->location), 0, &dd,
+				NULL)
+			: le64_to_cpu(payload->location);
+
+		sh = r5c_recovery_lookup_stripe(cached_stripe_list,
+						stripe_sect);
+
+		if (!sh) {
+			sh = r5c_recovery_alloc_stripe(conf, cached_stripe_list,
+						       stripe_sect, ctx->pos);
+			/*
+			 * cannot get stripe from raid5_get_active_stripe
+			 * try replay some stripes
+			 */
+			if (!sh) {
+				r5c_recovery_replay_stripes(
+					cached_stripe_list, ctx);
+				sh = r5c_recovery_alloc_stripe(
+					conf, cached_stripe_list,
+					stripe_sect, ctx->pos);
+			}
+			if (!sh) {
+				pr_info("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
+					mdname(mddev),
+					conf->min_nr_stripes * 2);
+				raid5_set_cache_size(mddev,
+						     conf->min_nr_stripes * 2);
+				sh = r5c_recovery_alloc_stripe(
+					conf, cached_stripe_list, stripe_sect,
+					ctx->pos);
+			}
+			if (!sh) {
+				pr_err("md/raid:%s: Cannot get enough stripe_cache. Recovery interrupted.\n",
+				       mdname(mddev));
+				return -ENOMEM;
+			}
+			list_add_tail(&sh->lru, cached_stripe_list);
+		}
+		if (!sh)
+			return -ENOMEM;
+
+		if (payload->header.type == R5LOG_PAYLOAD_DATA) {
+			if (test_bit(STRIPE_R5C_WRITTEN, &sh->state)) {
+				r5l_recovery_reset_stripe(sh);
+				sh->log_start = ctx->pos;
+				list_move_tail(&sh->lru, cached_stripe_list);
+			}
+			r5l_recovery_load_data(log, sh, ctx, payload,
+					       log_offset);
+		} else if (payload->header.type == R5LOG_PAYLOAD_PARITY)
+			r5l_recovery_load_parity(log, sh, ctx, payload,
+						 log_offset);
+		else
 			return -EINVAL;
+
+		log_offset = r5l_ring_add(log, log_offset,
+					  le32_to_cpu(payload->size));
+
+		mb_offset += sizeof(struct r5l_payload_data_parity) +
+			sizeof(__le32) *
+			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
 	}
+
 	return 0;
 }
 
-/* copy data/parity from log to raid disks */
-static void r5l_recovery_flush_log(struct r5l_log *log,
+/*
+ * Load the stripe into cache. The stripe will be written out later by
+ * the stripe cache state machine.
+ */
+static void r5c_recovery_load_one_stripe(struct r5l_log *log,
+					 struct stripe_head *sh)
+{
+	struct r5conf *conf = sh->raid_conf;
+	struct r5dev *dev;
+	int i;
+
+	atomic_set(&sh->dev_in_cache, 0);
+	for (i = sh->disks; i--; ) {
+		dev = sh->dev + i;
+		if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) {
+			set_bit(R5_InCache, &dev->flags);
+			atomic_inc(&sh->dev_in_cache);
+		}
+	}
+	set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state);
+	atomic_inc(&conf->r5c_cached_partial_stripes);
+	list_add_tail(&sh->r5c, &log->stripe_in_cache_list);
+}
+
+/*
+ * Scan through the log for all to-be-flushed data
+ *
+ * For stripes with data and parity, namely Data-Parity stripe
+ * (STRIPE_R5C_WRITTEN == 0), we simply replay all the writes.
+ *
+ * For stripes with only data, namely Data-Only stripe
+ * (STRIPE_R5C_WRITTEN == 1), we load them to stripe cache state machine.
+ *
+ * For a stripe, if we see data after parity, we should discard all previous
+ * data and parity for this stripe, as these data are already flushed to
+ * the array.
+ *
+ * At the end of the scan, we return the new journal_tail, which points to
+ * first data-only stripe on the journal device, or next invalid meta block.
+ */
+static void r5c_recovery_flush_log(struct r5l_log *log,
 				   struct r5l_recovery_ctx *ctx)
 {
+	struct stripe_head *sh, *next;
+	int ret;
+
+	/* scan through the log */
 	while (1) {
-		if (r5l_read_meta_block(log, ctx))
-			return;
-		if (r5l_recovery_flush_one_meta(log, ctx))
-			return;
+		if (r5l_recovery_read_meta_block(log, ctx))
+			break;
+
+		ret = r5c_recovery_analyze_meta_block(log, ctx,
+						      &ctx->cached_list);
+		/*
+		 * -EAGAIN means mismatch in data block, in this case, we still
+		 * try scan the next metablock
+		 */
+		if (ret && ret != -EAGAIN)
+			break;
 		ctx->seq++;
 		ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
 	}
+
+	/* replay data-parity stripes */
+	r5c_recovery_replay_stripes(&ctx->cached_list, ctx);
+
+	/* load data-only stripes to stripe cache */
+	list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
+		WARN_ON(test_bit(STRIPE_R5C_WRITTEN, &sh->state));
+		r5c_recovery_load_one_stripe(log, sh);
+		list_del_init(&sh->lru);
+		raid5_release_stripe(sh);
+		ctx->data_only_stripes++;
+	}
+
+	return;
 }
 
-static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
-					  u64 seq)
+/*
+ * we did a recovery. Now ctx.pos points to an invalid meta block. New
+ * log will start here. but we can't let superblock point to last valid
+ * meta block. The log might looks like:
+ * | meta 1| meta 2| meta 3|
+ * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
+ * superblock points to meta 1, we write a new valid meta 2n.  if crash
+ * happens again, new recovery will start from meta 1. Since meta 2n is
+ * valid now, recovery will think meta 3 is valid, which is wrong.
+ * The solution is we create a new meta in meta2 with its seq == meta
+ * 1's seq + 10 and let superblock points to meta2. The same recovery will
+ * not think meta 3 is a valid meta, because its seq doesn't match
+ */
+
+/*
+ * Before recovery, the log looks like the following
+ *
+ *   ---------------------------------------------
+ *   |           valid log        | invalid log  |
+ *   ---------------------------------------------
+ *   ^
+ *   |- log->last_checkpoint
+ *   |- log->last_cp_seq
+ *
+ * Now we scan through the log until we see invalid entry
+ *
+ *   ---------------------------------------------
+ *   |           valid log        | invalid log  |
+ *   ---------------------------------------------
+ *   ^                            ^
+ *   |- log->last_checkpoint      |- ctx->pos
+ *   |- log->last_cp_seq          |- ctx->seq
+ *
+ * From this point, we need to increase seq number by 10 to avoid
+ * confusing next recovery.
+ *
+ *   ---------------------------------------------
+ *   |           valid log        | invalid log  |
+ *   ---------------------------------------------
+ *   ^                              ^
+ *   |- log->last_checkpoint        |- ctx->pos+1
+ *   |- log->last_cp_seq            |- ctx->seq+11
+ *
+ * However, it is not safe to start the state machine yet, because data only
+ * parities are not yet secured in RAID. To save these data only parities, we
+ * rewrite them from seq+11.
+ *
+ *   -----------------------------------------------------------------
+ *   |           valid log        | data only stripes | invalid log  |
+ *   -----------------------------------------------------------------
+ *   ^                                                ^
+ *   |- log->last_checkpoint                          |- ctx->pos+n
+ *   |- log->last_cp_seq                              |- ctx->seq+10+n
+ *
+ * If failure happens again during this process, the recovery can safe start
+ * again from log->last_checkpoint.
+ *
+ * Once data only stripes are rewritten to journal, we move log_tail
+ *
+ *   -----------------------------------------------------------------
+ *   |     old log        |    data only stripes    | invalid log  |
+ *   -----------------------------------------------------------------
+ *                        ^                         ^
+ *                        |- log->last_checkpoint   |- ctx->pos+n
+ *                        |- log->last_cp_seq       |- ctx->seq+10+n
+ *
+ * Then we can safely start the state machine. If failure happens from this
+ * point on, the recovery will start from new log->last_checkpoint.
+ */
+static int
+r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
+				       struct r5l_recovery_ctx *ctx)
 {
+	struct stripe_head *sh;
+	struct mddev *mddev = log->rdev->mddev;
 	struct page *page;
-	struct r5l_meta_block *mb;
-	u32 crc;
 
-	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-	if (!page)
+	page = alloc_page(GFP_KERNEL);
+	if (!page) {
+		pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n",
+		       mdname(mddev));
 		return -ENOMEM;
-	mb = page_address(page);
-	mb->magic = cpu_to_le32(R5LOG_MAGIC);
-	mb->version = R5LOG_VERSION;
-	mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
-	mb->seq = cpu_to_le64(seq);
-	mb->position = cpu_to_le64(pos);
-	crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
-	mb->checksum = cpu_to_le32(crc);
+	}
 
-	if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
-			  WRITE_FUA, false)) {
-		__free_page(page);
-		return -EIO;
+	ctx->seq += 10;
+	list_for_each_entry(sh, &ctx->cached_list, lru) {
+		struct r5l_meta_block *mb;
+		int i;
+		int offset;
+		sector_t write_pos;
+
+		WARN_ON(test_bit(STRIPE_R5C_WRITTEN, &sh->state));
+		r5l_recovery_create_emtpy_meta_block(log, page,
+						     ctx->pos, ctx->seq);
+		mb = page_address(page);
+		offset = le32_to_cpu(mb->meta_size);
+		write_pos = ctx->pos + BLOCK_SECTORS;
+
+		for (i = sh->disks; i--; ) {
+			struct r5dev *dev = &sh->dev[i];
+			struct r5l_payload_data_parity *payload;
+			void *addr;
+
+			if (test_bit(R5_InCache, &dev->flags)) {
+				payload = (void *)mb + offset;
+				payload->header.type = cpu_to_le16(
+					R5LOG_PAYLOAD_DATA);
+				payload->size = BLOCK_SECTORS;
+				payload->location = cpu_to_le64(
+					raid5_compute_blocknr(sh, i, 0));
+				addr = kmap_atomic(dev->page);
+				payload->checksum[0] = cpu_to_le32(
+					crc32c_le(log->uuid_checksum, addr,
+						  PAGE_SIZE));
+				kunmap_atomic(addr);
+				sync_page_io(log->rdev, write_pos, PAGE_SIZE,
+					     dev->page, REQ_OP_WRITE, 0, false);
+				write_pos = r5l_ring_add(log, write_pos,
+							 BLOCK_SECTORS);
+				offset += sizeof(__le32) +
+					sizeof(struct r5l_payload_data_parity);
+
+			}
+		}
+		mb->meta_size = cpu_to_le32(offset);
+		mb->checksum = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
+		sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
+			     REQ_OP_WRITE, WRITE_FUA, false);
+		sh->log_start = ctx->pos;
+		ctx->pos = write_pos;
+		ctx->seq += 1;
 	}
 	__free_page(page);
 	return 0;
@@ -1409,43 +1823,45 @@ static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
 
 static int r5l_recovery_log(struct r5l_log *log)
 {
+	struct mddev *mddev = log->rdev->mddev;
 	struct r5l_recovery_ctx ctx;
 
 	ctx.pos = log->last_checkpoint;
 	ctx.seq = log->last_cp_seq;
 	ctx.meta_page = alloc_page(GFP_KERNEL);
+	ctx.data_only_stripes = 0;
+	ctx.data_parity_stripes = 0;
+	INIT_LIST_HEAD(&ctx.cached_list);
+
 	if (!ctx.meta_page)
 		return -ENOMEM;
 
-	r5l_recovery_flush_log(log, &ctx);
+	r5c_recovery_flush_log(log, &ctx);
+
 	__free_page(ctx.meta_page);
 
-	/*
-	 * we did a recovery. Now ctx.pos points to an invalid meta block. New
-	 * log will start here. but we can't let superblock point to last valid
-	 * meta block. The log might looks like:
-	 * | meta 1| meta 2| meta 3|
-	 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
-	 * superblock points to meta 1, we write a new valid meta 2n.  if crash
-	 * happens again, new recovery will start from meta 1. Since meta 2n is
-	 * valid now, recovery will think meta 3 is valid, which is wrong.
-	 * The solution is we create a new meta in meta2 with its seq == meta
-	 * 1's seq + 10 and let superblock points to meta2. The same recovery will
-	 * not think meta 3 is a valid meta, because its seq doesn't match
-	 */
-	if (ctx.seq > log->last_cp_seq + 1) {
-		int ret;
-
-		ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
-		if (ret)
-			return ret;
-		log->seq = ctx.seq + 11;
-		log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
-		r5l_write_super(log, ctx.pos);
-	} else {
-		log->log_start = ctx.pos;
-		log->seq = ctx.seq;
+	if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
+		pr_info("md/raid:%s: starting from clean shutdown\n",
+			mdname(mddev));
+	else {
+		pr_info("md/raid:%s: recoverying %d data-only stripes and %d data-parity stripes\n",
+			mdname(mddev), ctx.data_only_stripes,
+			ctx.data_parity_stripes);
+
+		if (ctx.data_only_stripes > 0)
+			if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
+				pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
+				       mdname(mddev));
+				return -EIO;
+			}
 	}
+
+	log->log_start = ctx.pos;
+	log->next_checkpoint = ctx.pos;
+	log->seq = ctx.seq;
+	r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq);
+	r5l_write_super(log, ctx.pos);
+
 	return 0;
 }
 
-- 
2.9.3


^ permalink raw reply related

* [PATCH v3 6/8] md/r5cache: sysfs entry r5c_state
From: Song Liu @ 2016-10-06  7:14 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuang521,
	liuzhengyuan, Song Liu
In-Reply-To: <20161006071416.3295093-1-songliubraving@fb.com>

r5c_state have 4 states:
* no-cache;
* write-through (write journal only);
* write-back (w/ write cache);
* cache-broken (journal missing or Faulty)

When there is functional write cache, r5c_state is a knob to
switch between write-back and write-through.

When the journal device is broken, the raid array is forced
in readonly mode. In this case, r5c_state can be used to
remove "journal feature", and thus make the array read-write
without journal. By writing into r5c_cache_mode, the array
can transit from cache-broken to no-cache, which removes
journal feature for the array.

To remove the journal feature:
- When journal fails, the raid array is forced readonly mode
  (enforced by kernel)
- User uses the new interface to remove journal (writing 0
  to r5c_state, I will add a mdadm option for that later)
- User forces array read-write;
- Kernel updates superblock and array can run read/write.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.c       |  1 +
 drivers/md/raid5.h       |  1 +
 3 files changed, 60 insertions(+)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 688dae1..c1288a7 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -57,6 +57,8 @@ enum r5c_state {
 	R5C_STATE_CACHE_BROKEN = 3,
 };
 
+static char *r5c_state_str[] = {"no-cache", "write-through",
+				"write-back", "cache-broken"};
 /*
  * raid5 cache state machine
  *
@@ -1516,6 +1518,62 @@ int r5c_flush_cache(struct r5conf *conf, int num)
 	return count;
 }
 
+ssize_t r5c_state_show(struct mddev *mddev, char *page)
+{
+	struct r5conf *conf = mddev->private;
+	int val = 0;
+	int ret = 0;
+
+	if (conf->log)
+		val = conf->log->r5c_state;
+	else if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
+		val = R5C_STATE_CACHE_BROKEN;
+	ret += snprintf(page, PAGE_SIZE - ret, "%d: %s\n",
+			val, r5c_state_str[val]);
+	return ret;
+}
+
+ssize_t r5c_state_store(struct mddev *mddev, const char *page, size_t len)
+{
+	struct r5conf *conf = mddev->private;
+	struct r5l_log *log = conf->log;
+	int val;
+
+	if (kstrtoint(page, 10, &val))
+		return -EINVAL;
+	if (!log && val != R5C_STATE_NO_CACHE)
+		return -EINVAL;
+
+	if (val < R5C_STATE_NO_CACHE || val > R5C_STATE_WRITE_BACK)
+		return -EINVAL;
+	if (val == R5C_STATE_NO_CACHE) {
+		if (conf->log &&
+		    !test_bit(Faulty, &log->rdev->flags)) {
+			pr_err("md/raid:%s: journal device is in use, cannot remove it\n",
+			       mdname(mddev));
+			return -EINVAL;
+		}
+	}
+
+	if (log) {
+		mddev_suspend(mddev);
+		conf->log->r5c_state = val;
+		mddev_resume(mddev);
+	}
+
+	if (val == R5C_STATE_NO_CACHE) {
+		clear_bit(MD_HAS_JOURNAL, &mddev->flags);
+		set_bit(MD_UPDATE_SB_FLAGS, &mddev->flags);
+	}
+	pr_info("md/raid:%s: setting r5c cache mode to %d: %s\n",
+		mdname(mddev), val, r5c_state_str[val]);
+	return len;
+}
+
+struct md_sysfs_entry
+r5c_state = __ATTR(r5c_state, S_IRUGO | S_IWUSR,
+		   r5c_state_show, r5c_state_store);
+
 int r5c_handle_stripe_dirtying(struct r5conf *conf,
 			       struct stripe_head *sh,
 			       struct stripe_head_state *s,
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5977d44..ec51129 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6297,6 +6297,7 @@ static struct attribute *raid5_attrs[] =  {
 	&raid5_group_thread_cnt.attr,
 	&raid5_skip_copy.attr,
 	&raid5_rmw_level.attr,
+	&r5c_state.attr,
 	NULL,
 };
 static struct attribute_group raid5_attrs_group = {
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index d17eed4..6898a76 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -758,5 +758,6 @@ extern void r5c_do_reclaim(struct r5conf *conf);
 extern int r5c_flush_cache(struct r5conf *conf, int num);
 extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
 extern void r5c_check_cached_full_stripe(struct r5conf *conf);
+extern struct md_sysfs_entry r5c_state;
 
 #endif
-- 
2.9.3


^ permalink raw reply related

* [PATCH v3 5/8] md/r5cache: reclaim support
From: Song Liu @ 2016-10-06  7:14 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuang521,
	liuzhengyuan, Song Liu
In-Reply-To: <20161006071416.3295093-1-songliubraving@fb.com>

There are two limited resources, stripe cache and journal disk space.
For better performance, we priotize reclaim of full stripe writes.
To free up more journal space, we free earliest data on the journal.

In current implementation, reclaim happens when:
1. every R5C_RECLAIM_WAKEUP_INTERVAL (5 seconds)
2. when there are R5C_FULL_STRIPE_FLUSH_BATCH (8) cached full stripes
   (r5c_check_cached_full_stripe)
3. when raid5_get_active_stripe sees pressure in stripe cache space
   (r5c_check_stripe_cache_usage)
4. when there is pressure in journal space.

1-3 above are straightforward. The following explains details of 4.

To avoid deadlock due to log space, we need to reserve enough space
to flush cached data. The size of required log space depends on total
number of cached stripes (stripe_in_cache_count). In current
implementation, the reclaim path automatically include pending
data writes with parity writes (similar to write through case).
Therefore, we need up to (conf->raid_disks + 1) pages for each cached
stripe (1 page for meta data, raid_disks pages for all data and
parity). r5c_log_required_to_flush_cache() calculates log space
required to flush cache. In the following, we refer to the space
calculated by r5c_log_required_to_flush_cache() as
reclaim_required_space.

Two flags are added to r5conf->cache_state: R5C_LOG_TIGHT and
R5C_LOG_CRITICAL. R5C_LOG_TIGHT is set when free space on the log
device is less than 3x of reclaim_required_space. R5C_LOG_CRITICAL
is set when free space on the log device is less than 2x of
reclaim_required_space.

r5c_cache keeps all data in cache (not fully committed to RAID) in
a list (stripe_in_cache_list). These stripes are in the order of their
first appearance on the journal. So the log tail (last_checkpoint)
should point to the journal_start of the first item in the list.

When R5C_LOG_TIGHT is set, r5l_reclaim_thread starts flushing out
stripes at the head of stripe_in_cache. When R5C_LOG_CRITICAL is
set, the state machine only writes data that are already in the
log device (in stripe_in_cache_list).

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c | 362 +++++++++++++++++++++++++++++++++++++++++++----
 drivers/md/raid5.c       |  21 ++-
 drivers/md/raid5.h       |  39 +++--
 3 files changed, 384 insertions(+), 38 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 92d3d7b..688dae1 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -29,12 +29,21 @@
 #define BLOCK_SECTORS (8)
 
 /*
- * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
- * recovery scans a very long log
+ * log->max_free_space is min(1/4 disk size, 10G reclaimable space).
+ *
+ * In write through mode, the reclaim runs every log->max_free_space.
+ * This can prevent the recovery scans for too long
  */
 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
 
+/* wake up reclaim thread periodically */
+#define R5C_RECLAIM_WAKEUP_INTERVAL (5 * HZ)
+/* start flush with these full stripes */
+#define R5C_FULL_STRIPE_FLUSH_BATCH 8
+/* reclaim stripes in groups */
+#define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
+
 /*
  * We only need 2 bios per I/O unit to make progress, but ensure we
  * have a few more available to not get too tight.
@@ -141,6 +150,11 @@ struct r5l_log {
 
 	/* for r5c_cache */
 	enum r5c_state r5c_state;
+	struct list_head stripe_in_cache_list;	/* all stripes in r5cache, with
+						 * sh->log_start in order
+						 */
+	spinlock_t stripe_in_cache_lock;
+	atomic_t stripe_in_cache_count;
 };
 
 /*
@@ -256,6 +270,91 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
 	}
 }
 
+static inline int r5c_total_cached_stripes(struct r5conf *conf)
+{
+	return atomic_read(&conf->r5c_cached_partial_stripes) +
+		atomic_read(&conf->r5c_cached_full_stripes);
+}
+
+/*
+ * check whether we should flush some stripes to free up stripe cache
+ */
+void r5c_check_stripe_cache_usage(struct r5conf *conf)
+{
+	if (!r5c_is_writeback(conf->log))
+		return;
+	spin_lock(&conf->device_lock);
+	if ((r5c_total_cached_stripes(conf) >
+	     conf->min_nr_stripes * 3 / 4) ||
+	    atomic_read(&conf->empty_inactive_list_nr) > 0)
+		r5c_flush_cache(conf, R5C_RECLAIM_STRIPE_GROUP);
+	spin_unlock(&conf->device_lock);
+}
+
+/*
+ * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full
+ * stripes in the cache
+ */
+void r5c_check_cached_full_stripe(struct r5conf *conf)
+{
+	if (!r5c_is_writeback(conf->log))
+		return;
+	if (atomic_read(&conf->r5c_cached_full_stripes) >=
+	    R5C_FULL_STRIPE_FLUSH_BATCH)
+		r5l_wake_reclaim(conf->log, 0);
+}
+
+/*
+ * Total log space (in sectors) needed to flush all data in cache
+ *
+ * Currently, reclaim path automatically includes all pending writes
+ * to the same sector. So the reclaim of each stripe takes up to
+ * (conf->raid_disks + 1) pages of log space.
+ *
+ * To totally avoid deadlock due to log space, the code reserves
+ * (conf->raid_disks + 1) pages for each stripe in cache, which is not
+ * necessary in most cases.
+ *
+ * To improve this, we will need reclaim path to be able to NOT include
+ * pending writes, which will reduce the requirement to
+ * (conf->max_degraded + 1) pages per stripe in cache.
+ */
+static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
+{
+	struct r5l_log *log = conf->log;
+
+	if (!r5c_is_writeback(log))
+		return 0;
+
+	return BLOCK_SECTORS * (conf->raid_disks + 1) *
+		atomic_read(&log->stripe_in_cache_count);
+}
+
+/* evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL */
+static inline void r5c_update_log_state(struct r5l_log *log)
+{
+	struct r5conf *conf = log->rdev->mddev->private;
+	sector_t free_space = r5l_ring_distance(log, log->log_start,
+						log->last_checkpoint);
+	sector_t reclaim_space = r5c_log_required_to_flush_cache(conf);
+
+	if (!r5c_is_writeback(log))
+		return;
+	if (free_space < reclaim_space)
+		BUG();
+	else if (free_space < 2 * reclaim_space) {
+		set_bit(R5C_LOG_CRITICAL, &conf->cache_state);
+		set_bit(R5C_LOG_TIGHT, &conf->cache_state);
+	} else if (free_space < 3 * reclaim_space) {
+		clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
+		set_bit(R5C_LOG_TIGHT, &conf->cache_state);
+	} else if (free_space > 4 * reclaim_space) {
+		clear_bit(R5C_LOG_TIGHT, &conf->cache_state);
+		clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
+	} else
+		clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
+}
+
 /*
  * Freeze the stripe, thus send the stripe into reclaim path.
  *
@@ -290,6 +389,19 @@ void r5c_freeze_stripe_for_reclaim(struct stripe_head *sh)
 	}
 }
 
+/*
+ * do not release a stripe to cached lists in quiesce
+ */
+void r5c_prepare_stripe_for_release_in_quiesce(struct stripe_head *sh)
+{
+	if (!test_bit(STRIPE_HANDLE, &sh->state) &&
+	    atomic_read(&sh->dev_in_cache) != 0) {
+		if (!test_bit(STRIPE_R5C_FROZEN, &sh->state))
+			r5c_freeze_stripe_for_reclaim(sh);
+		set_bit(STRIPE_HANDLE, &sh->state);
+	}
+}
+
 static void r5c_handle_data_cached(struct stripe_head *sh)
 {
 	int i;
@@ -435,6 +547,7 @@ static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
 {
 	log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
 
+	r5c_update_log_state(log);
 	/*
 	 * If we filled up the log device start from the beginning again,
 	 * which will require a new bio.
@@ -552,6 +665,7 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 	int meta_size;
 	int ret;
 	struct r5l_io_unit *io;
+	unsigned long flags;
 
 	meta_size =
 		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
@@ -595,6 +709,18 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 	atomic_inc(&io->pending_stripe);
 	sh->log_io = io;
 
+	if (log->r5c_state == R5C_STATE_WRITE_THROUGH)
+		return 0;
+
+	if (sh->log_start == MaxSector) {
+		BUG_ON(!list_empty(&sh->r5c));
+		sh->log_start = io->log_start;
+		spin_lock_irqsave(&log->stripe_in_cache_lock, flags);
+		list_add_tail(&sh->r5c,
+			      &log->stripe_in_cache_list);
+		spin_unlock_irqrestore(&log->stripe_in_cache_lock, flags);
+		atomic_inc(&log->stripe_in_cache_count);
+	}
 	return 0;
 }
 
@@ -604,6 +730,7 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
  */
 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 {
+	struct r5conf *conf = sh->raid_conf;
 	int write_disks = 0;
 	int data_pages, parity_pages;
 	int reserve;
@@ -654,13 +781,36 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 	mutex_lock(&log->io_mutex);
 	/* meta + data */
 	reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
-	if (!r5l_has_free_space(log, reserve)) {
-		spin_lock(&log->no_space_stripes_lock);
-		list_add_tail(&sh->log_list, &log->no_space_stripes);
-		spin_unlock(&log->no_space_stripes_lock);
 
-		r5l_wake_reclaim(log, reserve);
-	} else {
+	if (log->r5c_state == R5C_STATE_WRITE_THROUGH) {
+		if (!r5l_has_free_space(log, reserve)) {
+			spin_lock(&log->no_space_stripes_lock);
+			list_add_tail(&sh->log_list, &log->no_space_stripes);
+			spin_unlock(&log->no_space_stripes_lock);
+			r5l_wake_reclaim(log, reserve);
+		} else {
+			ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
+			if (ret) {
+				spin_lock_irq(&log->io_list_lock);
+				list_add_tail(&sh->log_list,
+					      &log->no_mem_stripes);
+				spin_unlock_irq(&log->io_list_lock);
+			}
+		}
+	} else {  /* R5C_STATE_WRITE_BACK */
+		if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) {
+			/* log space critical, only process stripes in cache */
+			if (list_empty(&sh->r5c)) {
+				spin_lock(&log->no_space_stripes_lock);
+				list_add_tail(&sh->log_list,
+					      &log->no_space_stripes);
+				spin_unlock(&log->no_space_stripes_lock);
+				mutex_unlock(&log->io_mutex);
+				r5l_wake_reclaim(log, reserve);
+				return -ENOSPC;
+			}
+		}
+		BUG_ON(!r5l_has_free_space(log, reserve));
 		ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
 		if (ret) {
 			spin_lock_irq(&log->io_list_lock);
@@ -716,10 +866,40 @@ static void r5l_run_no_space_stripes(struct r5l_log *log)
 	spin_unlock(&log->no_space_stripes_lock);
 }
 
+/*
+ * calculate new last_checkpoint
+ * for write through mode, returns log->next_checkpoint
+ * for write back, returns log_start of first sh in stripe_in_cache_list
+ */
+static sector_t r5c_calculate_new_cp(struct r5conf *conf)
+{
+	struct stripe_head *sh;
+	struct r5l_log *log = conf->log;
+	sector_t end = MaxSector;
+	unsigned long flags;
+
+	if (log->r5c_state == R5C_STATE_WRITE_THROUGH)
+		return log->next_checkpoint;
+
+	spin_lock_irqsave(&log->stripe_in_cache_lock, flags);
+	if (list_empty(&conf->log->stripe_in_cache_list)) {
+		/* all stripes flushed */
+		spin_unlock_irqrestore(&log->stripe_in_cache_lock, flags);
+		return log->next_checkpoint;
+	}
+	sh = list_first_entry(&conf->log->stripe_in_cache_list,
+			      struct stripe_head, r5c);
+	end = sh->log_start;
+	spin_unlock_irqrestore(&log->stripe_in_cache_lock, flags);
+	return end;
+}
+
 static sector_t r5l_reclaimable_space(struct r5l_log *log)
 {
+	struct r5conf *conf = log->rdev->mddev->private;
+
 	return r5l_ring_distance(log, log->last_checkpoint,
-				 log->next_checkpoint);
+				 r5c_calculate_new_cp(conf));
 }
 
 static void r5l_run_no_mem_stripe(struct r5l_log *log)
@@ -765,6 +945,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log)
 static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
 {
 	struct r5l_log *log = io->log;
+	struct r5conf *conf = log->rdev->mddev->private;
 	unsigned long flags;
 
 	spin_lock_irqsave(&log->io_list_lock, flags);
@@ -775,7 +956,8 @@ static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
 		return;
 	}
 
-	if (r5l_reclaimable_space(log) > log->max_free_space)
+	if (r5l_reclaimable_space(log) > log->max_free_space ||
+	    test_bit(R5C_LOG_TIGHT, &conf->cache_state))
 		r5l_wake_reclaim(log, 0);
 
 	spin_unlock_irqrestore(&log->io_list_lock, flags);
@@ -898,10 +1080,10 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
 
 static void r5l_do_reclaim(struct r5l_log *log)
 {
+	struct r5conf *conf = log->rdev->mddev->private;
 	sector_t reclaim_target = xchg(&log->reclaim_target, 0);
 	sector_t reclaimable;
 	sector_t next_checkpoint;
-	u64 next_cp_seq;
 
 	spin_lock_irq(&log->io_list_lock);
 	/*
@@ -924,8 +1106,7 @@ static void r5l_do_reclaim(struct r5l_log *log)
 				    log->io_list_lock);
 	}
 
-	next_checkpoint = log->next_checkpoint;
-	next_cp_seq = log->next_cp_seq;
+	next_checkpoint = r5c_calculate_new_cp(conf);
 	spin_unlock_irq(&log->io_list_lock);
 
 	BUG_ON(reclaimable < 0);
@@ -941,7 +1122,7 @@ static void r5l_do_reclaim(struct r5l_log *log)
 
 	mutex_lock(&log->io_mutex);
 	log->last_checkpoint = next_checkpoint;
-	log->last_cp_seq = next_cp_seq;
+	r5c_update_log_state(log);
 	mutex_unlock(&log->io_mutex);
 
 	r5l_run_no_space_stripes(log);
@@ -955,6 +1136,7 @@ static void r5l_reclaim_thread(struct md_thread *thread)
 
 	if (!log)
 		return;
+	r5c_do_reclaim(conf);
 	r5l_do_reclaim(log);
 }
 
@@ -963,6 +1145,8 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
 	unsigned long target;
 	unsigned long new = (unsigned long)space; /* overflow in theory */
 
+	if (!log)
+		return;
 	do {
 		target = log->reclaim_target;
 		if (new < target)
@@ -990,7 +1174,7 @@ void r5l_quiesce(struct r5l_log *log, int state)
 		/* make sure r5l_write_super_and_discard_space exits */
 		mddev = log->rdev->mddev;
 		wake_up(&mddev->sb_wait);
-		r5l_wake_reclaim(log, -1L);
+		r5l_wake_reclaim(log, MaxSector);
 		md_unregister_thread(&log->reclaim_thread);
 		r5l_do_reclaim(log);
 	}
@@ -1271,6 +1455,67 @@ static void r5l_write_super(struct r5l_log *log, sector_t cp)
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 }
 
+/*
+ * r5c_flush_stripe will move stripe from cached list to handle_list
+ *
+ * must hold conf->device_lock
+ */
+static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
+{
+	BUG_ON(list_empty(&sh->lru));
+
+	if (!test_bit(STRIPE_R5C_FROZEN, &sh->state))
+		r5c_freeze_stripe_for_reclaim(sh);
+	if (!test_and_set_bit(STRIPE_HANDLE, &sh->state))
+		atomic_inc(&conf->active_stripes);
+	clear_bit(STRIPE_DELAYED, &sh->state);
+	clear_bit(STRIPE_BIT_DELAY, &sh->state);
+
+	list_del_init(&sh->lru);
+	atomic_inc(&sh->count);
+	raid5_release_stripe(sh);
+}
+
+/*
+ * if num <= 0, flush all stripes
+ * if num > 0, flush at most num stripes
+ */
+int r5c_flush_cache(struct r5conf *conf, int num)
+{
+	int count = 0;
+	struct stripe_head *sh, *next;
+
+	assert_spin_locked(&conf->device_lock);
+	if (!conf->log)
+		return 0;
+
+	list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) {
+		r5c_flush_stripe(conf, sh);
+		count++;
+		if (num > 0 && count >= num && count >=
+		    R5C_FULL_STRIPE_FLUSH_BATCH)
+			return count;
+	}
+
+	list_for_each_entry_safe(sh, next,
+				 &conf->r5c_partial_stripe_list, lru) {
+		r5c_flush_stripe(conf, sh);
+		count++;
+		if (num > 0 && count == num)
+			return count;
+	}
+
+	if (num <= 0) {
+		list_for_each_entry_safe(sh, next, &conf->delayed_list, lru) {
+			if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) ||
+			    test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
+				r5c_flush_stripe(conf, sh);
+		}
+		r5l_run_no_space_stripes(conf->log);
+	}
+	return count;
+}
+
 int r5c_handle_stripe_dirtying(struct r5conf *conf,
 			       struct stripe_head *sh,
 			       struct stripe_head_state *s,
@@ -1327,6 +1572,7 @@ void r5c_handle_stripe_written(struct r5conf *conf,
 {
 	int i;
 	int do_wakeup = 0;
+	unsigned long flags;
 
 	if (!test_and_clear_bit(STRIPE_R5C_WRITTEN, &sh->state))
 		return;
@@ -1349,12 +1595,22 @@ void r5c_handle_stripe_written(struct r5conf *conf,
 
 	if (do_wakeup)
 		wake_up(&conf->wait_for_overlap);
+
+	if (conf->log->r5c_state == R5C_STATE_WRITE_THROUGH)
+		return;
+
+	spin_lock_irqsave(&conf->log->stripe_in_cache_lock, flags);
+	list_del_init(&sh->r5c);
+	spin_unlock_irqrestore(&conf->log->stripe_in_cache_lock, flags);
+	sh->log_start = MaxSector;
+	atomic_dec(&conf->log->stripe_in_cache_count);
 }
 
 int
 r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
 	       struct stripe_head_state *s)
 {
+	struct r5conf *conf = sh->raid_conf;
 	int pages;
 	int reserve;
 	int i;
@@ -1387,25 +1643,71 @@ r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
 	mutex_lock(&log->io_mutex);
 	/* meta + data */
 	reserve = (1 + pages) << (PAGE_SHIFT - 9);
-	if (!r5l_has_free_space(log, reserve)) {
-		spin_lock(&log->no_space_stripes_lock);
-		list_add_tail(&sh->log_list, &log->no_space_stripes);
-		spin_unlock(&log->no_space_stripes_lock);
 
-		r5l_wake_reclaim(log, reserve);
-	} else {
-		ret = r5l_log_stripe(log, sh, pages, 0);
-		if (ret) {
-			spin_lock_irq(&log->io_list_lock);
-			list_add_tail(&sh->log_list, &log->no_mem_stripes);
-			spin_unlock_irq(&log->io_list_lock);
+	if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) {
+		/* if log space critical, only process data already cached */
+		if (list_empty(&sh->r5c)) {
+			spin_lock(&log->no_space_stripes_lock);
+			list_add_tail(&sh->log_list, &log->no_space_stripes);
+			spin_unlock(&log->no_space_stripes_lock);
+			mutex_unlock(&log->io_mutex);
+			return -ENOSPC;
 		}
 	}
+	/*
+	 * r5cache reserves enough log space for reclaim. If there is not
+	 * enough space to process, there must be a bug in reclaim
+	 */
+	if (!r5l_has_free_space(log, reserve))
+		BUG();
+	ret = r5l_log_stripe(log, sh, pages, 0);
+	if (ret) {
+		spin_lock_irq(&log->io_list_lock);
+		list_add_tail(&sh->log_list, &log->no_mem_stripes);
+		spin_unlock_irq(&log->io_list_lock);
+	}
 
 	mutex_unlock(&log->io_mutex);
 	return 0;
 }
 
+void r5c_do_reclaim(struct r5conf *conf)
+{
+	struct r5l_log *log = conf->log;
+	struct stripe_head *sh, *next;
+	int count = 0;
+	unsigned long flags;
+	sector_t last_checkpoint;
+
+	if (!r5c_is_writeback(log))
+		return;
+
+	/* flush all full stripes */
+	spin_lock_irqsave(&conf->device_lock, flags);
+	list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru)
+		r5c_flush_stripe(conf, sh);
+	spin_unlock_irqrestore(&conf->device_lock, flags);
+
+	/* if log space is tight, start flushing data near last_checkpoint */
+	if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) {
+		spin_lock_irqsave(&log->stripe_in_cache_lock, flags);
+		spin_lock(&conf->device_lock);
+		last_checkpoint = (list_first_entry(&log->stripe_in_cache_list,
+						    struct stripe_head, r5c))->log_start;
+		list_for_each_entry(sh, &log->stripe_in_cache_list, r5c) {
+			if (!list_empty(&sh->lru)) {
+				r5c_flush_stripe(conf, sh);
+				count++;
+			}
+			if (count >= R5C_RECLAIM_STRIPE_GROUP)
+				break;
+		}
+		spin_unlock(&conf->device_lock);
+		spin_unlock_irqrestore(&log->stripe_in_cache_lock, flags);
+	}
+	md_wakeup_thread(conf->mddev->thread);
+}
+
 static int r5l_load_log(struct r5l_log *log)
 {
 	struct md_rdev *rdev = log->rdev;
@@ -1463,6 +1765,9 @@ create:
 	if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
 		log->max_free_space = RECLAIM_MAX_FREE_SPACE;
 	log->last_checkpoint = cp;
+	mutex_lock(&log->io_mutex);
+	r5c_update_log_state(log);
+	mutex_unlock(&log->io_mutex);
 
 	__free_page(page);
 
@@ -1534,6 +1839,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 						 log->rdev->mddev, "reclaim");
 	if (!log->reclaim_thread)
 		goto reclaim_thread;
+	log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
+
 	init_waitqueue_head(&log->iounit_wait);
 
 	INIT_LIST_HEAD(&log->no_mem_stripes);
@@ -1542,6 +1849,9 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	spin_lock_init(&log->no_space_stripes_lock);
 
 	log->r5c_state = R5C_STATE_WRITE_THROUGH;
+	INIT_LIST_HEAD(&log->stripe_in_cache_list);
+	spin_lock_init(&log->stripe_in_cache_lock);
+	atomic_set(&log->stripe_in_cache_count, 0);
 
 	if (r5l_load_log(log))
 		goto error;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 27fd183..5977d44 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -220,6 +220,11 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 {
 	BUG_ON(!list_empty(&sh->lru));
 	BUG_ON(atomic_read(&conf->active_stripes)==0);
+
+	/* When quiesce in r5c write back, make sure the stripe is handled*/
+	if (conf->quiesce && r5c_is_writeback(conf->log))
+		r5c_prepare_stripe_for_release_in_quiesce(sh);
+
 	if (test_bit(STRIPE_HANDLE, &sh->state)) {
 		if (test_bit(STRIPE_DELAYED, &sh->state) &&
 		    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
@@ -256,6 +261,7 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 				if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
 					atomic_dec(&conf->r5c_cached_partial_stripes);
 				list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
+				r5c_check_cached_full_stripe(conf);
 			} else {
 				/* partial stripe */
 				if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE,
@@ -626,9 +632,12 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
 			}
 			if (noblock && sh == NULL)
 				break;
+
+			r5c_check_stripe_cache_usage(conf);
 			if (!sh) {
 				set_bit(R5_INACTIVE_BLOCKED,
 					&conf->cache_state);
+				r5l_wake_reclaim(conf->log, 0);
 				wait_event_lock_irq(
 					conf->wait_for_stripe,
 					!list_empty(conf->inactive_list + hash) &&
@@ -844,6 +853,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 	struct r5conf *conf = sh->raid_conf;
 	int i, disks = sh->disks;
 	struct stripe_head *head_sh = sh;
+	int ret;
 
 	might_sleep();
 
@@ -852,8 +862,10 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 		return;
 	}
 
-	if (r5l_write_stripe(conf->log, sh) == 0)
+	ret = r5l_write_stripe(conf->log, sh);
+	if (ret == 0 || ret == -ENOSPC)
 		return;
+
 	for (i = disks; i--; ) {
 		int op, op_flags = 0;
 		int replace_only = 0;
@@ -1983,8 +1995,10 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
 		spin_lock_init(&sh->batch_lock);
 		INIT_LIST_HEAD(&sh->batch_list);
 		INIT_LIST_HEAD(&sh->lru);
+		INIT_LIST_HEAD(&sh->r5c);
 		atomic_set(&sh->count, 1);
 		atomic_set(&sh->dev_in_cache, 0);
+		sh->log_start = MaxSector;
 		for (i = 0; i < disks; i++) {
 			struct r5dev *dev = &sh->dev[i];
 
@@ -4739,6 +4753,10 @@ static int raid5_congested(struct mddev *mddev, int bits)
 
 	if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
 		return 1;
+
+	/* Also checks whether there is pressure on r5cache log space */
+	if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
+		return 1;
 	if (conf->quiesce)
 		return 1;
 	if (atomic_read(&conf->empty_inactive_list_nr))
@@ -7704,6 +7722,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
 		/* '2' tells resync/reshape to pause so that all
 		 * active stripes can drain
 		 */
+		r5c_flush_cache(conf, 0);
 		conf->quiesce = 2;
 		wait_event_cmd(conf->wait_for_quiescent,
 				    atomic_read(&conf->active_stripes) == 0 &&
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index ac6d7c7..d17eed4 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -227,6 +227,8 @@ struct stripe_head {
 	struct r5l_io_unit	*log_io;
 	struct list_head	log_list;
 	atomic_t		dev_in_cache;
+	sector_t		log_start; /* first meta block on the journal */
+	struct list_head	r5c; /* for r5c_cache->stripe_in_cache */
 	/**
 	 * struct stripe_operations
 	 * @target - STRIPE_OP_COMPUTE_BLK target
@@ -521,6 +523,26 @@ struct r5worker_group {
 	int stripes_cnt;
 };
 
+enum r5_cache_state {
+	R5_INACTIVE_BLOCKED,	/* release of inactive stripes blocked,
+				 * waiting for 25% to be free
+				 */
+	R5_ALLOC_MORE,		/* It might help to allocate another
+				 * stripe.
+				 */
+	R5_DID_ALLOC,		/* A stripe was allocated, don't allocate
+				 * more until at least one has been
+				 * released.  This avoids flooding
+				 * the cache.
+				 */
+	R5C_LOG_TIGHT,		/* journal device space tight, need to
+				 * prioritize stripes at last_checkpoint
+				 */
+	R5C_LOG_CRITICAL,	/* journal device is running out of space,
+				 * only process stripes at last_checkpoint
+				 */
+};
+
 struct r5conf {
 	struct hlist_head	*stripe_hashtbl;
 	/* only protect corresponding hash list and inactive_list */
@@ -622,17 +644,6 @@ struct r5conf {
 	wait_queue_head_t	wait_for_stripe;
 	wait_queue_head_t	wait_for_overlap;
 	unsigned long		cache_state;
-#define R5_INACTIVE_BLOCKED	1	/* release of inactive stripes blocked,
-					 * waiting for 25% to be free
-					 */
-#define R5_ALLOC_MORE		2	/* It might help to allocate another
-					 * stripe.
-					 */
-#define R5_DID_ALLOC		4	/* A stripe was allocated, don't allocate
-					 * more until at least one has been
-					 * released.  This avoids flooding
-					 * the cache.
-					 */
 	struct shrinker		shrinker;
 	int			pool_size; /* number of disks in stripeheads in pool */
 	spinlock_t		device_lock;
@@ -742,4 +753,10 @@ extern void r5c_handle_cached_data_endio(struct r5conf *conf,
 extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
 			  struct stripe_head_state *s);
 extern void r5c_freeze_stripe_for_reclaim(struct stripe_head *sh);
+extern void r5c_prepare_stripe_for_release_in_quiesce(struct stripe_head *sh);
+extern void r5c_do_reclaim(struct r5conf *conf);
+extern int r5c_flush_cache(struct r5conf *conf, int num);
+extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
+extern void r5c_check_cached_full_stripe(struct r5conf *conf);
+
 #endif
-- 
2.9.3


^ permalink raw reply related

* [PATCH v3 4/8] md/r5cache: write part of r5cache
From: Song Liu @ 2016-10-06  7:14 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuang521,
	liuzhengyuan, Song Liu
In-Reply-To: <20161006071416.3295093-1-songliubraving@fb.com>

This is the write part of r5cache. The cache is integrated with
stripe cache of raid456. It leverages code of r5l_log to write
data to journal device.

r5cache split current write path into 2 parts: the write path
and the reclaim path. The write path is as following:
1. write data to journal
   (r5c_handle_stripe_dirtying, r5c_cache_data)
2. call bio_endio
   (r5c_handle_data_cached, r5c_return_dev_pending_writes).

Then the reclaim path is as:
1. Freeze the stripe (r5c_freeze_stripe_for_reclaim)
2. Calcualte parity (reconstruct or RMW)
3. Write parity (and maybe some other data) to journal device
4. Write data and parity to RAID disks

Reclaim path of the cache is implemented in the next patch.

With r5cache, write operation does not wait for parity calculation
and write out, so the write latency is lower (1 write to journal
device vs. read and then write to raid disks). Also, r5cache will
reduce RAID overhead (multipile IO due to read-modify-write of
parity) and provide more opportunities of full stripe writes.

This patch adds 2 flags to stripe_head.state:
 - STRIPE_R5C_PARTIAL_STRIPE,
 - STRIPE_R5C_FULL_STRIPE,

Instead of inactive_list, stripes with cached data are tracked in
r5conf->r5c_full_stripe_list and r5conf->r5c_partial_stripe_list.
STRIPE_R5C_FULL_STRIPE and STRIPE_R5C_PARTIAL_STRIPE are flags for
stripes in these lists. Note: stripes in r5c_full/partial_stripe_list
are not considered as "active".

For RMW, the code allocates an extra page for each data block
being updated.  This is stored in r5dev->page and the old data
is read into it.  Then the prexor calculation subtracts ->page
from the parity block, and the reconstruct calculation adds the
->orig_page data back into the parity block.

r5cache naturally excludes SkipCopy. With R5_Wantcache bit set,
async_copy_data will not skip copy.

There are some known limitations of the cache implementation:

1. Write cache only covers full page writes (R5_OVERWRITE). Writes
   of smaller granularity are write through.
2. Only one log io (sh->log_io) for each stripe at anytime. Later
   writes for the same stripe have to wait. This can be improved by
   moving log_io to r5dev.
3. With writeback cache, read path must enter state machine, which
   is a significant bottleneck for some workloads.
4. There is no per stripe checkpoint (with r5l_payload_flush) in
   the log, so recovery code has to replay more than necessary data
   (sometimes all the log from last_checkpoint). This reduces
   availability of the array.

This patch includes a fix proposed by ZhengYuan Liu
<liuzhengyuan@kylinos.cn>

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c | 204 +++++++++++++++++++++++++++++++++++++++++++++--
 drivers/md/raid5.c       | 137 ++++++++++++++++++++++++++-----
 drivers/md/raid5.h       |  22 +++++
 3 files changed, 336 insertions(+), 27 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 9e05850..92d3d7b 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -20,6 +20,7 @@
 #include <linux/random.h>
 #include "md.h"
 #include "raid5.h"
+#include "bitmap.h"
 
 /*
  * metadata/data stored in disk with 4k size unit (a block) regardless
@@ -217,6 +218,44 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
 	io->state = state;
 }
 
+static void
+r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
+			      struct bio_list *return_bi)
+{
+	struct bio *wbi, *wbi2;
+
+	wbi = dev->written;
+	dev->written = NULL;
+	while (wbi && wbi->bi_iter.bi_sector <
+	       dev->sector + STRIPE_SECTORS) {
+		wbi2 = r5_next_bio(wbi, dev->sector);
+		if (!raid5_dec_bi_active_stripes(wbi)) {
+			md_write_end(conf->mddev);
+			bio_list_add(return_bi, wbi);
+		}
+		wbi = wbi2;
+	}
+}
+
+void r5c_handle_cached_data_endio(struct r5conf *conf,
+	  struct stripe_head *sh, int disks, struct bio_list *return_bi)
+{
+	int i;
+
+	for (i = sh->disks; i--; ) {
+		if (test_bit(R5_InCache, &sh->dev[i].flags) &&
+		    sh->dev[i].written) {
+			set_bit(R5_UPTODATE, &sh->dev[i].flags);
+			r5c_return_dev_pending_writes(conf, &sh->dev[i],
+						      return_bi);
+			bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+					STRIPE_SECTORS,
+					!test_bit(STRIPE_DEGRADED, &sh->state),
+					0);
+		}
+	}
+}
+
 /*
  * Freeze the stripe, thus send the stripe into reclaim path.
  *
@@ -233,6 +272,48 @@ void r5c_freeze_stripe_for_reclaim(struct stripe_head *sh)
 		return;
 	WARN_ON(test_bit(STRIPE_R5C_FROZEN, &sh->state));
 	set_bit(STRIPE_R5C_FROZEN, &sh->state);
+
+	if (log->r5c_state == R5C_STATE_WRITE_THROUGH)
+		return;
+
+	if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+		atomic_inc(&conf->preread_active_stripes);
+
+	if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
+		BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
+		atomic_dec(&conf->r5c_cached_partial_stripes);
+	}
+
+	if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
+		BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
+		atomic_dec(&conf->r5c_cached_full_stripes);
+	}
+}
+
+static void r5c_handle_data_cached(struct stripe_head *sh)
+{
+	int i;
+
+	for (i = sh->disks; i--; )
+		if (test_and_clear_bit(R5_Wantcache, &sh->dev[i].flags)) {
+			set_bit(R5_InCache, &sh->dev[i].flags);
+			clear_bit(R5_LOCKED, &sh->dev[i].flags);
+			atomic_inc(&sh->dev_in_cache);
+		}
+}
+
+/*
+ * this journal write must contain full parity,
+ * it may also contain some data pages
+ */
+static void r5c_handle_parity_cached(struct stripe_head *sh)
+{
+	int i;
+
+	for (i = sh->disks; i--; )
+		if (test_bit(R5_InCache, &sh->dev[i].flags))
+			set_bit(R5_Wantwrite, &sh->dev[i].flags);
+	set_bit(STRIPE_R5C_WRITTEN, &sh->state);
 }
 
 static void r5c_finish_cache_stripe(struct stripe_head *sh)
@@ -242,8 +323,10 @@ static void r5c_finish_cache_stripe(struct stripe_head *sh)
 	if (log->r5c_state == R5C_STATE_WRITE_THROUGH) {
 		BUG_ON(!test_bit(STRIPE_R5C_FROZEN, &sh->state));
 		set_bit(STRIPE_R5C_WRITTEN, &sh->state);
-	} else
-		BUG(); /* write back logic in next patch */
+	} else if (test_bit(STRIPE_R5C_FROZEN, &sh->state))
+		r5c_handle_parity_cached(sh);
+	else
+		r5c_handle_data_cached(sh);
 }
 
 static void r5l_io_run_stripes(struct r5l_io_unit *io)
@@ -483,7 +566,8 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 	io = log->current_io;
 
 	for (i = 0; i < sh->disks; i++) {
-		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
+		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) &&
+		    !test_bit(R5_Wantcache, &sh->dev[i].flags))
 			continue;
 		if (i == sh->pd_idx || i == sh->qd_idx)
 			continue;
@@ -514,7 +598,6 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 	return 0;
 }
 
-static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
 /*
  * running in raid5d, where reclaim could wait for raid5d too (when it flushes
  * data from log to raid disks), so we shouldn't wait for reclaim here
@@ -544,6 +627,10 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 
 		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
 			continue;
+
+		if (test_bit(R5_InCache, &sh->dev[i].flags))
+			continue;
+
 		write_disks++;
 		/* checksum is already calculated in last run */
 		if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
@@ -809,7 +896,6 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
 	}
 }
 
-
 static void r5l_do_reclaim(struct r5l_log *log)
 {
 	sector_t reclaim_target = xchg(&log->reclaim_target, 0);
@@ -872,7 +958,7 @@ static void r5l_reclaim_thread(struct md_thread *thread)
 	r5l_do_reclaim(log);
 }
 
-static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
+void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
 {
 	unsigned long target;
 	unsigned long new = (unsigned long)space; /* overflow in theory */
@@ -1191,6 +1277,8 @@ int r5c_handle_stripe_dirtying(struct r5conf *conf,
 			       int disks)
 {
 	struct r5l_log *log = conf->log;
+	int i;
+	struct r5dev *dev;
 
 	if (!log || test_bit(STRIPE_R5C_FROZEN, &sh->state))
 		return -EAGAIN;
@@ -1201,21 +1289,121 @@ int r5c_handle_stripe_dirtying(struct r5conf *conf,
 		r5c_freeze_stripe_for_reclaim(sh);
 		return -EAGAIN;
 	}
-	BUG();  /* write back logic in next commit */
+
+	s->to_cache = 0;
+
+	for (i = disks; i--; ) {
+		dev = &sh->dev[i];
+		/* if none-overwrite, use the reclaim path (write through) */
+		if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
+		    !test_bit(R5_InCache, &dev->flags)) {
+			r5c_freeze_stripe_for_reclaim(sh);
+			return -EAGAIN;
+		}
+	}
+
+	for (i = disks; i--; ) {
+		dev = &sh->dev[i];
+		if (dev->towrite) {
+			set_bit(R5_Wantcache, &dev->flags);
+			set_bit(R5_Wantdrain, &dev->flags);
+			set_bit(R5_LOCKED, &dev->flags);
+			s->to_cache++;
+		}
+	}
+
+	if (s->to_cache)
+		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+
 	return 0;
 }
 
 /*
  * clean up the stripe (clear STRIPE_R5C_FROZEN etc.) after the stripe is
  * committed to RAID disks
-*/
+ */
 void r5c_handle_stripe_written(struct r5conf *conf,
 			       struct stripe_head *sh)
 {
+	int i;
+	int do_wakeup = 0;
+
 	if (!test_and_clear_bit(STRIPE_R5C_WRITTEN, &sh->state))
 		return;
 	WARN_ON(!test_bit(STRIPE_R5C_FROZEN, &sh->state));
 	clear_bit(STRIPE_R5C_FROZEN, &sh->state);
+
+	if (conf->log->r5c_state == R5C_STATE_WRITE_THROUGH)
+		return;
+
+	for (i = sh->disks; i--; ) {
+		if (test_and_clear_bit(R5_InCache, &sh->dev[i].flags))
+			atomic_dec(&sh->dev_in_cache);
+		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+			do_wakeup = 1;
+	}
+
+	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
+		if (atomic_dec_and_test(&conf->pending_full_writes))
+			md_wakeup_thread(conf->mddev->thread);
+
+	if (do_wakeup)
+		wake_up(&conf->wait_for_overlap);
+}
+
+int
+r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
+	       struct stripe_head_state *s)
+{
+	int pages;
+	int reserve;
+	int i;
+	int ret = 0;
+	int page_count = 0;
+
+	BUG_ON(!log);
+
+	for (i = 0; i < sh->disks; i++) {
+		void *addr;
+
+		if (!test_bit(R5_Wantcache, &sh->dev[i].flags))
+			continue;
+		addr = kmap_atomic(sh->dev[i].page);
+		sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
+						    addr, PAGE_SIZE);
+		kunmap_atomic(addr);
+		page_count++;
+	}
+	WARN_ON(page_count != s->to_cache);
+	pages = s->to_cache;
+
+	/*
+	 * The stripe must enter state machine again to call endio, so
+	 * don't delay.
+	 */
+	clear_bit(STRIPE_DELAYED, &sh->state);
+	atomic_inc(&sh->count);
+
+	mutex_lock(&log->io_mutex);
+	/* meta + data */
+	reserve = (1 + pages) << (PAGE_SHIFT - 9);
+	if (!r5l_has_free_space(log, reserve)) {
+		spin_lock(&log->no_space_stripes_lock);
+		list_add_tail(&sh->log_list, &log->no_space_stripes);
+		spin_unlock(&log->no_space_stripes_lock);
+
+		r5l_wake_reclaim(log, reserve);
+	} else {
+		ret = r5l_log_stripe(log, sh, pages, 0);
+		if (ret) {
+			spin_lock_irq(&log->io_list_lock);
+			list_add_tail(&sh->log_list, &log->no_mem_stripes);
+			spin_unlock_irq(&log->io_list_lock);
+		}
+	}
+
+	mutex_unlock(&log->io_mutex);
+	return 0;
 }
 
 static int r5l_load_log(struct r5l_log *log)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2e3e61a..27fd183 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -245,8 +245,25 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 			    < IO_THRESHOLD)
 				md_wakeup_thread(conf->mddev->thread);
 		atomic_dec(&conf->active_stripes);
-		if (!test_bit(STRIPE_EXPANDING, &sh->state))
-			list_add_tail(&sh->lru, temp_inactive_list);
+		if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
+			if (atomic_read(&sh->dev_in_cache) == 0) {
+				list_add_tail(&sh->lru, temp_inactive_list);
+			} else if (atomic_read(&sh->dev_in_cache) ==
+				   conf->raid_disks - conf->max_degraded) {
+				/* full stripe */
+				if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
+					atomic_inc(&conf->r5c_cached_full_stripes);
+				if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
+					atomic_dec(&conf->r5c_cached_partial_stripes);
+				list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
+			} else {
+				/* partial stripe */
+				if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE,
+						      &sh->state))
+					atomic_inc(&conf->r5c_cached_partial_stripes);
+				list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
+			}
+		}
 	}
 }
 
@@ -830,6 +847,11 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
 	might_sleep();
 
+	if (s->to_cache) {
+		r5c_cache_data(conf->log, sh, s);
+		return;
+	}
+
 	if (r5l_write_stripe(conf->log, sh) == 0)
 		return;
 	for (i = disks; i--; ) {
@@ -1044,7 +1066,7 @@ again:
 static struct dma_async_tx_descriptor *
 async_copy_data(int frombio, struct bio *bio, struct page **page,
 	sector_t sector, struct dma_async_tx_descriptor *tx,
-	struct stripe_head *sh)
+	struct stripe_head *sh, int no_skipcopy)
 {
 	struct bio_vec bvl;
 	struct bvec_iter iter;
@@ -1084,7 +1106,8 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
 			if (frombio) {
 				if (sh->raid_conf->skip_copy &&
 				    b_offset == 0 && page_offset == 0 &&
-				    clen == STRIPE_SIZE)
+				    clen == STRIPE_SIZE &&
+				    !no_skipcopy)
 					*page = bio_page;
 				else
 					tx = async_memcpy(*page, bio_page, page_offset,
@@ -1166,7 +1189,7 @@ static void ops_run_biofill(struct stripe_head *sh)
 			while (rbi && rbi->bi_iter.bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				tx = async_copy_data(0, rbi, &dev->page,
-					dev->sector, tx, sh);
+						     dev->sector, tx, sh, 0);
 				rbi = r5_next_bio(rbi, dev->sector);
 			}
 		}
@@ -1293,7 +1316,8 @@ static int set_syndrome_sources(struct page **srcs,
 		if (i == sh->qd_idx || i == sh->pd_idx ||
 		    (srctype == SYNDROME_SRC_ALL) ||
 		    (srctype == SYNDROME_SRC_WANT_DRAIN &&
-		     test_bit(R5_Wantdrain, &dev->flags)) ||
+		     (test_bit(R5_Wantdrain, &dev->flags) ||
+		      test_bit(R5_InCache, &dev->flags))) ||
 		    (srctype == SYNDROME_SRC_WRITTEN &&
 		     dev->written))
 			srcs[slot] = sh->dev[i].page;
@@ -1472,9 +1496,25 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 static void ops_complete_prexor(void *stripe_head_ref)
 {
 	struct stripe_head *sh = stripe_head_ref;
+	int i;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
+
+	if (!r5c_is_writeback(sh->raid_conf->log))
+		return;
+
+	/*
+	 * raid5-cache write back uses orig_page during prexor. after prexor,
+	 * it is time to free orig_page
+	 */
+	for (i = sh->disks; i--; )
+		if (sh->dev[i].page != sh->dev[i].orig_page) {
+			struct page *p = sh->dev[i].page;
+
+			sh->dev[i].page = sh->dev[i].orig_page;
+			put_page(p);
+		}
 }
 
 static struct dma_async_tx_descriptor *
@@ -1496,7 +1536,8 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 		/* Only process blocks that are known to be uptodate */
-		if (test_bit(R5_Wantdrain, &dev->flags))
+		if (test_bit(R5_Wantdrain, &dev->flags) ||
+		    test_bit(R5_InCache, &dev->flags))
 			xor_srcs[count++] = dev->page;
 	}
 
@@ -1547,6 +1588,10 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 
 again:
 			dev = &sh->dev[i];
+			if (test_and_clear_bit(R5_InCache, &dev->flags)) {
+				BUG_ON(atomic_read(&sh->dev_in_cache) == 0);
+				atomic_dec(&sh->dev_in_cache);
+			}
 			spin_lock_irq(&sh->stripe_lock);
 			chosen = dev->towrite;
 			dev->towrite = NULL;
@@ -1554,7 +1599,13 @@ again:
 			BUG_ON(dev->written);
 			wbi = dev->written = chosen;
 			spin_unlock_irq(&sh->stripe_lock);
-			WARN_ON(dev->page != dev->orig_page);
+
+			/*
+			 * in biodrain stage, prexor for data in raid5-cache
+			 * is the only case where page != orig_page
+			 */
+			if (!test_bit(R5_Wantcache, &dev->flags))
+				WARN_ON(dev->page != dev->orig_page);
 
 			while (wbi && wbi->bi_iter.bi_sector <
 				dev->sector + STRIPE_SECTORS) {
@@ -1566,8 +1617,10 @@ again:
 					set_bit(R5_Discard, &dev->flags);
 				else {
 					tx = async_copy_data(1, wbi, &dev->page,
-						dev->sector, tx, sh);
-					if (dev->page != dev->orig_page) {
+							     dev->sector, tx, sh,
+							     test_bit(R5_Wantcache, &dev->flags));
+					if (dev->page != dev->orig_page &&
+					    !test_bit(R5_Wantcache, &dev->flags)) {
 						set_bit(R5_SkipCopy, &dev->flags);
 						clear_bit(R5_UPTODATE, &dev->flags);
 						clear_bit(R5_OVERWRITE, &dev->flags);
@@ -1675,7 +1728,8 @@ again:
 		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if (head_sh->dev[i].written)
+			if (head_sh->dev[i].written ||
+			    test_bit(R5_InCache, &head_sh->dev[i].flags))
 				xor_srcs[count++] = dev->page;
 		}
 	} else {
@@ -1930,6 +1984,7 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
 		INIT_LIST_HEAD(&sh->batch_list);
 		INIT_LIST_HEAD(&sh->lru);
 		atomic_set(&sh->count, 1);
+		atomic_set(&sh->dev_in_cache, 0);
 		for (i = 0; i < disks; i++) {
 			struct r5dev *dev = &sh->dev[i];
 
@@ -2816,6 +2871,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 				if (!expand)
 					clear_bit(R5_UPTODATE, &dev->flags);
 				s->locked++;
+			} else if (test_bit(R5_InCache, &dev->flags)) {
+				set_bit(R5_LOCKED, &dev->flags);
+				s->locked++;
 			}
 		}
 		/* if we are not expanding this is a proper write request, and
@@ -2855,6 +2913,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 				set_bit(R5_LOCKED, &dev->flags);
 				clear_bit(R5_UPTODATE, &dev->flags);
 				s->locked++;
+			} else if (test_bit(R5_InCache, &dev->flags)) {
+				set_bit(R5_LOCKED, &dev->flags);
+				s->locked++;
 			}
 		}
 		if (!s->locked)
@@ -3529,9 +3590,12 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 	} else for (i = disks; i--; ) {
 		/* would I have to read this buffer for read_modify_write */
 		struct r5dev *dev = &sh->dev[i];
-		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
+		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx ||
+		     test_bit(R5_InCache, &dev->flags)) &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
-		    !(test_bit(R5_UPTODATE, &dev->flags) ||
+		    !((test_bit(R5_UPTODATE, &dev->flags) &&
+		       (!test_bit(R5_InCache, &dev->flags) ||
+			dev->page != dev->orig_page)) ||
 		      test_bit(R5_Wantcompute, &dev->flags))) {
 			if (test_bit(R5_Insync, &dev->flags))
 				rmw++;
@@ -3543,13 +3607,15 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 		    i != sh->pd_idx && i != sh->qd_idx &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
 		    !(test_bit(R5_UPTODATE, &dev->flags) ||
-		    test_bit(R5_Wantcompute, &dev->flags))) {
+		      test_bit(R5_InCache, &dev->flags) ||
+		      test_bit(R5_Wantcompute, &dev->flags))) {
 			if (test_bit(R5_Insync, &dev->flags))
 				rcw++;
 			else
 				rcw += 2*disks;
 		}
 	}
+
 	pr_debug("for sector %llu, rmw=%d rcw=%d\n",
 		(unsigned long long)sh->sector, rmw, rcw);
 	set_bit(STRIPE_HANDLE, &sh->state);
@@ -3561,10 +3627,18 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 					  (unsigned long long)sh->sector, rmw);
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
+			if (test_bit(R5_InCache, &dev->flags) &&
+			    dev->page == dev->orig_page)
+				dev->page = alloc_page(GFP_NOIO);  /* prexor */
+
+			if ((dev->towrite ||
+			     i == sh->pd_idx || i == sh->qd_idx ||
+			     test_bit(R5_InCache, &dev->flags)) &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
-			    !(test_bit(R5_UPTODATE, &dev->flags) ||
-			    test_bit(R5_Wantcompute, &dev->flags)) &&
+			    !((test_bit(R5_UPTODATE, &dev->flags) &&
+			       (!test_bit(R5_InCache, &dev->flags) ||
+				dev->page != dev->orig_page)) ||
+			      test_bit(R5_Wantcompute, &dev->flags)) &&
 			    test_bit(R5_Insync, &dev->flags)) {
 				if (test_bit(STRIPE_PREREAD_ACTIVE,
 					     &sh->state)) {
@@ -3590,6 +3664,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 			    i != sh->pd_idx && i != sh->qd_idx &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
 			    !(test_bit(R5_UPTODATE, &dev->flags) ||
+			      test_bit(R5_InCache, &dev->flags) ||
 			      test_bit(R5_Wantcompute, &dev->flags))) {
 				rcw++;
 				if (test_bit(R5_Insync, &dev->flags) &&
@@ -3629,7 +3704,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 	 */
 	if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
 	    (s->locked == 0 && (rcw == 0 || rmw == 0) &&
-	    !test_bit(STRIPE_BIT_DELAY, &sh->state)))
+	     !test_bit(STRIPE_BIT_DELAY, &sh->state)))
 		schedule_reconstruction(sh, s, rcw == 0, 0);
 }
 
@@ -4120,6 +4195,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 			if (rdev && !test_bit(Faulty, &rdev->flags))
 				do_recovery = 1;
 		}
+		if (test_bit(R5_InCache, &dev->flags) && dev->written)
+			s->just_cached++;
+		if (test_bit(R5_Wantcache, &dev->flags) && dev->written)
+			s->want_cache++;
 	}
 	if (test_bit(STRIPE_SYNCING, &sh->state)) {
 		/* If there is a failed device being replaced,
@@ -4285,6 +4364,17 @@ static void handle_stripe(struct stripe_head *sh)
 
 	analyse_stripe(sh, &s);
 
+	if (s.want_cache) {
+		/* In last run of handle_stripe, we have finished
+		 * r5c_handle_stripe_dirtying and ops_run_biodrain, but
+		 * r5c_cache_data didn't finish because the journal device
+		 * didn't have enough space. This time we should continue
+		 * r5c_cache_data
+		 */
+		s.to_cache = s.want_cache;
+		goto finish;
+	}
+
 	if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
 		goto finish;
 
@@ -4348,7 +4438,7 @@ static void handle_stripe(struct stripe_head *sh)
 			struct r5dev *dev = &sh->dev[i];
 			if (test_bit(R5_LOCKED, &dev->flags) &&
 				(i == sh->pd_idx || i == sh->qd_idx ||
-				 dev->written)) {
+				 dev->written || test_bit(R5_InCache, &dev->flags))) {
 				pr_debug("Writing block %d\n", i);
 				set_bit(R5_Wantwrite, &dev->flags);
 				if (prexor)
@@ -4388,6 +4478,10 @@ static void handle_stripe(struct stripe_head *sh)
 				 test_bit(R5_Discard, &qdev->flags))))))
 		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
 
+	if (s.just_cached)
+		r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
+	r5l_stripe_write_finished(sh);
+
 	/* Now we might consider reading some blocks, either to check/generate
 	 * parity, or to satisfy requests
 	 * or to load a block that is being partially written.
@@ -6526,6 +6620,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
 		INIT_LIST_HEAD(conf->temp_inactive_list + i);
 
+	atomic_set(&conf->r5c_cached_full_stripes, 0);
+	INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
+	atomic_set(&conf->r5c_cached_partial_stripes, 0);
+	INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
+
 	conf->level = mddev->new_level;
 	conf->chunk_sectors = mddev->new_chunk_sectors;
 	if (raid5_alloc_percpu(conf) != 0)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 8bae64b..ac6d7c7 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -226,6 +226,7 @@ struct stripe_head {
 
 	struct r5l_io_unit	*log_io;
 	struct list_head	log_list;
+	atomic_t		dev_in_cache;
 	/**
 	 * struct stripe_operations
 	 * @target - STRIPE_OP_COMPUTE_BLK target
@@ -263,6 +264,7 @@ struct stripe_head_state {
 	 */
 	int syncing, expanding, expanded, replacing;
 	int locked, uptodate, to_read, to_write, failed, written;
+	int to_cache, want_cache, just_cached;
 	int to_fill, compute, req_compute, non_overwrite;
 	int failed_num[2];
 	int p_failed, q_failed;
@@ -313,6 +315,8 @@ enum r5dev_flags {
 			 */
 	R5_Discard,	/* Discard the stripe */
 	R5_SkipCopy,	/* Don't copy data from bio to stripe cache */
+	R5_Wantcache,	/* Want write data to write cache */
+	R5_InCache,	/* Data in cache */
 };
 
 /*
@@ -348,6 +352,12 @@ enum {
 	STRIPE_LOG_TRAPPED,	/* trapped into log */
 	STRIPE_R5C_FROZEN,	/* r5c_cache frozen and being written out */
 	STRIPE_R5C_WRITTEN,	/* ready for r5c_handle_stripe_written() */
+	STRIPE_R5C_PARTIAL_STRIPE,	/* in r5c cache (to-be/being handled or
+					 * in conf->r5c_partial_stripe_list)
+					 */
+	STRIPE_R5C_FULL_STRIPE,	/* in r5c cache (to-be/being handled or
+				 * in conf->r5c_full_stripe_list)
+				 */
 };
 
 #define STRIPE_EXPAND_SYNC_FLAGS \
@@ -600,6 +610,12 @@ struct r5conf {
 	 */
 	atomic_t		active_stripes;
 	struct list_head	inactive_list[NR_STRIPE_HASH_LOCKS];
+
+	atomic_t		r5c_cached_full_stripes;
+	struct list_head	r5c_full_stripe_list;
+	atomic_t		r5c_cached_partial_stripes;
+	struct list_head	r5c_partial_stripe_list;
+
 	atomic_t		empty_inactive_list_nr;
 	struct llist_head	released_stripes;
 	wait_queue_head_t	wait_for_quiescent;
@@ -720,4 +736,10 @@ r5c_handle_stripe_dirtying(struct r5conf *conf, struct stripe_head *sh,
 			   struct stripe_head_state *s, int disks);
 extern void
 r5c_handle_stripe_written(struct r5conf *conf, struct stripe_head *sh);
+extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
+extern void r5c_handle_cached_data_endio(struct r5conf *conf,
+	struct stripe_head *sh, int disks, struct bio_list *return_bi);
+extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
+			  struct stripe_head_state *s);
+extern void r5c_freeze_stripe_for_reclaim(struct stripe_head *sh);
 #endif
-- 
2.9.3


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox