linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Logan Gunthorpe <logang@deltatee.com>
To: linux-raid@vger.kernel.org, Jes Sorensen <jes@trained-monkey.org>
Cc: Guoqing Jiang <guoqing.jiang@linux.dev>, Xiao Ni <xni@redhat.com>,
	Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>,
	Coly Li <colyli@suse.de>,
	Chaitanya Kulkarni <chaitanyak@nvidia.com>,
	Jonmichael Hands <jm@chia.net>,
	Stephen Bates <sbates@raithlin.com>,
	Martin Oliveira <Martin.Oliveira@eideticom.com>,
	David Sloan <David.Sloan@eideticom.com>,
	Logan Gunthorpe <logang@deltatee.com>
Subject: [PATCH mdadm v4 5/7] mdadm: Add --write-zeros option for Create
Date: Fri,  7 Oct 2022 14:10:35 -0600	[thread overview]
Message-ID: <20221007201037.20263-6-logang@deltatee.com> (raw)
In-Reply-To: <20221007201037.20263-1-logang@deltatee.com>

Add the --write-zeros option for Create which will send a write zeros
request to all the disks before assembling the array. After zeroing
the array, the disks will be in a known clean state and the initial
sync may be skipped.

Writing zeroes is best used when there is a hardware offload method
to zero the data. But even still, zeroing can take several minutes on
a large device. Because of this, all disks are zeroed in parallel using
their own forked process and a message is printed to the user. The main
process will proceed only after all the zeroing processes have completed
successfully.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>

fixup! mdadm: Add --write-zeros option for Create
---
 Create.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 ReadMe.c |  2 ++
 mdadm.c  |  9 ++++++
 mdadm.h  |  5 ++++
 4 files changed, 104 insertions(+), 2 deletions(-)

diff --git a/Create.c b/Create.c
index 4acda30c5256..36d63244f9ad 100644
--- a/Create.c
+++ b/Create.c
@@ -26,6 +26,8 @@
 #include	"md_u.h"
 #include	"md_p.h"
 #include	<ctype.h>
+#include	<fcntl.h>
+#include	<sys/wait.h>
 
 static int round_size_and_verify(unsigned long long *size, int chunk)
 {
@@ -91,9 +93,77 @@ int default_layout(struct supertype *st, int level, int verbose)
 	return layout;
 }
 
+static pid_t write_zeroes_fork(int fd, struct shape *s, struct supertype *st,
+			       struct mddev_dev *dv)
+
+{
+	unsigned long long offset_bytes, size_bytes;
+	int ret = 0;
+	pid_t pid;
+
+	size_bytes = KIB_TO_BYTES(s->size);
+
+	/*
+	 * If size_bytes is zero, this is a zoned raid array where
+	 * each disk is of a different size and uses its full
+	 * disk. Thus zero the entire disk.
+	 */
+	if (!size_bytes && !get_dev_size(fd, dv->devname, &size_bytes))
+		return -1;
+
+	if (dv->data_offset != INVALID_SECTORS)
+		offset_bytes = SEC_TO_BYTES(dv->data_offset);
+	else
+		offset_bytes = SEC_TO_BYTES(st->data_offset);
+
+	pr_info("zeroing data from %lld to %lld on: %s\n",
+		offset_bytes, size_bytes, dv->devname);
+
+	pid = fork();
+	if (pid < 0) {
+		pr_err("Could not fork to zero disks: %m\n");
+		return pid;
+	} else if (pid != 0) {
+		return pid;
+	}
+
+	if (fallocate(fd, FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE,
+		      offset_bytes, size_bytes)) {
+		pr_err("zeroing %s failed: %m\n", dv->devname);
+		ret = 1;
+	}
+
+	exit(ret);
+}
+
+static int wait_for_zero_forks(int *zero_pids, int count)
+{
+	int wstatus, ret = 0, i;
+	bool waited = false;
+
+	for (i = 0; i < count; i++) {
+		if (!zero_pids[i])
+			continue;
+
+		waited = true;
+		waitpid(zero_pids[i], &wstatus, 0);
+
+		if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus))
+			ret = 1;
+	}
+
+	if (ret)
+		pr_err("zeroing failed!\n");
+	else if (waited)
+		pr_info("zeroing finished\n");
+
+	return ret;
+}
+
 static int add_disk_to_super(int mdfd, struct shape *s, struct context *c,
 		struct supertype *st, struct mddev_dev *dv,
-		struct mdinfo *info, int have_container, int major_num)
+		struct mdinfo *info, int have_container, int major_num,
+		int *zero_pid)
 {
 	dev_t rdev;
 	int fd;
@@ -148,6 +218,14 @@ static int add_disk_to_super(int mdfd, struct shape *s, struct context *c,
 	}
 	st->ss->getinfo_super(st, info, NULL);
 
+	if (fd >= 0 && s->write_zeroes) {
+		*zero_pid = write_zeroes_fork(fd, s, st, dv);
+		if (*zero_pid <= 0) {
+			ioctl(mdfd, STOP_ARRAY, NULL);
+			return 1;
+		}
+	}
+
 	if (have_container && c->verbose > 0)
 		pr_err("Using %s for device %d\n",
 		       map_dev(info->disk.major, info->disk.minor, 0),
@@ -224,10 +302,12 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s,
 {
 	struct mddev_dev *moved_disk = NULL;
 	int pass, raid_disk_num, dnum;
+	int zero_pids[total_slots];
 	struct mddev_dev *dv;
 	struct mdinfo *infos;
 	int ret = 0;
 
+	memset(zero_pids, 0, sizeof(zero_pids));
 	infos = xmalloc(sizeof(*infos) * total_slots);
 	enable_fds(total_slots);
 	for (pass = 1; pass <= 2; pass++) {
@@ -261,7 +341,7 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s,
 
 				ret = add_disk_to_super(mdfd, s, c, st, dv,
 						&infos[dnum], have_container,
-						major_num);
+						major_num, &zero_pids[dnum]);
 				if (ret)
 					goto out;
 
@@ -287,6 +367,10 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s,
 		}
 
 		if (pass == 1) {
+			ret = wait_for_zero_forks(zero_pids, total_slots);
+			if (ret)
+				goto out;
+
 			ret = update_metadata(mdfd, s, st, map, info,
 					      chosen_name);
 			if (ret)
@@ -295,6 +379,8 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s,
 	}
 
 out:
+	if (ret)
+		wait_for_zero_forks(zero_pids, total_slots);
 	free(infos);
 	return ret;
 }
diff --git a/ReadMe.c b/ReadMe.c
index 50a5e36d05fc..9424bfc3eeca 100644
--- a/ReadMe.c
+++ b/ReadMe.c
@@ -138,6 +138,7 @@ struct option long_options[] = {
     {"size",	  1, 0, 'z'},
     {"auto",	  1, 0, Auto}, /* also for --assemble */
     {"assume-clean",0,0, AssumeClean },
+    {"write-zeroes",0,0, WriteZeroes },
     {"metadata",  1, 0, 'e'}, /* superblock format */
     {"bitmap",	  1, 0, Bitmap},
     {"bitmap-chunk", 1, 0, BitmapChunk},
@@ -390,6 +391,7 @@ char Help_create[] =
 "  --write-journal=      : Specify journal device for RAID-4/5/6 array\n"
 "  --consistency-policy= : Specify the policy that determines how the array\n"
 "                     -k : maintains consistency in case of unexpected shutdown.\n"
+"  --write-zeroes        : Write zeroes to the disks before creating. This will bypass initial sync.\n"
 "\n"
 ;
 
diff --git a/mdadm.c b/mdadm.c
index 972adb524dfb..141838bd394f 100644
--- a/mdadm.c
+++ b/mdadm.c
@@ -602,6 +602,10 @@ int main(int argc, char *argv[])
 			s.assume_clean = 1;
 			continue;
 
+		case O(CREATE, WriteZeroes):
+			s.write_zeroes = 1;
+			continue;
+
 		case O(GROW,'n'):
 		case O(CREATE,'n'):
 		case O(BUILD,'n'): /* number of raid disks */
@@ -1306,6 +1310,11 @@ int main(int argc, char *argv[])
 		}
 	}
 
+	if (s.write_zeroes && !s.assume_clean) {
+		pr_info("Disk zeroing requested, setting --assume-clean to skip resync\n");
+		s.assume_clean = 1;
+	}
+
 	if (!mode && devs_found) {
 		mode = MISC;
 		devmode = 'Q';
diff --git a/mdadm.h b/mdadm.h
index 18c24915e94c..82e920fb523a 100644
--- a/mdadm.h
+++ b/mdadm.h
@@ -273,6 +273,9 @@ static inline void __put_unaligned32(__u32 val, void *p)
 
 #define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
 
+#define KIB_TO_BYTES(x)	((x) << 10)
+#define SEC_TO_BYTES(x)	((x) << 9)
+
 extern const char Name[];
 
 struct md_bb_entry {
@@ -433,6 +436,7 @@ extern char Version[], Usage[], Help[], OptionHelp[],
  */
 enum special_options {
 	AssumeClean = 300,
+	WriteZeroes,
 	BitmapChunk,
 	WriteBehind,
 	ReAdd,
@@ -593,6 +597,7 @@ struct shape {
 	int	bitmap_chunk;
 	char	*bitmap_file;
 	int	assume_clean;
+	bool	write_zeroes;
 	int	write_behind;
 	unsigned long long size;
 	unsigned long long data_offset;
-- 
2.30.2


  parent reply	other threads:[~2022-10-07 20:11 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-10-07 20:10 [PATCH mdadm v4 0/7] Write Zeroes option for Creating Arrays Logan Gunthorpe
2022-10-07 20:10 ` [PATCH mdadm v4 1/7] Create: goto abort_locked instead of return 1 in error path Logan Gunthorpe
2022-10-07 20:10 ` [PATCH mdadm v4 2/7] Create: remove safe_mode_delay local variable Logan Gunthorpe
2022-10-07 20:10 ` [PATCH mdadm v4 3/7] Create: Factor out add_disks() helpers Logan Gunthorpe
2022-10-07 20:10 ` [PATCH mdadm v4 4/7] mdadm: Introduce pr_info() Logan Gunthorpe
2022-10-07 20:10 ` Logan Gunthorpe [this message]
2022-10-07 20:10 ` [PATCH mdadm v4 6/7] tests/00raid5-zero: Introduce test to exercise --write-zeros Logan Gunthorpe
2022-10-07 20:10 ` [PATCH mdadm v4 7/7] manpage: Add --write-zeroes option to manpage Logan Gunthorpe
2022-10-12  1:09 ` [PATCH mdadm v4 0/7] Write Zeroes option for Creating Arrays Xiao Ni
2022-10-12 16:59   ` Logan Gunthorpe
2022-10-13  1:33     ` Martin K. Petersen
2022-10-13  7:51       ` Xiao Ni
2022-10-26  2:41         ` Martin K. Petersen
2022-10-27  8:44           ` Xiao Ni
2022-11-16 17:11       ` Logan Gunthorpe
2022-11-03  8:14 ` Kinga Tanska

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20221007201037.20263-6-logang@deltatee.com \
    --to=logang@deltatee.com \
    --cc=David.Sloan@eideticom.com \
    --cc=Martin.Oliveira@eideticom.com \
    --cc=chaitanyak@nvidia.com \
    --cc=colyli@suse.de \
    --cc=guoqing.jiang@linux.dev \
    --cc=jes@trained-monkey.org \
    --cc=jm@chia.net \
    --cc=linux-raid@vger.kernel.org \
    --cc=mariusz.tkaczyk@linux.intel.com \
    --cc=sbates@raithlin.com \
    --cc=xni@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).