Linux RAID subsystem development

Linux RAID subsystem development
 help / color / mirror / Atom feed

* [PATCH] imsm: count arrays under VMD HBAs correctly
From: Alexey Obitotskiy @ 2017-01-09 12:12 UTC (permalink / raw)
  To: linux-raid; +Cc: Jes.Sorensen

OROM defines maximum number of arrays supported. On array creation mdadm
checks if number of arrays doesn't exceed that limit, however it is not
calculated correctly for VMD now.

The current code performs a lookup of HBA using the id. VMD HBAs have
the same id so each lookup returns the same structure (first
encountered). Take a different approach for VMD HBAs. As id is not
unique and cannot be used for lookups, iterate over all VMD HBAs and
compare both id and HBA path.

Signed-off-by: Alexey Obitotskiy <aleksey.obitotskiy@intel.com>
---
 platform-intel.c | 10 ++++++++++
 platform-intel.h |  1 +
 super-intel.c    | 48 ++++++++++++++++++++++++++++++++++++------------
 3 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/platform-intel.c b/platform-intel.c
index c60fd9e..72a4f19 100644
--- a/platform-intel.c
+++ b/platform-intel.c
@@ -178,6 +178,16 @@ struct sys_dev *device_by_id(__u16 device_id)
 	return NULL;
 }
 
+struct sys_dev *device_by_id_and_path(__u16 device_id, const char *path)
+{
+	struct sys_dev *iter;
+
+	for (iter = intel_devices; iter != NULL; iter = iter->next)
+		if ((iter->dev_id == device_id) && strstr(iter->path, path))
+			return iter;
+	return NULL;
+}
+
 static int devpath_to_ll(const char *dev_path, const char *entry, unsigned long long *val)
 {
 	char path[strlen(dev_path) + strlen(entry) + 2];
diff --git a/platform-intel.h b/platform-intel.h
index a8ae85f..4b4e67f 100644
--- a/platform-intel.h
+++ b/platform-intel.h
@@ -244,4 +244,5 @@ const char *get_sys_dev_type(enum sys_dev_type);
 const struct orom_entry *get_orom_entry_by_device_id(__u16 dev_id);
 const struct imsm_orom *get_orom_by_device_id(__u16 device_id);
 struct sys_dev *device_by_id(__u16 device_id);
+struct sys_dev *device_by_id_and_path(__u16 device_id, const char *path);
 char *vmd_domain_to_controller(struct sys_dev *hba, char *buf);
diff --git a/super-intel.c b/super-intel.c
index 0407d43..93cea8c 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -6475,20 +6475,20 @@ count_volumes_list(struct md_list *devlist, char *homehost,
 	return count;
 }
 
-static int
-count_volumes(struct intel_hba *hba, int dpa, int verbose)
+static int __count_volumes(char *hba_path, int dpa, int verbose,
+			   int cmp_hba_path)
 {
 	struct sys_dev *idev, *intel_devices = find_intel_devices();
 	int count = 0;
 	const struct orom_entry *entry;
 	struct devid_list *dv, *devid_list;
 
-	if (!hba || !hba->path)
+	if (!hba_path)
 		return 0;
 
 	for (idev = intel_devices; idev; idev = idev->next) {
-		if (strstr(idev->path, hba->path))
-				break;
+		if (strstr(idev->path, hba_path))
+			break;
 	}
 
 	if (!idev || !idev->dev_id)
@@ -6502,22 +6502,28 @@ count_volumes(struct intel_hba *hba, int dpa, int verbose)
 	devid_list = entry->devid_list;
 	for (dv = devid_list; dv; dv = dv->next) {
 		struct md_list *devlist;
-		struct sys_dev *device = device_by_id(dv->devid);
-		char *hba_path;
+		struct sys_dev *device = NULL;
+		char *hpath;
 		int found = 0;
 
+		if (cmp_hba_path)
+			device = device_by_id_and_path(dv->devid, hba_path);
+		else
+			device = device_by_id(dv->devid);
+
 		if (device)
-			hba_path = device->path;
+			hpath = device->path;
 		else
 			return 0;
 
-		devlist = get_devices(hba_path);
+		devlist = get_devices(hpath);
 		/* if no intel devices return zero volumes */
 		if (devlist == NULL)
 			return 0;
 
-		count += active_arrays_by_format("imsm", hba_path, &devlist, dpa, verbose);
-		dprintf("path: %s active arrays: %d\n", hba_path, count);
+		count += active_arrays_by_format("imsm", hpath, &devlist, dpa,
+						 verbose);
+		dprintf("path: %s active arrays: %d\n", hpath, count);
 		if (devlist == NULL)
 			return 0;
 		do  {
@@ -6529,7 +6535,7 @@ count_volumes(struct intel_hba *hba, int dpa, int verbose)
 			dprintf("found %d count: %d\n", found, count);
 		} while (found);
 
-		dprintf("path: %s total number of volumes: %d\n", hba_path, count);
+		dprintf("path: %s total number of volumes: %d\n", hpath, count);
 
 		while (devlist) {
 			struct md_list *dv = devlist;
@@ -6541,6 +6547,24 @@ count_volumes(struct intel_hba *hba, int dpa, int verbose)
 	return count;
 }
 
+static int count_volumes(struct intel_hba *hba, int dpa, int verbose)
+{
+	if (!hba)
+		return 0;
+	if (hba->type == SYS_DEV_VMD) {
+		struct sys_dev *dev;
+		int count = 0;
+
+		for (dev = find_intel_devices(); dev; dev = dev->next) {
+			if (dev->type == SYS_DEV_VMD)
+				count += __count_volumes(dev->path, dpa,
+							 verbose, 1);
+		}
+		return count;
+	}
+	return __count_volumes(hba->path, dpa, verbose, 0);
+}
+
 static int imsm_default_chunk(const struct imsm_orom *orom)
 {
 	/* up to 512 if the plaform supports it, otherwise the platform max.
-- 
2.7.4


^ permalink raw reply related

* Re: [PATCH] imsm: count arrays under VMD HBAs correctly
From: Jes Sorensen @ 2017-01-09 12:49 UTC (permalink / raw)
  To: Alexey Obitotskiy; +Cc: linux-raid
In-Reply-To: <1483963942-24774-1-git-send-email-aleksey.obitotskiy@intel.com>

Alexey Obitotskiy <aleksey.obitotskiy@intel.com> writes:
> OROM defines maximum number of arrays supported. On array creation mdadm
> checks if number of arrays doesn't exceed that limit, however it is not
> calculated correctly for VMD now.
>
> The current code performs a lookup of HBA using the id. VMD HBAs have
> the same id so each lookup returns the same structure (first
> encountered). Take a different approach for VMD HBAs. As id is not
> unique and cannot be used for lookups, iterate over all VMD HBAs and
> compare both id and HBA path.
>
> Signed-off-by: Alexey Obitotskiy <aleksey.obitotskiy@intel.com>
> ---
>  platform-intel.c | 10 ++++++++++
>  platform-intel.h |  1 +
>  super-intel.c    | 48 ++++++++++++++++++++++++++++++++++++------------
>  3 files changed, 47 insertions(+), 12 deletions(-)

Applied!

Thanks,
Jes

^ permalink raw reply

* Re: [MDADM PATCH 1/1] Add detail information when can not connect monitor
From: Jes Sorensen @ 2017-01-09 12:50 UTC (permalink / raw)
  To: Xiao Ni; +Cc: linux-raid, osamarin68
In-Reply-To: <1483844394-7647-1-git-send-email-xni@redhat.com>

Xiao Ni <xni@redhat.com> writes:
> If it can't connect monitor, now the error message is just
> Error waiting for xxx to be clean. Add detail error message
> in connect_monitor.
>
> Suggested-by: Oleg Samarin <osamarin68@gmail.com>
> Signed-off-by: Xiao Ni <xni@redhat.com>
> ---
>  msg.c | 2 ++
>  1 file changed, 2 insertions(+)

Applied!

Thanks,
Jes

^ permalink raw reply

* Re: [PATCH v2 7/7] uapi: export all headers under uapi directories
From: Christoph Hellwig @ 2017-01-09 12:56 UTC (permalink / raw)
  To: Nicolas Dichtel
  Cc: linux-mips, alsa-devel, linux-ia64, linux-doc, airlied,
	linux-fbdev, dri-devel, linux-mtd, sparclinux, linux-arch,
	linux-s390, linux-am33-list, linux-c6x-dev, linux-rdma,
	linux-hexagon, linux-sh, coreteam, fcoe-devel, xen-devel,
	linux-snps-arc, linux-media, uclinux-h8-devel, linux-xtensa, arnd,
	linux-kbuild, adi-buildroot-devel, linux-raid, linux-m68k,
	openrisc, linux-metag, linux-arm
In-Reply-To: <1483695839-18660-8-git-send-email-nicolas.dichtel@6wind.com>

On Fri, Jan 06, 2017 at 10:43:59AM +0100, Nicolas Dichtel wrote:
> Regularly, when a new header is created in include/uapi/, the developer
> forgets to add it in the corresponding Kbuild file. This error is usually
> detected after the release is out.
> 
> In fact, all headers under uapi directories should be exported, thus it's
> useless to have an exhaustive list.
> 
> After this patch, the following files, which were not exported, are now
> exported (with make headers_install_all):

... snip ...

> linux/genwqe/.install
> linux/genwqe/..install.cmd
> linux/cifs/.install
> linux/cifs/..install.cmd

I'm pretty sure these should not be exported!

^ permalink raw reply

* [PATCH] imsm: show correct size for arrays with 4k disks
From: Maksymilian Kunt @ 2017-01-09 14:16 UTC (permalink / raw)
  To: linux-raid; +Cc: jes.sorensen, Maksymilian Kunt

Number of blocks used to calculate array size is based on 512 block size
so the size displayed is incorrect for arrays with 4k disks.

Signed-off-by: Maksymilian Kunt <maksymilian.kunt@intel.com>
---
 super-intel.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/super-intel.c b/super-intel.c
index 0817a1f..87ed8ec 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -1480,8 +1480,10 @@ static void print_imsm_dev(struct intel_super *super,
 	printf("    Dirty State : %s\n", dev->vol.dirty ? "dirty" : "clean");
 }
 
-static void print_imsm_disk(struct imsm_disk *disk, int index, __u32 reserved)
-{
+static void print_imsm_disk(struct imsm_disk *disk,
+			    int index,
+			    __u32 reserved,
+			    unsigned int sector_size) {
 	char str[MAX_RAID_SERIAL_LEN + 1];
 	__u64 sz;
 
@@ -1499,7 +1501,8 @@ static void print_imsm_disk(struct imsm_disk *disk, int index, __u32 reserved)
 					    is_failed(disk) ? " failed" : "");
 	printf("             Id : %08x\n", __le32_to_cpu(disk->scsi_id));
 	sz = total_blocks(disk) - reserved;
-	printf("    Usable Size : %llu%s\n", (unsigned long long)sz,
+	printf("    Usable Size : %llu%s\n",
+	       (unsigned long long)sz * 512 / sector_size,
 	       human_size(sz * 512));
 }
 
@@ -1829,7 +1832,8 @@ static void examine_super_imsm(struct supertype *st, char *homehost)
 	printf("    MPB Sectors : %d\n", mpb_sectors(mpb, super->sector_size));
 	printf("          Disks : %d\n", mpb->num_disks);
 	printf("   RAID Devices : %d\n", mpb->num_raid_devs);
-	print_imsm_disk(__get_imsm_disk(mpb, super->disks->index), super->disks->index, reserved);
+	print_imsm_disk(__get_imsm_disk(mpb, super->disks->index),
+			super->disks->index, reserved, super->sector_size);
 	if (get_imsm_bbm_log_size(super->bbm_log)) {
 		struct bbm_log *log = super->bbm_log;
 
@@ -1851,12 +1855,14 @@ static void examine_super_imsm(struct supertype *st, char *homehost)
 	for (i = 0; i < mpb->num_disks; i++) {
 		if (i == super->disks->index)
 			continue;
-		print_imsm_disk(__get_imsm_disk(mpb, i), i, reserved);
+		print_imsm_disk(__get_imsm_disk(mpb, i), i, reserved,
+				super->sector_size);
 	}
 
 	for (dl = super->disks; dl; dl = dl->next)
 		if (dl->index == -1)
-			print_imsm_disk(&dl->disk, -1, reserved);
+			print_imsm_disk(&dl->disk, -1, reserved,
+					super->sector_size);
 
 	examine_migr_rec_imsm(super);
 }
-- 
1.8.3.1


^ permalink raw reply related

* Re: [PATCH] imsm: show correct size for arrays with 4k disks
From: Jes Sorensen @ 2017-01-09 14:26 UTC (permalink / raw)
  To: Maksymilian Kunt; +Cc: linux-raid
In-Reply-To: <1483971364-7343-1-git-send-email-maksymilian.kunt@intel.com>

Maksymilian Kunt <maksymilian.kunt@intel.com> writes:
> Number of blocks used to calculate array size is based on 512 block size
> so the size displayed is incorrect for arrays with 4k disks.
>
> Signed-off-by: Maksymilian Kunt <maksymilian.kunt@intel.com>
> ---
>  super-intel.c | 18 ++++++++++++------
>  1 file changed, 12 insertions(+), 6 deletions(-)

Applied!

Thanks,
Jes

^ permalink raw reply

* [LSF/MM TOPIC] [LSF/MM ATTEND] md raid general discussion
From: Coly Li @ 2017-01-09 16:38 UTC (permalink / raw)
  To: lsf-pc
  Cc: open list:SOFTWARE RAID (Multiple Disks) SUPPORT, linux-block,
	linux-kernel, linux-nvme, Shaohua Li, NeilBrown, songliubraving,
	Guoqing Jiang, pawel.baldysiak, mariusz.dabrowski,
	artur.paszkiewicz, Jes.Sorensen, Hannes Reinecke

Hi Folks,

I'd like to propose a general md raid discussion, it is quite necessary
for most of active md raid developers sit together to discuss current
challenge of Linux software raid and development trends.

In the last years, we have many development activities in md raid, e.g.
raid5 cache, raid1 clustering, partial parity log, fast fail
upstreaming, and some effort for raid1 & raid0 performance improvement.

I see there are some kind of functionality overlap between r5cache
(raid5 cache) and PPL (partial parity log), currently I have no idea
where we will go for these two development activities.
Also I receive reports from users that raid1 performance is desired when
it is built on NVMe SSDs as a cache (maybe bcache or dm-cache). I am
working on some raid1 performance improvement (e.g. new raid1 I/O
barrier and lockless raid1 I/O submit), and have some more ideas to discuss.

Therefore, if md raid developers may have a chance to sit together,
discuss how to efficiently collaborate in next year, it will be much
more productive then communicating on mailing list.

Finally let me introduce myself for people don't know me. My name is
Coly Li, I used to work on OCFS2, Ext4 for SUSE Linux, now I work with
Neil Brown and Hannes Reinecke to maintain block layer code for SUSE
Linux, mostly focus on drivers/md/*

Thanks.

Coly Li

^ permalink raw reply

* Re: [PATCH 1/1] md/raid5: Use correct IS_ERR() variation on pointer check
From: Shaohua Li @ 2017-01-09 21:34 UTC (permalink / raw)
  To: Jes.Sorensen; +Cc: linux-raid
In-Reply-To: <20170107003135.8276-2-Jes.Sorensen@redhat.com>

On Fri, Jan 06, 2017 at 07:31:35PM -0500, Jes.Sorensen@redhat.com wrote:
> From: Jes Sorensen <Jes.Sorensen@redhat.com>
> 
> This fixes a build error on certain architectures, such as ppc64.

Yep, we should use IS_ERR here, applied this. I'm curious why there is a
compile error. The IS_ERR_VALUE casts the pointer to 'unsigned long'.

Thanks,
Shaohua
> Fixes: 6995f0b247e("md: takeover should clear unrelated bits")
> Signed-off-by: Jes Sorensen <Jes.Sorensen@redhat.com>
> ---
>  drivers/md/raid5.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 06d7279..0e0646b 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -7829,7 +7829,7 @@ static void *raid5_takeover_raid1(struct mddev *mddev)
>  	mddev->new_chunk_sectors = chunksect;
>  
>  	ret = setup_conf(mddev);
> -	if (!IS_ERR_VALUE(ret))
> +	if (!IS_ERR(ret))
>  		clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
>  	return ret;
>  }
> -- 
> 2.9.3
> 

^ permalink raw reply

* Re: [PATCH 1/1] md/raid5: Use correct IS_ERR() variation on pointer check
From: Jes Sorensen @ 2017-01-09 21:38 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid
In-Reply-To: <20170109213412.we7sxcivzbaiyb3i@kernel.org>

Shaohua Li <shli@kernel.org> writes:
> On Fri, Jan 06, 2017 at 07:31:35PM -0500, Jes.Sorensen@redhat.com wrote:
>> From: Jes Sorensen <Jes.Sorensen@redhat.com>
>> 
>> This fixes a build error on certain architectures, such as ppc64.
>
> Yep, we should use IS_ERR here, applied this. I'm curious why there is a
> compile error. The IS_ERR_VALUE casts the pointer to 'unsigned long'.

When building on ppc64 I get a -Wsign-compare build error due to -Werror.

Cheers,
Jes

^ permalink raw reply

* Re: PROBLEM: Kernel BUG with raid5 soft + Xen + DRBD - invalid opcode
From: Shaohua Li @ 2017-01-09 22:44 UTC (permalink / raw)
  To: MasterPrenium
  Cc: linux-kernel, xen-users, linux-raid, MasterPrenium@gmail.com,
	xen-devel
In-Reply-To: <2717981a-4308-3f7b-15c6-f384a41fd445@gmail.com>

On Sun, Jan 08, 2017 at 02:31:15PM +0100, MasterPrenium wrote:
> Hello,
> 
> Replies below + :
> - I don't know if this can help but after the crash, when the system
> reboots, the Raid 5 stack is re-synchronizing
> [   37.028239] md10: Warning: Device sdc1 is misaligned
> [   37.028541] created bitmap (15 pages) for device md10
> [   37.030433] md10: bitmap initialized from disk: read 1 pages, set 59 of
> 29807 bits
> 
> - Sometimes the kernel completely crash (lost serial + network connection),
> sometimes only got the "BUG" dump, but still have network access (but a
> reboot is impossible, need to reset the system).
> 
> - You can find blktrace here (while running fio), I hope it's complete since
> the end of the file is when the kernel crashed : https://goo.gl/X9jZ50

Looks most are normal full stripe writes.
 
> > I'm trying to reproduce, but no success. So
> > ext4->btrfs->raid5, crash
> > btrfs->raid5, no crash
> > right? does subvolume matter? When you create the raid5 array, does adding
> > '--assume-clean' option change the behavior? I'd like to narrow down the issue.
> > If you can capture the blktrace to the raid5 array, it would be great to hint
> > us what kind of IO it is.
> Yes Correct.
> The subvolume doesn't matter.
> -- assume-clean doesn't change the behaviour.

so it's not a resync issue.

> Don't forget that the system needs to be running on xen to crash, without
> (on native kernel) it doesn't crash (or at least, I was not able to make it
> crash).
> > > Regarding your patch, I can't find it. Is it the one sent by Konstantin
> > > Khlebnikov ?
> > Right.
> It doesn't help :(. Maybe the crash is happening a little bit later.

ok, the patch is unlikely helpful, since the IO size isn't very big.

Don't have good idea yet. My best guess so far is virtual machine introduces
extra delay, which might trigger some race conditions which aren't seen in
native.  I'll check if I could find something locally.

Thanks,
Shaohua

^ permalink raw reply

* ANNOUNCE: mdadm 4.0 - A tool for managing md Soft RAID under Linux
From: Jes Sorensen @ 2017-01-10  1:11 UTC (permalink / raw)
  To: linux-raid@vger.kernel.org; +Cc: LKML, Brown, Neil

I am pleased to announce the availability of
   mdadm version 4.0

It is available at the usual places:
   http://www.kernel.org/pub/linux/utils/raid/mdadm/
and via git at
   git://git.kernel.org/pub/scm/utils/mdadm/mdadm.git
   http://git.kernel.org/cgit/utils/mdadm/

The update in major version number primarily indicates this is a
release by it's new maintainer. In addition it contains a large number
of fixes in particular for IMSM RAID and clustered RAID support.  In
addition this release includes support for IMSM 4k sector drives,
failfast and better documentation for journaled RAID.

This is my first release of mdadm. Please thank Neil Brown for his
previous work as maintainer and blame me for all the bugs I caused
since taking over.

Jes Sorensen, 2017-01-09

^ permalink raw reply

* Re: Recommendation on new system Arrays
From: Benjammin2068 @ 2017-01-10  8:48 UTC (permalink / raw)
  To: Linux-RAID
In-Reply-To: <d1f5b65b-5a0b-92e1-7fe7-d2a0c45c8998@fnarfbargle.com>

On 01/08/2017 10:34 PM, Brad Campbell wrote:
>
> This can be a real trap.
>
> In general CCTV software works great for about the first 2 complete database writes. As the database ages you will find increasing levels of write fragmentation that kill your write latency. It also makes each write smaller as you are no longer streaming bulk data to disk but filling the holes left by expired footage. This destroys your array write latency and makes replay and searching for footage frustratingly slow.
>
> Make sure your CCTV storage RAID is not used for anything else, select a small chunk size to minimise stripe size and be prepared for fairly high search and playback latency.
>
> Most of the major CCTV systems suffer in the same way, and they generally get around it by using "hardware" RAID cards with large RAM buffers (until the BBU wears out and you start getting massive footage loss because they can't get the streams to disk fast enough).
>
> If performance is not an issue then a RAID6 being written from a Windows VM on a Linux host works just fine. Pass the md straight through to the Windows VM and let it manage the raw block device.
>
> It has bee a couple of years since I tested Milestone, but I've just finished a test with Indigo Vision, Aimetis, Genetec, Geutebruck, Bosch, Avigilon & March Networks. They all suffer to a certain degree and all use different storage architectures depending on their legacy. March works really well over SMB shares though. I'm just not fond of the product.
>
> Milestone sent me a new test license before Christmas but I've not got around to spooling it up yet.
>
> All of this stuff gets tested on KVM VM's writing to a couple of small RAID-6 arrays (10-14 drives).

Excellent -- the kind of info I was looking for (and wondering about.)

Thanks for the tip!

 -Ben



^ permalink raw reply

* Re: Recommendation on new system Arrays
From: Wols Lists @ 2017-01-10 15:55 UTC (permalink / raw)
  To: Benjammin2068, Linux-RAID
In-Reply-To: <bfb6ebec-defe-186d-76c8-f7fbbaf65a70@gmail.com>

On 10/01/17 08:48, Benjammin2068 wrote:
> Most of the major CCTV systems suffer in the same way, and they generally get around it by using "hardware" RAID cards with large RAM buffers (until the BBU wears out and you start getting massive footage loss because they can't get the streams to disk fast enough).
>>
>> If performance is not an issue then a RAID6 being written from a Windows VM on a Linux host works just fine. Pass the md straight through to the Windows VM and let it manage the raw block device.

This made me think. Bear in mind ext tends to over-allocate space to try
and avoid fragmentation. I don't know to what extent it happens
automatically, but this sounds similar to what Brad is recommending. If
you can match your file system to your raid array, such that the
file-system's default allocation unit is one stride of the raid, this
will help avoid RMW thrashing. Pretty obvious, in hindsight, this will
mean strides (mostly) never get split across files so are only ever
allocated and freed as whole units.

Cheers,
Wol

^ permalink raw reply

* Re: ANNOUNCE: mdadm 4.0 - A tool for managing md Soft RAID under Linux
From: Bruce Dubbs @ 2017-01-10 17:49 UTC (permalink / raw)
  To: Jes Sorensen, linux-raid@vger.kernel.org; +Cc: LKML, Brown, Neil
In-Reply-To: <1cd97490-e650-d98b-466a-095292dc5b98@gmail.com>

Jes Sorensen wrote:
> I am pleased to announce the availability of
>     mdadm version 4.0
>
> It is available at the usual places:
>     http://www.kernel.org/pub/linux/utils/raid/mdadm/
> and via git at
>     git://git.kernel.org/pub/scm/utils/mdadm/mdadm.git
>     http://git.kernel.org/cgit/utils/mdadm/
>
> The update in major version number primarily indicates this is a
> release by it's new maintainer. In addition it contains a large number
> of fixes in particular for IMSM RAID and clustered RAID support.  In
> addition this release includes support for IMSM 4k sector drives,
> failfast and better documentation for journaled RAID.

Thank you for the new release.  Unfortunately I get 9 failures running the 
test suite:

tests/00raid1...          FAILED
tests/07autoassemble...   FAILED
tests/07changelevels...   FAILED
tests/07revert-grow...    FAILED
tests/07revert-inplace... FAILED
tests/07testreshape5...   FAILED
tests/10ddf-fail-twice... FAILED
tests/20raid5journal...   FAILED
tests/10ddf-incremental-wrong-order...  FAILED

The procedure I used was

make
sudo ./test --keep-going --logdir=test-logs --save-logs

I'll also note that there is an irritating message when a test fails:
cp: cannot stat '/var/tmp/log': No such file or directory

This can be fixed easily enough with:
sed -i 's# if.* == "1"#& -a -e $targetdir/log#' test

I don't know if this mailing list is the right place to report bugs or 
not.  I do not want to spam the list with the logs but they are available at:

http://anduin.linuxfromscratch.org/~bdubbs/mdadm-logs/

   -- Bruce Dubbs
      linuxfromscratch.org

^ permalink raw reply

* Re: Interesting mdadm quirk ...
From: NeilBrown @ 2017-01-10 22:07 UTC (permalink / raw)
  To: Wols Lists, linux-raid; +Cc: jes.sorensen
In-Reply-To: <587334E6.406@youngman.org.uk>

[-- Attachment #1: Type: text/plain, Size: 1949 bytes --]

On Mon, Jan 09 2017, Wols Lists wrote:

> On 08/01/17 22:44, NeilBrown wrote:
>> On Sun, Jan 08 2017, Wols Lists wrote:
>> 
>>> Just been doing some raid testing, and this happened ...
>>>
>>> linux-lfqf:/dev # mdadm md/parity
>>> md/parity: 31.97GiB raid5 3 devices, 0 spares. Use mdadm --detail for
>>> more detail.
>>> linux-lfqf:/dev # mdadm --stop md/parity
>>> mdadm: Cannot open md/parity
>>> linux-lfqf:/dev # mdadm --stop /dev/md/parity
>>> mdadm: stopped /dev/md/parity
>>>
>>> Weird - why can it successfully stop it when passed an absolute path,
>>> but not when passed a relative path? When I did the first variant, I
>>> used tab completion, and then when I edited it I really did edit it, not
>>> retype it, so I can't see any way the two arguments could refer to
>>> different objects.
>> 
>> If you give mdadm a name of an array that start with "/", it is assumed
>> to be a path name (usually in /dev).
>> If it doesn't start with "/", then it is an array name.  The might mean
>> different things in different contexts, I'm not 100% sure.
>> However, for --stop, it a name like you would find is /sys/block or
>> /proc/mdstat.
>> So "mdadm --stop md0" or "mdadm --stop md_parity" might do what you
>> want.
>> Probably the error message could be more useful here.
>> 
> Does that mean an array name can contain a "/"?

No, it cannot.

>
> Assuming it can't, surely it's better to alter the logic slightly...
>
> if char[0] ne '/' then
>   open array_name
> end
> if not successful then
>   open path_name
>   if error then go error_handler
> end
> carry on ...
>
> That way naive users like me won't get a surprise. And it is rather
> inconsistent for it to work with one sort of path but not another ...
> and actually that logic will work fine even if the name does contain a
> "/" :-)

If you can convert the above into a patch.....

I agree with your logic.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* [PATCH v2 1/2] EXPORT_SYMBOL radix_tree_lookup_slot
From: Song Liu @ 2017-01-11  1:42 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuan,
	liuyun01, Song Liu, Jes.Sorensen

It will be used in drivers/md/raid5-cache.c

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 lib/radix-tree.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 6f382e0..1ee7449 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -1099,6 +1099,7 @@ void radix_tree_replace_slot(struct radix_tree_root *root,
 {
 	replace_slot(root, NULL, slot, item, true);
 }
+EXPORT_SYMBOL(radix_tree_replace_slot);
 
 /**
  * radix_tree_iter_replace - replace item in a slot
-- 
2.9.3


^ permalink raw reply related

* [PATCH v2 2/2] md/r5cache: enable chunk_aligned_read with write back cache
From: Song Liu @ 2017-01-11  1:42 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuan,
	liuyun01, Song Liu, Jes.Sorensen
In-Reply-To: <20170111014251.3236610-1-songliubraving@fb.com>

Chunk aligned read significantly reduces CPU usage of raid456.
However, it is not safe to fully bypass the write back cache.
This patch enables chunk aligned read with write back cache.

For chunk aligned read, we track stripes in write back cache at
a bigger granularity, "big_stripe". Each chunk may contain more
than one stripe (for example, a 256kB chunk contains 64 4kB-page,
so this chunk contain 64 stripes). For chunk_aligned_read, these
stripes are grouped into one big_stripe, so we only need one lookup
for the whole chunk.

For each big_stripe, struct big_stripe_info tracks how many stripes
of this big_stripe are in the write back cache. We count how many
stripes of this big_stripe are in the write back cache. These
counters are tracked in a radix tree (big_stripe_tree).
r5c_tree_index() is used to calculate keys for the radix tree.

chunk_aligned_read() calls r5c_big_stripe_cached() to look up
big_stripe of each chunk in the tree. If this big_stripe is in the
tree, chunk_aligned_read() aborts. This look up is protected by
rcu_read_lock().

It is necessary to remember whether a stripe is counted in
big_stripe_tree. Instead of adding new flag, we reuses existing flags:
STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these
two flags are set, the stripe is counted in big_stripe_tree. This
requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to
r5c_try_caching_write(); and moving clear_bit of
STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to
r5c_finish_stripe_write_out().

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c | 164 ++++++++++++++++++++++++++++++++++++++++++-----
 drivers/md/raid5.c       |  19 ++++--
 drivers/md/raid5.h       |   1 +
 3 files changed, 160 insertions(+), 24 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 3e3e5dc..2ff2510 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -20,6 +20,7 @@
 #include <linux/crc32c.h>
 #include <linux/random.h>
 #include <linux/kthread.h>
+#include <linux/types.h>
 #include "md.h"
 #include "raid5.h"
 #include "bitmap.h"
@@ -162,9 +163,59 @@ struct r5l_log {
 
 	/* to submit async io_units, to fulfill ordering of flush */
 	struct work_struct deferred_io_work;
+
+	/* to for chunk_aligned_read in writeback mode, details below */
+	spinlock_t tree_lock;
+	struct radix_tree_root big_stripe_tree;
 };
 
 /*
+ * Enable chunk_aligned_read() with write back cache.
+ *
+ * Each chunk may contain more than one stripe (for example, a 256kB
+ * chunk contains 64 4kB-page, so this chunk contain 64 stripes). For
+ * chunk_aligned_read, these stripes are grouped into one "big_stripe".
+ * For each big_stripe, we count how many stripes of this big_stripe
+ * are in the write back cache. These data are tracked in a radix tree
+ * (big_stripe_tree). We use radix_tree item pointer as the counter.
+ * r5c_tree_index() is used to calculate keys for the radix tree.
+ *
+ * chunk_aligned_read() calls r5c_big_stripe_cached() to look up
+ * big_stripe of each chunk in the tree. If this big_stripe is in the
+ * tree, chunk_aligned_read() aborts. This look up is protected by
+ * rcu_read_lock().
+ *
+ * It is necessary to remember whether a stripe is counted in
+ * big_stripe_tree. Instead of adding new flag, we reuses existing flags:
+ * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these
+ * two flags are set, the stripe is counted in big_stripe_tree. This
+ * requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to
+ * r5c_try_caching_write(); and moving clear_bit of
+ * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to
+ * r5c_finish_stripe_write_out().
+ */
+
+/*
+ * radix tree requests lowest 2 bits of data pointer to be 2b'00, so we
+ * adds 4 for each stripe
+ */
+#define R5C_RADIX_COUNT_UNIT 4
+
+/*
+ * calculate key for big_stripe_tree
+ *
+ * sect: align_bi->bi_iter.bi_sector or sh->sector
+ */
+static inline sector_t r5c_tree_index(struct r5conf *conf,
+				      sector_t sect)
+{
+	sector_t offset;
+
+	offset = sector_div(sect, conf->chunk_sectors);
+	return sect;
+}
+
+/*
  * an IO range starts from a meta data block and end at the next meta data
  * block. The io unit's the meta data block tracks data/parity followed it. io
  * unit is written to log disk with normal write, as we always flush log disk
@@ -410,16 +461,6 @@ void r5c_make_stripe_write_out(struct stripe_head *sh)
 
 	if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 		atomic_inc(&conf->preread_active_stripes);
-
-	if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
-		BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
-		atomic_dec(&conf->r5c_cached_partial_stripes);
-	}
-
-	if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
-		BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
-		atomic_dec(&conf->r5c_cached_full_stripes);
-	}
 }
 
 static void r5c_handle_data_cached(struct stripe_head *sh)
@@ -2303,6 +2344,10 @@ int r5c_try_caching_write(struct r5conf *conf,
 	int i;
 	struct r5dev *dev;
 	int to_cache = 0;
+	void **pslot;
+	sector_t tree_index;
+	int ret;
+	uintptr_t refcount;
 
 	BUG_ON(!r5c_is_writeback(log));
 
@@ -2337,6 +2382,40 @@ int r5c_try_caching_write(struct r5conf *conf,
 		}
 	}
 
+	/* if the stripe is not counted in big_stripe_tree, add it now */
+	if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
+	    !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
+		tree_index = r5c_tree_index(conf, sh->sector);
+		spin_lock(&log->tree_lock);
+		pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
+					       tree_index);
+		if (pslot) {
+			refcount = (uintptr_t)radix_tree_deref_slot(pslot);
+			radix_tree_replace_slot(
+				&log->big_stripe_tree, pslot,
+				(void *)(refcount + R5C_RADIX_COUNT_UNIT));
+		} else {
+			/* this radix_tree_insert can fail safely, so no
+			 * need to call radix_tree_preload()
+			 */
+			ret = radix_tree_insert(
+				&log->big_stripe_tree, tree_index,
+				(void *)R5C_RADIX_COUNT_UNIT);
+			if (ret) {
+				spin_unlock(&log->tree_lock);
+				r5c_make_stripe_write_out(sh);
+				return -EAGAIN;
+			}
+		}
+		spin_unlock(&log->tree_lock);
+
+		/* set STRIPE_R5C_PARTIAL_STRIPE, this shows the stripe is
+		 * counted in the radix tree
+		 */
+		set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state);
+		atomic_dec(&conf->r5c_cached_partial_stripes);
+	}
+
 	for (i = disks; i--; ) {
 		dev = &sh->dev[i];
 		if (dev->towrite) {
@@ -2411,17 +2490,20 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
 				 struct stripe_head *sh,
 				 struct stripe_head_state *s)
 {
+	struct r5l_log *log = conf->log;
 	int i;
 	int do_wakeup = 0;
+	sector_t tree_index;
+	void **pslot;
+	uintptr_t refcount;
 
-	if (!conf->log ||
-	    !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
+	if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
 		return;
 
 	WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
 	clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
 
-	if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
+	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
 		return;
 
 	for (i = sh->disks; i--; ) {
@@ -2443,12 +2525,41 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
 	if (do_wakeup)
 		wake_up(&conf->wait_for_overlap);
 
-	spin_lock_irq(&conf->log->stripe_in_journal_lock);
+	spin_lock_irq(&log->stripe_in_journal_lock);
 	list_del_init(&sh->r5c);
-	spin_unlock_irq(&conf->log->stripe_in_journal_lock);
+	spin_unlock_irq(&log->stripe_in_journal_lock);
 	sh->log_start = MaxSector;
-	atomic_dec(&conf->log->stripe_in_journal_count);
-	r5c_update_log_state(conf->log);
+
+	atomic_dec(&log->stripe_in_journal_count);
+	r5c_update_log_state(log);
+
+	/* stop counting this stripe in big_stripe_tree */
+	if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) ||
+	    test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
+		tree_index = r5c_tree_index(conf, sh->sector);
+		spin_lock(&log->tree_lock);
+		pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
+					       tree_index);
+		BUG_ON(pslot == NULL);
+		refcount = (uintptr_t)radix_tree_deref_slot(pslot);
+		if (refcount == R5C_RADIX_COUNT_UNIT)
+			radix_tree_delete(&log->big_stripe_tree, tree_index);
+		else
+			radix_tree_replace_slot(
+				&log->big_stripe_tree, pslot,
+				(void *)refcount - R5C_RADIX_COUNT_UNIT);
+		spin_unlock(&log->tree_lock);
+	}
+
+	if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
+		BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
+		atomic_dec(&conf->r5c_cached_partial_stripes);
+	}
+
+	if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
+		BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
+		atomic_dec(&conf->r5c_cached_full_stripes);
+	}
 }
 
 int
@@ -2508,6 +2619,22 @@ r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
 	return 0;
 }
 
+/* check whether this big stripe is in write back cache. */
+bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
+{
+	struct r5l_log *log = conf->log;
+	sector_t tree_index;
+	void **pslot;
+
+	if (!log)
+		return false;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	tree_index = r5c_tree_index(conf, sect);
+	pslot = radix_tree_lookup_slot(&log->big_stripe_tree, tree_index);
+	return pslot != NULL;
+}
+
 static int r5l_load_log(struct r5l_log *log)
 {
 	struct md_rdev *rdev = log->rdev;
@@ -2641,6 +2768,9 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	if (!log->meta_pool)
 		goto out_mempool;
 
+	spin_lock_init(&log->tree_lock);
+	INIT_RADIX_TREE(&log->big_stripe_tree, GFP_ATOMIC);
+
 	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
 						 log->rdev->mddev, "reclaim");
 	if (!log->reclaim_thread)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5a42f4b..4394ebc 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -287,13 +287,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 						atomic_dec(&conf->r5c_cached_partial_stripes);
 					list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
 					r5c_check_cached_full_stripe(conf);
-				} else {
-					/* partial stripe */
-					if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE,
-							      &sh->state))
-						atomic_inc(&conf->r5c_cached_partial_stripes);
+				} else
+					/* STRIPE_R5C_PARTIAL_STRIPE is set in
+					 * r5c_try_caching_write(). No need to
+					 * set it again.
+					 */
 					list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
-				}
 			}
 		}
 	}
@@ -5059,6 +5058,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 		      rdev->recovery_offset >= end_sector)))
 			rdev = NULL;
 	}
+
+	if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) {
+		rcu_read_unlock();
+		bio_put(align_bi);
+		return 0;
+	}
+
 	if (rdev) {
 		sector_t first_bad;
 		int bad_sectors;
@@ -5415,7 +5421,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	 * data on failed drives.
 	 */
 	if (rw == READ && mddev->degraded == 0 &&
-	    !r5c_is_writeback(conf->log) &&
 	    mddev->reshape_position == MaxSector) {
 		bi = chunk_aligned_read(mddev, bi);
 		if (!bi)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 876c75f..06b240e 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -795,4 +795,5 @@ extern void r5c_flush_cache(struct r5conf *conf, int num);
 extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
 extern void r5c_check_cached_full_stripe(struct r5conf *conf);
 extern struct md_sysfs_entry r5c_journal_mode;
+extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
 #endif
-- 
2.9.3


^ permalink raw reply related

* Re: ANNOUNCE: mdadm 4.0 - A tool for managing md Soft RAID under Linux
From: Eyal Lebedinsky @ 2017-01-11  1:47 UTC (permalink / raw)
  To: linux-raid@vger.kernel.org
In-Reply-To: <1cd97490-e650-d98b-466a-095292dc5b98@gmail.com>

On 10/01/17 12:11, Jes Sorensen wrote:
> I am pleased to announce the availability of
>    mdadm version 4.0
>
> It is available at the usual places:
>    http://www.kernel.org/pub/linux/utils/raid/mdadm/
> and via git at
>    git://git.kernel.org/pub/scm/utils/mdadm/mdadm.git
>    http://git.kernel.org/cgit/utils/mdadm/
>
> The update in major version number primarily indicates this is a
> release by it's new maintainer. In addition it contains a large number
> of fixes in particular for IMSM RAID and clustered RAID support.  In
> addition this release includes support for IMSM 4k sector drives,
> failfast and better documentation for journaled RAID.
>
> This is my first release of mdadm. Please thank Neil Brown for his
> previous work as maintainer and blame me for all the bugs I caused
> since taking over.
>
> Jes Sorensen, 2017-01-09

Hi Jes,

Not sure if this is interesting but

mdadm-4.0]$ make everything
cc -Wall -Werror -Wstrict-prototypes -Wextra -Wno-unused-parameter -ggdb -DSendmail=\""/usr/sbin/sendmail -t"\" -DCONFFILE=\"/etc/mdadm.conf\" -DCONFFILE2=\"/etc/mdadm/mdadm.conf\" -DMAP_DIR=\"/run/mdadm\" -DMAP_FILE=\"map\" -DMDMON_DIR=\"/run/mdadm\" -DFAILED_SLOTS_DIR=\"/run/mdadm/failed-slots\" -DNO_COROSYNC -DNO_DLM   -DUSE_PTHREADS  -static -o mdadm.static mdadm.o config.o policy.o mdstat.o  ReadMe.o util.o maps.o lib.o Manage.o Assemble.o Build.o Create.o Detail.o Examine.o Grow.o Monitor.o dlink.o Kill.o Query.o Incremental.o Dump.o mdopen.o super0.o super1.o super-ddf.o super-intel.o bitmap.o super-mbr.o super-gpt.o restripe.o sysfs.o sha1.o mapfile.o crc32.o sg_io.o msg.o xmalloc.o platform-intel.o probe_roms.o crc32c.o pwgr.o -ldl
/usr/bin/ld: cannot find -ldl
/usr/bin/ld: cannot find -lc
collect2: error: ld returned 1 exit status
make: *** [mdadm.static] Error 1

And while I have the podium: there is no rule to install raid6check (intended?). For my convenience I added:

install-raid6check: raid6check raid6check.8
	$(INSTALL) -D -m 644 raid6check.8 $(DESTDIR)$(MAN8DIR)/raid6check.8
	$(INSTALL) -D $(STRIP) -m 755 raid6check $(DESTDIR)$(BINDIR)/raid6check

Regards

-- 
Eyal Lebedinsky (eyal@eyal.emu.id.au)

^ permalink raw reply

* Re: ANNOUNCE: mdadm 4.0 - A tool for managing md Soft RAID under Linux
From: Guoqing Jiang @ 2017-01-11  3:37 UTC (permalink / raw)
  To: Eyal Lebedinsky, linux-raid@vger.kernel.org
In-Reply-To: <1b17f316-f974-a119-d981-fc52d1932fe2@eyal.emu.id.au>



On 01/11/2017 09:47 AM, Eyal Lebedinsky wrote:
> On 10/01/17 12:11, Jes Sorensen wrote:
>> I am pleased to announce the availability of
>>    mdadm version 4.0
>>
>> It is available at the usual places:
>>    http://www.kernel.org/pub/linux/utils/raid/mdadm/
>> and via git at
>>    git://git.kernel.org/pub/scm/utils/mdadm/mdadm.git
>>    http://git.kernel.org/cgit/utils/mdadm/
>>
>> The update in major version number primarily indicates this is a
>> release by it's new maintainer. In addition it contains a large number
>> of fixes in particular for IMSM RAID and clustered RAID support.  In
>> addition this release includes support for IMSM 4k sector drives,
>> failfast and better documentation for journaled RAID.
>>
>> This is my first release of mdadm. Please thank Neil Brown for his
>> previous work as maintainer and blame me for all the bugs I caused
>> since taking over.
>>
>> Jes Sorensen, 2017-01-09
>
> Hi Jes,
>
> Not sure if this is interesting but
>
> mdadm-4.0]$ make everything
> cc -Wall -Werror -Wstrict-prototypes -Wextra -Wno-unused-parameter 
> -ggdb -DSendmail=\""/usr/sbin/sendmail -t"\" 
> -DCONFFILE=\"/etc/mdadm.conf\" -DCONFFILE2=\"/etc/mdadm/mdadm.conf\" 
> -DMAP_DIR=\"/run/mdadm\" -DMAP_FILE=\"map\" -DMDMON_DIR=\"/run/mdadm\" 
> -DFAILED_SLOTS_DIR=\"/run/mdadm/failed-slots\" -DNO_COROSYNC 
> -DNO_DLM   -DUSE_PTHREADS  -static -o mdadm.static mdadm.o config.o 
> policy.o mdstat.o  ReadMe.o util.o maps.o lib.o Manage.o Assemble.o 
> Build.o Create.o Detail.o Examine.o Grow.o Monitor.o dlink.o Kill.o 
> Query.o Incremental.o Dump.o mdopen.o super0.o super1.o super-ddf.o 
> super-intel.o bitmap.o super-mbr.o super-gpt.o restripe.o sysfs.o 
> sha1.o mapfile.o crc32.o sg_io.o msg.o xmalloc.o platform-intel.o 
> probe_roms.o crc32c.o pwgr.o -ldl
> /usr/bin/ld: cannot find -ldl
> /usr/bin/ld: cannot find -lc
> collect2: error: ld returned 1 exit status
> make: *** [mdadm.static] Error 1

It happened when compile mdadm.static, so please make sure the static 
glibc is installed,
actually these got disappered after I have glibc-devel-static package.

>
> And while I have the podium: there is no rule to install raid6check 
> (intended?). For my convenience I added:
>
> install-raid6check: raid6check raid6check.8
>     $(INSTALL) -D -m 644 raid6check.8 $(DESTDIR)$(MAN8DIR)/raid6check.8
>     $(INSTALL) -D $(STRIP) -m 755 raid6check 
> $(DESTDIR)$(BINDIR)/raid6check

Or maybe it was ignored in origin patch.

Thanks,
Guoqing

^ permalink raw reply

* Re: [PATCH v2 2/2] md/r5cache: enable chunk_aligned_read with write back cache
From: NeilBrown @ 2017-01-11  4:10 UTC (permalink / raw)
  To: linux-raid
  Cc: shli, kernel-team, dan.j.williams, hch, liuzhengyuan, liuyun01,
	Song Liu, Jes.Sorensen
In-Reply-To: <20170111014251.3236610-2-songliubraving@fb.com>

[-- Attachment #1: Type: text/plain, Size: 1960 bytes --]

On Wed, Jan 11 2017, Song Liu wrote:

> Chunk aligned read significantly reduces CPU usage of raid456.
> However, it is not safe to fully bypass the write back cache.
> This patch enables chunk aligned read with write back cache.
>
> For chunk aligned read, we track stripes in write back cache at
> a bigger granularity, "big_stripe". Each chunk may contain more
> than one stripe (for example, a 256kB chunk contains 64 4kB-page,
> so this chunk contain 64 stripes). For chunk_aligned_read, these
> stripes are grouped into one big_stripe, so we only need one lookup
> for the whole chunk.
>
> For each big_stripe, struct big_stripe_info tracks how many stripes
> of this big_stripe are in the write back cache. We count how many
> stripes of this big_stripe are in the write back cache. These
> counters are tracked in a radix tree (big_stripe_tree).
> r5c_tree_index() is used to calculate keys for the radix tree.
>
> chunk_aligned_read() calls r5c_big_stripe_cached() to look up
> big_stripe of each chunk in the tree. If this big_stripe is in the
> tree, chunk_aligned_read() aborts. This look up is protected by
> rcu_read_lock().
>
> It is necessary to remember whether a stripe is counted in
> big_stripe_tree. Instead of adding new flag, we reuses existing flags:
> STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these
> two flags are set, the stripe is counted in big_stripe_tree. This
> requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to
> r5c_try_caching_write(); and moving clear_bit of
> STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to
> r5c_finish_stripe_write_out().
>
> Signed-off-by: Song Liu <songliubraving@fb.com>

Thanks, this looks quite good.

One thing I wonder about is reshape.  If the chunksize is being
reshaped, that would confused things.
But maybe reshaped isn't supported when the journal is in use, in
which case it wouldn't matter.

Reviewed-by: NeilBrown <neilb@suse.com>

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* Re: [PATCH v2 0/7] uapi: export all headers under uapi directories
From: Jesper Nilsson @ 2017-01-11 12:42 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: linuxppc-dev, linux-kbuild, Nicolas Dichtel, linux-mips,
	alsa-devel, linux-ia64, linux-doc, airlied, linux-fbdev,
	dri-devel, linux-mtd, sparclinux, linux-arch, linux-s390,
	linux-am33-list, linux-c6x-dev, linux-rdma, linux-hexagon,
	linux-sh, coreteam, fcoe-devel, xen-devel, linux-snps-arc,
	linux-media, uclinux-h8-devel, adi-buildroot-devel
In-Reply-To: <3131144.4Ej3KFWRbz@wuerfel>

On Mon, Jan 09, 2017 at 12:33:58PM +0100, Arnd Bergmann wrote:
> On Friday, January 6, 2017 10:43:52 AM CET Nicolas Dichtel wrote:
> > Here is the v2 of this series. The first 5 patches are just cleanup: some
> > exported headers were still under a non-uapi directory.
> 
> Since this is meant as a cleanup, I commented on this to point out a cleaner
> way to do the same.
> 
> > The patch 6 was spotted by code review: there is no in-tree user of this
> > functionality.
> > The last patch remove the use of header-y. Now all files under an uapi
> > directory are exported.
> 
> Very nice!
> 
> > asm is a bit special, most of architectures export asm/<arch>/include/uapi/asm
> > only, but there is two exceptions:
> >  - cris which exports arch/cris/include/uapi/arch-v[10|32];
> 
> This is interesting, though not your problem. Maybe someone who understands
> cris better can comment on this: How is the decision made about which of
> the arch/user.h headers gets used? I couldn't find that in the sources,
> but it appears to be based on kernel compile-time settings, which is
> wrong for user space header files that should be independent of the kernel
> config.

I believe it's since the CRISv10 and CRISv32 are very different beasts,
and that is selected via kernel config...

This part of the CRIS port has been transformed a couple of times from
the original layout without uapi, and there's still some legacy silliness,
where some files might have been exported but never used from userspace
except for some corner cases.

> >  - tile which exports arch/tile/include/uapi/arch.
> > Because I don't know if the output of 'make headers_install_all' can be changed,
> > I introduce subdir-y in Kbuild file. The headers_install_all target copies all
> > asm/<arch>/include/uapi/asm to usr/include/asm-<arch> but
> > arch/cris/include/uapi/arch-v[10|32] and arch/tile/include/uapi/arch are not
> > prefixed (they are put asis in usr/include/). If it's acceptable to modify the
> > output of 'make headers_install_all' to export asm headers in
> > usr/include/asm-<arch>/asm, then I could remove this new subdir-y and exports
> > everything under arch/<arch>/include/uapi/.
> 
> I don't know if anyone still uses "make headers_install_all", I suspect
> distros these days all use "make headers_install", so it probably
> doesn't matter much.
> 
> In case of cris, it should be easy enough to move all the contents of the
> uapi/arch-*/*.h headers into the respective uapi/asm/*.h headers, they
> only seem to be referenced from there.

This would seem to be a reasonable change.

> For tile, I suspect that would not work as the arch/*.h headers are
> apparently defined as interfaces for both user space and kernel.
> 
> > Note also that exported files for asm are a mix of files listed by:
> >  - include/uapi/asm-generic/Kbuild.asm;
> >  - arch/x86/include/uapi/asm/Kbuild;
> >  - arch/x86/include/asm/Kbuild.
> > This complicates a lot the processing (arch/x86/include/asm/Kbuild is also
> > used by scripts/Makefile.asm-generic).
> > 
> > This series has been tested with a 'make headers_install' on x86 and a
> > 'make headers_install_all'. I've checked the result of both commands.
> > 
> > This patch is built against linus tree. I don't know if it should be
> > made against antoher tree.
> 
> The series should probably get merged through the kbuild tree, but testing
> it on mainline is fine here.
> 
> 	Arnd

/^JN - Jesper Nilsson
-- 
               Jesper Nilsson -- jesper.nilsson@axis.com

^ permalink raw reply

* Re: [RFC PATCH v2] crypto: Add IV generation algorithms
From: Ondrej Mosnáček @ 2017-01-11 14:55 UTC (permalink / raw)
  To: Binoy Jayan
  Cc: Oded, Ofir, Herbert Xu, David S. Miller, linux-crypto, Mark Brown,
	Arnd Bergmann, linux-kernel, Alasdair Kergon, Mike Snitzer,
	dm-devel, Shaohua Li, linux-raid, Rajendra
In-Reply-To: <1481618949-20086-2-git-send-email-binoy.jayan@linaro.org>

Hi Binoy,

2016-12-13 9:49 GMT+01:00 Binoy Jayan <binoy.jayan@linaro.org>:
> Currently, the iv generation algorithms are implemented in dm-crypt.c.
> The goal is to move these algorithms from the dm layer to the kernel
> crypto layer by implementing them as template ciphers so they can be
> implemented in hardware for performance. As part of this patchset, the
> iv-generation code is moved from the dm layer to the crypto layer and
> adapt the dm-layer to send a whole 'bio' (as defined in the block layer)
> at a time. Each bio contains the in memory representation of physically
> contiguous disk blocks. The dm layer sets up a chained scatterlist of
> these blocks split into physically contiguous segments in memory so that
> DMA can be performed. The iv generation algorithms implemented in geniv.c
> include plain, plain64, essiv, benbi, null, lmk and tcw.

I like what you are trying to achieve, however I don't think the
solution you are heading towards (passing sector number to a special
crypto template) would be the best approach here. Milan is currently
trying to add authenticated encryption support to dm-crypt (see [1])
and as part of this change, a new random IV mode would be introduced.
This mode generates a random IV for each sector write, includes it in
the authenticated data and stores it in the sector's metadata (in a
separate part of the disk). In this case dm-crypt will need to have
control over the IV generation (or at least be able to somehow
retrieve it after the crypto operation).

That said, I believe a different approach would be preferable here. I
would suggest, instead of moving the IV generation to the crypto
layer, to add a new type of request to skcipher API (let's call it
'skcipher_bulk_request'), which could be used to submit several
messages at once (together in a single sg list), each with their own
IV, to a skcipher. This would allow drivers to optimize handling of
such requests (e.g. the SIMD ciphers could call kernel_fpu_begin/end
just once for the whole request). It could be done in such a way, that
implementing this type of requests would be optional and a fallback
implementation, which would just split the request into regular
skcipher_requests, would be automatically set for the ciphers that do
not set it themselves. That way this would require no changes to
crypto drivers in the beginning and optimizations could be added
incrementally.

The advantage of this approach to handling such "bulk" requests is
that crypto drivers could just optimize regular algorithms (xts(aes),
cbc(aes), etc.) and wouldn't need to mess with dm-crypt-specific IV
generation. This also means that other users that could potentially
benefit from bulking requests (perhaps network stack?) could use the
same functionality.

I have been playing with this idea for some time now and I should have
an RFC patchset ready soon...

Binoy, Herbert, what do you think about such approach?

[1] https://www.redhat.com/archives/dm-devel/2017-January/msg00028.html

> When using multiple keys with the original dm-crypt, the key selection is
> made based on the sector number as:
>
> key_index = sector & (key_count - 1)
>
> This restricts the usage of the same key for encrypting/decrypting a
> single bio. One way to solve this is to move the key management code from
> dm-crypt to cryto layer. But this seems tricky when using template ciphers
> because, when multiple ciphers are instantiated from dm layer, each cipher
> instance set with a unique subkey (part of the bigger master key) and
> these instances themselves do not have access to each other's instances
> or contexts. This way, a single instance cannot encryt/decrypt a whole bio.
> This has to be fixed.

Please note that the "keycount" parameter was added to dm-crypt solely
for the purpose of implementing the loop-AES partition format. In
general, the security benefit gained by using keycount > 1 is
debatable, so it does not really make sense to use it for anything
else than accessing legacy loopAES partitions. Since Milan decided to
add it as a generic parameter, instead of hard-coding the
functionality for the LMK mode, it can be technically used also in
other combinations, but IMHO it is perfectly reasonable to just give
up on optimizing the cases when keycount > 1. I believe the loop-AES
partition support is just not that important :)

Thanks,
Ondrej

^ permalink raw reply

* Re: ANNOUNCE: mdadm 4.0 - A tool for managing md Soft RAID under Linux
From: Shaohua Li @ 2017-01-11 16:52 UTC (permalink / raw)
  To: Bruce Dubbs; +Cc: Jes Sorensen, linux-raid@vger.kernel.org, LKML, Brown, Neil
In-Reply-To: <58751E90.5090306@gmail.com>

On Tue, Jan 10, 2017 at 11:49:04AM -0600, Bruce Dubbs wrote:
> Jes Sorensen wrote:
> > I am pleased to announce the availability of
> >     mdadm version 4.0
> > 
> > It is available at the usual places:
> >     http://www.kernel.org/pub/linux/utils/raid/mdadm/
> > and via git at
> >     git://git.kernel.org/pub/scm/utils/mdadm/mdadm.git
> >     http://git.kernel.org/cgit/utils/mdadm/
> > 
> > The update in major version number primarily indicates this is a
> > release by it's new maintainer. In addition it contains a large number
> > of fixes in particular for IMSM RAID and clustered RAID support.  In
> > addition this release includes support for IMSM 4k sector drives,
> > failfast and better documentation for journaled RAID.
> 
> Thank you for the new release.  Unfortunately I get 9 failures running the
> test suite:
> 
> tests/00raid1...          FAILED
> tests/07autoassemble...   FAILED
> tests/07changelevels...   FAILED
> tests/07revert-grow...    FAILED
> tests/07revert-inplace... FAILED
> tests/07testreshape5...   FAILED
> tests/10ddf-fail-twice... FAILED
> tests/20raid5journal...   FAILED
> tests/10ddf-incremental-wrong-order...  FAILED

Yep, several tests usually fail. It appears some checks aren't always good.  At
least the 'check' function for reshape/resync isn't reliable in my test, I saw
07changelevelintr fails frequently.

Thanks,
Shaohua

^ permalink raw reply

* Re: ANNOUNCE: mdadm 4.0 - A tool for managing md Soft RAID under Linux
From: Jes Sorensen @ 2017-01-11 16:59 UTC (permalink / raw)
  To: Shaohua Li, Bruce Dubbs; +Cc: linux-raid@vger.kernel.org, LKML, Brown, Neil
In-Reply-To: <20170111165241.yavdwc57v6yodx7g@kernel.org>

On 01/11/17 11:52, Shaohua Li wrote:
> On Tue, Jan 10, 2017 at 11:49:04AM -0600, Bruce Dubbs wrote:
>> Jes Sorensen wrote:
>>> I am pleased to announce the availability of
>>>     mdadm version 4.0
>>>
>>> It is available at the usual places:
>>>     http://www.kernel.org/pub/linux/utils/raid/mdadm/
>>> and via git at
>>>     git://git.kernel.org/pub/scm/utils/mdadm/mdadm.git
>>>     http://git.kernel.org/cgit/utils/mdadm/
>>>
>>> The update in major version number primarily indicates this is a
>>> release by it's new maintainer. In addition it contains a large number
>>> of fixes in particular for IMSM RAID and clustered RAID support.  In
>>> addition this release includes support for IMSM 4k sector drives,
>>> failfast and better documentation for journaled RAID.
>>
>> Thank you for the new release.  Unfortunately I get 9 failures running the
>> test suite:
>>
>> tests/00raid1...          FAILED
>> tests/07autoassemble...   FAILED
>> tests/07changelevels...   FAILED
>> tests/07revert-grow...    FAILED
>> tests/07revert-inplace... FAILED
>> tests/07testreshape5...   FAILED
>> tests/10ddf-fail-twice... FAILED
>> tests/20raid5journal...   FAILED
>> tests/10ddf-incremental-wrong-order...  FAILED
> 
> Yep, several tests usually fail. It appears some checks aren't always good.  At
> least the 'check' function for reshape/resync isn't reliable in my test, I saw
> 07changelevelintr fails frequently.

That is my experience as well - some of them are affected by the kernel
version too. We probably need to look into making them more reliable.

I am also not sure how reliable the DDF tests are on systems without DDF
support.

Cheers,
Jes

^ permalink raw reply

* Re: [PATCH v2 1/2] EXPORT_SYMBOL radix_tree_lookup_slot
From: Shaohua Li @ 2017-01-11 17:54 UTC (permalink / raw)
  To: Song Liu
  Cc: linux-raid, neilb, shli, kernel-team, dan.j.williams, hch,
	liuzhengyuan, liuyun01, Jes.Sorensen
In-Reply-To: <20170111014251.3236610-1-songliubraving@fb.com>

On Tue, Jan 10, 2017 at 05:42:50PM -0800, Song Liu wrote:
> It will be used in drivers/md/raid5-cache.c

title and patch don't match
 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>  lib/radix-tree.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/lib/radix-tree.c b/lib/radix-tree.c
> index 6f382e0..1ee7449 100644
> --- a/lib/radix-tree.c
> +++ b/lib/radix-tree.c
> @@ -1099,6 +1099,7 @@ void radix_tree_replace_slot(struct radix_tree_root *root,
>  {
>  	replace_slot(root, NULL, slot, item, true);
>  }
> +EXPORT_SYMBOL(radix_tree_replace_slot);
>  
>  /**
>   * radix_tree_iter_replace - replace item in a slot
> -- 
> 2.9.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox