Linux RAID subsystem development

Linux RAID subsystem development
 help / color / mirror / Atom feed

* mdadm raid6 sequential read slower than reading from userspace
From: Stevie Trujillo @ 2017-02-03 22:24 UTC (permalink / raw)
  To: linux-raid, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1798 bytes --]

Hello

Kernel:     Linux version 4.9.0-1-amd64 (debian-kernel@lists.debian.org)
            (gcc version 6.3.0 20161229 (Debian 6.3.0-2) ) #1 SMP Debian
            4.9.2-2 (2017-01-12)
CPU:        2xE5-2665
Memory:     256GB
Drives:     6x8TB Seagate
Controller: LSI2008
md0 : active raid6 sdb1[1] sda1[0] sdd1[3] sde1[4] sdc1[2] sdf1[5]
      31255576576 blocks super 1.2 level 6, 512k chunk, algorithm 2
[6/6] [UUUUUU] bitmap: 0/59 pages [0KB], 65536KB chunk

When I read sequentially from one of the disks I get 230-245MB/s. If I
read from all of them at the same time, the performance stays the same
(even if I bind all the dd processes to the same core).
Conclusion: I think the controller is not a bottleneck.

I first tried Debian8 with 3.16 and got 400-500MB/s when dd-ing
from /dev/md0. Upgrading to Debian9 with 4.9.2 roughly doubled my
performance:
53687091200 bytes (54 GB, 50 GiB) copied, 62.0078 s, 866 MB/s
53687091200 bytes (54 GB, 50 GiB) copied, 57.9882 s, 926 MB/s

dd uses 40% cpu and I can't find any process that uses more, so I don't
think I'm limited by CPU.

I wrote a small program that reads directly from the disks and outputs
the same data as reading from md0 would do. It's faster and has
more stable runtime than reading from md0: it finishes in 44.0 +-
0.2seconds (that is ~1150MB/s).

Is it possible to make mdadm work faster? I was hoping it could read
6x240MB/s, but maybe that's not possible. At least I think it should be
able to do 1150MB/s like userspace?
How can I find out what bottleneck? I couldn't see anything obvious
like 100% cpu usage.
I tried copying different tuning instructions I found on Google, but
they usually made negative impact if any.

I attached the program, but I'm still learning programming so it's not
very good.

--
Stevie Trujillo

[-- Attachment #2: raid6read.cc --]
[-- Type: text/x-c++src, Size: 8079 bytes --]

#include <vector>
#include <queue>
#include <thread>
#include <mutex>
#include <condition_variable>
using namespace std;
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <inttypes.h>
#include <sys/poll.h>
#include <scsi/sg.h>
#include <assert.h>
#include <err.h>

#define READ_16 0x88
#define NUM_DISKS 6
#define MAX_READAHEAD 16
#define CHUNK_SIZE (512*1024)

namespace {

struct BufferStorage
{
	int ref_count;
	unsigned char *memory;
};

struct Buffer
{
	BufferStorage *storage;
	unsigned char *buf;
};

struct Request
{
	uint64_t output_idx;
	uint64_t chunk;
};

struct Response
{
	uint64_t output_idx;
	Buffer buffer;
};

struct PendingIO
{
	BufferStorage *storage;
	uint64_t output_idx[4];
};

struct Disk
{
	int sg_fd;
	PendingIO pending_io[MAX_READAHEAD];
	unsigned char slot_i;
	unsigned char slots[MAX_READAHEAD];
	int current_request;
	vector<Request> requests;
};

struct Raid6
{
	Disk disks[6];
};

struct ThreadData
{
	mutex m;
	condition_variable cv;
	uint64_t last_idx;
	vector<Response> responses;
};

static void
sg_read(int sg_fd, void *buf, int pack_id, uint64_t lba, uint64_t len_lba)
{
	uint64_t len_bytes = 512 * len_lba;

	unsigned char cdb[16] = {};
	cdb[0] = READ_16;
	cdb[2] = (lba >> 56) & 0xff;
	cdb[3] = (lba >> 48) & 0xff;
	cdb[4] = (lba >> 40) & 0xff;
	cdb[5] = (lba >> 32) & 0xff;
	cdb[6] = (lba >> 24) & 0xff;
	cdb[7] = (lba >> 16) & 0xff;
	cdb[8] = (lba >> 8) & 0xff;
	cdb[9] = (lba >> 0) & 0xff;
	cdb[10] = (len_lba >> 24) & 0xff;
	cdb[11] = (len_lba >> 16) & 0xff;
	cdb[12] = (len_lba >> 8) & 0xff;
	cdb[13] = (len_lba >> 0) & 0xff;

	sg_io_hdr_t io_hdr;
	memset(&io_hdr, '\0', sizeof(io_hdr));
	io_hdr.interface_id = 'S'; /* SCSI Generic Interface */
	io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
	io_hdr.cmd_len = sizeof(cdb);
	io_hdr.cmdp = cdb;
	io_hdr.dxfer_len = len_bytes;
	io_hdr.dxferp = buf;
	io_hdr.timeout = 20000;
	io_hdr.pack_id = pack_id;

	if (write(sg_fd, &io_hdr, sizeof(io_hdr)) != sizeof(io_hdr))
		err(1, "write");
}

void queue_requests(Disk *disk)
{
	while ((size_t) disk->current_request < disk->requests.size() && disk->slot_i < MAX_READAHEAD) {
		int batch_requests = 1;
		for (; batch_requests < 4 && (size_t) disk->current_request + batch_requests < disk->requests.size(); ++batch_requests) {
			if (disk->requests[disk->current_request].chunk + batch_requests
			 != disk->requests[disk->current_request + batch_requests].chunk
			)
				break;
		}

		unsigned char slot = disk->slots[disk->slot_i++];
		struct PendingIO *pending = &disk->pending_io[slot];
		for (int i = 0; i < 4; ++i)
			pending->output_idx[i] = ~(uint64_t) 0;

		for (int i = 0; i < batch_requests; ++i)
			pending->output_idx[i] = disk->requests[disk->current_request + i].output_idx;

		uint64_t len_bytes = batch_requests * CHUNK_SIZE;
		pending->storage = new BufferStorage;
		pending->storage->ref_count = 0;
		void *buf;
		posix_memalign(&buf, 0x1000, len_bytes);
		pending->storage->memory = (unsigned char *) buf;

		const Request &r = disk->requests[disk->current_request];
		sg_read(disk->sg_fd, pending->storage->memory, slot, r.chunk * CHUNK_SIZE / 512, batch_requests * CHUNK_SIZE / 512);

		disk->current_request += batch_requests;
	}
}

void read_response(vector<Response> &responses, Disk *disk)
{
	sg_io_hdr_t io_hdr;
	memset(&io_hdr, '\0', sizeof(io_hdr));
	io_hdr.interface_id = 'S'; /* SCSI Generic Interface */
	io_hdr.pack_id = -1;

	if (read(disk->sg_fd, &io_hdr, sizeof(io_hdr)) != sizeof(io_hdr))
		err(1, "read");

	assert(io_hdr.pack_id >= 0 && io_hdr.pack_id < MAX_READAHEAD);
	PendingIO *pending = &disk->pending_io[io_hdr.pack_id];

	for (int j = 0; j < 4; ++j) {
		if (pending->output_idx[j] == ~(uint64_t) 0)
			break;

		Buffer buffer;
		buffer.storage = pending->storage;
		buffer.storage->ref_count += 1;
		buffer.buf = buffer.storage->memory + CHUNK_SIZE * j;
		responses.push_back(Response{pending->output_idx[j], buffer});
	}

	disk->slots[--disk->slot_i] = io_hdr.pack_id;
}

/* write the data we read to stdout in correct order */
void writer_function(ThreadData *td)
{
	auto cmp = [](const Response &a, const Response &b) { return a.output_idx > b.output_idx; };
	priority_queue<Response, vector<Response>, decltype(cmp)> responses(cmp);
	uint64_t current_idx = 0;

	while (current_idx < td->last_idx) {
		if (responses.empty() || responses.top().output_idx != current_idx) {
			vector<Response> tmp;
			{
				unique_lock<mutex> lk(td->m);
				td->cv.wait(lk, [=](){ return !td->responses.empty(); });
				tmp = move(td->responses);
			}

			for (const Response &r : tmp)
				responses.push(r);

			continue;
		}

		Response r = responses.top();
		responses.pop();
		unsigned char *buf = r.buffer.buf;
		size_t size = CHUNK_SIZE;

		while (size) {
			ssize_t bytes_written = write(1, buf, size);
			if (bytes_written < 0)
				err(1, "write");

			buf += bytes_written;
			size -= bytes_written;
		}

		if (--r.buffer.storage->ref_count == 0) {
			free(r.buffer.storage->memory);
			delete r.buffer.storage;
		}

		++current_idx;
	}
}

/* run all the disks from the same thread */
void run_sg_poll(Raid6 *raid, ThreadData *writer_td)
{
	for (;;) {
		struct pollfd pfds[NUM_DISKS];
		int nfds = 0;

		for (int i = 0; i < NUM_DISKS; ++i) {
			Disk *disk = &raid->disks[i];
			if (disk->sg_fd < 0)
				continue;

			queue_requests(disk);
			if ((size_t) disk->current_request == disk->requests.size() && disk->slot_i == 0) {
				close(disk->sg_fd);
				disk->sg_fd = -1;
				continue;
			}

			pfds[nfds++] = (struct pollfd) { raid->disks[i].sg_fd, POLLIN, 0 };
		}

		if (!nfds)
			break;

		int ret = poll(pfds, nfds, -1);
		if (ret <= 0)
			err(1, "poll");

		vector<Response> responses;
		for (int i = 0; i < NUM_DISKS; ++i) {
			Disk *disk = &raid->disks[i];
			if (disk->sg_fd >= 0)
				read_response(responses, disk);
		}

		{
			unique_lock<mutex> lk(writer_td->m);
			writer_td->responses.insert(writer_td->responses.end(), responses.begin(), responses.end());
		}
		writer_td->cv.notify_one();
	}
}

/* run each disk from one thread */
void run_sg_single(Raid6 *raid, Disk *disk, ThreadData *writer_td)
{
	for (;;) {
		queue_requests(disk);
		if ((size_t) disk->current_request == disk->requests.size() && disk->slot_i == 0) {
			close(disk->sg_fd);
			disk->sg_fd = -1;
			break;
		}

		vector<Response> responses;
		read_response(responses, disk);

		{
			unique_lock<mutex> lk(writer_td->m);
			writer_td->responses.insert(writer_td->responses.end(), responses.begin(), responses.end());
		}
		writer_td->cv.notify_one();
	}
}

}

int main(int argc, char **argv)
{
	if (argc != 1 + NUM_DISKS)
		errx(1, "usage: disks");

	Raid6 raid;

	for (int i = 0; i < NUM_DISKS; ++i) {
		const char *path = argv[1 + i];
		Disk *disk = &raid.disks[i];

		disk->sg_fd = open(path, O_RDWR);
		if (disk->sg_fd < 0)
			err(1, "open(%s)", path);

		disk->current_request = 0;
		disk->slot_i = 0;
		for (int i = 0; i < MAX_READAHEAD; ++i)
			disk->slots[i] = i;
	}

	uint64_t num_chunks = 102400; // 50TB

	/* precompute all the chunks we want the disks to read */
	uint64_t output_idx = 0;
	for (uint64_t chunk = 0; chunk < num_chunks; ++chunk) {
		uint64_t data_offset = 2048 * 512 / (512*1024) /* from partitioning */
		                     + 256*1024*512 / (512*1024); /* from mdadm --examine */
		/*
		 * stripe0: bcde|fa
		 * stripe1: abcd|ef
		 * stripe2: fabc|de
		 */
		int64_t stripe = chunk / 4;
		uint64_t slot = chunk % 4;
		int64_t disk_idx = 1 - stripe + slot;
		disk_idx %= 6;
		disk_idx = disk_idx + (disk_idx >> 63 & 6);
		raid.disks[disk_idx].requests.push_back(Request{output_idx++, data_offset + stripe});
	}

	ThreadData writer_td;
	writer_td.last_idx = output_idx;

	thread writer_thread(writer_function, &writer_td);

	if (0) {
		run_sg_poll(&raid, &writer_td);
	} else {
		thread threads[6];

		for (int i = 0; i < NUM_DISKS; ++i)
			threads[i] = move(thread(run_sg_single, &raid, &raid.disks[i], &writer_td));

		for (int i = 0; i < NUM_DISKS; ++i)
			threads[i].join();
	}

	writer_thread.join();
	return 0;
}

^ permalink raw reply

* Re: [PATCH 3/6] async_tx: Handle DMA devices having support for fewer PQ coefficients
From: Dan Williams @ 2017-02-03 18:42 UTC (permalink / raw)
  To: Anup Patel
  Cc: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar, Ray Jui, Scott Branden, Jon Mason,
	Rob Rice, BCM Kernel Feedback,
	dmaengine-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Device Tree,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-crypto-u79uwXL29TY76Z2rM5mHXA, linux-raid
In-Reply-To: <CAALAos8HEjPhdM0cibTVB==WsanktBx87e8b8diVsNm1EmCQHQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Fri, Feb 3, 2017 at 2:59 AM, Anup Patel <anup.patel-dY08KVG/lbpWk0Htik3J/w@public.gmane.org> wrote:
>
>
> On Thu, Feb 2, 2017 at 11:31 AM, Dan Williams <dan.j.williams-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
> wrote:
>>
>> On Wed, Feb 1, 2017 at 8:47 PM, Anup Patel <anup.patel-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>
>> wrote:
>> > The DMAENGINE framework assumes that if PQ offload is supported by a
>> > DMA device then all 256 PQ coefficients are supported. This assumption
>> > does not hold anymore because we now have BCM-SBA-RAID offload engine
>> > which supports PQ offload with limited number of PQ coefficients.
>> >
>> > This patch extends async_tx APIs to handle DMA devices with support
>> > for fewer PQ coefficients.
>> >
>> > Signed-off-by: Anup Patel <anup.patel-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>
>> > Reviewed-by: Scott Branden <scott.branden-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>
>> > ---
>> >  crypto/async_tx/async_pq.c          |  3 +++
>> >  crypto/async_tx/async_raid6_recov.c | 12 ++++++++++--
>> >  include/linux/dmaengine.h           | 19 +++++++++++++++++++
>> >  include/linux/raid/pq.h             |  3 +++
>> >  4 files changed, 35 insertions(+), 2 deletions(-)
>>
>> So, I hate the way async_tx does these checks on each operation, and
>> it's ok for me to say that because it's my fault. Really it's md that
>> should be validating engine offload capabilities once at the beginning
>> of time. I'd rather we move in that direction than continue to pile
>> onto a bad design.
>
>
> Yes, indeed. All async_tx APIs have lot of checks and for high throughput
> RAID offload engine these checks can add some overhead.
>
> I think doing checks in Linux md would be certainly better but this would
> mean lot of changes in Linux md as well as remove checks in async_tx.
>
> Also, async_tx APIs should not find DMA channel on its own instead it
> should rely on Linux md to provide DMA channel pointer as parameter.
>
> It's better to do checks cleanup in async_tx as separate patchset and
> keep this patchset simple.

That's been the problem with async_tx being broken like this for
years. Once you get this "small / simple" patch upstream, that
arguably makes async_tx a little bit worse, there is no longer any
motivation to fix the underlying issues. If you care about the long
term health of raid offload and are enabling new hardware support you
should first tackle the known problems with it before adding new
features.
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 1/6] mailbox: Add new API mbox_channel_device() for clients
From: Jassi Brar @ 2017-02-03 12:05 UTC (permalink / raw)
  To: Anup Patel
  Cc: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Dan Williams, Ray Jui, Scott Branden, Jon Mason,
	Rob Rice, BCM Kernel Feedback, dmaengine, Devicetree List,
	linux-arm-kernel@lists.infradead.org, Linux Kernel Mailing List,
	linux-crypto, linux-raid
In-Reply-To: <1486010836-25228-2-git-send-email-anup.patel@broadcom.com>

On Thu, Feb 2, 2017 at 10:17 AM, Anup Patel <anup.patel@broadcom.com> wrote:
> The remote processor can have DMAENGINE capabilities and client
> can pass data to be processed via main memory. In such cases,
> the client will require DMAble memory for remote processor.
>
> This patch adds new API mbox_channel_device() which can be
> used by clients to get struct device pointer of underlying
> mailbox controller. This struct device pointer of mailbox
> controller can be used by clients to allocate DMAble memory
> for remote processor.
>
IIUC, DT already provides a way for what you need.

^ permalink raw reply

* Re: sector size mismatch a problem for RAID 1?
From: Wols Lists @ 2017-02-03 11:20 UTC (permalink / raw)
  To: Boylan, Ross, linux-raid@vger.kernel.org
In-Reply-To: <d05e3579-38c4-40bd-8fb4-d296fc3ea9c5@EXHT02.net.ucsf.edu>

On 03/02/17 06:18, Boylan, Ross wrote:
> When I moved the new disk to an internal drive bay (after reading lvm-raid wiki's statements that RAID shouldn't be used over a USB link) the problems went away--at least so far.
> 
As I understand it, the problem is that the USB interface goes to sleep.
So when you try to write to it, it may not wake up quick enough, causing
havoc ...

> I wonder if this  is the source of my original problem: I was using a single disk RAID1 where the single disk had a USB connection.  However, I did get a failure off that drive doing a dd off the raw device.
> 
Quite likely. But I'm puzzled as to why the dd would fail, as this would
keep the USB interface active. Maybe others who know more will chime in
and explain.
> 
> Ross

Cheers,
Wol

^ permalink raw reply

* Re: [PATCH 3/6] async_tx: Handle DMA devices having support for fewer PQ coefficients
From: Anup Patel @ 2017-02-03 11:00 UTC (permalink / raw)
  To: Dan Williams
  Cc: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar, Ray Jui, Scott Branden, Jon Mason,
	Rob Rice, BCM Kernel Feedback, dmaengine@vger.kernel.org,
	Device Tree, linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-crypto, linux-raid
In-Reply-To: <CAPcyv4gBFu7skx_8cvsaL0sL5=44DMcWY_EjWSCfaNd8oh=Svw@mail.gmail.com>

On Thu, Feb 2, 2017 at 11:31 AM, Dan Williams <dan.j.williams@intel.com> wrote:
>
> On Wed, Feb 1, 2017 at 8:47 PM, Anup Patel <anup.patel@broadcom.com> wrote:
> > The DMAENGINE framework assumes that if PQ offload is supported by a
> > DMA device then all 256 PQ coefficients are supported. This assumption
> > does not hold anymore because we now have BCM-SBA-RAID offload engine
> > which supports PQ offload with limited number of PQ coefficients.
> >
> > This patch extends async_tx APIs to handle DMA devices with support
> > for fewer PQ coefficients.
> >
> > Signed-off-by: Anup Patel <anup.patel@broadcom.com>
> > Reviewed-by: Scott Branden <scott.branden@broadcom.com>
> > ---
> >  crypto/async_tx/async_pq.c          |  3 +++
> >  crypto/async_tx/async_raid6_recov.c | 12 ++++++++++--
> >  include/linux/dmaengine.h           | 19 +++++++++++++++++++
> >  include/linux/raid/pq.h             |  3 +++
> >  4 files changed, 35 insertions(+), 2 deletions(-)
>
> So, I hate the way async_tx does these checks on each operation, and
> it's ok for me to say that because it's my fault. Really it's md that
> should be validating engine offload capabilities once at the beginning
> of time. I'd rather we move in that direction than continue to pile
> onto a bad design.

Yes, indeed. All async_tx APIs have lot of checks and for high throughput
RAID offload engine these checks can add some overhead.

I think doing checks in Linux md would be certainly better but this would
mean lot of changes in Linux md as well as remove checks in async_tx.

Also, async_tx APIs should not find DMA channel on its own instead it
should rely on Linux md to provide DMA channel pointer as parameter.

It's better to do checks cleanup in async_tx as separate patchset and
keep this patchset simple.

Regards,
Anup

^ permalink raw reply

* RE: sector size mismatch a problem for RAID 1?
From: Boylan, Ross @ 2017-02-03  6:18 UTC (permalink / raw)
  To: linux-raid@vger.kernel.org
In-Reply-To: <3dcb5a7a-9bd4-4cfd-a655-91a88692c549@EXHT01.net.ucsf.edu>

When I moved the new disk to an internal drive bay (after reading lvm-raid wiki's statements that RAID shouldn't be used over a USB link) the problems went away--at least so far.

I wonder if this  is the source of my original problem: I was using a single disk RAID1 where the single disk had a USB connection.  However, I did get a failure off that drive doing a dd off the raw device.

Ross
________________________________________
From: linux-raid-owner@vger.kernel.org [linux-raid-owner@vger.kernel.org] on behalf of Boylan, Ross [Ross.Boylan@ucsf.edu]
Sent: Thursday, February 02, 2017 4:17 PM
To: linux-raid@vger.kernel.org
Subject: sector size mismatch a problem for RAID 1?

I got a replacement  for a failing disk, but the 2 drives seems to have slightly different formatting:
For the failing one
GNU Parted 2.3
Using /dev/sdb
Welcome to GNU Parted! Type 'help' to view a list of commands.
(parted) p
Model: WDC WD40 01FFSX-68JNUN0 (scsi)
Disk /dev/sdb: 4001GB
Sector size (logical/physical): 512B/4096B <<<------
Partition Table: gpt

And the replacement (sdk)
Model: WDC WD40 01FFSX-68JNUN0 (scsi)
Disk /dev/sdk: 7814037168s
Sector size (logical/physical): 512B/512B <<<<-----
Partition Table: gpt

So the first is 4k physical, and the second is 512b physical (I assume that's really a fib, but it's what the drive says).

Could the mismatch in the physical sizes cause trouble?

I ask because I did this:
mdadm --grow /dev/md/media4 --add /dev/sdk2 --raid-devices=2
which resulted in
the Feb  2 15:26:46 tempserver kernel: [ 8583.545160] md: bind<sdk2>
Feb  2 15:26:46 tempserver kernel: [ 8583.957047] RAID1 conf printout:
Feb  2 15:26:46 tempserver kernel: [ 8583.957051]  --- wd:1 rd:2
Feb  2 15:26:46 tempserver kernel: [ 8583.957052]  disk 0, wo:0, o:1, dev:sdb2
Feb  2 15:26:46 tempserver kernel: [ 8583.957054]  disk 1, wo:1, o:1, dev:sdk2
Feb  2 15:26:46 tempserver kernel: [ 8583.957108] md: recovery of RAID array md126
Feb  2 15:26:46 tempserver kernel: [ 8583.957111] md: minimum _guaranteed_  speed: 1000 KB/sec/disk.
Feb  2 15:26:46 tempserver kernel: [ 8583.957112] md: using maximum available idle IO bandwidth (but not more than 200000 KB/sec) for recovery.
Feb  2 15:26:46 tempserver kernel: [ 8583.957115] md: using 128k window, over a total of 1953376064k.
Feb  2 15:26:50 tempserver kernel: [ 8587.312261] usb 2-3: USB disconnect, device number 4
Feb  2 15:26:50 tempserver kernel: [ 8587.315908] scsi 8:0:0:3: rejecting I/O to offline device
Feb  2 15:26:50 tempserver kernel: [ 8587.315912] scsi 8:0:0:3: [sdk] killing request
Feb  2 15:26:50 tempserver kernel: [ 8587.315934] scsi 8:0:0:3: [sdk] Unhandled error code
Feb  2 15:26:50 tempserver kernel: [ 8587.315936] scsi 8:0:0:3: [sdk]
Feb  2 15:26:50 tempserver kernel: [ 8587.315939] Result: hostbyte=DID_NO_CONNECT driverbyte=DRIVER_OK
Feb  2 15:26:50 tempserver kernel: [ 8587.315941] scsi 8:0:0:3: [sdk] CDB:
Feb  2 15:26:50 tempserver kernel: [ 8587.315943] Write(16): 8a 00 00 00 00 00 00 0c 3e 80 00 00 00 80 00 00
Feb  2 15:26:50 tempserver kernel: [ 8587.315951] end_request: I/O error, dev sdk, sector 802432
Feb  2 15:26:50 tempserver kernel: [ 8587.315963] md/raid1:md126: Disk failure on sdk2, disabling device.
Feb  2 15:26:50 tempserver kernel: [ 8587.315963] md/raid1:md126: Operation continuing on 1 devices.result of which from kern.log was
I then tried
 dd if=/dev/disk/by-id/ata-WDC_WD4001FFSX-68JNUN0_WD-WMC130FACU91 of=/dev/null skip=802500 count=1000
which produced no error.  The input is from the device previously known as sdk.  So I can at least read from the sector that was associated with the error message above.

BTW, the array I'm trying to mirror is on a part of the disk that I think is still good.
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* sector size mismatch a problem for RAID 1?
From: Boylan, Ross @ 2017-02-03  0:17 UTC (permalink / raw)
  To: linux-raid@vger.kernel.org

I got a replacement  for a failing disk, but the 2 drives seems to have slightly different formatting:
For the failing one
GNU Parted 2.3
Using /dev/sdb
Welcome to GNU Parted! Type 'help' to view a list of commands.
(parted) p
Model: WDC WD40 01FFSX-68JNUN0 (scsi)
Disk /dev/sdb: 4001GB
Sector size (logical/physical): 512B/4096B <<<------
Partition Table: gpt

And the replacement (sdk)
Model: WDC WD40 01FFSX-68JNUN0 (scsi)
Disk /dev/sdk: 7814037168s
Sector size (logical/physical): 512B/512B <<<<-----
Partition Table: gpt

So the first is 4k physical, and the second is 512b physical (I assume that's really a fib, but it's what the drive says).

Could the mismatch in the physical sizes cause trouble?

I ask because I did this:
mdadm --grow /dev/md/media4 --add /dev/sdk2 --raid-devices=2
which resulted in 
the Feb  2 15:26:46 tempserver kernel: [ 8583.545160] md: bind<sdk2>
Feb  2 15:26:46 tempserver kernel: [ 8583.957047] RAID1 conf printout:
Feb  2 15:26:46 tempserver kernel: [ 8583.957051]  --- wd:1 rd:2
Feb  2 15:26:46 tempserver kernel: [ 8583.957052]  disk 0, wo:0, o:1, dev:sdb2
Feb  2 15:26:46 tempserver kernel: [ 8583.957054]  disk 1, wo:1, o:1, dev:sdk2
Feb  2 15:26:46 tempserver kernel: [ 8583.957108] md: recovery of RAID array md126
Feb  2 15:26:46 tempserver kernel: [ 8583.957111] md: minimum _guaranteed_  speed: 1000 KB/sec/disk.
Feb  2 15:26:46 tempserver kernel: [ 8583.957112] md: using maximum available idle IO bandwidth (but not more than 200000 KB/sec) for recovery.
Feb  2 15:26:46 tempserver kernel: [ 8583.957115] md: using 128k window, over a total of 1953376064k.
Feb  2 15:26:50 tempserver kernel: [ 8587.312261] usb 2-3: USB disconnect, device number 4
Feb  2 15:26:50 tempserver kernel: [ 8587.315908] scsi 8:0:0:3: rejecting I/O to offline device
Feb  2 15:26:50 tempserver kernel: [ 8587.315912] scsi 8:0:0:3: [sdk] killing request
Feb  2 15:26:50 tempserver kernel: [ 8587.315934] scsi 8:0:0:3: [sdk] Unhandled error code
Feb  2 15:26:50 tempserver kernel: [ 8587.315936] scsi 8:0:0:3: [sdk]  
Feb  2 15:26:50 tempserver kernel: [ 8587.315939] Result: hostbyte=DID_NO_CONNECT driverbyte=DRIVER_OK
Feb  2 15:26:50 tempserver kernel: [ 8587.315941] scsi 8:0:0:3: [sdk] CDB: 
Feb  2 15:26:50 tempserver kernel: [ 8587.315943] Write(16): 8a 00 00 00 00 00 00 0c 3e 80 00 00 00 80 00 00
Feb  2 15:26:50 tempserver kernel: [ 8587.315951] end_request: I/O error, dev sdk, sector 802432
Feb  2 15:26:50 tempserver kernel: [ 8587.315963] md/raid1:md126: Disk failure on sdk2, disabling device.
Feb  2 15:26:50 tempserver kernel: [ 8587.315963] md/raid1:md126: Operation continuing on 1 devices.result of which from kern.log was
I then tried
 dd if=/dev/disk/by-id/ata-WDC_WD4001FFSX-68JNUN0_WD-WMC130FACU91 of=/dev/null skip=802500 count=1000
which produced no error.  The input is from the device previously known as sdk.  So I can at least read from the sector that was associated with the error message above.

BTW, the array I'm trying to mirror is on a part of the disk that I think is still good.

^ permalink raw reply

* Re: Fail to assemble raid4 with replaced disk
From: Santiago DIEZ @ 2017-02-02 13:33 UTC (permalink / raw)
  To: Wols Lists; +Cc: Linux Raid LIST
In-Reply-To: <581769E1.6060403@youngman.org.uk>

Hi,

I never said THANKS.

Never too late ;o)

-------------------------
Santiago DIEZ
-------------------------
Quark Systems & CAOBA
23 rue du Buisson Saint-Louis, 75010 Paris
-------------------------


On Mon, Oct 31, 2016 at 4:57 PM, Wols Lists <antlists@youngman.org.uk> wrote:
> On 27/10/16 15:11, Santiago DIEZ wrote:
>> Hi,
>>
>> Indeed, here is what I had in terms of event count:
>> /dev/sda10: 81589
>> /dev/sdb10: 81626
>> /dev/sdc10: 81589
>>
>> Then the following procedure worked quite straightforward:
>> --------------------------------------------------------------------------------
>> # mdadm --assemble /dev/md10 --verbose --force /dev/sda10 /dev/sdb10 /dev/sdc10
>> # mdadm --manage /dev/md10 --add /dev/sdd10
>> --------------------------------------------------------------------------------
>>
>> And 6h+ later:
>> --------------------------------------------------------------------------------
>> # cat /proc/mdstat
>> Personalities : [raid1] [raid6] [raid5] [raid4]
>> md10 : active raid5 sdd10[3] sda10[0] sdc10[2] sdb10[1]
>>       5778741888 blocks level 5, 64k chunk, algorithm 2 [4/4] [UUUU]
>> --------------------------------------------------------------------------------
>>
>> Then I ran:
>> --------------------------------------------------------------------------------
>> # e2fsck -f -n -t -v /dev/md10
>> e2fsck 1.42.5 (29-Jul-2012)
>> Pass 1: Checking inodes, blocks, and sizes
>> Pass 2: Checking directory structure
>> Pass 3: Checking directory connectivity
>> Pass 4: Checking reference counts
>> Pass 5: Checking group summary information
>>
>>     15675837 inodes used (4.34%, out of 361177088)
>>       188798 non-contiguous files (1.2%)
>>        14751 non-contiguous directories (0.1%)
>>              # of inodes with ind/dind/tind blocks: 0/0/0
>>              Extent depth histogram: 15626455/47037/15
>>   1281308341 blocks used (88.69%, out of 1444685472)
>>            0 bad blocks
>>          101 large files
>>
>>     15311457 regular files
>>       361754 directories
>>            0 character device files
>>            0 block device files
>>            0 fifos
>>            0 links
>>         2607 symbolic links (2310 fast symbolic links)
>>           10 sockets
>> ------------
>>     15675828 files
>> Memory used: 50976k/1912k (20541k/30436k), time: 1304.00/334.06/ 8.00
>> I/O read: 4891MB, write: 0MB, rate: 3.75MB/s
>> --------------------------------------------------------------------------------
>>
>> Does it look OK enough to launch the mount?
>>
> sorry - I've been away for the weekend - daughter's wedding :-)
>
> But yes, that looks great. No errors on fsck either, I think :-)
>
> I think your array looks fine. Just look at the output from smartctl for
> your old drives and make sure that it doesn't look like another drive is
> going to fail soon. I'm not quite sure what to look for, mostly bad
> blocks and relocates, I think, but if you compare it with your new drive
> and stuff looks dodgy, you can always ask for help.
>
> Cheers,
> Wol
>

^ permalink raw reply

* Re: [PATCH] MD: add doc for raid5-cache
From: Jure Erznožnik @ 2017-02-02  6:54 UTC (permalink / raw)
  To: Shaohua Li, linux-raid
In-Reply-To: <3d68e5aa-5c2e-4cb0-ba57-45246041ffe6@gmail.com>

If I may, I'd also like to see the following in the manual:

1. Instructions on how to set up the cache. So far I have seen how to
change mode, but not how to even get to the part where you can (change
the mode)
2. List of all tweaking parameters with descriptions on what they do

Thanks for the fine work!

LP,
Jure

On Thu, Feb 2, 2017 at 7:33 AM, Ram Ramesh <rramesh2400@gmail.com> wrote:
> On 01/31/2017 01:18 PM, Shaohua Li wrote:
>>
>> I'm starting document of the raid5-cache feature. Please let me know
>> what else we should put into the document. Of course, comments are
>> welcome!
>>
>> Signed-off-by: Shaohua Li <shli@fb.com>
>> ---
>>   Documentation/md/raid5-cache.txt | 99
>> ++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 99 insertions(+)
>>   create mode 100644 Documentation/md/raid5-cache.txt
>>
>> diff --git a/Documentation/md/raid5-cache.txt
>> b/Documentation/md/raid5-cache.txt
>> new file mode 100644
>> index 0000000..17a6279
>> --- /dev/null
>> +++ b/Documentation/md/raid5-cache.txt
>> @@ -0,0 +1,99 @@
>> +RAID5 cache
>> +
>> +Raid 4/5/6 could include an extra disk for data cache. The cache could be
>> +in write-through or write-back mode. mdadm has a new option
>> +'--write-journal' to create array with cache. By default (raid array
>> +starts), the cache is in write-through mode. User can switch it to
>> +write-back mode by:
>> +
>> +echo "write-back" > /sys/block/md0/md/journal_mode
>> +
>> +And switch it back to write-through mode by:
>> +
>> +echo "write-through" > /sys/block/md0/md/journal_mode
>> +
>> +In both modes, all writes to the array will hit cache disk first. This
>> means
>> +the cache disk must be fast and sustainable (if you use a SSD as the
>> cache).
>> +
>> +-------------------------------------
>> +write-through mode:
>> +
>> +This mode mainly fixes 'write hole' issue. For RAID 4/5/6 array, an
>> +unclean shutdown could cause data in some stripes is not in consistent
>> +state, eg, data and parity don't match. The reason is a stripe write
>> +involves several raid disks and it's possible writes don't hit all raid
>> +disks yet before the unclean shutdown. After an unclean shutdown, MD try
>> +to 'resync' the array to put all stripes back into consistent state. In
>> +the resync, any disk failure will cause real data corruption. This
>> problem
>> +is called 'write hole'. So the 'write hole' issue occurs between unclean
>> +shutdown and 'resync'. This window isn't big. On the other hand, if one
>> +disk fails, other disks could fail soon, which happens sometimes if the
>> +disks are from the same vendor and manufactured in the same time. This
>> +will increase the chance of 'write whole', but overall the chance isn't
>> +big, so don't panic even not using cache disk.
>> +
>> +The write-through cache will cache all data in cache disk first. Until
>> the
>> +data hits into the cache disk, the data is flushed into RAID disks. The
>> +two-step write will guarantee MD can recover correct data after unclean
>> +shutdown even with disk failure. Thus the cache can close the 'write
>> +hole'.
>> +
>> +In write-through mode, MD reports IO finish to upper layer (usually
>> +filesystems) till the data hits RAID disks, so cache disk failure doesn't
>> +cause data lost. Of course cache disk failure means the array is exposed
>> +into 'write hole' again.
>> +
>> +--------------------------------------
>> +write-back mode:
>> +
>> +write-back mode fixes the 'write hole' issue too, since all write data is
>> +cached in cache disk. But the main goal of 'write-back' cache is to speed
>> up
>> +write. If a write crosses all raid disks of a stripe, we call it
>> full-stripe
>> +write. For non-full-stripe write, MD must do a read-modify-write. The
>> extra
>> +read (for data in other disks) and write (for parity) introduce a lot of
>> +overhead. Some writes which are sequential but not dispatched in the same
>> time
>> +will suffer from this overhead too. write-back cache will aggregate the
>> data
>> +and flush the data to raid disks till the data becomes a full stripe
>> write.
>> +This will completely avoid the overhead, so it's very helpful for some
>> +workloads. A typical workload which does sequential write and follows
>> fsync is
>> +an example.
>> +
>> +In write-back mode, MD reports IO finish to upper layer (usually
>> filesystems)
>> +right after the data hit cache disk. The data is flushed to raid disks
>> later
>> +after specific conditions met. So cache disk failure will cause data
>> lost.
>> +
>> +--------------------------------------
>> +The implementation:
>> +
>> +The write-through and write-back cache use the same disk format. The
>> cache disk
>> +is organized as a simple write log. The log consists of 'meta data' and
>> 'data'
>> +pairs. The meta data describes the data. It also includes checksum and
>> sequence
>> +ID for recovery identification. Data could be IO data and parity data.
>> Data is
>> +checksumed too. The checksum is stored in the meta data ahead of the
>> data. The
>> +checksum is an optimization because MD can write meta and data freely
>> without
>> +worry about the order. MD superblock has a field pointed to the valid
>> meta data
>> +of log head.
>> +
>> +The log implementation is pretty straightforward. The difficult part is
>> the
>> +order MD write data to cache disk and raid disks. Specifically, in
>> +write-through mode, MD calculates parity for IO data, writes both IO data
>> and
>> +parity to the log, write the data and parity to raid disks after the data
>> and
>> +parity is settled down in log and finally the IO is finished. Read just
>> reads
>> +from raid disks as usual.
>> +
>> +In write-back mode, MD writes IO data to the log and reports IO finish.
>> The
>> +data is also fully cached in memory at that time, which means read must
>> query
>> +memory cache. If some conditions are met, MD will flush the data to raid
>> disks.
>> +MD will calculate parity for the data and write parity into the log.
>> After this
>> +is finished, MD will write both data and parity into raid disks, then MD
>> can
>> +release the memory cache. The flush conditions could be stripe becomes a
>> full
>> +stripe write, free cache disk space is low or in-kernel memory cache
>> space is
>> +low.
>> +
>> +After an unclean shutdown, MD does recovery. MD reads all meta data and
>> data
>> +from the log. The sequence ID and checksum will help us detect corrupted
>> meta
>> +data and data. If MD finds a stripe with data and valid parities (1
>> parity for
>> +raid4/5 and 2 for raid6), MD will write the data and parities to raid
>> disks. If
>> +parities are incompleted, they are discarded. If part of data is
>> corrupted,
>> +they are discarded too. MD then loads valid data and writes them to raid
>> disks
>> +in normal way.
>
>
> Which version of mdadm/kernel supports this feature? Is it already released
> or in the process?
>
> Ramesh
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] MD: add doc for raid5-cache
From: Ram Ramesh @ 2017-02-02  6:33 UTC (permalink / raw)
  To: Shaohua Li, linux-raid
In-Reply-To: <25051bd79d94b45c7be24ce466a8b6eb2fba66c0.1485890144.git.shli@fb.com>

On 01/31/2017 01:18 PM, Shaohua Li wrote:
> I'm starting document of the raid5-cache feature. Please let me know
> what else we should put into the document. Of course, comments are
> welcome!
>
> Signed-off-by: Shaohua Li <shli@fb.com>
> ---
>   Documentation/md/raid5-cache.txt | 99 ++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 99 insertions(+)
>   create mode 100644 Documentation/md/raid5-cache.txt
>
> diff --git a/Documentation/md/raid5-cache.txt b/Documentation/md/raid5-cache.txt
> new file mode 100644
> index 0000000..17a6279
> --- /dev/null
> +++ b/Documentation/md/raid5-cache.txt
> @@ -0,0 +1,99 @@
> +RAID5 cache
> +
> +Raid 4/5/6 could include an extra disk for data cache. The cache could be
> +in write-through or write-back mode. mdadm has a new option
> +'--write-journal' to create array with cache. By default (raid array
> +starts), the cache is in write-through mode. User can switch it to
> +write-back mode by:
> +
> +echo "write-back" > /sys/block/md0/md/journal_mode
> +
> +And switch it back to write-through mode by:
> +
> +echo "write-through" > /sys/block/md0/md/journal_mode
> +
> +In both modes, all writes to the array will hit cache disk first. This means
> +the cache disk must be fast and sustainable (if you use a SSD as the cache).
> +
> +-------------------------------------
> +write-through mode:
> +
> +This mode mainly fixes 'write hole' issue. For RAID 4/5/6 array, an
> +unclean shutdown could cause data in some stripes is not in consistent
> +state, eg, data and parity don't match. The reason is a stripe write
> +involves several raid disks and it's possible writes don't hit all raid
> +disks yet before the unclean shutdown. After an unclean shutdown, MD try
> +to 'resync' the array to put all stripes back into consistent state. In
> +the resync, any disk failure will cause real data corruption. This problem
> +is called 'write hole'. So the 'write hole' issue occurs between unclean
> +shutdown and 'resync'. This window isn't big. On the other hand, if one
> +disk fails, other disks could fail soon, which happens sometimes if the
> +disks are from the same vendor and manufactured in the same time. This
> +will increase the chance of 'write whole', but overall the chance isn't
> +big, so don't panic even not using cache disk.
> +
> +The write-through cache will cache all data in cache disk first. Until the
> +data hits into the cache disk, the data is flushed into RAID disks. The
> +two-step write will guarantee MD can recover correct data after unclean
> +shutdown even with disk failure. Thus the cache can close the 'write
> +hole'.
> +
> +In write-through mode, MD reports IO finish to upper layer (usually
> +filesystems) till the data hits RAID disks, so cache disk failure doesn't
> +cause data lost. Of course cache disk failure means the array is exposed
> +into 'write hole' again.
> +
> +--------------------------------------
> +write-back mode:
> +
> +write-back mode fixes the 'write hole' issue too, since all write data is
> +cached in cache disk. But the main goal of 'write-back' cache is to speed up
> +write. If a write crosses all raid disks of a stripe, we call it full-stripe
> +write. For non-full-stripe write, MD must do a read-modify-write. The extra
> +read (for data in other disks) and write (for parity) introduce a lot of
> +overhead. Some writes which are sequential but not dispatched in the same time
> +will suffer from this overhead too. write-back cache will aggregate the data
> +and flush the data to raid disks till the data becomes a full stripe write.
> +This will completely avoid the overhead, so it's very helpful for some
> +workloads. A typical workload which does sequential write and follows fsync is
> +an example.
> +
> +In write-back mode, MD reports IO finish to upper layer (usually filesystems)
> +right after the data hit cache disk. The data is flushed to raid disks later
> +after specific conditions met. So cache disk failure will cause data lost.
> +
> +--------------------------------------
> +The implementation:
> +
> +The write-through and write-back cache use the same disk format. The cache disk
> +is organized as a simple write log. The log consists of 'meta data' and 'data'
> +pairs. The meta data describes the data. It also includes checksum and sequence
> +ID for recovery identification. Data could be IO data and parity data. Data is
> +checksumed too. The checksum is stored in the meta data ahead of the data. The
> +checksum is an optimization because MD can write meta and data freely without
> +worry about the order. MD superblock has a field pointed to the valid meta data
> +of log head.
> +
> +The log implementation is pretty straightforward. The difficult part is the
> +order MD write data to cache disk and raid disks. Specifically, in
> +write-through mode, MD calculates parity for IO data, writes both IO data and
> +parity to the log, write the data and parity to raid disks after the data and
> +parity is settled down in log and finally the IO is finished. Read just reads
> +from raid disks as usual.
> +
> +In write-back mode, MD writes IO data to the log and reports IO finish. The
> +data is also fully cached in memory at that time, which means read must query
> +memory cache. If some conditions are met, MD will flush the data to raid disks.
> +MD will calculate parity for the data and write parity into the log. After this
> +is finished, MD will write both data and parity into raid disks, then MD can
> +release the memory cache. The flush conditions could be stripe becomes a full
> +stripe write, free cache disk space is low or in-kernel memory cache space is
> +low.
> +
> +After an unclean shutdown, MD does recovery. MD reads all meta data and data
> +from the log. The sequence ID and checksum will help us detect corrupted meta
> +data and data. If MD finds a stripe with data and valid parities (1 parity for
> +raid4/5 and 2 for raid6), MD will write the data and parities to raid disks. If
> +parities are incompleted, they are discarded. If part of data is corrupted,
> +they are discarded too. MD then loads valid data and writes them to raid disks
> +in normal way.

Which version of mdadm/kernel supports this feature? Is it already 
released or in the process?

Ramesh


^ permalink raw reply

* Re: [PATCH 3/6] async_tx: Handle DMA devices having support for fewer PQ coefficients
From: Dan Williams @ 2017-02-02  6:01 UTC (permalink / raw)
  To: Anup Patel
  Cc: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar, Ray Jui, Scott Branden, Jon Mason,
	Rob Rice, bcm-kernel-feedback-list, dmaengine@vger.kernel.org,
	devicetree, linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-crypto, linux-raid
In-Reply-To: <1486010836-25228-4-git-send-email-anup.patel@broadcom.com>

On Wed, Feb 1, 2017 at 8:47 PM, Anup Patel <anup.patel@broadcom.com> wrote:
> The DMAENGINE framework assumes that if PQ offload is supported by a
> DMA device then all 256 PQ coefficients are supported. This assumption
> does not hold anymore because we now have BCM-SBA-RAID offload engine
> which supports PQ offload with limited number of PQ coefficients.
>
> This patch extends async_tx APIs to handle DMA devices with support
> for fewer PQ coefficients.
>
> Signed-off-by: Anup Patel <anup.patel@broadcom.com>
> Reviewed-by: Scott Branden <scott.branden@broadcom.com>
> ---
>  crypto/async_tx/async_pq.c          |  3 +++
>  crypto/async_tx/async_raid6_recov.c | 12 ++++++++++--
>  include/linux/dmaengine.h           | 19 +++++++++++++++++++
>  include/linux/raid/pq.h             |  3 +++
>  4 files changed, 35 insertions(+), 2 deletions(-)

So, I hate the way async_tx does these checks on each operation, and
it's ok for me to say that because it's my fault. Really it's md that
should be validating engine offload capabilities once at the beginning
of time. I'd rather we move in that direction than continue to pile
onto a bad design.

^ permalink raw reply

* [PATCH 6/6] dt-bindings: Add DT bindings document for Broadcom SBA RAID driver
From: Anup Patel @ 2017-02-02  4:47 UTC (permalink / raw)
  To: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar
  Cc: Dan Williams, Ray Jui, Scott Branden, Jon Mason, Rob Rice,
	bcm-kernel-feedback-list-dY08KVG/lbpWk0Htik3J/w,
	dmaengine-u79uwXL29TY76Z2rM5mHXA,
	devicetree-u79uwXL29TY76Z2rM5mHXA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-crypto-u79uwXL29TY76Z2rM5mHXA,
	linux-raid-u79uwXL29TY76Z2rM5mHXA, Anup Patel
In-Reply-To: <1486010836-25228-1-git-send-email-anup.patel-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>

This patch adds the DT bindings document for newly added Broadcom
SBA RAID driver.

Signed-off-by: Anup Patel <anup.patel-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>
Reviewed-by: Ray Jui <ray.jui-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>
Reviewed-by: Scott Branden <scott.branden-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>
---
 .../devicetree/bindings/dma/brcm,iproc-sba.txt     | 29 ++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/dma/brcm,iproc-sba.txt

diff --git a/Documentation/devicetree/bindings/dma/brcm,iproc-sba.txt b/Documentation/devicetree/bindings/dma/brcm,iproc-sba.txt
new file mode 100644
index 0000000..092913a
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/brcm,iproc-sba.txt
@@ -0,0 +1,29 @@
+* Broadcom SBA RAID engine
+
+Required properties:
+- compatible: Should be one of the following
+	      "brcm,iproc-sba"
+	      "brcm,iproc-sba-v2"
+  The "brcm,iproc-sba" has support for only 6 PQ coefficients
+  The "brcm,iproc-sba-v2" has support for only 30 PQ coefficients
+- mboxes: List of phandle and mailbox channel specifiers
+
+Example:
+
+raid_mbox: mbox@67400000 {
+	...
+	#mbox-cells = <3>;
+	...
+};
+
+raid0 {
+	compatible = "brcm,iproc-sba-v2";
+	mboxes = <&raid_mbox 0 0x1 0xffff>,
+		 <&raid_mbox 1 0x1 0xffff>,
+		 <&raid_mbox 2 0x1 0xffff>,
+		 <&raid_mbox 3 0x1 0xffff>,
+		 <&raid_mbox 4 0x1 0xffff>,
+		 <&raid_mbox 5 0x1 0xffff>,
+		 <&raid_mbox 6 0x1 0xffff>,
+		 <&raid_mbox 7 0x1 0xffff>;
+};
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH 5/6] dmaengine: Add Broadcom SBA RAID driver
From: Anup Patel @ 2017-02-02  4:47 UTC (permalink / raw)
  To: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar
  Cc: Dan Williams, Ray Jui, Scott Branden, Jon Mason, Rob Rice,
	bcm-kernel-feedback-list, dmaengine, devicetree, linux-arm-kernel,
	linux-kernel, linux-crypto, linux-raid, Anup Patel
In-Reply-To: <1486010836-25228-1-git-send-email-anup.patel@broadcom.com>

The Broadcom stream buffer accelerator (SBA) provides offloading
capabilities for RAID operations. This SBA offload engine is
accessible via Broadcom SoC specific ring manager.

This patch adds Broadcom SBA RAID driver which provides one
DMA device with RAID capabilities using one or more Broadcom
SoC specific ring manager channels. The SBA RAID driver in its
current shape implements memcpy, xor, and pq operations.

Signed-off-by: Anup Patel <anup.patel@broadcom.com>
Reviewed-by: Ray Jui <ray.jui@broadcom.com>
Reviewed-by: Scott Branden <scott.branden@broadcom.com>
---
 drivers/dma/Kconfig        |   13 +
 drivers/dma/Makefile       |    1 +
 drivers/dma/bcm-sba-raid.c | 1309 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1323 insertions(+)
 create mode 100644 drivers/dma/bcm-sba-raid.c

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 263495d..58d0463 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -99,6 +99,19 @@ config AXI_DMAC
 	  controller is often used in Analog Device's reference designs for FPGA
 	  platforms.
 
+config BCM_SBA_RAID
+        tristate "Broadcom SBA RAID engine support"
+        depends on (ARM64 && MAILBOX && RAID6_PQ) || COMPILE_TEST
+        select DMA_ENGINE
+        select DMA_ENGINE_RAID
+	select ASYNC_TX_ENABLE_CHANNEL_SWITCH
+	default ARCH_BCM_IPROC
+        help
+	  Enable support for Broadcom SBA RAID Engine. The SBA RAID
+	  engine is available on most of the Broadcom iProc SoCs. It
+	  has the capability to offload memcpy, xor and pq computation
+	  for raid5/6.
+
 config COH901318
 	bool "ST-Ericsson COH901318 DMA support"
 	select DMA_ENGINE
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index a4fa336..ba96bdd 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_AMCC_PPC440SPE_ADMA) += ppc4xx/
 obj-$(CONFIG_AT_HDMAC) += at_hdmac.o
 obj-$(CONFIG_AT_XDMAC) += at_xdmac.o
 obj-$(CONFIG_AXI_DMAC) += dma-axi-dmac.o
+obj-$(CONFIG_BCM_SBA_RAID) += bcm-sba-raid.o
 obj-$(CONFIG_COH901318) += coh901318.o coh901318_lli.o
 obj-$(CONFIG_DMA_BCM2835) += bcm2835-dma.o
 obj-$(CONFIG_DMA_JZ4740) += dma-jz4740.o
diff --git a/drivers/dma/bcm-sba-raid.c b/drivers/dma/bcm-sba-raid.c
new file mode 100644
index 0000000..bf39f3f
--- /dev/null
+++ b/drivers/dma/bcm-sba-raid.c
@@ -0,0 +1,1309 @@
+/*
+ * Copyright (C) 2017 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * Broadcom SBA RAID Driver
+ *
+ * The Broadcom stream buffer accelerator (SBA) provides offloading
+ * capabilities for RAID operations. The SBA offload engine is accessible
+ * via Broadcom SoC specific ring manager. Two or more offload engines
+ * can share same Broadcom SoC specific ring manager due to this Broadcom
+ * SoC specific ring manager driver is implemented as a mailbox controller
+ * driver and offload engine drivers are implemented as mallbox clients.
+ *
+ * Typically, Broadcom SoC specific ring manager will implement larger
+ * number of hardware rings over one or more SBA hardware devices. By
+ * design, the internal buffer size of SBA hardware device is limited
+ * but all offload operations supported by SBA can be broken down into
+ * multiple small size requests and executed parallely on multiple SBA
+ * hardware devices for achieving high through-put.
+ *
+ * The Broadcom SBA RAID driver does not require any register programming
+ * except submitting request to SBA hardware device via mailbox channels.
+ * This driver implements a DMA device with one DMA channel using a set
+ * of mailbox channels provided by Broadcom SoC specific ring manager
+ * driver. To exploit parallelism (as described above), all DMA request
+ * coming to SBA RAID DMA channel are broken down to smaller requests
+ * and submitted to multiple mailbox channels in round-robin fashion.
+ * For having more SBA DMA channels, we can create more SBA device nodes
+ * in Broadcom SoC specific DTS based on number of hardware rings supported
+ * by Broadcom SoC ring manager.
+ */
+
+#include <linux/dma-mapping.h>
+#include <linux/dmaengine.h>
+#include <linux/list.h>
+#include <linux/mailbox_client.h>
+#include <linux/mailbox/brcm-message.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/slab.h>
+#include <linux/raid/pq.h>
+
+#include "dmaengine.h"
+
+/* SBA command helper macros */
+#define SBA_DEC(_d, _s, _m)		(((_d) >> (_s)) & (_m))
+#define SBA_ENC(_d, _v, _s, _m)					\
+		do {						\
+			(_d) &= ~((u64)(_m) << (_s));		\
+			(_d) |= (((u64)(_v) & (_m)) << (_s));	\
+		} while (0)
+
+/* SBA command related defines */
+#define SBA_TYPE_SHIFT					48
+#define SBA_TYPE_MASK					0x3
+#define SBA_TYPE_A					0x0
+#define SBA_TYPE_B					0x2
+#define SBA_TYPE_C					0x3
+#define SBA_USER_DEF_SHIFT				32
+#define SBA_USER_DEF_MASK				0xffff
+#define SBA_R_MDATA_SHIFT				24
+#define SBA_R_MDATA_MASK				0xff
+#define SBA_C_MDATA_MS_SHIFT				18
+#define SBA_C_MDATA_MS_MASK				0x3
+#define SBA_INT_SHIFT					17
+#define SBA_INT_MASK					0x1
+#define SBA_RESP_SHIFT					16
+#define SBA_RESP_MASK					0x1
+#define SBA_C_MDATA_SHIFT				8
+#define SBA_C_MDATA_MASK				0xff
+#define SBA_CMD_SHIFT					0
+#define SBA_CMD_MASK					0xf
+#define SBA_CMD_ZERO_ALL_BUFFERS			0x8
+#define SBA_CMD_LOAD_BUFFER				0x9
+#define SBA_CMD_XOR					0xa
+#define SBA_CMD_GALOIS_XOR				0xb
+#define SBA_CMD_ZERO_BUFFER				0x4
+#define SBA_CMD_WRITE_BUFFER				0xc
+
+/* SBA C_MDATA helper macros */
+#define SBA_C_MDATA_LOAD_VAL(__bnum0)		((__bnum0) & 0x3)
+#define SBA_C_MDATA_WRITE_VAL(__bnum0)		((__bnum0) & 0x3)
+#define SBA_C_MDATA_XOR_VAL(__bnum1, __bnum0)			\
+			({	u32 __v = ((__bnum0) & 0x3);	\
+				__v |= ((__bnum1) & 0x3) << 2;	\
+				__v;				\
+			})
+#define SBA_C_MDATA_PQ_VAL(__dnum, __bnum1, __bnum0)		\
+			({	u32 __v = ((__bnum0) & 0x3);	\
+				__v |= ((__bnum1) & 0x3) << 2;	\
+				__v |= ((__dnum) & 0x1f) << 5;	\
+				__v;				\
+			})
+#define SBA_C_MDATA_LS(__c_mdata_val)	((__c_mdata_val) & 0xff)
+#define SBA_C_MDATA_MS(__c_mdata_val)	(((__c_mdata_val) >> 8) & 0x3)
+
+/* Driver helper macros */
+#define to_sba_request(tx)		\
+	container_of(tx, struct sba_request, tx)
+#define to_sba_device(dchan)		\
+	container_of(dchan, struct sba_device, dma_chan)
+
+enum sba_request_state {
+	SBA_REQUEST_STATE_FREE = 1,
+	SBA_REQUEST_STATE_ALLOCED = 2,
+	SBA_REQUEST_STATE_PENDING = 3,
+	SBA_REQUEST_STATE_ACTIVE = 4,
+	SBA_REQUEST_STATE_COMPLETED = 5,
+	SBA_REQUEST_STATE_ABORTED = 6,
+};
+
+struct sba_request {
+	struct list_head node;
+	struct sba_device *sba;
+	enum sba_request_state state;
+	bool fence;
+	void *resp;
+	dma_addr_t resp_dma;
+	struct brcm_sba_command *cmds;
+	struct brcm_message *msgs;
+	struct brcm_message bmsg;
+	atomic_t msgs_pending_count;
+	struct dma_async_tx_descriptor tx;
+};
+
+enum sba_version {
+	SBA_VER_1 = 0,
+	SBA_VER_2
+};
+
+struct sba_device {
+	/* Underlying device */
+	struct device *dev;
+	/* DT configuration parameters */
+	enum sba_version ver;
+	u32 max_req;
+	u32 req_size;
+	/* Derived configuration parameters */
+	u32 hw_buf_size;
+	u32 hw_resp_size;
+	u32 max_pq_coefs;
+	u32 max_pq_srcs;
+	u32 max_msg_per_req;
+	u32 max_cmd_per_msg;
+	u32 max_cmd_per_req;
+	u32 max_xor_srcs;
+	u32 max_resp_pool_size;
+	u32 max_cmds_pool_size;
+	/* Maibox client and Mailbox channels */
+	struct mbox_client client;
+	int mchans_count;
+	atomic_t mchans_current;
+	struct mbox_chan **mchans;
+	struct device *mbox_dev;
+	/* DMA device and DMA channel */
+	struct dma_device dma_dev;
+	struct dma_chan dma_chan;
+	/* DMA channel resources */
+	void *resp_base;
+	dma_addr_t resp_dma_base;
+	void *cmds_base;
+	dma_addr_t cmds_dma_base;
+	spinlock_t reqs_lock;
+	struct sba_request *reqs;
+	bool reqs_fence;
+	struct list_head reqs_alloc_list;
+	struct list_head reqs_pending_list;
+	struct list_head reqs_active_list;
+	struct list_head reqs_completed_list;
+	struct list_head reqs_aborted_list;
+	struct list_head reqs_free_list;
+	int reqs_free_count;
+};
+
+/* ====== Channel resource management routines ===== */
+
+static struct sba_request *sba_alloc_request(struct sba_device *sba)
+{
+	unsigned long flags;
+	struct sba_request *req = NULL;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	if (!list_empty(&sba->reqs_free_list)) {
+		req = list_first_entry(&sba->reqs_free_list,
+				       struct sba_request,
+				       node);
+
+		req->state = SBA_REQUEST_STATE_ALLOCED;
+		req->fence = false;
+		atomic_set(&req->msgs_pending_count, 0);
+		list_move_tail(&req->node, &sba->reqs_alloc_list);
+		sba->reqs_free_count--;
+
+		dma_async_tx_descriptor_init(&req->tx, &sba->dma_chan);
+	}
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+
+	return req;
+}
+
+/* Note: Must be called with sba->reqs_lock held */
+static void _sba_pending_request(struct sba_device *sba,
+				 struct sba_request *req)
+{
+	req->state = SBA_REQUEST_STATE_PENDING;
+	list_move_tail(&req->node, &sba->reqs_pending_list);
+	if (list_empty(&sba->reqs_active_list))
+		sba->reqs_fence = false;
+}
+
+/* Note: Must be called with sba->reqs_lock held */
+static bool _sba_active_request(struct sba_device *sba,
+				struct sba_request *req)
+{
+	if (list_empty(&sba->reqs_active_list))
+		sba->reqs_fence = false;
+	if (sba->reqs_fence)
+		return false;
+	req->state = SBA_REQUEST_STATE_ACTIVE;
+	list_move_tail(&req->node, &sba->reqs_active_list);
+	if (req->fence)
+		sba->reqs_fence = true;
+	return true;
+}
+
+/* Note: Must be called with sba->reqs_lock held */
+static void _sba_abort_request(struct sba_device *sba,
+			       struct sba_request *req)
+{
+	req->state = SBA_REQUEST_STATE_ABORTED;
+	list_move_tail(&req->node, &sba->reqs_aborted_list);
+	if (list_empty(&sba->reqs_active_list))
+		sba->reqs_fence = false;
+}
+
+/* Note: Must be called with sba->reqs_lock held */
+static void _sba_free_request(struct sba_device *sba,
+			      struct sba_request *req)
+{
+	req->state = SBA_REQUEST_STATE_FREE;
+	list_move_tail(&req->node, &sba->reqs_free_list);
+	if (list_empty(&sba->reqs_active_list))
+		sba->reqs_fence = false;
+	sba->reqs_free_count++;
+}
+
+static void sba_complete_request(struct sba_request *req)
+{
+	unsigned long flags;
+	struct sba_device *sba = req->sba;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+	req->state = SBA_REQUEST_STATE_COMPLETED;
+	list_move_tail(&req->node, &sba->reqs_completed_list);
+	if (list_empty(&sba->reqs_active_list))
+		sba->reqs_fence = false;
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+static void sba_free_request(struct sba_request *req)
+{
+	unsigned long flags;
+	struct sba_device *sba = req->sba;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+	_sba_free_request(sba, req);
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+static int sba_free_request_count(struct sba_device *sba)
+{
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+	ret = sba->reqs_free_count;
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+
+	return ret;
+}
+
+static void sba_cleanup_inflight_requests(struct sba_device *sba)
+{
+	unsigned long flags;
+	struct sba_request *req, *req1;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	/* Freeup all alloced request */
+	list_for_each_entry_safe(req, req1, &sba->reqs_alloc_list, node) {
+		_sba_free_request(sba, req);
+	}
+
+	/* Freeup all pending request */
+	list_for_each_entry_safe(req, req1, &sba->reqs_pending_list, node) {
+		if (req->bmsg.batch.msgs_queued < req->bmsg.batch.msgs_count)
+			/* Set partially-queued request as aborted */
+			_sba_abort_request(sba, req);
+		else
+			/* Freeup rest of the pending request */
+			_sba_free_request(sba, req);
+	}
+
+	/* Freeup all completed request */
+	list_for_each_entry_safe(req, req1, &sba->reqs_completed_list, node) {
+		_sba_free_request(sba, req);
+	}
+
+	/* Set all active requests as aborted */
+	list_for_each_entry_safe(req, req1, &sba->reqs_active_list, node) {
+		_sba_abort_request(sba, req);
+	}
+
+	/*
+	 * Note: We expect that aborted request will be eventually
+	 * freed by sba_receive_message()
+	 */
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+/* ====== DMAENGINE callbacks ===== */
+
+static int sba_alloc_chan_resources(struct dma_chan *dchan)
+{
+	/*
+	 * We only have one channel so we have pre-alloced
+	 * channel resources. Over here we just return number
+	 * of free request.
+	 */
+	return sba_free_request_count(to_sba_device(dchan));
+}
+
+static void sba_free_chan_resources(struct dma_chan *dchan)
+{
+	/*
+	 * Channel resources are pre-alloced so we just free-up
+	 * whatever we can so that we can re-use pre-alloced
+	 * channel resources next time.
+	 */
+	sba_cleanup_inflight_requests(to_sba_device(dchan));
+}
+
+static int sba_send_mbox_request(struct sba_device *sba,
+				 struct sba_request *req)
+{
+	int mchans_idx, ret = 0;
+
+	/* Select mailbox channel in round-robin fashion */
+	mchans_idx = atomic_inc_return(&sba->mchans_current);
+	mchans_idx = mchans_idx % sba->mchans_count;
+
+	/* Send batch message for the request */
+	req->bmsg.batch.msgs_queued = 0;
+	ret = mbox_send_message(sba->mchans[mchans_idx], &req->bmsg);
+	if (ret < 0) {
+		dev_info(sba->dev, "channel %d message %d (total %d)",
+			 mchans_idx, req->bmsg.batch.msgs_queued,
+			 req->bmsg.batch.msgs_count);
+		dev_err(sba->dev, "send message failed with error %d", ret);
+		return ret;
+	}
+	ret = req->bmsg.error;
+	if (ret < 0) {
+		dev_info(sba->dev,
+			 "mbox channel %d message %d (total %d)",
+			 mchans_idx, req->bmsg.batch.msgs_queued,
+			 req->bmsg.batch.msgs_count);
+		dev_err(sba->dev, "message error %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void sba_issue_pending(struct dma_chan *dchan)
+{
+	int ret;
+	unsigned long flags;
+	struct sba_request *req, *req1;
+	struct sba_device *sba = to_sba_device(dchan);
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	/* Process all pending request */
+	list_for_each_entry_safe(req, req1, &sba->reqs_pending_list, node) {
+		/* Try to make request active */
+		if (!_sba_active_request(sba, req))
+			break;
+
+		/* Send request to mailbox channel */
+		spin_unlock_irqrestore(&sba->reqs_lock, flags);
+		ret = sba_send_mbox_request(sba, req);
+		spin_lock_irqsave(&sba->reqs_lock, flags);
+
+		/* If something went wrong then keep request pending */
+		if (ret < 0) {
+			_sba_pending_request(sba, req);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+static dma_cookie_t sba_tx_submit(struct dma_async_tx_descriptor *tx)
+{
+	unsigned long flags;
+	dma_cookie_t cookie;
+	struct sba_request *req;
+	struct sba_device *sba;
+
+	if (unlikely(!tx))
+		return -EINVAL;
+
+	sba = to_sba_device(tx->chan);
+	req = to_sba_request(tx);
+
+	/* Assign cookie and mark request pending */
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+	cookie = dma_cookie_assign(tx);
+	_sba_pending_request(sba, req);
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+
+	/* Try to submit pending request */
+	sba_issue_pending(&sba->dma_chan);
+
+	return cookie;
+}
+
+static enum dma_status sba_tx_status(struct dma_chan *dchan,
+				     dma_cookie_t cookie,
+				     struct dma_tx_state *txstate)
+{
+	int mchan_idx;
+	enum dma_status ret;
+	struct sba_device *sba = to_sba_device(dchan);
+
+	ret = dma_cookie_status(dchan, cookie, txstate);
+	if (ret == DMA_COMPLETE)
+		return ret;
+
+	for (mchan_idx = 0; mchan_idx < sba->mchans_count; mchan_idx++)
+		mbox_client_peek_data(sba->mchans[mchan_idx]);
+
+	return dma_cookie_status(dchan, cookie, txstate);
+}
+
+static unsigned int sba_fillup_memcpy_msg(struct sba_request *req,
+					  struct brcm_sba_command *cmds,
+					  struct brcm_message *msg,
+					  dma_addr_t msg_offset, size_t msg_len,
+					  dma_addr_t dst, dma_addr_t src)
+{
+	u64 cmd;
+	u32 c_mdata;
+	struct brcm_sba_command *cmdsp = cmds;
+
+	/* Type-B command to load data into buf0 */
+	cmd = 0;
+	SBA_ENC(cmd, SBA_TYPE_B, SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	SBA_ENC(cmd, msg_len,
+		SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	c_mdata = SBA_C_MDATA_LOAD_VAL(0);
+	SBA_ENC(cmd, SBA_C_MDATA_LS(c_mdata),
+		SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	SBA_ENC(cmd, SBA_CMD_LOAD_BUFFER,
+		SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+	cmdsp->data = src + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+	/* Type-A command to write buf0 */
+	cmd = 0;
+	SBA_ENC(cmd, SBA_TYPE_A, SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	SBA_ENC(cmd, msg_len,
+		SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	SBA_ENC(cmd, 0x1, SBA_RESP_SHIFT, SBA_RESP_MASK);
+	c_mdata = SBA_C_MDATA_WRITE_VAL(0);
+	SBA_ENC(cmd, SBA_C_MDATA_LS(c_mdata),
+		SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	SBA_ENC(cmd, SBA_CMD_WRITE_BUFFER,
+		SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+	if (req->sba->hw_resp_size) {
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+		cmdsp->resp = req->resp_dma;
+		cmdsp->resp_len = req->sba->hw_resp_size;
+	}
+	cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+	cmdsp->data = dst + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+	/* Fillup brcm_message */
+	msg->type = BRCM_MESSAGE_SBA;
+	msg->sba.cmds = cmds;
+	msg->sba.cmds_count = cmdsp - cmds;
+	msg->ctx = req;
+	msg->error = 0;
+
+	return cmdsp - cmds;
+}
+
+static struct dma_async_tx_descriptor *
+sba_prep_dma_memcpy(struct dma_chan *dchan, dma_addr_t dst, dma_addr_t src,
+		    size_t len, unsigned long flags)
+{
+	size_t msg_len;
+	dma_addr_t msg_offset = 0;
+	unsigned int msgs_count = 0, cmds_count, cmds_idx = 0;
+	struct sba_device *sba = to_sba_device(dchan);
+	struct sba_request *req = NULL;
+
+	/* Sanity checks */
+	if (unlikely(len > sba->req_size))
+		return NULL;
+
+	/* Alloc new request */
+	req = sba_alloc_request(sba);
+	if (!req)
+		return NULL;
+	req->fence = (flags & DMA_PREP_FENCE) ? true : false;
+
+	/* Fillup request messages */
+	while (len) {
+		msg_len = (len < sba->hw_buf_size) ? len : sba->hw_buf_size;
+		cmds_count = sba_fillup_memcpy_msg(req,
+					&req->cmds[cmds_idx],
+					&req->msgs[msgs_count],
+					msg_offset, msg_len, dst, src);
+		msgs_count++;
+		cmds_idx += cmds_count;
+		msg_offset += msg_len;
+		len -= msg_len;
+	}
+	req->bmsg.type = BRCM_MESSAGE_BATCH;
+	req->bmsg.batch.msgs = &req->msgs[0];
+	req->bmsg.batch.msgs_queued = 0;
+	req->bmsg.batch.msgs_count = msgs_count;
+	req->bmsg.ctx = req;
+	req->bmsg.error = 0;
+	atomic_set(&req->msgs_pending_count, msgs_count);
+
+	/* Init async_tx descriptor */
+	req->tx.flags = flags;
+	req->tx.cookie = -EBUSY;
+
+	return &req->tx;
+}
+
+static unsigned int sba_fillup_xor_msg(struct sba_request *req,
+				struct brcm_sba_command *cmds,
+				struct brcm_message *msg,
+				dma_addr_t msg_offset, size_t msg_len,
+				dma_addr_t dst, dma_addr_t *src, u32 src_cnt)
+{
+	u64 cmd;
+	u32 c_mdata;
+	unsigned int i;
+	struct brcm_sba_command *cmdsp = cmds;
+
+	/* Type-B command to load data into buf0 */
+	cmd = 0;
+	SBA_ENC(cmd, SBA_TYPE_B, SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	SBA_ENC(cmd, msg_len,
+		SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	c_mdata = SBA_C_MDATA_LOAD_VAL(0);
+	SBA_ENC(cmd, SBA_C_MDATA_LS(c_mdata),
+		SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	SBA_ENC(cmd, SBA_CMD_LOAD_BUFFER,
+		SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+	cmdsp->data = src[0] + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+	/* Type-B commands to xor data with buf0 and put it back in buf0 */
+	for (i = 1; i < src_cnt; i++) {
+		cmd = 0;
+		SBA_ENC(cmd, SBA_TYPE_B, SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		SBA_ENC(cmd, msg_len,
+			SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		c_mdata = SBA_C_MDATA_XOR_VAL(0, 0);
+		SBA_ENC(cmd, SBA_C_MDATA_LS(c_mdata),
+			SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		SBA_ENC(cmd, SBA_CMD_XOR, SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+		cmdsp->data = src[i] + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	}
+
+	/* Type-A command to write buf0 */
+	cmd = 0;
+	SBA_ENC(cmd, SBA_TYPE_A, SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	SBA_ENC(cmd, msg_len,
+		SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	SBA_ENC(cmd, 0x1, SBA_RESP_SHIFT, SBA_RESP_MASK);
+	c_mdata = SBA_C_MDATA_WRITE_VAL(0);
+	SBA_ENC(cmd, SBA_C_MDATA_LS(c_mdata),
+		SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	SBA_ENC(cmd, SBA_CMD_WRITE_BUFFER,
+		SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+	if (req->sba->hw_resp_size) {
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+		cmdsp->resp = req->resp_dma;
+		cmdsp->resp_len = req->sba->hw_resp_size;
+	}
+	cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+	cmdsp->data = dst + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+	/* Fillup brcm_message */
+	msg->type = BRCM_MESSAGE_SBA;
+	msg->sba.cmds = cmds;
+	msg->sba.cmds_count = cmdsp - cmds;
+	msg->ctx = req;
+	msg->error = 0;
+
+	return cmdsp - cmds;
+}
+
+static struct dma_async_tx_descriptor *
+sba_prep_dma_xor(struct dma_chan *dchan, dma_addr_t dst, dma_addr_t *src,
+		 u32 src_cnt, size_t len, unsigned long flags)
+{
+	size_t msg_len;
+	dma_addr_t msg_offset = 0;
+	unsigned int msgs_count = 0, cmds_count, cmds_idx = 0;
+	struct sba_device *sba = to_sba_device(dchan);
+	struct sba_request *req = NULL;
+
+	/* Sanity checks */
+	if (unlikely(len > sba->req_size))
+		return NULL;
+	if (unlikely(src_cnt > sba->max_xor_srcs))
+		return NULL;
+
+	/* Alloc new request */
+	req = sba_alloc_request(sba);
+	if (!req)
+		return NULL;
+	req->fence = (flags & DMA_PREP_FENCE) ? true : false;
+
+	/* Fillup request messages */
+	while (len) {
+		msg_len = (len < sba->hw_buf_size) ? len : sba->hw_buf_size;
+		cmds_count = sba_fillup_xor_msg(req,
+				     &req->cmds[cmds_idx],
+				     &req->msgs[msgs_count],
+				     msg_offset, msg_len,
+				     dst, src, src_cnt);
+		msgs_count++;
+		cmds_idx += cmds_count;
+		msg_offset += msg_len;
+		len -= msg_len;
+	}
+	req->bmsg.type = BRCM_MESSAGE_BATCH;
+	req->bmsg.batch.msgs = &req->msgs[0];
+	req->bmsg.batch.msgs_queued = 0;
+	req->bmsg.batch.msgs_count = msgs_count;
+	req->bmsg.ctx = req;
+	req->bmsg.error = 0;
+	atomic_set(&req->msgs_pending_count, msgs_count);
+
+	/* Init async_tx descriptor */
+	req->tx.flags = flags;
+	req->tx.cookie = -EBUSY;
+
+	return &req->tx;
+}
+
+static unsigned int sba_fillup_pq_msg(struct sba_request *req,
+				bool pq_continue,
+				struct brcm_sba_command *cmds,
+				struct brcm_message *msg,
+				dma_addr_t msg_offset, size_t msg_len,
+				dma_addr_t *dst_p, dma_addr_t *dst_q,
+				const u8 *scf, dma_addr_t *src, u32 src_cnt)
+{
+	u64 cmd;
+	u32 c_mdata;
+	unsigned int i;
+	struct brcm_sba_command *cmdsp = cmds;
+
+	if (pq_continue) {
+		/* Type-B command to load old P into buf0 */
+		if (dst_p) {
+			cmd = 0;
+			SBA_ENC(cmd, SBA_TYPE_B,
+				SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+			SBA_ENC(cmd, msg_len,
+				SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+			c_mdata = SBA_C_MDATA_LOAD_VAL(0);
+			SBA_ENC(cmd, SBA_C_MDATA_LS(c_mdata),
+				SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+			SBA_ENC(cmd, SBA_CMD_LOAD_BUFFER,
+				SBA_CMD_SHIFT, SBA_CMD_MASK);
+			cmdsp->cmd = cmd;
+			*cmdsp->cmd_dma = cpu_to_le64(cmd);
+			cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+			cmdsp->data = *dst_p + msg_offset;
+			cmdsp->data_len = msg_len;
+			cmdsp++;
+		}
+
+		/* Type-B command to load old Q into buf1 */
+		if (dst_q) {
+			cmd = 0;
+			SBA_ENC(cmd, SBA_TYPE_B,
+				SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+			SBA_ENC(cmd, msg_len,
+				SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+			c_mdata = SBA_C_MDATA_LOAD_VAL(1);
+			SBA_ENC(cmd, SBA_C_MDATA_LS(c_mdata),
+				SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+			SBA_ENC(cmd, SBA_CMD_LOAD_BUFFER,
+				SBA_CMD_SHIFT, SBA_CMD_MASK);
+			cmdsp->cmd = cmd;
+			*cmdsp->cmd_dma = cpu_to_le64(cmd);
+			cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+			cmdsp->data = *dst_q + msg_offset;
+			cmdsp->data_len = msg_len;
+			cmdsp++;
+		}
+	} else {
+		/* Type-A command to load data into buf0 */
+		cmd = 0;
+		SBA_ENC(cmd, SBA_TYPE_A, SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		SBA_ENC(cmd, msg_len,
+			SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		SBA_ENC(cmd, SBA_CMD_ZERO_ALL_BUFFERS,
+			SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+		cmdsp++;
+	}
+
+	/* Type-B commands for generate P onto buf0 and Q onto buf1 */
+	for (i = 0; i < src_cnt; i++) {
+		cmd = 0;
+		SBA_ENC(cmd, SBA_TYPE_B, SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		SBA_ENC(cmd, msg_len,
+			SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		c_mdata = SBA_C_MDATA_PQ_VAL(raid6_gflog[scf[i]], 1, 0);
+		SBA_ENC(cmd, SBA_C_MDATA_LS(c_mdata),
+			SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		SBA_ENC(cmd, SBA_C_MDATA_MS(c_mdata),
+			SBA_C_MDATA_MS_SHIFT, SBA_C_MDATA_MS_MASK);
+		SBA_ENC(cmd, SBA_CMD_GALOIS_XOR,
+			SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+		cmdsp->data = src[i] + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	}
+
+	/* Type-A command to write buf0 */
+	if (dst_p) {
+		cmd = 0;
+		SBA_ENC(cmd, SBA_TYPE_A, SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		SBA_ENC(cmd, msg_len,
+			SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		SBA_ENC(cmd, 0x1, SBA_RESP_SHIFT, SBA_RESP_MASK);
+		c_mdata = SBA_C_MDATA_WRITE_VAL(0);
+		SBA_ENC(cmd, SBA_C_MDATA_LS(c_mdata),
+			SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		SBA_ENC(cmd, SBA_CMD_WRITE_BUFFER,
+			SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+		if (req->sba->hw_resp_size) {
+			cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+			cmdsp->resp = req->resp_dma;
+			cmdsp->resp_len = req->sba->hw_resp_size;
+		}
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+		cmdsp->data = *dst_p + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	}
+
+	/* Type-A command to write buf1 */
+	if (dst_q) {
+		cmd = 0;
+		SBA_ENC(cmd, SBA_TYPE_A, SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		SBA_ENC(cmd, msg_len,
+			SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		SBA_ENC(cmd, 0x1, SBA_RESP_SHIFT, SBA_RESP_MASK);
+		c_mdata = SBA_C_MDATA_WRITE_VAL(1);
+		SBA_ENC(cmd, SBA_C_MDATA_LS(c_mdata),
+			SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		SBA_ENC(cmd, SBA_CMD_WRITE_BUFFER,
+			SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+		if (req->sba->hw_resp_size) {
+			cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+			cmdsp->resp = req->resp_dma;
+			cmdsp->resp_len = req->sba->hw_resp_size;
+		}
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+		cmdsp->data = *dst_q + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	}
+
+	/* Fillup brcm_message */
+	msg->type = BRCM_MESSAGE_SBA;
+	msg->sba.cmds = cmds;
+	msg->sba.cmds_count = cmdsp - cmds;
+	msg->ctx = req;
+	msg->error = 0;
+
+	return cmdsp - cmds;
+}
+
+static struct dma_async_tx_descriptor *
+sba_prep_dma_pq(struct dma_chan *dchan, dma_addr_t *dst, dma_addr_t *src,
+		u32 src_cnt, const u8 *scf, size_t len, unsigned long flags)
+{
+	u32 i;
+	size_t dst_count, msg_len;
+	unsigned int msgs_count = 0, cmds_count, cmds_idx = 0;
+	dma_addr_t *dst_p = NULL, *dst_q = NULL;
+	dma_addr_t msg_offset = 0;
+	struct sba_device *sba = to_sba_device(dchan);
+	struct sba_request *req = NULL;
+
+	/* Sanity checks */
+	if (unlikely(len > sba->req_size))
+		return NULL;
+	if (unlikely(src_cnt > sba->max_pq_srcs))
+		return NULL;
+	for (i = 0; i < src_cnt; i++)
+		if (sba->max_pq_coefs <= raid6_gflog[scf[i]])
+			return NULL;
+
+	/* Figure-out P and Q destination addresses */
+	dst_count = 0;
+	if (!(flags & DMA_PREP_PQ_DISABLE_P))
+		dst_p = &dst[dst_count++];
+	if (!(flags & DMA_PREP_PQ_DISABLE_Q))
+		dst_q = &dst[dst_count++];
+
+	/* Alloc new request */
+	req = sba_alloc_request(sba);
+	if (!req)
+		return NULL;
+	req->fence = (flags & DMA_PREP_FENCE) ? true : false;
+
+	/* Fillup request messages */
+	while (len) {
+		msg_len = (len < sba->hw_buf_size) ? len : sba->hw_buf_size;
+		cmds_count = sba_fillup_pq_msg(req, dmaf_continue(flags),
+				    &req->cmds[cmds_idx],
+				    &req->msgs[msgs_count],
+				    msg_offset, msg_len,
+				    dst_p, dst_q, scf, src, src_cnt);
+		msgs_count++;
+		cmds_idx += cmds_count;
+		msg_offset += msg_len;
+		len -= msg_len;
+	}
+	req->bmsg.type = BRCM_MESSAGE_BATCH;
+	req->bmsg.batch.msgs = &req->msgs[0];
+	req->bmsg.batch.msgs_queued = 0;
+	req->bmsg.batch.msgs_count = msgs_count;
+	req->bmsg.ctx = req;
+	req->bmsg.error = 0;
+	atomic_set(&req->msgs_pending_count, msgs_count);
+
+	/* Init async_tx descriptor */
+	req->tx.flags = flags;
+	req->tx.cookie = -EBUSY;
+
+	return &req->tx;
+}
+
+/* ====== Mailbox callbacks ===== */
+
+static void sba_dma_tx_actions(struct sba_request *req)
+{
+	struct dma_async_tx_descriptor *tx = &req->tx;
+
+	WARN_ON(tx->cookie < 0);
+
+	if (tx->cookie > 0) {
+		dma_cookie_complete(tx);
+
+		/* call the callback (must not sleep or submit new
+		 * operations to this channel)
+		 */
+		if (tx->callback)
+			tx->callback(tx->callback_param);
+
+		dma_descriptor_unmap(tx);
+	}
+
+	/* run dependent operations */
+	dma_run_dependencies(tx);
+}
+
+static void sba_dma_clean(struct sba_request *req)
+{
+	/* If waiting for 'ack' then move to completed list */
+	if (!async_tx_test_ack(&req->tx))
+		sba_complete_request(req);
+	else
+		sba_free_request(req);
+}
+
+static void sba_receive_message(struct mbox_client *cl, void *msg)
+{
+	unsigned long flags;
+	struct brcm_message *m = msg;
+	struct sba_request *req = m->ctx, *req1;
+	struct sba_device *sba = req->sba;
+
+	/*  error count if message has error */
+	if (m->error < 0) {
+		dev_err(sba->dev, "%s got message with error %d",
+			dma_chan_name(&sba->dma_chan), m->error);
+	}
+
+	/* Wait for all messages to be completed */
+	if (atomic_dec_return(&req->msgs_pending_count))
+		return;
+
+	/* Update request */
+	if (req->state == SBA_REQUEST_STATE_ACTIVE) {
+		sba_dma_tx_actions(req);
+		sba_dma_clean(req);
+	} else {
+		sba_free_request(req);
+	}
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	/* Re-check all completed request waiting for 'ack' */
+	list_for_each_entry_safe(req, req1, &sba->reqs_completed_list, node) {
+		spin_unlock_irqrestore(&sba->reqs_lock, flags);
+		sba_dma_tx_actions(req);
+		sba_dma_clean(req);
+		spin_lock_irqsave(&sba->reqs_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+
+	/* Try to submit pending request */
+	sba_issue_pending(&sba->dma_chan);
+}
+
+/* ====== Platform driver routines ===== */
+
+static int sba_prealloc_channel_resources(struct sba_device *sba)
+{
+	int i, j, p, ret = 0;
+	struct sba_request *req = NULL;
+
+	sba->resp_base = dma_alloc_coherent(sba->dma_dev.dev,
+					    sba->max_resp_pool_size,
+					    &sba->resp_dma_base, GFP_KERNEL);
+	if (!sba->resp_base)
+		return -ENOMEM;
+
+	sba->cmds_base = dma_alloc_coherent(sba->dma_dev.dev,
+					    sba->max_cmds_pool_size,
+					    &sba->cmds_dma_base, GFP_KERNEL);
+	if (!sba->cmds_base) {
+		ret = -ENOMEM;
+		goto fail_free_resp_pool;
+	}
+
+	spin_lock_init(&sba->reqs_lock);
+	sba->reqs_fence = false;
+	INIT_LIST_HEAD(&sba->reqs_alloc_list);
+	INIT_LIST_HEAD(&sba->reqs_pending_list);
+	INIT_LIST_HEAD(&sba->reqs_active_list);
+	INIT_LIST_HEAD(&sba->reqs_completed_list);
+	INIT_LIST_HEAD(&sba->reqs_aborted_list);
+	INIT_LIST_HEAD(&sba->reqs_free_list);
+
+	sba->reqs = devm_kcalloc(sba->dev, sba->max_req,
+				 sizeof(*req), GFP_KERNEL);
+	if (!sba->reqs) {
+		ret = -ENOMEM;
+		goto fail_free_cmds_pool;
+	}
+
+	for (i = 0, p = 0; i < sba->max_req; i++) {
+		req = &sba->reqs[i];
+		INIT_LIST_HEAD(&req->node);
+		req->sba = sba;
+		req->state = SBA_REQUEST_STATE_FREE;
+		req->fence = false;
+		req->resp = sba->resp_base + p;
+		req->resp_dma = sba->resp_dma_base + p;
+		p += sba->hw_resp_size;
+		req->cmds = devm_kcalloc(sba->dev, sba->max_cmd_per_req,
+					 sizeof(*req->cmds), GFP_KERNEL);
+		if (!req->cmds) {
+			ret = -ENOMEM;
+			goto fail_free_cmds_pool;
+		}
+		for (j = 0; j < sba->max_cmd_per_req; j++) {
+			req->cmds[j].cmd = 0;
+			req->cmds[j].cmd_dma = sba->cmds_base +
+				(i * sba->max_cmd_per_req + j) * sizeof(u64);
+			req->cmds[j].cmd_dma_addr = sba->cmds_dma_base +
+				(i * sba->max_cmd_per_req + j) * sizeof(u64);
+			req->cmds[j].flags = 0;
+		}
+		req->msgs = devm_kcalloc(sba->dev, sba->max_msg_per_req,
+					 sizeof(*req->msgs), GFP_KERNEL);
+		if (!req->msgs) {
+			ret = -ENOMEM;
+			goto fail_free_cmds_pool;
+		}
+		memset(&req->bmsg, 0, sizeof(req->bmsg));
+		atomic_set(&req->msgs_pending_count, 0);
+		dma_async_tx_descriptor_init(&req->tx, &sba->dma_chan);
+		req->tx.tx_submit = sba_tx_submit;
+		req->tx.phys = req->resp_dma;
+		list_add_tail(&req->node, &sba->reqs_free_list);
+	}
+
+	sba->reqs_free_count = sba->max_req;
+
+	return 0;
+
+fail_free_cmds_pool:
+	dma_free_coherent(sba->dma_dev.dev,
+			  sba->max_cmds_pool_size,
+			  sba->cmds_base, sba->cmds_dma_base);
+fail_free_resp_pool:
+	dma_free_coherent(sba->dma_dev.dev,
+			  sba->max_resp_pool_size,
+			  sba->resp_base, sba->resp_dma_base);
+	return ret;
+}
+
+static void sba_freeup_channel_resources(struct sba_device *sba)
+{
+	dmaengine_terminate_all(&sba->dma_chan);
+	dma_free_coherent(sba->dma_dev.dev, sba->max_cmds_pool_size,
+			  sba->cmds_base, sba->cmds_dma_base);
+	dma_free_coherent(sba->dma_dev.dev, sba->max_resp_pool_size,
+			  sba->resp_base, sba->resp_dma_base);
+	sba->resp_base = NULL;
+	sba->resp_dma_base = 0;
+}
+
+static int sba_async_register(struct sba_device *sba)
+{
+	int ret;
+	struct dma_device *dma_dev = &sba->dma_dev;
+
+	/* Initialize DMA channel cookie */
+	sba->dma_chan.device = dma_dev;
+	dma_cookie_init(&sba->dma_chan);
+
+	/* Initialize DMA device capability mask */
+	dma_cap_zero(dma_dev->cap_mask);
+	dma_cap_set(DMA_MEMCPY, dma_dev->cap_mask);
+	dma_cap_set(DMA_XOR, dma_dev->cap_mask);
+	dma_cap_set(DMA_PQ, dma_dev->cap_mask);
+
+	/*
+	 * Set mailbox channel device as the base device of
+	 * our dma_device because the actual memory accesses
+	 * will be done by mailbox controller
+	 */
+	dma_dev->dev = sba->mbox_dev;
+
+	/* Set base prep routines */
+	dma_dev->device_alloc_chan_resources = sba_alloc_chan_resources;
+	dma_dev->device_free_chan_resources = sba_free_chan_resources;
+	dma_dev->device_issue_pending = sba_issue_pending;
+	dma_dev->device_tx_status = sba_tx_status;
+
+	/* Set memcpy routines and capability */
+	if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask))
+		dma_dev->device_prep_dma_memcpy = sba_prep_dma_memcpy;
+
+	/* Set xor routines and capability */
+	if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) {
+		dma_dev->device_prep_dma_xor = sba_prep_dma_xor;
+		dma_dev->max_xor = sba->max_xor_srcs;
+	}
+
+	/* Set pq routines and capability */
+	if (dma_has_cap(DMA_PQ, dma_dev->cap_mask)) {
+		dma_dev->device_prep_dma_pq = sba_prep_dma_pq;
+		dma_set_maxpq(dma_dev, sba->max_pq_srcs, 0);
+		dma_set_maxpqcoef(dma_dev, sba->max_pq_coefs);
+	}
+
+	/* Initialize DMA device channel list */
+	INIT_LIST_HEAD(&dma_dev->channels);
+	list_add_tail(&sba->dma_chan.device_node, &dma_dev->channels);
+
+	/* Register with Linux async DMA framework*/
+	ret = dma_async_device_register(dma_dev);
+	if (ret) {
+		dev_err(sba->dev, "async device register error %d", ret);
+		return ret;
+	}
+
+	dev_info(sba->dev, "%s capabilities: %s%s%s\n",
+		 dma_chan_name(&sba->dma_chan),
+		 dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask) ? "memcpy " : "",
+		 dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "",
+		 dma_has_cap(DMA_PQ, dma_dev->cap_mask) ? "pq " : "");
+
+	return 0;
+}
+
+static int sba_probe(struct platform_device *pdev)
+{
+	int i, ret = 0, mchans_count;
+	struct sba_device *sba;
+
+	/* Allocate main SBA struct */
+	sba = devm_kzalloc(&pdev->dev, sizeof(*sba), GFP_KERNEL);
+	if (!sba)
+		return -ENOMEM;
+
+	sba->dev = &pdev->dev;
+	platform_set_drvdata(pdev, sba);
+
+	/* Determine SBA version from DT compatible string */
+	if (of_device_is_compatible(sba->dev->of_node, "brcm,iproc-sba"))
+		sba->ver = SBA_VER_1;
+	else if (of_device_is_compatible(sba->dev->of_node,
+					 "brcm,iproc-sba-v2"))
+		sba->ver = SBA_VER_2;
+	else
+		return -ENODEV;
+
+	/* Derived Configuration parameters */
+	switch (sba->ver) {
+	case SBA_VER_1:
+		sba->max_req = 128;
+		sba->req_size = PAGE_SIZE;
+		sba->hw_buf_size = 4096;
+		sba->hw_resp_size = 8;
+		sba->max_pq_coefs = 6;
+		sba->max_pq_srcs = 6;
+		break;
+	case SBA_VER_2:
+		sba->max_req = 128;
+		sba->req_size = PAGE_SIZE;
+		sba->hw_buf_size = 4096;
+		sba->hw_resp_size = 8;
+		sba->max_pq_coefs = 30;
+		/*
+		 * We can support max_pq_srcs == max_pq_coefs because
+		 * we are limited by number of SBA commands that we can
+		 * fit in one message for underlying ring manager HW.
+		 */
+		sba->max_pq_srcs = 12;
+		break;
+	default:
+		return -EINVAL;
+	}
+	sba->max_msg_per_req = sba->req_size / sba->hw_buf_size;
+	if ((sba->max_msg_per_req * sba->hw_buf_size) < sba->req_size)
+		sba->max_msg_per_req++;
+	sba->max_cmd_per_msg = sba->max_pq_srcs + 3;
+	sba->max_cmd_per_req = sba->max_msg_per_req * sba->max_cmd_per_msg;
+	sba->max_xor_srcs = sba->max_cmd_per_msg - 1;
+	sba->max_resp_pool_size = sba->max_req * sba->hw_resp_size;
+	sba->max_cmds_pool_size = sba->max_req *
+				  sba->max_cmd_per_req * sizeof(u64);
+
+	/* Setup mailbox client */
+	sba->client.dev			= &pdev->dev;
+	sba->client.rx_callback		= sba_receive_message;
+	sba->client.tx_block		= false;
+	sba->client.knows_txdone	= false;
+	sba->client.tx_tout		= 0;
+
+	/* Number of channels equals number of mailbox channels */
+	ret = of_count_phandle_with_args(pdev->dev.of_node,
+					 "mboxes", "#mbox-cells");
+	if (ret <= 0)
+		return -ENODEV;
+	mchans_count = ret;
+	sba->mchans_count = 0;
+	atomic_set(&sba->mchans_current, 0);
+
+	/* Allocate mailbox channel array */
+	sba->mchans = devm_kcalloc(&pdev->dev, sba->mchans_count,
+				   sizeof(*sba->mchans), GFP_KERNEL);
+	if (!sba->mchans)
+		return -ENOMEM;
+
+	/* Request mailbox channels */
+	for (i = 0; i < mchans_count; i++) {
+		sba->mchans[i] = mbox_request_channel(&sba->client, i);
+		if (IS_ERR(sba->mchans[i])) {
+			ret = PTR_ERR(sba->mchans[i]);
+			goto fail_free_mchans;
+		}
+		sba->mchans_count++;
+	}
+
+	/* Find-out underlying mailbox device */
+	sba->mbox_dev = mbox_channel_device(sba->mchans[0]);
+	if (IS_ERR(sba->mbox_dev)) {
+		ret = PTR_ERR(sba->mbox_dev);
+		goto fail_free_mchans;
+	}
+
+	/* All mailbox channels should be of same ring manager device */
+	for (i = 1; i < mchans_count; i++) {
+		if (mbox_channel_device(sba->mchans[i]) != sba->mbox_dev) {
+			ret = -EINVAL;
+			goto fail_free_mchans;
+		}
+	}
+
+	/* Register DMA device with linux async framework */
+	ret = sba_async_register(sba);
+	if (ret)
+		goto fail_free_mchans;
+
+	/* Prealloc channel resource */
+	ret = sba_prealloc_channel_resources(sba);
+	if (ret)
+		goto fail_async_dev_unreg;
+
+	/* Print device info */
+	dev_info(sba->dev, "%s using SBAv%d and %d mailbox channels",
+		 dma_chan_name(&sba->dma_chan), sba->ver+1,
+		 sba->mchans_count);
+
+	return 0;
+
+fail_async_dev_unreg:
+	dma_async_device_unregister(&sba->dma_dev);
+fail_free_mchans:
+	for (i = 0; i < sba->mchans_count; i++)
+		mbox_free_channel(sba->mchans[i]);
+	return ret;
+}
+
+static int sba_remove(struct platform_device *pdev)
+{
+	int i;
+	struct sba_device *sba = platform_get_drvdata(pdev);
+
+	sba_freeup_channel_resources(sba);
+
+	dma_async_device_unregister(&sba->dma_dev);
+
+	for (i = 0; i < sba->mchans_count; i++)
+		mbox_free_channel(sba->mchans[i]);
+
+	return 0;
+}
+
+static const struct of_device_id sba_of_match[] = {
+	{ .compatible = "brcm,iproc-sba", },
+	{ .compatible = "brcm,iproc-sba-v2", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, sba_of_match);
+
+static struct platform_driver sba_driver = {
+	.probe = sba_probe,
+	.remove = sba_remove,
+	.driver = {
+		.name = "bcm-sba-raid",
+		.of_match_table = sba_of_match,
+	},
+};
+module_platform_driver(sba_driver);
+
+MODULE_DESCRIPTION("Broadcom SBA RAID driver");
+MODULE_AUTHOR("Anup Patel <anup.patel@broadcom.com>");
+MODULE_LICENSE("GPL v2");
-- 
2.7.4

^ permalink raw reply related

* [PATCH 4/6] async_tx: Fix DMA_PREP_FENCE usage in do_async_gen_syndrome()
From: Anup Patel @ 2017-02-02  4:47 UTC (permalink / raw)
  To: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar
  Cc: Dan Williams, Ray Jui, Scott Branden, Jon Mason, Rob Rice,
	bcm-kernel-feedback-list, dmaengine, devicetree, linux-arm-kernel,
	linux-kernel, linux-crypto, linux-raid, Anup Patel
In-Reply-To: <1486010836-25228-1-git-send-email-anup.patel@broadcom.com>

The DMA_PREP_FENCE is to be used when preparing Tx descriptor if output
of Tx descriptor is to be used by next/dependent Tx descriptor.

The DMA_PREP_FENSE will not be set correctly in do_async_gen_syndrome()
when calling dma->device_prep_dma_pq() under following conditions:
1. ASYNC_TX_FENCE not set in submit->flags
2. DMA_PREP_FENCE not set in dma_flags
3. src_cnt (= (disks - 2)) is greater than dma_maxpq(dma, dma_flags)

This patch fixes DMA_PREP_FENCE usage in do_async_gen_syndrome() taking
inspiration from do_async_xor() implementation.

Signed-off-by: Anup Patel <anup.patel@broadcom.com>
Reviewed-by: Ray Jui <ray.jui@broadcom.com>
Reviewed-by: Scott Branden <scott.branden@broadcom.com>
---
 crypto/async_tx/async_pq.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c
index 16c6526..947cf35 100644
--- a/crypto/async_tx/async_pq.c
+++ b/crypto/async_tx/async_pq.c
@@ -62,9 +62,6 @@ do_async_gen_syndrome(struct dma_chan *chan,
 	dma_addr_t dma_dest[2];
 	int src_off = 0;
 
-	if (submit->flags & ASYNC_TX_FENCE)
-		dma_flags |= DMA_PREP_FENCE;
-
 	while (src_cnt > 0) {
 		submit->flags = flags_orig;
 		pq_src_cnt = min(src_cnt, dma_maxpq(dma, dma_flags));
@@ -83,6 +80,8 @@ do_async_gen_syndrome(struct dma_chan *chan,
 			if (cb_fn_orig)
 				dma_flags |= DMA_PREP_INTERRUPT;
 		}
+		if (submit->flags & ASYNC_TX_FENCE)
+			dma_flags |= DMA_PREP_FENCE;
 
 		/* Drivers force forward progress in case they can not provide
 		 * a descriptor
-- 
2.7.4

^ permalink raw reply related

* [PATCH 3/6] async_tx: Handle DMA devices having support for fewer PQ coefficients
From: Anup Patel @ 2017-02-02  4:47 UTC (permalink / raw)
  To: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar
  Cc: Dan Williams, Ray Jui, Scott Branden, Jon Mason, Rob Rice,
	bcm-kernel-feedback-list, dmaengine, devicetree, linux-arm-kernel,
	linux-kernel, linux-crypto, linux-raid, Anup Patel
In-Reply-To: <1486010836-25228-1-git-send-email-anup.patel@broadcom.com>

The DMAENGINE framework assumes that if PQ offload is supported by a
DMA device then all 256 PQ coefficients are supported. This assumption
does not hold anymore because we now have BCM-SBA-RAID offload engine
which supports PQ offload with limited number of PQ coefficients.

This patch extends async_tx APIs to handle DMA devices with support
for fewer PQ coefficients.

Signed-off-by: Anup Patel <anup.patel@broadcom.com>
Reviewed-by: Scott Branden <scott.branden@broadcom.com>
---
 crypto/async_tx/async_pq.c          |  3 +++
 crypto/async_tx/async_raid6_recov.c | 12 ++++++++++--
 include/linux/dmaengine.h           | 19 +++++++++++++++++++
 include/linux/raid/pq.h             |  3 +++
 4 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c
index f83de99..16c6526 100644
--- a/crypto/async_tx/async_pq.c
+++ b/crypto/async_tx/async_pq.c
@@ -187,6 +187,9 @@ async_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
 
 	BUG_ON(disks > 255 || !(P(blocks, disks) || Q(blocks, disks)));
 
+	if (device && dma_maxpqcoef(device) < src_cnt)
+		device = NULL;
+
 	if (device)
 		unmap = dmaengine_get_unmap_data(device->dev, disks, GFP_NOWAIT);
 
diff --git a/crypto/async_tx/async_raid6_recov.c b/crypto/async_tx/async_raid6_recov.c
index 8fab627..2916f95 100644
--- a/crypto/async_tx/async_raid6_recov.c
+++ b/crypto/async_tx/async_raid6_recov.c
@@ -352,6 +352,7 @@ async_raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
 {
 	void *scribble = submit->scribble;
 	int non_zero_srcs, i;
+	struct dma_chan *chan = async_dma_find_channel(DMA_PQ);
 
 	BUG_ON(faila == failb);
 	if (failb < faila)
@@ -359,12 +360,15 @@ async_raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
 
 	pr_debug("%s: disks: %d len: %zu\n", __func__, disks, bytes);
 
+	if (chan && dma_maxpqcoef(chan->device) < RAID6_PQ_MAX_COEF)
+		chan = NULL;
+
 	/* if a dma resource is not available or a scribble buffer is not
 	 * available punt to the synchronous path.  In the 'dma not
 	 * available' case be sure to use the scribble buffer to
 	 * preserve the content of 'blocks' as the caller intended.
 	 */
-	if (!async_dma_find_channel(DMA_PQ) || !scribble) {
+	if (!chan || !scribble) {
 		void **ptrs = scribble ? scribble : (void **) blocks;
 
 		async_tx_quiesce(&submit->depend_tx);
@@ -432,15 +436,19 @@ async_raid6_datap_recov(int disks, size_t bytes, int faila,
 	void *scribble = submit->scribble;
 	int good_srcs, good, i;
 	struct page *srcs[2];
+	struct dma_chan *chan = async_dma_find_channel(DMA_PQ);
 
 	pr_debug("%s: disks: %d len: %zu\n", __func__, disks, bytes);
 
+	if (chan && dma_maxpqcoef(chan->device) < RAID6_PQ_MAX_COEF)
+		chan = NULL;
+
 	/* if a dma resource is not available or a scribble buffer is not
 	 * available punt to the synchronous path.  In the 'dma not
 	 * available' case be sure to use the scribble buffer to
 	 * preserve the content of 'blocks' as the caller intended.
 	 */
-	if (!async_dma_find_channel(DMA_PQ) || !scribble) {
+	if (!chan || !scribble) {
 		void **ptrs = scribble ? scribble : (void **) blocks;
 
 		async_tx_quiesce(&submit->depend_tx);
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index feee6ec..d938a8b 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -24,6 +24,7 @@
 #include <linux/scatterlist.h>
 #include <linux/bitmap.h>
 #include <linux/types.h>
+#include <linux/raid/pq.h>
 #include <asm/page.h>
 
 /**
@@ -668,6 +669,7 @@ struct dma_filter {
  * @cap_mask: one or more dma_capability flags
  * @max_xor: maximum number of xor sources, 0 if no capability
  * @max_pq: maximum number of PQ sources and PQ-continue capability
+ * @max_pqcoef: maximum number of PQ coefficients, 0 if all supported
  * @copy_align: alignment shift for memcpy operations
  * @xor_align: alignment shift for xor operations
  * @pq_align: alignment shift for pq operations
@@ -727,11 +729,13 @@ struct dma_device {
 	dma_cap_mask_t  cap_mask;
 	unsigned short max_xor;
 	unsigned short max_pq;
+	unsigned short max_pqcoef;
 	enum dmaengine_alignment copy_align;
 	enum dmaengine_alignment xor_align;
 	enum dmaengine_alignment pq_align;
 	enum dmaengine_alignment fill_align;
 	#define DMA_HAS_PQ_CONTINUE (1 << 15)
+	#define DMA_HAS_FEWER_PQ_COEF (1 << 15)
 
 	int dev_id;
 	struct device *dev;
@@ -1122,6 +1126,21 @@ static inline int dma_maxpq(struct dma_device *dma, enum dma_ctrl_flags flags)
 	BUG();
 }
 
+static inline void dma_set_maxpqcoef(struct dma_device *dma,
+				     unsigned short max_pqcoef)
+{
+	if (max_pqcoef < RAID6_PQ_MAX_COEF) {
+		dma->max_pqcoef = max_pqcoef;
+		dma->max_pqcoef |= DMA_HAS_FEWER_PQ_COEF;
+	}
+}
+
+static inline unsigned short dma_maxpqcoef(struct dma_device *dma)
+{
+	return (dma->max_pqcoef & DMA_HAS_FEWER_PQ_COEF) ?
+		(dma->max_pqcoef & ~DMA_HAS_FEWER_PQ_COEF) : RAID6_PQ_MAX_COEF;
+}
+
 static inline size_t dmaengine_get_icg(bool inc, bool sgl, size_t icg,
 				      size_t dir_icg)
 {
diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 30f9453..f3a04bb 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -15,6 +15,9 @@
 
 #ifdef __KERNEL__
 
+/* Max number of PQ coefficients */
+#define RAID6_PQ_MAX_COEF 256
+
 /* Set to 1 to use kernel-wide empty_zero_page */
 #define RAID6_USE_EMPTY_ZERO_PAGE 0
 #include <linux/blkdev.h>
-- 
2.7.4

^ permalink raw reply related

* [PATCH 2/6] lib/raid6: Add log-of-2 table for RAID6 HW requiring disk position
From: Anup Patel @ 2017-02-02  4:47 UTC (permalink / raw)
  To: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar
  Cc: Dan Williams, Ray Jui, Scott Branden, Jon Mason, Rob Rice,
	bcm-kernel-feedback-list, dmaengine, devicetree, linux-arm-kernel,
	linux-kernel, linux-crypto, linux-raid, Anup Patel
In-Reply-To: <1486010836-25228-1-git-send-email-anup.patel@broadcom.com>

The raid6_gfexp table represents {2}^n values for 0 <= n < 256. The
Linux async_tx framework pass values from raid6_gfexp as coefficients
for each source to prep_dma_pq() callback of DMA channel with PQ
capability. This creates problem for RAID6 offload engines (such as
Broadcom SBA) which take disk position (i.e. log of {2}) instead of
multiplicative cofficients from raid6_gfexp table.

This patch adds raid6_gflog table having log-of-2 value for any given
x such that 0 <= x < 256. For any given disk coefficient x, the
corresponding disk position is given by raid6_gflog[x]. The RAID6
offload engine driver can use this newly added raid6_gflog table to
get disk position from multiplicative coefficient.

Signed-off-by: Anup Patel <anup.patel@broadcom.com>
Reviewed-by: Scott Branden <scott.branden@broadcom.com>
Reviewed-by: Ray Jui <ray.jui@broadcom.com>
---
 include/linux/raid/pq.h |  1 +
 lib/raid6/mktables.c    | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 4d57bba..30f9453 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -142,6 +142,7 @@ int raid6_select_algo(void);
 extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256)));
 extern const u8 raid6_vgfmul[256][32] __attribute__((aligned(256)));
 extern const u8 raid6_gfexp[256]      __attribute__((aligned(256)));
+extern const u8 raid6_gflog[256]      __attribute__((aligned(256)));
 extern const u8 raid6_gfinv[256]      __attribute__((aligned(256)));
 extern const u8 raid6_gfexi[256]      __attribute__((aligned(256)));
 
diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c
index 39787db..e824d08 100644
--- a/lib/raid6/mktables.c
+++ b/lib/raid6/mktables.c
@@ -125,6 +125,26 @@ int main(int argc, char *argv[])
 	printf("EXPORT_SYMBOL(raid6_gfexp);\n");
 	printf("#endif\n");
 
+	/* Compute log-of-2 table */
+	printf("\nconst u8 __attribute__((aligned(256)))\n"
+	       "raid6_gflog[256] =\n" "{\n");
+	for (i = 0; i < 256; i += 8) {
+		printf("\t");
+		for (j = 0; j < 8; j++) {
+			v = 255;
+			for (k = 0; k < 256; k++)
+				if (exptbl[k] == (i + j)) {
+					v = k;
+					break;
+				}
+			printf("0x%02x,%c", v, (j == 7) ? '\n' : ' ');
+		}
+	}
+	printf("};\n");
+	printf("#ifdef __KERNEL__\n");
+	printf("EXPORT_SYMBOL(raid6_gflog);\n");
+	printf("#endif\n");
+
 	/* Compute inverse table x^-1 == x^254 */
 	printf("\nconst u8 __attribute__((aligned(256)))\n"
 	       "raid6_gfinv[256] =\n" "{\n");
-- 
2.7.4

^ permalink raw reply related

* [PATCH 1/6] mailbox: Add new API mbox_channel_device() for clients
From: Anup Patel @ 2017-02-02  4:47 UTC (permalink / raw)
  To: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar
  Cc: Dan Williams, Ray Jui, Scott Branden, Jon Mason, Rob Rice,
	bcm-kernel-feedback-list, dmaengine, devicetree, linux-arm-kernel,
	linux-kernel, linux-crypto, linux-raid, Anup Patel
In-Reply-To: <1486010836-25228-1-git-send-email-anup.patel@broadcom.com>

The remote processor can have DMAENGINE capabilities and client
can pass data to be processed via main memory. In such cases,
the client will require DMAble memory for remote processor.

This patch adds new API mbox_channel_device() which can be
used by clients to get struct device pointer of underlying
mailbox controller. This struct device pointer of mailbox
controller can be used by clients to allocate DMAble memory
for remote processor.

Signed-off-by: Anup Patel <anup.patel@broadcom.com>
Reviewed-by: Scott Branden <scott.branden@broadcom.com>
Reviewed-by: Ray Jui <ray.jui@broadcom.com>
---
 drivers/mailbox/mailbox.c      | 21 +++++++++++++++++++++
 include/linux/mailbox_client.h |  1 +
 2 files changed, 22 insertions(+)

diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c
index 4671f8a..d4380fc 100644
--- a/drivers/mailbox/mailbox.c
+++ b/drivers/mailbox/mailbox.c
@@ -281,6 +281,27 @@ int mbox_send_message(struct mbox_chan *chan, void *mssg)
 EXPORT_SYMBOL_GPL(mbox_send_message);
 
 /**
+ * mbox_channel_device - Get device pointer of a mailbox channel.
+ * @chan: Mailbox channel assigned to this client.
+ *
+ * The remote processor can have DMAENGINE capabilities and client
+ * can pass data to be processed via main memory. In such cases,
+ * the client will require struct device pointer of the mailbox
+ * channel to map/unmap/allocate/free DMAble memory.
+ *
+ * Return: Pointer to the struct device of mailbox channel.
+ *	   ERR_PTR on failure.
+ */
+struct device *mbox_channel_device(struct mbox_chan *chan)
+{
+	if (!chan || !chan->cl)
+		return ERR_PTR(-EINVAL);
+
+	return chan->mbox->dev;
+}
+EXPORT_SYMBOL_GPL(mbox_channel_device);
+
+/**
  * mbox_request_channel - Request a mailbox channel.
  * @cl: Identity of the client requesting the channel.
  * @index: Index of mailbox specifier in 'mboxes' property.
diff --git a/include/linux/mailbox_client.h b/include/linux/mailbox_client.h
index 4434871..3daffad 100644
--- a/include/linux/mailbox_client.h
+++ b/include/linux/mailbox_client.h
@@ -40,6 +40,7 @@ struct mbox_client {
 	void (*tx_done)(struct mbox_client *cl, void *mssg, int r);
 };
 
+struct device *mbox_channel_device(struct mbox_chan *chan);
 struct mbox_chan *mbox_request_channel_byname(struct mbox_client *cl,
 					      const char *name);
 struct mbox_chan *mbox_request_channel(struct mbox_client *cl, int index);
-- 
2.7.4

^ permalink raw reply related

* [PATCH 0/6] Broadcom SBA RAID support
From: Anup Patel @ 2017-02-02  4:47 UTC (permalink / raw)
  To: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar
  Cc: Dan Williams, Ray Jui, Scott Branden, Jon Mason, Rob Rice,
	bcm-kernel-feedback-list-dY08KVG/lbpWk0Htik3J/w,
	dmaengine-u79uwXL29TY76Z2rM5mHXA,
	devicetree-u79uwXL29TY76Z2rM5mHXA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-crypto-u79uwXL29TY76Z2rM5mHXA,
	linux-raid-u79uwXL29TY76Z2rM5mHXA, Anup Patel

The Broadcom SBA RAID is a stream-based device which provides
RAID5/6 offload.

It requires a SoC specific ring manager (such as Broadcom FlexRM
ring manager) to provide ring-based programming interface. Due to
this, the Broadcom SBA RAID driver (mailbox client) implements
DMA device having one DMA channel using a set of mailbox channels
provided by Broadcom SoC specific ring manager driver (mailbox
controller).

Important limitations of Broadcom SBA RAID hardware are:
1. Requires disk position instead of disk coefficient 
2. Supports only 30 PQ disk coefficients

To address limitation #1, we have added raid_gflog table which
will help driver convert disk coefficient to disk position. To
address limitation #2, we have extended Linux Async Tx APIs to
check for available PQ coefficients before doing PQ offload.

This patchset is based on Linux-4.10-rc6 and depends on patchset
"[PATCH v4 0/2] Broadcom FlexRM ring manager support"

It is also available at sba-raid-v1 branch of
https://github.com/Broadcom/arm64-linux.git

Anup Patel (6):
  mailbox: Add new API mbox_channel_device() for clients
  lib/raid6: Add log-of-2 table for RAID6 HW requiring disk position
  async_tx: Handle DMA devices having support for fewer PQ coefficients
  async_tx: Fix DMA_PREP_FENCE usage in do_async_gen_syndrome()
  dmaengine: Add Broadcom SBA RAID driver
  dt-bindings: Add DT bindings document for Broadcom SBA RAID driver

 .../devicetree/bindings/dma/brcm,iproc-sba.txt     |   29 +
 crypto/async_tx/async_pq.c                         |    8 +-
 crypto/async_tx/async_raid6_recov.c                |   12 +-
 drivers/dma/Kconfig                                |   13 +
 drivers/dma/Makefile                               |    1 +
 drivers/dma/bcm-sba-raid.c                         | 1309 ++++++++++++++++++++
 drivers/mailbox/mailbox.c                          |   21 +
 include/linux/dmaengine.h                          |   19 +
 include/linux/mailbox_client.h                     |    1 +
 include/linux/raid/pq.h                            |    4 +
 lib/raid6/mktables.c                               |   20 +
 11 files changed, 1432 insertions(+), 5 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/dma/brcm,iproc-sba.txt
 create mode 100644 drivers/dma/bcm-sba-raid.c

-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] MD: add doc for raid5-cache
From: NeilBrown @ 2017-02-02  0:37 UTC (permalink / raw)
  To: Shaohua Li, linux-raid; +Cc: antlists, philip, songliubraving
In-Reply-To: <25051bd79d94b45c7be24ce466a8b6eb2fba66c0.1485890144.git.shli@fb.com>

[-- Attachment #1: Type: text/plain, Size: 9347 bytes --]

On Tue, Jan 31 2017, Shaohua Li wrote:

> I'm starting document of the raid5-cache feature. Please let me know
> what else we should put into the document. Of course, comments are
> welcome!
>
> Signed-off-by: Shaohua Li <shli@fb.com>
> ---
>  Documentation/md/raid5-cache.txt | 99 ++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 99 insertions(+)
>  create mode 100644 Documentation/md/raid5-cache.txt
>
> diff --git a/Documentation/md/raid5-cache.txt b/Documentation/md/raid5-cache.txt
> new file mode 100644
> index 0000000..17a6279
> --- /dev/null
> +++ b/Documentation/md/raid5-cache.txt
> @@ -0,0 +1,99 @@
> +RAID5 cache
> +
> +Raid 4/5/6 could include an extra disk for data cache. The cache could be
> +in write-through or write-back mode. mdadm has a new option

"can" fits better than "could".  "could" suggests past-tense: something
that was true before but might have changed.

> +'--write-journal' to create array with cache. By default (raid array
> +starts), the cache is in write-through mode. User can switch it to
> +write-back mode by:

I think "The user" or "A user" is better than just "User".


> +
> +echo "write-back" > /sys/block/md0/md/journal_mode
> +
> +And switch it back to write-through mode by:
> +
> +echo "write-through" > /sys/block/md0/md/journal_mode
> +
> +In both modes, all writes to the array will hit cache disk first. This means
> +the cache disk must be fast and sustainable (if you use a SSD as the cache).

"(if you use a SSD as the cache)"

Are you trying to say "You should normally use an SSD or similar for the
cache", or is this really a condition: "If you use an SSD as the cache,
then ...."??


> +
> +-------------------------------------
> +write-through mode:
> +
> +This mode mainly fixes 'write hole' issue. For RAID 4/5/6 array, an
                         ^the


> +unclean shutdown could cause data in some stripes is not in consistent
                    can                             to not be in a consistent


> +state, eg, data and parity don't match. The reason is a stripe write
                                                    is that a
> +involves several raid disks and it's possible writes don't hit all raid
> +disks yet before the unclean shutdown. After an unclean shutdown, MD try
                                                                        tries
> +to 'resync' the array to put all stripes back into consistent state. In
> +the resync, any disk failure will cause real data corruption. This problem
                               could cause
The write hole often won't cause corruption, but it is a real
possibility. So "it could ..."
                                                    
                               
> +is called 'write hole'. So the 'write hole' issue occurs between unclean
> +shutdown and 'resync'. This window isn't big.

I don't think this is the best way to explain the write hole.
If the array is already degraded, there is no window at all.  A crash
of a degraded array exposes you to the chance of data corruption due to
the write hole.

>                                                 On the other hand, if one
> +disk fails, other disks could fail soon, which happens sometimes if the
> +disks are from the same vendor and manufactured in the same time. This
> +will increase the chance of 'write whole', but overall the chance isn't
> +big, so don't panic even not using cache disk.

I don't think you really need to talk about the "two drive failure" case
at all - it isn't relevant.
Just focus on "system crash while array is degraded", and mention that
if the array becomes degraded before resync completes, the write hole
still applies.

> +
> +The write-through cache will cache all data in cache disk first. Until the
> +data hits into the cache disk, the data is flushed into RAID disks. The

Drop "into".  Just "data hits the cache disk"..
Also use "After", not "Until".
I wouldn't say "hit" either - it is colloquial.

 After the data is safe on the cache disk, the data will be flushed onto
 the RAID disks.

This implies that the cache disk is not one of the RAID disks.  I do
prefer to think if it that way, but your opening statement suggest that a
RAID5 can "include" another disk for the cache.  That suggests that the
cache disk is part of the RAID... so it would be a RAID disk.

I think it is important to get this terminology right to avoid
confusion.  An array can have several RAID disks which can be
supplemented with a cache disk. (That is how you talk about them later).

> +two-step write will guarantee MD can recover correct data after unclean
> +shutdown even with disk failure. Thus the cache can close the 'write
> +hole'.
> +
> +In write-through mode, MD reports IO finish to upper layer (usually

"IO finished", or "IO completion". (same changed needed twice more below)

> +filesystems) till the data hits RAID disks, so cache disk failure doesn't

"after", not "till". (and "is safe on", rather than "hits").

> +cause data lost. Of course cache disk failure means the array is exposed

"cause data loss". or "cause data to be lost".

> +into 'write hole' again.

"expose to", not "exposed into".

> +
> +--------------------------------------
> +write-back mode:
> +
> +write-back mode fixes the 'write hole' issue too, since all write data is
> +cached in cache disk. But the main goal of 'write-back' cache is to speed up
> +write. If a write crosses all raid disks of a stripe, we call it full-stripe
> +write. For non-full-stripe write, MD must do a read-modify-write. The extra
> +read (for data in other disks) and write (for parity) introduce a lot of

The parity write is not an extra write.  The only extras are reads.
The main cause of slowdown is the need to wait for the reads before the
parity calculation can happen.  i.e. the fact that the reads are
synchronous is important.
  For non-full-stripe writes, MD must read old data before the new
  parity can be calculated.  These synchronous reads hurt write
  throughput.

maybe.


> +overhead. Some writes which are sequential but not dispatched in the same time
> +will suffer from this overhead too. write-back cache will aggregate the data
> +and flush the data to raid disks till the data becomes a full stripe write.

... flush the data to the RAID disks only after the data becomes...

> +This will completely avoid the overhead, so it's very helpful for some
> +workloads. A typical workload which does sequential write and follows fsync is
> +an example.

 "which does sequential writes followed by fsync() is an example".
 
> +
> +In write-back mode, MD reports IO finish to upper layer (usually filesystems)
> +right after the data hit cache disk. The data is flushed to raid disks later
> +after specific conditions met. So cache disk failure will cause data lost.
> +
> +--------------------------------------
> +The implementation:
> +
> +The write-through and write-back cache use the same disk format. The cache disk
> +is organized as a simple write log. The log consists of 'meta data' and 'data'
> +pairs. The meta data describes the data. It also includes checksum and sequence
> +ID for recovery identification. Data could be IO data and parity data. Data is
> +checksumed too. The checksum is stored in the meta data ahead of the data. The
> +checksum is an optimization because MD can write meta and data freely without
> +worry about the order. MD superblock has a field pointed to the valid meta data
> +of log head.
> +
> +The log implementation is pretty straightforward. The difficult part is the
> +order MD write data to cache disk and raid disks. Specifically, in

 "the order in which MD writes data to the cache disk and the RAID disks".

> +write-through mode, MD calculates parity for IO data, writes both IO data and
> +parity to the log, write the data and parity to raid disks after the data and
                      writes

> +parity is settled down in log and finally the IO is finished. Read just reads
> +from raid disks as usual.
> +
> +In write-back mode, MD writes IO data to the log and reports IO finish. The
> +data is also fully cached in memory at that time, which means read must query
> +memory cache. If some conditions are met, MD will flush the data to raid disks.
> +MD will calculate parity for the data and write parity into the log. After this
> +is finished, MD will write both data and parity into raid disks, then MD can
> +release the memory cache. The flush conditions could be stripe becomes a full
> +stripe write, free cache disk space is low or in-kernel memory cache space is
> +low.
> +
> +After an unclean shutdown, MD does recovery. MD reads all meta data and data
> +from the log. The sequence ID and checksum will help us detect corrupted meta
> +data and data. If MD finds a stripe with data and valid parities (1 parity for
> +raid4/5 and 2 for raid6), MD will write the data and parities to raid disks. If
> +parities are incompleted, they are discarded. If part of data is corrupted,
> +they are discarded too. MD then loads valid data and writes them to raid disks
> +in normal way.

Good work,
thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* Re: drives failed during reshape, array won't even force-assemble
From: Thomas Warntjen @ 2017-02-01 18:55 UTC (permalink / raw)
  To: Phil Turmel, linux-raid
In-Reply-To: <fb9787fd-4047-cc53-a926-f9715d7faee6@turmel.org>

Holy cow, I poked it with a stick and I think I did it!

As I've wrote before after a reboot the array was there but didn't 
start, and I've noticed the same thing happend with the overlay files 
right after I created them:

# /cat /proc/mdstat
Personalities : [raid1] [raid6] [raid5] [raid4] [linear] [multipath] 
[raid0] [raid10]
md1 : inactive dm-0[8](S) dm-1[6](S) dm-7[4](S) dm-6[2](S) dm-5[0](S) 
dm-3[1](S) dm-4[5](S) dm-2[3](S)
       23429580800 blocks super 0.91

# mdadm --detail /dev/md1
/dev/md1:
         Version : 0.91
      Raid Level : raid0
   Total Devices : 8
Preferred Minor : 0
     Persistence : Superblock is persistent

           State : inactive

       New Level : raid6
      New Layout : left-symmetric
   New Chunksize : 64K

            UUID : 7a58ed4f:baf1934e:a2963c6e:a542ed71
          Events : 0.12370980

     Number   Major   Minor   RaidDevice

        -     252        0        -        /dev/dm-0
        -     252        1        -        /dev/dm-1
        -     252        2        -        /dev/dm-2
        -     252        3        -        /dev/dm-3
        -     252        4        -        /dev/dm-4
        -     252        5        -        /dev/dm-5
        -     252        6        -        /dev/dm-6
        -     252        7        -        /dev/dm-7

	
Now I tried

# mdadm --run /dev/md1
mdadm: failed to start array /dev/md1: Input/output error


and something interesting happend:

# mdadm --detail /dev/md1
/dev/md1:
         Version : 0.91
   Creation Time : Thu Sep  1 22:23:00 2011
      Raid Level : raid6
   Used Dev Size : 18446744073709551615
    Raid Devices : 7
   Total Devices : 6
Preferred Minor : 1
     Persistence : Superblock is persistent

     Update Time : Tue Jan 24 21:10:19 2017
           State : active, FAILED, Not Started
  Active Devices : 4
Working Devices : 6
  Failed Devices : 0
   Spare Devices : 2

          Layout : left-symmetric-6
      Chunk Size : 64K

      New Layout : left-symmetric

            UUID : 7a58ed4f:baf1934e:a2963c6e:a542ed71
          Events : 0.12370980

     Number   Major   Minor   RaidDevice State
        0     252        5        0      active sync   /dev/dm-5
        1     252        3        1      active sync   /dev/dm-3
        2     252        6        2      active sync   /dev/dm-6
        3     252        2        3      active sync   /dev/dm-2
        -       0        0        4      removed
        -       0        0        5      removed
        6     252        1        6      spare rebuilding   /dev/dm-1

        8     252        0        -      spare   /dev/dm-0
	
	
let's try to add the missing drives:

# mdadm --manage /dev/md1 --add /dev/mapper/sdc3
mdadm: re-added /dev/mapper/sdc3
	
# mdadm --manage /dev/md1 --add /dev/mapper/sdd3
mdadm: re-added /dev/mapper/sdd3
	
# mdadm --detail /dev/md1
detail /dev/md1
/dev/md1:
         Version : 0.91
   Creation Time : Thu Sep  1 22:23:00 2011
      Raid Level : raid6
   Used Dev Size : 18446744073709551615
    Raid Devices : 7
   Total Devices : 8
Preferred Minor : 1
     Persistence : Superblock is persistent

     Update Time : Tue Jan 24 21:10:19 2017
           State : active, degraded, Not Started
  Active Devices : 6
Working Devices : 8
  Failed Devices : 0
   Spare Devices : 2

          Layout : left-symmetric-6
      Chunk Size : 64K

      New Layout : left-symmetric

            UUID : 7a58ed4f:baf1934e:a2963c6e:a542ed71
          Events : 0.12370980

     Number   Major   Minor   RaidDevice State
        0     252        5        0      active sync   /dev/dm-5
        1     252        3        1      active sync   /dev/dm-3
        2     252        6        2      active sync   /dev/dm-6
        3     252        2        3      active sync   /dev/dm-2
        4     252        7        4      active sync   /dev/dm-7
        5     252        4        5      active sync   /dev/dm-4
        6     252        1        6      spare rebuilding   /dev/dm-1

        8     252        0        -      spare   /dev/dm-0
	

Not bad at all! But it still won't start, even with --run.  Maybe if I 
wait long enough for the rebuild to finish? But I still don't see it in 
/proc/mdstat and I don't want to wait for several days to see if it 
really rebuilds in the background.

So I poke it with a stick...

# echo "clean" > /sys/block/md1/md/array_state
-bash: echo: write error: Invalid argument	

nope

# echo "active" > /sys/block/md1/md/array_state
-bash: echo: write error: Invalid argument	

nope

# echo "readonly" > /sys/block/md1/md/array_state

wait, no error?

# cat /proc/mdstat
Personalities : [raid1] [raid6] [raid5] [raid4] [linear] [multipath] 
[raid0] [raid10]
md1 : active (read-only) raid6 dm-0[5] dm-2[4] dm-7[6] dm-6[3] dm-4[0] 
dm-1[2] dm-5[1] dm-3[8](S)
       14643488000 blocks super 0.91 level 6, 64k chunk, algorithm 18 
[7/6] [UUUUUU_]
       resync=PENDING
       bitmap: 175/175 pages [700KB], 8192KB chunk

# mdadm --detail /dev/md1
/dev/md1:
         Version : 0.91
   Creation Time : Thu Sep  1 22:23:00 2011
      Raid Level : raid6
      Array Size : 14643488000 (13965.12 GiB 14994.93 GB)
   Used Dev Size : 18446744073709551615
    Raid Devices : 7
   Total Devices : 8
Preferred Minor : 1
     Persistence : Superblock is persistent

   Intent Bitmap : Internal

     Update Time : Tue Jan 24 21:10:19 2017
           State : clean, degraded, resyncing (PENDING)
  Active Devices : 6
Working Devices : 8
  Failed Devices : 0
   Spare Devices : 2

          Layout : left-symmetric-6
      Chunk Size : 64K

      New Layout : left-symmetric

            UUID : 7a58ed4f:baf1934e:a2963c6e:a542ed71
          Events : 0.12370980

     Number   Major   Minor   RaidDevice State
        0     252        4        0      active sync   /dev/dm-4
        1     252        5        1      active sync   /dev/dm-5
        2     252        1        2      active sync   /dev/dm-1
        3     252        6        3      active sync   /dev/dm-6
        4     252        2        4      active sync   /dev/dm-2
        5     252        0        5      active sync   /dev/dm-0
        6     252        7        6      spare rebuilding   /dev/dm-7

        8     252        3        -      spare   /dev/dm-3


still no error
	
# echo "clean" > /sys/block/md1/md/array_state

# cat /proc/mdstat
Personalities : [raid1] [raid6] [raid5] [raid4] [linear] [multipath] 
[raid0] [raid10]
md1 : active raid6 raid6 dm-0[5] dm-2[4] dm-7[6] dm-6[3] dm-4[0] dm-1[2] 
dm-5[1] dm-3[8](S)
       14643488000 blocks super 0.91 level 6, 64k chunk, algorithm 18 
[7/6] [UUUUUU_]
       [==============>......]  reshape = 74.6% (2185464448/2928697600) 
finish=7719.3min speed=1603K/sec
       bitmap: 175/175 pages [700KB], 8192KB chunk
       14643488000 blocks super 0.91 level 6, 64k chunk, algorithm 18 
[7/6] [UUUUUU_]
       resync=PENDING
       bitmap: 175/175 pages [700KB], 8192KB chunk

# mdadm --detail /dev/md1
/dev/md1:
         Version : 0.91
   Creation Time : Thu Sep  1 22:23:00 2011
      Raid Level : raid6
      Array Size : 14643488000 (13965.12 GiB 14994.93 GB)
   Used Dev Size : 18446744073709551615
    Raid Devices : 7
   Total Devices : 8
Preferred Minor : 1
     Persistence : Superblock is persistent

   Intent Bitmap : Internal

     Update Time : Tue Jan 31 20:09:30 2017
           State : clean, degraded, reshaping
  Active Devices : 6
Working Devices : 8
  Failed Devices : 0
   Spare Devices : 2

          Layout : left-symmetric-6
      Chunk Size : 64K

  Reshape Status : 74% complete
      New Layout : left-symmetric

            UUID : 7a58ed4f:baf1934e:a2963c6e:a542ed71
          Events : 0.12370982

     Number   Major   Minor   RaidDevice State
        0     252        4        0      active sync   /dev/dm-4
        1     252        5        1      active sync   /dev/dm-5
        2     252        1        2      active sync   /dev/dm-1
        3     252        6        3      active sync   /dev/dm-6
        4     252        2        4      active sync   /dev/dm-2
        5     252        0        5      active sync   /dev/dm-0
        6     252        7        6      spare rebuilding   /dev/dm-7

        8     252        3        -      spare   /dev/dm-3

	
Looks good! fsck shows no errors, nothing in lost+found, so I've stopped 
the reshape (so the overlays won't fill the disk), mounted it readonly 
and backed up the more important data. That finished today, so I 
rebooted and did it for real. Reshape is finished, resync at 24% (6 
hours to go), fsck still looks good. w00t!

	

^ permalink raw reply

* Re: [PATCH 1/2] md: add bad block flag to disk state
From: Shaohua Li @ 2017-02-01 18:12 UTC (permalink / raw)
  To: Tomasz Majchrzak; +Cc: linux-raid, jes.sorensen
In-Reply-To: <20170201095352.GA24920@proton.igk.intel.com>

On Wed, Feb 01, 2017 at 10:53:52AM +0100, Tomasz Majchrzak wrote:
> On Mon, Jan 30, 2017 at 03:33:41PM -0800, Shaohua Li wrote:
> > On Tue, Jan 24, 2017 at 01:03:38PM +0100, Tomasz Majchrzak wrote:
> > > Add a new flag to report that bad blocks are present on a disk. It will
> > > allow userspace to notify the user of the problem.
> > > 
> > > Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
> > > ---
> > >  drivers/md/md.c                | 2 ++
> > >  include/uapi/linux/raid/md_p.h | 1 +
> > >  2 files changed, 3 insertions(+)
> > > 
> > > diff --git a/drivers/md/md.c b/drivers/md/md.c
> > > index 0abb147..1a807ec 100644
> > > --- a/drivers/md/md.c
> > > +++ b/drivers/md/md.c
> > > @@ -6034,6 +6034,8 @@ static int get_disk_info(struct mddev *mddev, void __user * arg)
> > >  			info.state |= (1<<MD_DISK_WRITEMOSTLY);
> > >  		if (test_bit(FailFast, &rdev->flags))
> > >  			info.state |= (1<<MD_DISK_FAILFAST);
> > > +		if (rdev->badblocks.count)
> > > +			info.state |= (1<<MD_DISK_BB_PRESENT);
> > 
> > Userspace can find if a disk has badblocks by reading the bad_blocks sysfs
> > file. Why adds another interface?
> > 
> > Thanks,
> > Shaohua
> 
> Yes, indeed, it can. I have chosen to do it this way to keep it consistent
> with mdadm which uses GET_DISK_INFO ioctl to get disk information. All data
> provided in this ioctl is also available in sysfs file (rdev state), however
> ioctl is still used (legacy). The same applies for details subcommand of
> mdadm. To answer your question - yes, I could avoid new flag but it would
> make mdadm side of my improvement much more complicated.

I intended to avoid adding new user interface if possible. Not sure about this
case though. How complicated in the mdadm side if we use the bad_block sysfs
file?

Jes, how do you think from the mdadm side?

Thanks,
Shaohua

^ permalink raw reply

* Re: [PATCH] MD: add doc for raid5-cache
From: Song Liu @ 2017-02-01 17:54 UTC (permalink / raw)
  To: Shaohua Li
  Cc: linux-raid@vger.kernel.org, antlists@youngman.org.uk,
	philip@turmel.org, neilb@suse.com
In-Reply-To: <25051bd79d94b45c7be24ce466a8b6eb2fba66c0.1485890144.git.shli@fb.com>


> On Jan 31, 2017, at 11:18 AM, Shaohua Li <shli@fb.com> wrote:
> 
> I'm starting document of the raid5-cache feature. Please let me know
> what else we should put into the document. Of course, comments are
> welcome!
> 
> Signed-off-by: Shaohua Li <shli@fb.com>
> ---
> Documentation/md/raid5-cache.txt | 99 ++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 99 insertions(+)
> create mode 100644 Documentation/md/raid5-cache.txt
> 
> diff --git a/Documentation/md/raid5-cache.txt b/Documentation/md/raid5-cache.txt
> new file mode 100644
> index 0000000..17a6279
> --- /dev/null
> +++ b/Documentation/md/raid5-cache.txt
> @@ -0,0 +1,99 @@
> +RAID5 cache
> +
> +Raid 4/5/6 could include an extra disk for data cache. The cache could be
> +in write-through or write-back mode. mdadm has a new option
> +'--write-journal' to create array with cache. By default (raid array
> +starts), the cache is in write-through mode. User can switch it to
> +write-back mode by:
> +
> +echo "write-back" > /sys/block/md0/md/journal_mode
> +
> +And switch it back to write-through mode by:
> +
> +echo "write-through" > /sys/block/md0/md/journal_mode
> +
> +In both modes, all writes to the array will hit cache disk first. This means
> +the cache disk must be fast and sustainable (if you use a SSD as the cache).
> +
> +-------------------------------------
> +write-through mode:
> +
> +This mode mainly fixes 'write hole' issue. For RAID 4/5/6 array, an
> +unclean shutdown could cause data in some stripes is not in consistent
> +state, eg, data and parity don't match. The reason is a stripe write
> +involves several raid disks and it's possible writes don't hit all raid
> +disks yet before the unclean shutdown. After an unclean shutdown, MD try
> +to 'resync' the array to put all stripes back into consistent state. In
> +the resync, any disk failure will cause real data corruption. This problem
> +is called 'write hole'. So the 'write hole' issue occurs between unclean
> +shutdown and 'resync'. This window isn't big. On the other hand, if one
> +disk fails, other disks could fail soon, which happens sometimes if the
> +disks are from the same vendor and manufactured in the same time. This
> +will increase the chance of 'write whole', but overall the chance isn't
> +big, so don't panic even not using cache disk.
> +
> +The write-through cache will cache all data in cache disk first. Until the
> +data hits into the cache disk, the data is flushed into RAID disks. The
> +two-step write will guarantee MD can recover correct data after unclean
> +shutdown even with disk failure. Thus the cache can close the 'write
> +hole'.
> +
> +In write-through mode, MD reports IO finish to upper layer (usually
> +filesystems) till the data hits RAID disks, so cache disk failure doesn't
> +cause data lost. Of course cache disk failure means the array is exposed
> +into 'write hole' again.
> +
> +--------------------------------------
> +write-back mode:
> +
> +write-back mode fixes the 'write hole' issue too, since all write data is
> +cached in cache disk. But the main goal of 'write-back' cache is to speed up
> +write. If a write crosses all raid disks of a stripe, we call it full-stripe
> +write. For non-full-stripe write, MD must do a read-modify-write. The extra
> +read (for data in other disks) and write (for parity) introduce a lot of
> +overhead. Some writes which are sequential but not dispatched in the same time
> +will suffer from this overhead too. write-back cache will aggregate the data
> +and flush the data to raid disks till the data becomes a full stripe write.
> +This will completely avoid the overhead, so it's very helpful for some
> +workloads. A typical workload which does sequential write and follows fsync is
> +an example.
> +
> +In write-back mode, MD reports IO finish to upper layer (usually filesystems)
> +right after the data hit cache disk. The data is flushed to raid disks later
> +after specific conditions met. So cache disk failure will cause data lost.
> +
> +--------------------------------------
> +The implementation:
> +
> +The write-through and write-back cache use the same disk format. The cache disk
> +is organized as a simple write log. The log consists of 'meta data' and 'data'
> +pairs. The meta data describes the data. It also includes checksum and sequence
> +ID for recovery identification. Data could be IO data and parity data. Data is
> +checksumed too. The checksum is stored in the meta data ahead of the data. The
> +checksum is an optimization because MD can write meta and data freely without
> +worry about the order. MD superblock has a field pointed to the valid meta data
> +of log head.
> +
> +The log implementation is pretty straightforward. The difficult part is the
> +order MD write data to cache disk and raid disks. Specifically, in
> +write-through mode, MD calculates parity for IO data, writes both IO data and
> +parity to the log, write the data and parity to raid disks after the data and
> +parity is settled down in log and finally the IO is finished. Read just reads
> +from raid disks as usual.
> +
> +In write-back mode, MD writes IO data to the log and reports IO finish. The
> +data is also fully cached in memory at that time, which means read must query
> +memory cache. If some conditions are met, MD will flush the data to raid disks.
> +MD will calculate parity for the data and write parity into the log. After this
> +is finished, MD will write both data and parity into raid disks, then MD can
> +release the memory cache. The flush conditions could be stripe becomes a full
> +stripe write, free cache disk space is low or in-kernel memory cache space is
> +low.
> +
> +After an unclean shutdown, MD does recovery. MD reads all meta data and data
> +from the log. The sequence ID and checksum will help us detect corrupted meta
> +data and data. If MD finds a stripe with data and valid parities (1 parity for
> +raid4/5 and 2 for raid6), MD will write the data and parities to raid disks. If
> +parities are incompleted, they are discarded. If part of data is corrupted,
> +they are discarded too. MD then loads valid data and writes them to raid disks
> +in normal way.
> -- 
> 2.9.3
> 

Looks great!

Reviewed-by: Song Liu <songliubraving@fb.com>



^ permalink raw reply

* Re: [PATCH 1/2] md: add bad block flag to disk state
From: Tomasz Majchrzak @ 2017-02-01  9:53 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, jes.sorensen
In-Reply-To: <20170130233341.l5eiars5lbgowrht@kernel.org>

On Mon, Jan 30, 2017 at 03:33:41PM -0800, Shaohua Li wrote:
> On Tue, Jan 24, 2017 at 01:03:38PM +0100, Tomasz Majchrzak wrote:
> > Add a new flag to report that bad blocks are present on a disk. It will
> > allow userspace to notify the user of the problem.
> > 
> > Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
> > ---
> >  drivers/md/md.c                | 2 ++
> >  include/uapi/linux/raid/md_p.h | 1 +
> >  2 files changed, 3 insertions(+)
> > 
> > diff --git a/drivers/md/md.c b/drivers/md/md.c
> > index 0abb147..1a807ec 100644
> > --- a/drivers/md/md.c
> > +++ b/drivers/md/md.c
> > @@ -6034,6 +6034,8 @@ static int get_disk_info(struct mddev *mddev, void __user * arg)
> >  			info.state |= (1<<MD_DISK_WRITEMOSTLY);
> >  		if (test_bit(FailFast, &rdev->flags))
> >  			info.state |= (1<<MD_DISK_FAILFAST);
> > +		if (rdev->badblocks.count)
> > +			info.state |= (1<<MD_DISK_BB_PRESENT);
> 
> Userspace can find if a disk has badblocks by reading the bad_blocks sysfs
> file. Why adds another interface?
> 
> Thanks,
> Shaohua

Yes, indeed, it can. I have chosen to do it this way to keep it consistent
with mdadm which uses GET_DISK_INFO ioctl to get disk information. All data
provided in this ioctl is also available in sysfs file (rdev state), however
ioctl is still used (legacy). The same applies for details subcommand of
mdadm. To answer your question - yes, I could avoid new flag but it would
make mdadm side of my improvement much more complicated.

Tomek

> >  	} else {
> >  		info.major = info.minor = 0;
> >  		info.raid_disk = -1;
> > diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
> > index 9930f3e..b151e93 100644
> > --- a/include/uapi/linux/raid/md_p.h
> > +++ b/include/uapi/linux/raid/md_p.h
> > @@ -93,6 +93,7 @@
> >  				   * read requests will only be sent here in
> >  				   * dire need
> >  				   */
> > +#define MD_DISK_BB_PRESENT	11 /* disk has bad blocks */
> >  #define MD_DISK_JOURNAL		18 /* disk is used as the write journal in RAID-5/6 */
> >  
> >  #define MD_DISK_ROLE_SPARE	0xffff
> > -- 
> > 1.8.3.1
> > 

^ permalink raw reply

* Bitte kontaktieren Sie mich für weitere Details!
From: Miss Marbell @ 2017-02-01  8:21 UTC (permalink / raw)


Sehr geehrte Damen und Herren,

Ich brauche Ihre Unterstützung in Ihrem Land zu verlagern und zu investieren.Ich bitte Sie um Hilfe, weil ich nicht das Wissen über
Geschäft und die Regeln, die Ihr Land für eine sichere Investition führen.

Werden Sie versprechen, mit mir aufrichtig zu sein?

Bitte kontaktieren Sie mich für weitere Details!

Mit freundlichen Grüßen,
Fräulein Marbell.

^ permalink raw reply

* Re: [dm-devel] split scsi passthrough fields out of struct request V2
From: Jens Axboe @ 2017-01-31 21:58 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: linux-block@vger.kernel.org, linux-raid@vger.kernel.org,
	snitzer@redhat.com, hch@lst.de, linux-scsi@vger.kernel.org,
	axboe@fb.com, j-nomura@ce.jp.nec.com, dm-devel@redhat.com
In-Reply-To: <1485899692.3113.9.camel@sandisk.com>

On 01/31/2017 01:55 PM, Bart Van Assche wrote:
> On Tue, 2017-01-31 at 13:34 -0800, Bart Van Assche wrote:
>> On Mon, 2017-01-30 at 17:38 -0800, Jens Axboe wrote:
>>> That's a known bug in mainline. Pull it into 4.10-rc6,
>>> or use my for-next where everything is already merged. 
>>
>> Hello Jens,
>>
>> With your for-next branch (commit c2e60b3a2602) I haven't hit any block
>> layer crashes so far. The only issue I encountered that is new is a
>> memory leak triggered by the SG-IO code. These memory leak reports
>> started to appear after I started testing the mq-deadline scheduler.
>> kmemleak reported the following call stack multiple times after my tests
>> had finished:
>>
>> unreferenced object 0xffff88041119e528 (size 192):
>>   comm "multipathd", pid 2353, jiffies 4295128020 (age 1332.440s)
>>   hex dump (first 32 bytes):
>>     00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
>>     00 00 00 00 00 00 00 00 12 01 00 00 00 00 00 00  ................
>>   backtrace:
>>     [<ffffffff8165e3b5>] kmemleak_alloc+0x45/0xa0
>>     [<ffffffff811cc23d>] __kmalloc+0x15d/0x2f0
>>     [<ffffffff81310e35>] bio_alloc_bioset+0x185/0x1f0
>>     [<ffffffff813117f4>] bio_map_user_iov+0x124/0x400
>>     [<ffffffff81320b7a>] blk_rq_map_user_iov+0x11a/0x210
>>     [<ffffffff81320cbd>] blk_rq_map_user+0x4d/0x60
>>     [<ffffffff81336694>] sg_io+0x3d4/0x410
>>     [<ffffffff813369d0>] scsi_cmd_ioctl+0x300/0x490
>>     [<ffffffff81336b9d>] scsi_cmd_blk_ioctl+0x3d/0x50
>>     [<ffffffff814b4360>] sd_ioctl+0x80/0x100
>>     [<ffffffff8132ddde>] blkdev_ioctl+0x51e/0x9f0
>>     [<ffffffff8122f388>] block_ioctl+0x38/0x40
>>     [<ffffffff8120097f>] do_vfs_ioctl+0x8f/0x700
>>     [<ffffffff8120102c>] SyS_ioctl+0x3c/0x70
>>     [<ffffffff8166c4aa>] entry_SYSCALL_64_fastpath+0x18/0xad
> 
> After I repeated my test the above findings were confirmed: no memory leaks
> were reported by kmemleak after a test with I/O scheduler "none" and the
> above call stack was reported 44 times by kmemleak after a test with I/O
> scheduler "mq-deadline".

Interesting, I'll check this. Doesn't make any sense why the scheduler
would be implicated in that, given how we run completions now. But if
it complains, then something must be up.

-- 
Jens Axboe

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox