Linux RAID subsystem development

Linux RAID subsystem development
 help / color / mirror / Atom feed

* mdadm raid6 sequential read slower than reading from userspace
From: Stevie Trujillo @ 2017-02-03 22:24 UTC (permalink / raw)
  To: linux-raid, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1798 bytes --]

Hello

Kernel:     Linux version 4.9.0-1-amd64 (debian-kernel@lists.debian.org)
            (gcc version 6.3.0 20161229 (Debian 6.3.0-2) ) #1 SMP Debian
            4.9.2-2 (2017-01-12)
CPU:        2xE5-2665
Memory:     256GB
Drives:     6x8TB Seagate
Controller: LSI2008
md0 : active raid6 sdb1[1] sda1[0] sdd1[3] sde1[4] sdc1[2] sdf1[5]
      31255576576 blocks super 1.2 level 6, 512k chunk, algorithm 2
[6/6] [UUUUUU] bitmap: 0/59 pages [0KB], 65536KB chunk

When I read sequentially from one of the disks I get 230-245MB/s. If I
read from all of them at the same time, the performance stays the same
(even if I bind all the dd processes to the same core).
Conclusion: I think the controller is not a bottleneck.

I first tried Debian8 with 3.16 and got 400-500MB/s when dd-ing
from /dev/md0. Upgrading to Debian9 with 4.9.2 roughly doubled my
performance:
53687091200 bytes (54 GB, 50 GiB) copied, 62.0078 s, 866 MB/s
53687091200 bytes (54 GB, 50 GiB) copied, 57.9882 s, 926 MB/s

dd uses 40% cpu and I can't find any process that uses more, so I don't
think I'm limited by CPU.

I wrote a small program that reads directly from the disks and outputs
the same data as reading from md0 would do. It's faster and has
more stable runtime than reading from md0: it finishes in 44.0 +-
0.2seconds (that is ~1150MB/s).

Is it possible to make mdadm work faster? I was hoping it could read
6x240MB/s, but maybe that's not possible. At least I think it should be
able to do 1150MB/s like userspace?
How can I find out what bottleneck? I couldn't see anything obvious
like 100% cpu usage.
I tried copying different tuning instructions I found on Google, but
they usually made negative impact if any.

I attached the program, but I'm still learning programming so it's not
very good.

--
Stevie Trujillo

[-- Attachment #2: raid6read.cc --]
[-- Type: text/x-c++src, Size: 8079 bytes --]

#include <vector>
#include <queue>
#include <thread>
#include <mutex>
#include <condition_variable>
using namespace std;
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <inttypes.h>
#include <sys/poll.h>
#include <scsi/sg.h>
#include <assert.h>
#include <err.h>

#define READ_16 0x88
#define NUM_DISKS 6
#define MAX_READAHEAD 16
#define CHUNK_SIZE (512*1024)

namespace {

struct BufferStorage
{
	int ref_count;
	unsigned char *memory;
};

struct Buffer
{
	BufferStorage *storage;
	unsigned char *buf;
};

struct Request
{
	uint64_t output_idx;
	uint64_t chunk;
};

struct Response
{
	uint64_t output_idx;
	Buffer buffer;
};

struct PendingIO
{
	BufferStorage *storage;
	uint64_t output_idx[4];
};

struct Disk
{
	int sg_fd;
	PendingIO pending_io[MAX_READAHEAD];
	unsigned char slot_i;
	unsigned char slots[MAX_READAHEAD];
	int current_request;
	vector<Request> requests;
};

struct Raid6
{
	Disk disks[6];
};

struct ThreadData
{
	mutex m;
	condition_variable cv;
	uint64_t last_idx;
	vector<Response> responses;
};

static void
sg_read(int sg_fd, void *buf, int pack_id, uint64_t lba, uint64_t len_lba)
{
	uint64_t len_bytes = 512 * len_lba;

	unsigned char cdb[16] = {};
	cdb[0] = READ_16;
	cdb[2] = (lba >> 56) & 0xff;
	cdb[3] = (lba >> 48) & 0xff;
	cdb[4] = (lba >> 40) & 0xff;
	cdb[5] = (lba >> 32) & 0xff;
	cdb[6] = (lba >> 24) & 0xff;
	cdb[7] = (lba >> 16) & 0xff;
	cdb[8] = (lba >> 8) & 0xff;
	cdb[9] = (lba >> 0) & 0xff;
	cdb[10] = (len_lba >> 24) & 0xff;
	cdb[11] = (len_lba >> 16) & 0xff;
	cdb[12] = (len_lba >> 8) & 0xff;
	cdb[13] = (len_lba >> 0) & 0xff;

	sg_io_hdr_t io_hdr;
	memset(&io_hdr, '\0', sizeof(io_hdr));
	io_hdr.interface_id = 'S'; /* SCSI Generic Interface */
	io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
	io_hdr.cmd_len = sizeof(cdb);
	io_hdr.cmdp = cdb;
	io_hdr.dxfer_len = len_bytes;
	io_hdr.dxferp = buf;
	io_hdr.timeout = 20000;
	io_hdr.pack_id = pack_id;

	if (write(sg_fd, &io_hdr, sizeof(io_hdr)) != sizeof(io_hdr))
		err(1, "write");
}

void queue_requests(Disk *disk)
{
	while ((size_t) disk->current_request < disk->requests.size() && disk->slot_i < MAX_READAHEAD) {
		int batch_requests = 1;
		for (; batch_requests < 4 && (size_t) disk->current_request + batch_requests < disk->requests.size(); ++batch_requests) {
			if (disk->requests[disk->current_request].chunk + batch_requests
			 != disk->requests[disk->current_request + batch_requests].chunk
			)
				break;
		}

		unsigned char slot = disk->slots[disk->slot_i++];
		struct PendingIO *pending = &disk->pending_io[slot];
		for (int i = 0; i < 4; ++i)
			pending->output_idx[i] = ~(uint64_t) 0;

		for (int i = 0; i < batch_requests; ++i)
			pending->output_idx[i] = disk->requests[disk->current_request + i].output_idx;

		uint64_t len_bytes = batch_requests * CHUNK_SIZE;
		pending->storage = new BufferStorage;
		pending->storage->ref_count = 0;
		void *buf;
		posix_memalign(&buf, 0x1000, len_bytes);
		pending->storage->memory = (unsigned char *) buf;

		const Request &r = disk->requests[disk->current_request];
		sg_read(disk->sg_fd, pending->storage->memory, slot, r.chunk * CHUNK_SIZE / 512, batch_requests * CHUNK_SIZE / 512);

		disk->current_request += batch_requests;
	}
}

void read_response(vector<Response> &responses, Disk *disk)
{
	sg_io_hdr_t io_hdr;
	memset(&io_hdr, '\0', sizeof(io_hdr));
	io_hdr.interface_id = 'S'; /* SCSI Generic Interface */
	io_hdr.pack_id = -1;

	if (read(disk->sg_fd, &io_hdr, sizeof(io_hdr)) != sizeof(io_hdr))
		err(1, "read");

	assert(io_hdr.pack_id >= 0 && io_hdr.pack_id < MAX_READAHEAD);
	PendingIO *pending = &disk->pending_io[io_hdr.pack_id];

	for (int j = 0; j < 4; ++j) {
		if (pending->output_idx[j] == ~(uint64_t) 0)
			break;

		Buffer buffer;
		buffer.storage = pending->storage;
		buffer.storage->ref_count += 1;
		buffer.buf = buffer.storage->memory + CHUNK_SIZE * j;
		responses.push_back(Response{pending->output_idx[j], buffer});
	}

	disk->slots[--disk->slot_i] = io_hdr.pack_id;
}

/* write the data we read to stdout in correct order */
void writer_function(ThreadData *td)
{
	auto cmp = [](const Response &a, const Response &b) { return a.output_idx > b.output_idx; };
	priority_queue<Response, vector<Response>, decltype(cmp)> responses(cmp);
	uint64_t current_idx = 0;

	while (current_idx < td->last_idx) {
		if (responses.empty() || responses.top().output_idx != current_idx) {
			vector<Response> tmp;
			{
				unique_lock<mutex> lk(td->m);
				td->cv.wait(lk, [=](){ return !td->responses.empty(); });
				tmp = move(td->responses);
			}

			for (const Response &r : tmp)
				responses.push(r);

			continue;
		}

		Response r = responses.top();
		responses.pop();
		unsigned char *buf = r.buffer.buf;
		size_t size = CHUNK_SIZE;

		while (size) {
			ssize_t bytes_written = write(1, buf, size);
			if (bytes_written < 0)
				err(1, "write");

			buf += bytes_written;
			size -= bytes_written;
		}

		if (--r.buffer.storage->ref_count == 0) {
			free(r.buffer.storage->memory);
			delete r.buffer.storage;
		}

		++current_idx;
	}
}

/* run all the disks from the same thread */
void run_sg_poll(Raid6 *raid, ThreadData *writer_td)
{
	for (;;) {
		struct pollfd pfds[NUM_DISKS];
		int nfds = 0;

		for (int i = 0; i < NUM_DISKS; ++i) {
			Disk *disk = &raid->disks[i];
			if (disk->sg_fd < 0)
				continue;

			queue_requests(disk);
			if ((size_t) disk->current_request == disk->requests.size() && disk->slot_i == 0) {
				close(disk->sg_fd);
				disk->sg_fd = -1;
				continue;
			}

			pfds[nfds++] = (struct pollfd) { raid->disks[i].sg_fd, POLLIN, 0 };
		}

		if (!nfds)
			break;

		int ret = poll(pfds, nfds, -1);
		if (ret <= 0)
			err(1, "poll");

		vector<Response> responses;
		for (int i = 0; i < NUM_DISKS; ++i) {
			Disk *disk = &raid->disks[i];
			if (disk->sg_fd >= 0)
				read_response(responses, disk);
		}

		{
			unique_lock<mutex> lk(writer_td->m);
			writer_td->responses.insert(writer_td->responses.end(), responses.begin(), responses.end());
		}
		writer_td->cv.notify_one();
	}
}

/* run each disk from one thread */
void run_sg_single(Raid6 *raid, Disk *disk, ThreadData *writer_td)
{
	for (;;) {
		queue_requests(disk);
		if ((size_t) disk->current_request == disk->requests.size() && disk->slot_i == 0) {
			close(disk->sg_fd);
			disk->sg_fd = -1;
			break;
		}

		vector<Response> responses;
		read_response(responses, disk);

		{
			unique_lock<mutex> lk(writer_td->m);
			writer_td->responses.insert(writer_td->responses.end(), responses.begin(), responses.end());
		}
		writer_td->cv.notify_one();
	}
}

}

int main(int argc, char **argv)
{
	if (argc != 1 + NUM_DISKS)
		errx(1, "usage: disks");

	Raid6 raid;

	for (int i = 0; i < NUM_DISKS; ++i) {
		const char *path = argv[1 + i];
		Disk *disk = &raid.disks[i];

		disk->sg_fd = open(path, O_RDWR);
		if (disk->sg_fd < 0)
			err(1, "open(%s)", path);

		disk->current_request = 0;
		disk->slot_i = 0;
		for (int i = 0; i < MAX_READAHEAD; ++i)
			disk->slots[i] = i;
	}

	uint64_t num_chunks = 102400; // 50TB

	/* precompute all the chunks we want the disks to read */
	uint64_t output_idx = 0;
	for (uint64_t chunk = 0; chunk < num_chunks; ++chunk) {
		uint64_t data_offset = 2048 * 512 / (512*1024) /* from partitioning */
		                     + 256*1024*512 / (512*1024); /* from mdadm --examine */
		/*
		 * stripe0: bcde|fa
		 * stripe1: abcd|ef
		 * stripe2: fabc|de
		 */
		int64_t stripe = chunk / 4;
		uint64_t slot = chunk % 4;
		int64_t disk_idx = 1 - stripe + slot;
		disk_idx %= 6;
		disk_idx = disk_idx + (disk_idx >> 63 & 6);
		raid.disks[disk_idx].requests.push_back(Request{output_idx++, data_offset + stripe});
	}

	ThreadData writer_td;
	writer_td.last_idx = output_idx;

	thread writer_thread(writer_function, &writer_td);

	if (0) {
		run_sg_poll(&raid, &writer_td);
	} else {
		thread threads[6];

		for (int i = 0; i < NUM_DISKS; ++i)
			threads[i] = move(thread(run_sg_single, &raid, &raid.disks[i], &writer_td));

		for (int i = 0; i < NUM_DISKS; ++i)
			threads[i].join();
	}

	writer_thread.join();
	return 0;
}

^ permalink raw reply

* Re: drives failed during reshape, array won't even force-assemble
From: Weedy @ 2017-02-04  0:52 UTC (permalink / raw)
  To: Thomas Warntjen; +Cc: Phil Turmel, Linux RAID
In-Reply-To: <ddc96793-cde0-33a7-b193-585efd74a202@warntjen.net>

On 1 February 2017 at 13:55, Thomas Warntjen <thomas@warntjen.net> wrote:
> Holy cow, I poked it with a stick and I think I did it!
>

I really hate these "wait, why the F did that fix it" solutions.
Always left feeling like you haven't learned anything after all your
work.

Congrats you have your data back :)

^ permalink raw reply

* Re: [PATCH 5/6] dmaengine: Add Broadcom SBA RAID driver
From: Vinod Koul @ 2017-02-05  6:06 UTC (permalink / raw)
  To: Anup Patel
  Cc: Rob Herring, Mark Rutland, Herbert Xu, David S . Miller,
	Jassi Brar, Dan Williams, Ray Jui, Scott Branden, Jon Mason,
	Rob Rice, bcm-kernel-feedback-list-dY08KVG/lbpWk0Htik3J/w,
	dmaengine-u79uwXL29TY76Z2rM5mHXA,
	devicetree-u79uwXL29TY76Z2rM5mHXA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-crypto-u79uwXL29TY76Z2rM5mHXA,
	linux-raid-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1486010836-25228-6-git-send-email-anup.patel-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>

On Thu, Feb 02, 2017 at 10:17:15AM +0530, Anup Patel wrote:
> +config BCM_SBA_RAID
> +        tristate "Broadcom SBA RAID engine support"
> +        depends on (ARM64 && MAILBOX && RAID6_PQ) || COMPILE_TEST
> +        select DMA_ENGINE
> +        select DMA_ENGINE_RAID
> +	select ASYNC_TX_ENABLE_CHANNEL_SWITCH
> +	default ARCH_BCM_IPROC

whats with the funny alignement?

> +/* SBA command related defines */
> +#define SBA_TYPE_SHIFT					48
> +#define SBA_TYPE_MASK					0x3
> +#define SBA_TYPE_A					0x0
> +#define SBA_TYPE_B					0x2
> +#define SBA_TYPE_C					0x3
> +#define SBA_USER_DEF_SHIFT				32
> +#define SBA_USER_DEF_MASK				0xffff
> +#define SBA_R_MDATA_SHIFT				24
> +#define SBA_R_MDATA_MASK				0xff
> +#define SBA_C_MDATA_MS_SHIFT				18
> +#define SBA_C_MDATA_MS_MASK				0x3
> +#define SBA_INT_SHIFT					17
> +#define SBA_INT_MASK					0x1
> +#define SBA_RESP_SHIFT					16
> +#define SBA_RESP_MASK					0x1
> +#define SBA_C_MDATA_SHIFT				8
> +#define SBA_C_MDATA_MASK				0xff
> +#define SBA_CMD_SHIFT					0
> +#define SBA_CMD_MASK					0xf
> +#define SBA_CMD_ZERO_ALL_BUFFERS			0x8
> +#define SBA_CMD_LOAD_BUFFER				0x9
> +#define SBA_CMD_XOR					0xa
> +#define SBA_CMD_GALOIS_XOR				0xb
> +#define SBA_CMD_ZERO_BUFFER				0x4
> +#define SBA_CMD_WRITE_BUFFER				0xc

Try using BIT and GENMAST for hardware descriptions

> +
> +/* SBA C_MDATA helper macros */
> +#define SBA_C_MDATA_LOAD_VAL(__bnum0)		((__bnum0) & 0x3)
> +#define SBA_C_MDATA_WRITE_VAL(__bnum0)		((__bnum0) & 0x3)
> +#define SBA_C_MDATA_XOR_VAL(__bnum1, __bnum0)			\
> +			({	u32 __v = ((__bnum0) & 0x3);	\
> +				__v |= ((__bnum1) & 0x3) << 2;	\
> +				__v;				\
> +			})
> +#define SBA_C_MDATA_PQ_VAL(__dnum, __bnum1, __bnum0)		\
> +			({	u32 __v = ((__bnum0) & 0x3);	\
> +				__v |= ((__bnum1) & 0x3) << 2;	\
> +				__v |= ((__dnum) & 0x1f) << 5;	\
> +				__v;				\
> +			})

ah why are we usig complex macros, why can't these be simple functions..

> +#define SBA_C_MDATA_LS(__c_mdata_val)	((__c_mdata_val) & 0xff)
> +#define SBA_C_MDATA_MS(__c_mdata_val)	(((__c_mdata_val) >> 8) & 0x3)
> +
> +/* Driver helper macros */
> +#define to_sba_request(tx)		\
> +	container_of(tx, struct sba_request, tx)
> +#define to_sba_device(dchan)		\
> +	container_of(dchan, struct sba_device, dma_chan)
> +
> +enum sba_request_state {
> +	SBA_REQUEST_STATE_FREE = 1,
> +	SBA_REQUEST_STATE_ALLOCED = 2,
> +	SBA_REQUEST_STATE_PENDING = 3,
> +	SBA_REQUEST_STATE_ACTIVE = 4,
> +	SBA_REQUEST_STATE_COMPLETED = 5,
> +	SBA_REQUEST_STATE_ABORTED = 6,

whats up with a very funny indentation setting, we use 8 chars.

Please re-read the Documentation/process/coding-style.rst

> +static int sba_alloc_chan_resources(struct dma_chan *dchan)
> +{
> +	/*
> +	 * We only have one channel so we have pre-alloced
> +	 * channel resources. Over here we just return number
> +	 * of free request.
> +	 */
> +	return sba_free_request_count(to_sba_device(dchan));
> +}

essentially you are not doing much, so you can skip it. Its an optional
call.

> +static void sba_free_chan_resources(struct dma_chan *dchan)
> +{
> +	/*
> +	 * Channel resources are pre-alloced so we just free-up
> +	 * whatever we can so that we can re-use pre-alloced
> +	 * channel resources next time.
> +	 */
> +	sba_cleanup_inflight_requests(to_sba_device(dchan));

well this one checks for pending requests as well, which shouldn't be there
when freeing a channel, something seems not quite right here..

> +static int sba_send_mbox_request(struct sba_device *sba,
> +				 struct sba_request *req)
> +{
> +	int mchans_idx, ret = 0;
> +
> +	/* Select mailbox channel in round-robin fashion */
> +	mchans_idx = atomic_inc_return(&sba->mchans_current);
> +	mchans_idx = mchans_idx % sba->mchans_count;
> +
> +	/* Send batch message for the request */
> +	req->bmsg.batch.msgs_queued = 0;
> +	ret = mbox_send_message(sba->mchans[mchans_idx], &req->bmsg);
> +	if (ret < 0) {
> +		dev_info(sba->dev, "channel %d message %d (total %d)",
> +			 mchans_idx, req->bmsg.batch.msgs_queued,
> +			 req->bmsg.batch.msgs_count);

dev_err?

> +		dev_err(sba->dev, "send message failed with error %d", ret);
> +		return ret;
> +	}
> +	ret = req->bmsg.error;
> +	if (ret < 0) {
> +		dev_info(sba->dev,
> +			 "mbox channel %d message %d (total %d)",
> +			 mchans_idx, req->bmsg.batch.msgs_queued,
> +			 req->bmsg.batch.msgs_count);

same here

> +static dma_cookie_t sba_tx_submit(struct dma_async_tx_descriptor *tx)
> +{
> +	unsigned long flags;
> +	dma_cookie_t cookie;
> +	struct sba_request *req;
> +	struct sba_device *sba;
> +
> +	if (unlikely(!tx))
> +		return -EINVAL;
> +
> +	sba = to_sba_device(tx->chan);
> +	req = to_sba_request(tx);
> +
> +	/* Assign cookie and mark request pending */
> +	spin_lock_irqsave(&sba->reqs_lock, flags);
> +	cookie = dma_cookie_assign(tx);
> +	_sba_pending_request(sba, req);
> +	spin_unlock_irqrestore(&sba->reqs_lock, flags);
> +
> +	/* Try to submit pending request */
> +	sba_issue_pending(&sba->dma_chan);

Nope, thats wrong, caller needs to call .issue_pending for that

> +static enum dma_status sba_tx_status(struct dma_chan *dchan,
> +				     dma_cookie_t cookie,
> +				     struct dma_tx_state *txstate)
> +{
> +	int mchan_idx;
> +	enum dma_status ret;
> +	struct sba_device *sba = to_sba_device(dchan);
> +
> +	ret = dma_cookie_status(dchan, cookie, txstate);
> +	if (ret == DMA_COMPLETE)
> +		return ret;
> +
> +	for (mchan_idx = 0; mchan_idx < sba->mchans_count; mchan_idx++)
> +		mbox_client_peek_data(sba->mchans[mchan_idx]);

what is this achieving?

> +static struct dma_async_tx_descriptor *
> +sba_prep_dma_memcpy(struct dma_chan *dchan, dma_addr_t dst, dma_addr_t src,
> +		    size_t len, unsigned long flags)
> +{
> +	size_t msg_len;
> +	dma_addr_t msg_offset = 0;
> +	unsigned int msgs_count = 0, cmds_count, cmds_idx = 0;
> +	struct sba_device *sba = to_sba_device(dchan);
> +	struct sba_request *req = NULL;
> +
> +	/* Sanity checks */
> +	if (unlikely(len > sba->req_size))
> +		return NULL;

why is that an error, you can create multiple txn of max length

> +static int sba_async_register(struct sba_device *sba)
> +{
> +	int ret;
> +	struct dma_device *dma_dev = &sba->dma_dev;
> +
> +	/* Initialize DMA channel cookie */
> +	sba->dma_chan.device = dma_dev;
> +	dma_cookie_init(&sba->dma_chan);
> +
> +	/* Initialize DMA device capability mask */
> +	dma_cap_zero(dma_dev->cap_mask);
> +	dma_cap_set(DMA_MEMCPY, dma_dev->cap_mask);
> +	dma_cap_set(DMA_XOR, dma_dev->cap_mask);
> +	dma_cap_set(DMA_PQ, dma_dev->cap_mask);
> +
> +	/*
> +	 * Set mailbox channel device as the base device of
> +	 * our dma_device because the actual memory accesses
> +	 * will be done by mailbox controller
> +	 */
> +	dma_dev->dev = sba->mbox_dev;
> +
> +	/* Set base prep routines */
> +	dma_dev->device_alloc_chan_resources = sba_alloc_chan_resources;
> +	dma_dev->device_free_chan_resources = sba_free_chan_resources;
> +	dma_dev->device_issue_pending = sba_issue_pending;
> +	dma_dev->device_tx_status = sba_tx_status;

Please add terminate callback support, also add the capabilities, we need to
advertise that and use in clients

Also you can simplify bunch of code by using virt-chan support for managing
channels and descriptors

-- 
~Vinod
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH 0/4] md: use bio_clone_fast()
From: Ming Lei @ 2017-02-05  6:22 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei

Hi,

This patches replaces bio_clone() with bio_fast_clone() in
bio_clone_mddev() because:

1) bio_clone_mddev() is used in raid normal I/O and isn't in
resync I/O path, and all the direct access to bvec table in
raid happens on resync I/O only except for write behind of raid1.
Write behind is treated specially, so the replacement is safe.

2) for write behind, bio_clone() is kept, but this patchset
introduces bio_clone_bioset_partial() to just clone one specific 
bvecs range instead of whole table. Then write behind is improved
too.

Thanks,
Ming

Ming Lei (4):
  block: introduce bio_clone_bioset_partial()
  md: introduce bio_clone_slow_mddev_partial()
  md/raid1: use bio_clone_slow_mddev_partial in case of write behind
  md: fast clone bio in bio_clone_mddev()

 block/bio.c         | 61 +++++++++++++++++++++++++++++++++++++++++------------
 drivers/md/md.c     | 24 +++++++++++++++++++--
 drivers/md/md.h     |  3 +++
 drivers/md/raid1.c  | 21 +++++++++++++-----
 include/linux/bio.h | 11 ++++++++--
 5 files changed, 98 insertions(+), 22 deletions(-)

-- 
2.7.4

^ permalink raw reply

* [PATCH 1/4] block: introduce bio_clone_bioset_partial()
From: Ming Lei @ 2017-02-05  6:22 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1486275733-7268-1-git-send-email-tom.leiming@gmail.com>

md still need bio clone(not the fast version) for behind write,
and it is more efficient to use bio_clone_bioset_partial().

The idea is simple and just copy the bvecs range specified from
parameters.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 block/bio.c         | 61 +++++++++++++++++++++++++++++++++++++++++------------
 include/linux/bio.h | 11 ++++++++--
 2 files changed, 57 insertions(+), 15 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 4b564d0c3e29..5eec5e08417f 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -625,21 +625,20 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
 }
 EXPORT_SYMBOL(bio_clone_fast);
 
-/**
- * 	bio_clone_bioset - clone a bio
- * 	@bio_src: bio to clone
- *	@gfp_mask: allocation priority
- *	@bs: bio_set to allocate from
- *
- *	Clone bio. Caller will own the returned bio, but not the actual data it
- *	points to. Reference count of returned bio will be one.
- */
-struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
-			     struct bio_set *bs)
+static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
+				      struct bio_set *bs, int offset,
+				      int size)
 {
 	struct bvec_iter iter;
 	struct bio_vec bv;
 	struct bio *bio;
+	struct bvec_iter iter_src = bio_src->bi_iter;
+
+	/* for supporting partial clone */
+	if (offset || size != bio_src->bi_iter.bi_size) {
+		bio_advance_iter(bio_src, &iter_src, offset);
+		iter_src.bi_size = size;
+	}
 
 	/*
 	 * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
@@ -663,7 +662,8 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 	 *    __bio_clone_fast() anyways.
 	 */
 
-	bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
+	bio = bio_alloc_bioset(gfp_mask, __bio_segments(bio_src,
+			       &iter_src), bs);
 	if (!bio)
 		return NULL;
 	bio->bi_bdev		= bio_src->bi_bdev;
@@ -680,7 +680,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 		bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
 		break;
 	default:
-		bio_for_each_segment(bv, bio_src, iter)
+		__bio_for_each_segment(bv, bio_src, iter, iter_src)
 			bio->bi_io_vec[bio->bi_vcnt++] = bv;
 		break;
 	}
@@ -699,9 +699,44 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 
 	return bio;
 }
+
+/**
+ * 	bio_clone_bioset - clone a bio
+ * 	@bio_src: bio to clone
+ *	@gfp_mask: allocation priority
+ *	@bs: bio_set to allocate from
+ *
+ *	Clone bio. Caller will own the returned bio, but not the actual data it
+ *	points to. Reference count of returned bio will be one.
+ */
+struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
+			     struct bio_set *bs)
+{
+	return __bio_clone_bioset(bio_src, gfp_mask, bs, 0,
+				  bio_src->bi_iter.bi_size);
+}
 EXPORT_SYMBOL(bio_clone_bioset);
 
 /**
+ * 	bio_clone_bioset_partial - clone a partial bio
+ * 	@bio_src: bio to clone
+ *	@gfp_mask: allocation priority
+ *	@bs: bio_set to allocate from
+ *	@offset: cloned starting from the offset
+ *	@size: size for the cloned bio
+ *
+ *	Clone bio. Caller will own the returned bio, but not the actual data it
+ *	points to. Reference count of returned bio will be one.
+ */
+struct bio *bio_clone_bioset_partial(struct bio *bio_src, gfp_t gfp_mask,
+				     struct bio_set *bs, int offset,
+				     int size)
+{
+	return __bio_clone_bioset(bio_src, gfp_mask, bs, offset, size);
+}
+EXPORT_SYMBOL(bio_clone_bioset_partial);
+
+/**
  *	bio_add_pc_page	-	attempt to add page to bio
  *	@q: the target queue
  *	@bio: destination bio
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7cf8a6c70a3f..8e521194f6fc 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -183,7 +183,7 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
 
 #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)
 
-static inline unsigned bio_segments(struct bio *bio)
+static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec)
 {
 	unsigned segs = 0;
 	struct bio_vec bv;
@@ -205,12 +205,17 @@ static inline unsigned bio_segments(struct bio *bio)
 		break;
 	}
 
-	bio_for_each_segment(bv, bio, iter)
+	__bio_for_each_segment(bv, bio, iter, *bvec)
 		segs++;
 
 	return segs;
 }
 
+static inline unsigned bio_segments(struct bio *bio)
+{
+	return __bio_segments(bio, &bio->bi_iter);
+}
+
 /*
  * get a reference to a bio, so it won't disappear. the intended use is
  * something like:
@@ -384,6 +389,8 @@ extern void bio_put(struct bio *);
 extern void __bio_clone_fast(struct bio *, struct bio *);
 extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
 extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
+extern struct bio *bio_clone_bioset_partial(struct bio *, gfp_t,
+					    struct bio_set *, int, int);
 
 extern struct bio_set *fs_bio_set;
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH 2/4] md: introduce bio_clone_slow_mddev_partial()
From: Ming Lei @ 2017-02-05  6:22 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1486275733-7268-1-git-send-email-tom.leiming@gmail.com>

In raid1/raid10, most users of bio_clone_mddev() need bio_trim() too,
that means only part of the bio is required to be cloned, and not
necessary to clone the whole bio each time, and it is just enough
to clone the specified bvecs range.

So this patch introduces bio_clone_slow_mddev_partial() to improve
the situation, and it is named as slow because the following patch
will switch to bio_clone_fast() in bio_clone_mddev().

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/md.c | 16 ++++++++++++++++
 drivers/md/md.h |  3 +++
 2 files changed, 19 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4c1b82defa78..704be11355a9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -200,6 +200,22 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
 }
 EXPORT_SYMBOL_GPL(bio_clone_mddev);
 
+struct bio *bio_clone_slow_mddev_partial(struct bio *bio, gfp_t gfp_mask,
+					 struct mddev *mddev, int offset,
+					 int size)
+{
+	struct bio_set *bs;
+
+	if (!mddev || !mddev->bio_set)
+		bs = fs_bio_set;
+	else
+		bs = mddev->bio_set;
+
+	return bio_clone_bioset_partial(bio, gfp_mask, bs, offset << 9,
+					size << 9);
+}
+EXPORT_SYMBOL_GPL(bio_clone_slow_mddev_partial);
+
 /*
  * We have a system wide 'event count' that is incremented
  * on any 'interesting' event, and readers of /proc/mdstat
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 968bbe72b237..4f4e6ded59e5 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -675,6 +675,9 @@ extern void mddev_suspend(struct mddev *mddev);
 extern void mddev_resume(struct mddev *mddev);
 extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
 				   struct mddev *mddev);
+extern struct bio *bio_clone_slow_mddev_partial(struct bio *bio, gfp_t gfp_mask,
+						struct mddev *mddev, int offset,
+						int size);
 extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 				   struct mddev *mddev);
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH 3/4] md/raid1: use bio_clone_slow_mddev_partial in case of write behind
From: Ming Lei @ 2017-02-05  6:22 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1486275733-7268-1-git-send-email-tom.leiming@gmail.com>

Write behind need to replace pages in bio's bvecs, and we have
to clone a fresh bio with new bvec table, so use the introduced
bio_clone_slow_mddev_partial() for it.

For other bio_clone_mddev() cases, we will use fast clone since
they don't need to touch bvec table.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 830ff2b20346..e1c5639febdb 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1341,13 +1341,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
 	first_clone = 1;
 	for (i = 0; i < disks; i++) {
-		struct bio *mbio;
+		struct bio *mbio = NULL;
+		int offset;
 		if (!r1_bio->bios[i])
 			continue;
 
-		mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector,
-			 max_sectors);
+		offset = r1_bio->sector - bio->bi_iter.bi_sector;
 
 		if (first_clone) {
 			/* do behind I/O ?
@@ -1357,8 +1356,14 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 			if (bitmap &&
 			    (atomic_read(&bitmap->behind_writes)
 			     < mddev->bitmap_info.max_write_behind) &&
-			    !waitqueue_active(&bitmap->behind_wait))
+			    !waitqueue_active(&bitmap->behind_wait)) {
+				mbio = bio_clone_slow_mddev_partial(bio,
+								    GFP_NOIO,
+								    mddev,
+								    offset,
+								    max_sectors);
 				alloc_behind_pages(mbio, r1_bio);
+			}
 
 			bitmap_startwrite(bitmap, r1_bio->sector,
 					  r1_bio->sectors,
@@ -1366,6 +1371,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 						   &r1_bio->state));
 			first_clone = 0;
 		}
+
+		if (!mbio) {
+			mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+			bio_trim(mbio, offset, max_sectors);
+		}
+
 		if (r1_bio->behind_bvecs) {
 			struct bio_vec *bvec;
 			int j;
-- 
2.7.4

^ permalink raw reply related

* [PATCH 4/4] md: fast clone bio in bio_clone_mddev()
From: Ming Lei @ 2017-02-05  6:22 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1486275733-7268-1-git-send-email-tom.leiming@gmail.com>

Firstly bio_clone_mddev() is used in raid normal I/O and isn't
in resync I/O path.

Secondly all the direct access to bvec table in raid happens on
resync I/O except for write behind of raid1, in which we still
use bio_clone() for allocating new bvec table.

So this patch replaces bio_clone() with bio_clone_fast()
in bio_clone_mddev().

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/md.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 704be11355a9..7d176f025add 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -193,10 +193,14 @@ EXPORT_SYMBOL_GPL(bio_alloc_mddev);
 struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
 			    struct mddev *mddev)
 {
+	struct bio_set *bs;
+
 	if (!mddev || !mddev->bio_set)
-		return bio_clone(bio, gfp_mask);
+		bs = fs_bio_set;
+	else
+		bs = mddev->bio_set;
 
-	return bio_clone_bioset(bio, gfp_mask, mddev->bio_set);
+	return bio_clone_fast(bio, gfp_mask, bs);
 }
 EXPORT_SYMBOL_GPL(bio_clone_mddev);
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH] md: ensure md devices are freed before module is unloaded.
From: NeilBrown @ 2017-02-06  2:41 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Guoqing Jiang, linux-raid

[-- Attachment #1: Type: text/plain, Size: 1554 bytes --]

Commit: cbd199837750 ("md: Fix unfortunate interaction with evms")
change mddev_put() so that it would not destroy an md device while
->ctime was non-zero.

Unfortunately, we didn't make sure to clear ->ctime when unloading
the module, so it is possible for an md device to remain after
module unload.  An attempt to open such a device will trigger
an invalid memory reference in:
  get_gendisk -> kobj_lookup -> exact_lock -> get_disk

when tring to access disk->fops, which was in the module that has
been removed.

So ensure we clear ->ctime in md_exit(), and explain how that is useful,
as it isn't immediately obvious when looking at the code.

Fixes: cbd199837750 ("md: Fix unfortunate interaction with evms")
Tested-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 01175dac0db6..8926fb781cdc 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8980,7 +8980,14 @@ static __exit void md_exit(void)

 	for_each_mddev(mddev, tmp) {
 		export_array(mddev);
+		mddev->ctime = 0;
 		mddev->hold_active = 0;
+		/* for_each_mddev() will call mddev_put() at the
+		 * end of each iteration.  As the mddev is now
+		 * fully clear, this will schedule the mddev for destruction
+		 * by a workqueue, and the destroy_workqueue() below
+		 * will wait for that to complete.
+		 */
 	}
 	destroy_workqueue(md_misc_wq);
 	destroy_workqueue(md_wq);
-- 
2.11.0

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply related

* Re: [PATCH 3/6] async_tx: Handle DMA devices having support for fewer PQ coefficients
From: Anup Patel @ 2017-02-06  3:55 UTC (permalink / raw)
  To: Dan Williams
  Cc: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar, Ray Jui, Scott Branden, Jon Mason,
	Rob Rice, BCM Kernel Feedback, dmaengine@vger.kernel.org,
	Device Tree, linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-crypto, linux-raid
In-Reply-To: <CAPcyv4icg74StGOjsOpYCDb9x02rrvSrkv6O66WtbSvhOwxYnw@mail.gmail.com>

On Sat, Feb 4, 2017 at 12:12 AM, Dan Williams <dan.j.williams@intel.com> wrote:
> On Fri, Feb 3, 2017 at 2:59 AM, Anup Patel <anup.patel@broadcom.com> wrote:
>>
>>
>> On Thu, Feb 2, 2017 at 11:31 AM, Dan Williams <dan.j.williams@intel.com>
>> wrote:
>>>
>>> On Wed, Feb 1, 2017 at 8:47 PM, Anup Patel <anup.patel@broadcom.com>
>>> wrote:
>>> > The DMAENGINE framework assumes that if PQ offload is supported by a
>>> > DMA device then all 256 PQ coefficients are supported. This assumption
>>> > does not hold anymore because we now have BCM-SBA-RAID offload engine
>>> > which supports PQ offload with limited number of PQ coefficients.
>>> >
>>> > This patch extends async_tx APIs to handle DMA devices with support
>>> > for fewer PQ coefficients.
>>> >
>>> > Signed-off-by: Anup Patel <anup.patel@broadcom.com>
>>> > Reviewed-by: Scott Branden <scott.branden@broadcom.com>
>>> > ---
>>> >  crypto/async_tx/async_pq.c          |  3 +++
>>> >  crypto/async_tx/async_raid6_recov.c | 12 ++++++++++--
>>> >  include/linux/dmaengine.h           | 19 +++++++++++++++++++
>>> >  include/linux/raid/pq.h             |  3 +++
>>> >  4 files changed, 35 insertions(+), 2 deletions(-)
>>>
>>> So, I hate the way async_tx does these checks on each operation, and
>>> it's ok for me to say that because it's my fault. Really it's md that
>>> should be validating engine offload capabilities once at the beginning
>>> of time. I'd rather we move in that direction than continue to pile
>>> onto a bad design.
>>
>>
>> Yes, indeed. All async_tx APIs have lot of checks and for high throughput
>> RAID offload engine these checks can add some overhead.
>>
>> I think doing checks in Linux md would be certainly better but this would
>> mean lot of changes in Linux md as well as remove checks in async_tx.
>>
>> Also, async_tx APIs should not find DMA channel on its own instead it
>> should rely on Linux md to provide DMA channel pointer as parameter.
>>
>> It's better to do checks cleanup in async_tx as separate patchset and
>> keep this patchset simple.
>
> That's been the problem with async_tx being broken like this for
> years. Once you get this "small / simple" patch upstream, that
> arguably makes async_tx a little bit worse, there is no longer any
> motivation to fix the underlying issues. If you care about the long
> term health of raid offload and are enabling new hardware support you
> should first tackle the known problems with it before adding new
> features.

Apart from the checks related issue you pointed there are other
issues with async_tx APIs such as:

1. The mechanism to do update PQ (or RAID6 update) operation
in current async_tx APIs is to call async_gen_syndrome() twice
with ASYNC_TX_PQ_XOR_DST flag set. Also, async_gen_syndrome()
will always prefer SW approach when ASYNC_TX_PQ_XOR_DST flag
is set. This means async_tx API is forcing SW approach for update
PQ operation and in-addition we require two async_gen_syndrome()
calls to achieve update PQ. This limitations of async_gen_syndrome()
reduces performance of async_tx APIs. Instead of this we should
have a dedicated async_update_pq() API which will allow RAID
offload engine drivers (such as BCM-FS4-RAID) to implement
update PQ using HW offload and this new API will fall-back to
SW approach using async_gen_syndrome() if no DMA channel
provides update PQ HW offload.

2. In our stress testing, we have observed that dma_map_page()
and dma_unmap_page() used in various async_tx APIs are the
major cause of overhead. If we directly call DMA channel callbacks
with pre-DMA-mapped pages then we get very high throughput.
The async_tx APIs should provide a way for pre-DMA-mapped
pages so that Linux MD can exploit this fact for better performance.

3. We really don't have a test module to stress/benchmark all
async_tx APIs using multi-threading and batching large number
of request in each thread. This kind of test module is very much
required for performance benchmarking and stressing high
throughput (hundreds of Gbps) RAID offload engines (such as
BCM-FS4-RAID).

From the above, we already have async_tx_test module to
address point3. We also plan to address point1 above but
this would also require changes in Linux MD to use new
async_update_pq() API.

As you can see, this patchset is not end of story of us if we
want best possible utilization of BCM-FS4-RAID.

Regards,
Anup

^ permalink raw reply

* Re: [PATCH 1/6] mailbox: Add new API mbox_channel_device() for clients
From: Anup Patel @ 2017-02-06  4:27 UTC (permalink / raw)
  To: Jassi Brar
  Cc: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Dan Williams, Ray Jui, Scott Branden, Jon Mason,
	Rob Rice, BCM Kernel Feedback, dmaengine, Devicetree List,
	linux-arm-kernel@lists.infradead.org, Linux Kernel Mailing List,
	linux-crypto, linux-raid
In-Reply-To: <CABb+yY1yKnFN-6=+xhZzcYrXNBe5eATthwOoHdvDrLmQzpEZ-Q@mail.gmail.com>

On Fri, Feb 3, 2017 at 5:35 PM, Jassi Brar <jassisinghbrar@gmail.com> wrote:
> On Thu, Feb 2, 2017 at 10:17 AM, Anup Patel <anup.patel@broadcom.com> wrote:
>> The remote processor can have DMAENGINE capabilities and client
>> can pass data to be processed via main memory. In such cases,
>> the client will require DMAble memory for remote processor.
>>
>> This patch adds new API mbox_channel_device() which can be
>> used by clients to get struct device pointer of underlying
>> mailbox controller. This struct device pointer of mailbox
>> controller can be used by clients to allocate DMAble memory
>> for remote processor.
>>
> IIUC, DT already provides a way for what you need.

Thanks for the suggestion. I will explore in this direction and
try to avoid this patch in next revision.

Can you please have a look at FlexRM driver which I
had submitted previously?
https://lkml.org/lkml/2017/1/5/291
https://lkml.org/lkml/2017/1/5/293

Regards,
Anup

^ permalink raw reply

* Re: [PATCH 2/4] md: introduce bio_clone_slow_mddev_partial()
From: Christoph Hellwig @ 2017-02-06  8:52 UTC (permalink / raw)
  To: Ming Lei
  Cc: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
In-Reply-To: <1486275733-7268-3-git-send-email-tom.leiming@gmail.com>

> +struct bio *bio_clone_slow_mddev_partial(struct bio *bio, gfp_t gfp_mask,
> +					 struct mddev *mddev, int offset,
> +					 int size)
> +{
> +	struct bio_set *bs;
> +
> +	if (!mddev || !mddev->bio_set)
> +		bs = fs_bio_set;
> +	else
> +		bs = mddev->bio_set;
> +
> +	return bio_clone_bioset_partial(bio, gfp_mask, bs, offset << 9,
> +					size << 9);
> +}
> +EXPORT_SYMBOL_GPL(bio_clone_slow_mddev_partial);

As far as I can tell the caller always has a mddev, and an active
mddev always has a bio_set.  So let's just skip this wrapper.

^ permalink raw reply

* Re: [PATCH 4/4] md: fast clone bio in bio_clone_mddev()
From: Christoph Hellwig @ 2017-02-06  8:54 UTC (permalink / raw)
  To: Ming Lei
  Cc: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
In-Reply-To: <1486275733-7268-5-git-send-email-tom.leiming@gmail.com>

On Sun, Feb 05, 2017 at 02:22:13PM +0800, Ming Lei wrote:
> Firstly bio_clone_mddev() is used in raid normal I/O and isn't
> in resync I/O path.
> 
> Secondly all the direct access to bvec table in raid happens on
> resync I/O except for write behind of raid1, in which we still
> use bio_clone() for allocating new bvec table.
> 
> So this patch replaces bio_clone() with bio_clone_fast()
> in bio_clone_mddev().

Having the _fast in the name would be really useful for the
reader.  And as far as I can tell in the callers mddev is never
NULL and neither is ->bio_set, so replacing bio_clone_mddev with
raw calls to bio_clone_fast would be my preference.

^ permalink raw reply

* Re: [md PATCH 0/5] Stop using bi_phys_segments as a counter
From: Christoph Hellwig @ 2017-02-06  8:56 UTC (permalink / raw)
  To: NeilBrown
  Cc: Shaohua Li, Konstantin Khlebnikov, Christoph Hellwig, linux-raid
In-Reply-To: <147969099621.5434.12384452255155063186.stgit@noble>

Did anything happen to this series?  Having it would be extremely
useful for the block layer moving forward.

^ permalink raw reply

* Re: [PATCH 4/4] md: fast clone bio in bio_clone_mddev()
From: Ming Lei @ 2017-02-06 10:43 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Shaohua Li, Jens Axboe, Linux Kernel Mailing List,
	open list:SOFTWARE RAID (Multiple Disks) SUPPORT, linux-block,
	NeilBrown
In-Reply-To: <20170206085433.GB6411@infradead.org>

On Mon, Feb 6, 2017 at 4:54 PM, Christoph Hellwig <hch@infradead.org> wrote:
> On Sun, Feb 05, 2017 at 02:22:13PM +0800, Ming Lei wrote:
>> Firstly bio_clone_mddev() is used in raid normal I/O and isn't
>> in resync I/O path.
>>
>> Secondly all the direct access to bvec table in raid happens on
>> resync I/O except for write behind of raid1, in which we still
>> use bio_clone() for allocating new bvec table.
>>
>> So this patch replaces bio_clone() with bio_clone_fast()
>> in bio_clone_mddev().
>
> Having the _fast in the name would be really useful for the
> reader.  And as far as I can tell in the callers mddev is never
> NULL and neither is ->bio_set, so replacing bio_clone_mddev with

In theory, ->bio_set still might be NULL in case of failed memory allocation,
please see md_run().

> raw calls to bio_clone_fast would be my preference.



Thanks,
Ming Lei

^ permalink raw reply

* Re: [PATCH 5/6] dmaengine: Add Broadcom SBA RAID driver
From: Anup Patel @ 2017-02-06 12:01 UTC (permalink / raw)
  To: Vinod Koul
  Cc: Rob Herring, Mark Rutland, Herbert Xu, David S . Miller,
	Jassi Brar, Dan Williams, Ray Jui, Scott Branden, Jon Mason,
	Rob Rice, BCM Kernel Feedback, dmaengine, Device Tree,
	Linux ARM Kernel, Linux Kernel, linux-crypto, linux-raid
In-Reply-To: <20170205060657.GV19244@localhost>

On Sun, Feb 5, 2017 at 11:36 AM, Vinod Koul <vinod.koul@intel.com> wrote:
> On Thu, Feb 02, 2017 at 10:17:15AM +0530, Anup Patel wrote:
>> +config BCM_SBA_RAID
>> +        tristate "Broadcom SBA RAID engine support"
>> +        depends on (ARM64 && MAILBOX && RAID6_PQ) || COMPILE_TEST
>> +        select DMA_ENGINE
>> +        select DMA_ENGINE_RAID
>> +     select ASYNC_TX_ENABLE_CHANNEL_SWITCH
>> +     default ARCH_BCM_IPROC
>
> whats with the funny alignement?

Sure, I will use tabs here.

>
>> +/* SBA command related defines */
>> +#define SBA_TYPE_SHIFT                                       48
>> +#define SBA_TYPE_MASK                                        0x3
>> +#define SBA_TYPE_A                                   0x0
>> +#define SBA_TYPE_B                                   0x2
>> +#define SBA_TYPE_C                                   0x3
>> +#define SBA_USER_DEF_SHIFT                           32
>> +#define SBA_USER_DEF_MASK                            0xffff
>> +#define SBA_R_MDATA_SHIFT                            24
>> +#define SBA_R_MDATA_MASK                             0xff
>> +#define SBA_C_MDATA_MS_SHIFT                         18
>> +#define SBA_C_MDATA_MS_MASK                          0x3
>> +#define SBA_INT_SHIFT                                        17
>> +#define SBA_INT_MASK                                 0x1
>> +#define SBA_RESP_SHIFT                                       16
>> +#define SBA_RESP_MASK                                        0x1
>> +#define SBA_C_MDATA_SHIFT                            8
>> +#define SBA_C_MDATA_MASK                             0xff
>> +#define SBA_CMD_SHIFT                                        0
>> +#define SBA_CMD_MASK                                 0xf
>> +#define SBA_CMD_ZERO_ALL_BUFFERS                     0x8
>> +#define SBA_CMD_LOAD_BUFFER                          0x9
>> +#define SBA_CMD_XOR                                  0xa
>> +#define SBA_CMD_GALOIS_XOR                           0xb
>> +#define SBA_CMD_ZERO_BUFFER                          0x4
>> +#define SBA_CMD_WRITE_BUFFER                         0xc
>
> Try using BIT and GENMAST for hardware descriptions

Sure, will do.

>
>> +
>> +/* SBA C_MDATA helper macros */
>> +#define SBA_C_MDATA_LOAD_VAL(__bnum0)                ((__bnum0) & 0x3)
>> +#define SBA_C_MDATA_WRITE_VAL(__bnum0)               ((__bnum0) & 0x3)
>> +#define SBA_C_MDATA_XOR_VAL(__bnum1, __bnum0)                        \
>> +                     ({      u32 __v = ((__bnum0) & 0x3);    \
>> +                             __v |= ((__bnum1) & 0x3) << 2;  \
>> +                             __v;                            \
>> +                     })
>> +#define SBA_C_MDATA_PQ_VAL(__dnum, __bnum1, __bnum0)         \
>> +                     ({      u32 __v = ((__bnum0) & 0x3);    \
>> +                             __v |= ((__bnum1) & 0x3) << 2;  \
>> +                             __v |= ((__dnum) & 0x1f) << 5;  \
>> +                             __v;                            \
>> +                     })
>
> ah why are we usig complex macros, why can't these be simple functions..

"static inline functions" seemed too complicated here because most of
these macros are two lines of c-code.

Do you still insist on using "static inline functions"?

>
>> +#define SBA_C_MDATA_LS(__c_mdata_val)        ((__c_mdata_val) & 0xff)
>> +#define SBA_C_MDATA_MS(__c_mdata_val)        (((__c_mdata_val) >> 8) & 0x3)
>> +
>> +/* Driver helper macros */
>> +#define to_sba_request(tx)           \
>> +     container_of(tx, struct sba_request, tx)
>> +#define to_sba_device(dchan)         \
>> +     container_of(dchan, struct sba_device, dma_chan)
>> +
>> +enum sba_request_state {
>> +     SBA_REQUEST_STATE_FREE = 1,
>> +     SBA_REQUEST_STATE_ALLOCED = 2,
>> +     SBA_REQUEST_STATE_PENDING = 3,
>> +     SBA_REQUEST_STATE_ACTIVE = 4,
>> +     SBA_REQUEST_STATE_COMPLETED = 5,
>> +     SBA_REQUEST_STATE_ABORTED = 6,
>
> whats up with a very funny indentation setting, we use 8 chars.
>
> Please re-read the Documentation/process/coding-style.rst

I have double checked this enum. The indentation is fine
and as-per coding style. Am I missing anything else?

>
>> +static int sba_alloc_chan_resources(struct dma_chan *dchan)
>> +{
>> +     /*
>> +      * We only have one channel so we have pre-alloced
>> +      * channel resources. Over here we just return number
>> +      * of free request.
>> +      */
>> +     return sba_free_request_count(to_sba_device(dchan));
>> +}
>
> essentially you are not doing much, so you can skip it. Its an optional
> call.

Sure, will do.

>
>> +static void sba_free_chan_resources(struct dma_chan *dchan)
>> +{
>> +     /*
>> +      * Channel resources are pre-alloced so we just free-up
>> +      * whatever we can so that we can re-use pre-alloced
>> +      * channel resources next time.
>> +      */
>> +     sba_cleanup_inflight_requests(to_sba_device(dchan));
>
> well this one checks for pending requests as well, which shouldn't be there
> when freeing a channel, something seems not quite right here..
>
>> +static int sba_send_mbox_request(struct sba_device *sba,
>> +                              struct sba_request *req)
>> +{
>> +     int mchans_idx, ret = 0;
>> +
>> +     /* Select mailbox channel in round-robin fashion */
>> +     mchans_idx = atomic_inc_return(&sba->mchans_current);
>> +     mchans_idx = mchans_idx % sba->mchans_count;
>> +
>> +     /* Send batch message for the request */
>> +     req->bmsg.batch.msgs_queued = 0;
>> +     ret = mbox_send_message(sba->mchans[mchans_idx], &req->bmsg);
>> +     if (ret < 0) {
>> +             dev_info(sba->dev, "channel %d message %d (total %d)",
>> +                      mchans_idx, req->bmsg.batch.msgs_queued,
>> +                      req->bmsg.batch.msgs_count);
>
> dev_err?

Sure, will use dev_err.

>
>> +             dev_err(sba->dev, "send message failed with error %d", ret);
>> +             return ret;
>> +     }
>> +     ret = req->bmsg.error;
>> +     if (ret < 0) {
>> +             dev_info(sba->dev,
>> +                      "mbox channel %d message %d (total %d)",
>> +                      mchans_idx, req->bmsg.batch.msgs_queued,
>> +                      req->bmsg.batch.msgs_count);
>
> same here

OK.

>
>> +static dma_cookie_t sba_tx_submit(struct dma_async_tx_descriptor *tx)
>> +{
>> +     unsigned long flags;
>> +     dma_cookie_t cookie;
>> +     struct sba_request *req;
>> +     struct sba_device *sba;
>> +
>> +     if (unlikely(!tx))
>> +             return -EINVAL;
>> +
>> +     sba = to_sba_device(tx->chan);
>> +     req = to_sba_request(tx);
>> +
>> +     /* Assign cookie and mark request pending */
>> +     spin_lock_irqsave(&sba->reqs_lock, flags);
>> +     cookie = dma_cookie_assign(tx);
>> +     _sba_pending_request(sba, req);
>> +     spin_unlock_irqrestore(&sba->reqs_lock, flags);
>> +
>> +     /* Try to submit pending request */
>> +     sba_issue_pending(&sba->dma_chan);
>
> Nope, thats wrong, caller needs to call .issue_pending for that

This was giving minor performance improvement but I will
remove this since its against API usage.

>
>> +static enum dma_status sba_tx_status(struct dma_chan *dchan,
>> +                                  dma_cookie_t cookie,
>> +                                  struct dma_tx_state *txstate)
>> +{
>> +     int mchan_idx;
>> +     enum dma_status ret;
>> +     struct sba_device *sba = to_sba_device(dchan);
>> +
>> +     ret = dma_cookie_status(dchan, cookie, txstate);
>> +     if (ret == DMA_COMPLETE)
>> +             return ret;
>> +
>> +     for (mchan_idx = 0; mchan_idx < sba->mchans_count; mchan_idx++)
>> +             mbox_client_peek_data(sba->mchans[mchan_idx]);
>
> what is this achieving?

The mbox_client_peek_data() is a hint to mailbox controller driver
to check for available messages.

This gives good performance improvement when some DMA client
code is polling using tx_status() callback.

>
>> +static struct dma_async_tx_descriptor *
>> +sba_prep_dma_memcpy(struct dma_chan *dchan, dma_addr_t dst, dma_addr_t src,
>> +                 size_t len, unsigned long flags)
>> +{
>> +     size_t msg_len;
>> +     dma_addr_t msg_offset = 0;
>> +     unsigned int msgs_count = 0, cmds_count, cmds_idx = 0;
>> +     struct sba_device *sba = to_sba_device(dchan);
>> +     struct sba_request *req = NULL;
>> +
>> +     /* Sanity checks */
>> +     if (unlikely(len > sba->req_size))
>> +             return NULL;
>
> why is that an error, you can create multiple txn of max length

Sure, I will extend driver to create multiple txn when
"len > req->size"

>
>> +static int sba_async_register(struct sba_device *sba)
>> +{
>> +     int ret;
>> +     struct dma_device *dma_dev = &sba->dma_dev;
>> +
>> +     /* Initialize DMA channel cookie */
>> +     sba->dma_chan.device = dma_dev;
>> +     dma_cookie_init(&sba->dma_chan);
>> +
>> +     /* Initialize DMA device capability mask */
>> +     dma_cap_zero(dma_dev->cap_mask);
>> +     dma_cap_set(DMA_MEMCPY, dma_dev->cap_mask);
>> +     dma_cap_set(DMA_XOR, dma_dev->cap_mask);
>> +     dma_cap_set(DMA_PQ, dma_dev->cap_mask);
>> +
>> +     /*
>> +      * Set mailbox channel device as the base device of
>> +      * our dma_device because the actual memory accesses
>> +      * will be done by mailbox controller
>> +      */
>> +     dma_dev->dev = sba->mbox_dev;
>> +
>> +     /* Set base prep routines */
>> +     dma_dev->device_alloc_chan_resources = sba_alloc_chan_resources;
>> +     dma_dev->device_free_chan_resources = sba_free_chan_resources;
>> +     dma_dev->device_issue_pending = sba_issue_pending;
>> +     dma_dev->device_tx_status = sba_tx_status;
>
> Please add terminate callback support, also add the capabilities, we need to
> advertise that and use in clients

OK, I will add terminate callback.

>
> Also you can simplify bunch of code by using virt-chan support for managing
> channels and descriptors

OK, I will surely explore virt-chan.

Regards,
Anup

^ permalink raw reply

* Re: [PATCH 4/4] md: fast clone bio in bio_clone_mddev()
From: Christoph Hellwig @ 2017-02-06 14:24 UTC (permalink / raw)
  To: Ming Lei
  Cc: Christoph Hellwig, Shaohua Li, Jens Axboe,
	Linux Kernel Mailing List,
	open list:SOFTWARE RAID (Multiple Disks) SUPPORT, linux-block,
	NeilBrown
In-Reply-To: <CACVXFVNhB5i_5VJ4596ktLRnEM03GhqhZD-x+E_QwK1kZydTJg@mail.gmail.com>

On Mon, Feb 06, 2017 at 06:43:32PM +0800, Ming Lei wrote:
> In theory, ->bio_set still might be NULL in case of failed memory allocation,
> please see md_run().

And that's something that should be fixed.  Silently not having mempool
is very bad behavior.

^ permalink raw reply

* Re: [PATCH 5/6] dmaengine: Add Broadcom SBA RAID driver
From: Vinod Koul @ 2017-02-06 16:54 UTC (permalink / raw)
  To: Anup Patel
  Cc: Rob Herring, Mark Rutland, Herbert Xu, David S . Miller,
	Jassi Brar, Dan Williams, Ray Jui, Scott Branden, Jon Mason,
	Rob Rice, BCM Kernel Feedback, dmaengine, Device Tree,
	Linux ARM Kernel, Linux Kernel, linux-crypto, linux-raid
In-Reply-To: <CAALAos9FxwKuF+qtqViybZ8APZNYt-PWLnTfJs64xtoyYB5NZQ@mail.gmail.com>

On Mon, Feb 06, 2017 at 05:31:15PM +0530, Anup Patel wrote:

> >> +
> >> +/* SBA C_MDATA helper macros */
> >> +#define SBA_C_MDATA_LOAD_VAL(__bnum0)                ((__bnum0) & 0x3)
> >> +#define SBA_C_MDATA_WRITE_VAL(__bnum0)               ((__bnum0) & 0x3)
> >> +#define SBA_C_MDATA_XOR_VAL(__bnum1, __bnum0)                        \
> >> +                     ({      u32 __v = ((__bnum0) & 0x3);    \
> >> +                             __v |= ((__bnum1) & 0x3) << 2;  \
> >> +                             __v;                            \
> >> +                     })
> >> +#define SBA_C_MDATA_PQ_VAL(__dnum, __bnum1, __bnum0)         \
> >> +                     ({      u32 __v = ((__bnum0) & 0x3);    \
> >> +                             __v |= ((__bnum1) & 0x3) << 2;  \
> >> +                             __v |= ((__dnum) & 0x1f) << 5;  \
> >> +                             __v;                            \
> >> +                     })
> >
> > ah why are we usig complex macros, why can't these be simple functions..
> 
> "static inline functions" seemed too complicated here because most of
> these macros are two lines of c-code.

and thats where I have an issue with this. Macros for simple things is fine
but not for couple of line of logic!

> 
> Do you still insist on using "static inline functions"?

Yes

> 
> >
> >> +#define SBA_C_MDATA_LS(__c_mdata_val)        ((__c_mdata_val) & 0xff)
> >> +#define SBA_C_MDATA_MS(__c_mdata_val)        (((__c_mdata_val) >> 8) & 0x3)
> >> +
> >> +/* Driver helper macros */
> >> +#define to_sba_request(tx)           \
> >> +     container_of(tx, struct sba_request, tx)
> >> +#define to_sba_device(dchan)         \
> >> +     container_of(dchan, struct sba_device, dma_chan)
> >> +
> >> +enum sba_request_state {
> >> +     SBA_REQUEST_STATE_FREE = 1,
> >> +     SBA_REQUEST_STATE_ALLOCED = 2,
> >> +     SBA_REQUEST_STATE_PENDING = 3,
> >> +     SBA_REQUEST_STATE_ACTIVE = 4,
> >> +     SBA_REQUEST_STATE_COMPLETED = 5,
> >> +     SBA_REQUEST_STATE_ABORTED = 6,
> >
> > whats up with a very funny indentation setting, we use 8 chars.
> >
> > Please re-read the Documentation/process/coding-style.rst
> 
> I have double checked this enum. The indentation is fine
> and as-per coding style. Am I missing anything else?

Somehow the initial indent doesnt seem to be 8 chars to me.

> >> +static enum dma_status sba_tx_status(struct dma_chan *dchan,
> >> +                                  dma_cookie_t cookie,
> >> +                                  struct dma_tx_state *txstate)
> >> +{
> >> +     int mchan_idx;
> >> +     enum dma_status ret;
> >> +     struct sba_device *sba = to_sba_device(dchan);
> >> +
> >> +     ret = dma_cookie_status(dchan, cookie, txstate);
> >> +     if (ret == DMA_COMPLETE)
> >> +             return ret;
> >> +
> >> +     for (mchan_idx = 0; mchan_idx < sba->mchans_count; mchan_idx++)
> >> +             mbox_client_peek_data(sba->mchans[mchan_idx]);
> >
> > what is this achieving?
> 
> The mbox_client_peek_data() is a hint to mailbox controller driver
> to check for available messages.
> 
> This gives good performance improvement when some DMA client
> code is polling using tx_status() callback.

Then why do it before and then check status.

-- 
~Vinod

^ permalink raw reply

* [PATCH V2] MD: add doc for raid5-cache
From: Shaohua Li @ 2017-02-06 19:23 UTC (permalink / raw)
  To: linux-raid
  Cc: antlists, philip, songliubraving, neilb, jure.erznoznik,
	rramesh2400

I'm starting document of the raid5-cache feature. Please note this is a
kernel doc instead of a mdadm manual, so I don't add the details about
how to use the feature in mdadm side. Please let me know what else we
should put into the document. Of course, comments are welcome!

Signed-off-by: Shaohua Li <shli@fb.com>
---
 Documentation/md/raid5-cache.txt | 109 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 Documentation/md/raid5-cache.txt

diff --git a/Documentation/md/raid5-cache.txt b/Documentation/md/raid5-cache.txt
new file mode 100644
index 0000000..2b210f2
--- /dev/null
+++ b/Documentation/md/raid5-cache.txt
@@ -0,0 +1,109 @@
+RAID5 cache
+
+Raid 4/5/6 could include an extra disk for data cache besides normal RAID
+disks. The role of RAID disks isn't changed with the cache disk. The cache disk
+caches data to the RAID disks. The cache can be in write-through (supported
+since 4.4) or write-back mode (supported since 4.10). mdadm (supported since
+3.4) has a new option '--write-journal' to create array with cache. Please
+refer to mdadm manual for details. By default (RAID array starts), the cache is
+in write-through mode. A user can switch it to write-back mode by:
+
+echo "write-back" > /sys/block/md0/md/journal_mode
+
+And switch it back to write-through mode by:
+
+echo "write-through" > /sys/block/md0/md/journal_mode
+
+In both modes, all writes to the array will hit cache disk first. This means
+the cache disk must be fast and sustainable.
+
+-------------------------------------
+write-through mode:
+
+This mode mainly fixes the 'write hole' issue. For RAID 4/5/6 array, an unclean
+shutdown can cause data in some stripes to not be in consistent state, eg, data
+and parity don't match. The reason is that a stripe write involves several RAID
+disks and it's possible the writes don't hit all RAID disks yet before the
+unclean shutdown. We call an array degraded if it has inconsistent data. MD
+tries to resync the array to bring it back to normal state. But before the
+resync completes, any system crash will expose the chance of real data
+corruption in the RAID array. This problem is called 'write hole'.
+
+The write-through cache will cache all data on cache disk first. After the data
+is safe on the cache disk, the data will be flushed onto RAID disks. The
+two-step write will guarantee MD can recover correct data after unclean
+shutdown even the array is degraded. Thus the cache can close the 'write hole'.
+
+In write-through mode, MD reports IO completion to upper layer (usually
+filesystems) after the data is safe on RAID disks, so cache disk failure
+doesn't cause data loss. Of course cache disk failure means the array is
+exposed to 'write hole' again.
+
+In write-through mode, the cache disk isn't required to be big. Several
+hundreds megabytes are enough.
+
+--------------------------------------
+write-back mode:
+
+write-back mode fixes the 'write hole' issue too, since all write data is
+cached on cache disk. But the main goal of 'write-back' cache is to speed up
+write. If a write crosses all RAID disks of a stripe, we call it full-stripe
+write. For non-full-stripe writes, MD must read old data before the new parity
+can be calculated. These synchronous reads hurt write throughput. Some writes
+which are sequential but not dispatched in the same time will suffer from this
+overhead too. Write-back cache will aggregate the data and flush the data to
+RAID disks only after the data becomes a full stripe write. This will
+completely avoid the overhead, so it's very helpful for some workloads. A
+typical workload which does sequential write followed by fsync is an example.
+
+In write-back mode, MD reports IO completion to upper layer (usually
+filesystems) right after the data hits cache disk. The data is flushed to raid
+disks later after specific conditions met. So cache disk failure will cause
+data loss.
+
+In write-back mode, MD also caches data in memory. The memory cache includes
+the same data stored on cache disk, so a power loss doesn't cause data loss.
+The memory cache size has performance impact for the array. It's recommended
+the size is big. A user can configure the size by:
+
+echo "2048" > /sys/block/md0/md/stripe_cache_size
+
+Too small cache disk will make the write aggregation less efficient in this
+mode depending on the workloads. It's recommended to use a cache disk with at
+least several gigabytes size in write-back mode.
+
+--------------------------------------
+The implementation:
+
+The write-through and write-back cache use the same disk format. The cache disk
+is organized as a simple write log. The log consists of 'meta data' and 'data'
+pairs. The meta data describes the data. It also includes checksum and sequence
+ID for recovery identification. Data can be IO data and parity data. Data is
+checksumed too. The checksum is stored in the meta data ahead of the data. The
+checksum is an optimization because MD can write meta and data freely without
+worry about the order. MD superblock has a field pointed to the valid meta data
+of log head.
+
+The log implementation is pretty straightforward. The difficult part is the
+order in which MD writes data to cache disk and RAID disks. Specifically, in
+write-through mode, MD calculates parity for IO data, writes both IO data and
+parity to the log, writes the data and parity to RAID disks after the data and
+parity is settled down in log and finally the IO is finished. Read just reads
+from raid disks as usual.
+
+In write-back mode, MD writes IO data to the log and reports IO completion. The
+data is also fully cached in memory at that time, which means read must query
+memory cache. If some conditions are met, MD will flush the data to RAID disks.
+MD will calculate parity for the data and write parity into the log. After this
+is finished, MD will write both data and parity into RAID disks, then MD can
+release the memory cache. The flush conditions could be stripe becomes a full
+stripe write, free cache disk space is low or free in-kernel memory cache space
+is low.
+
+After an unclean shutdown, MD does recovery. MD reads all meta data and data
+from the log. The sequence ID and checksum will help us detect corrupted meta
+data and data. If MD finds a stripe with data and valid parities (1 parity for
+raid4/5 and 2 for raid6), MD will write the data and parities to RAID disks. If
+parities are incompleted, they are discarded. If part of data is corrupted,
+they are discarded too. MD then loads valid data and writes them to RAID disks
+in normal way.
-- 
2.9.3


^ permalink raw reply related

* Re: [PATCH] md: ensure md devices are freed before module is unloaded.
From: Shaohua Li @ 2017-02-06 20:03 UTC (permalink / raw)
  To: NeilBrown; +Cc: Shaohua Li, Guoqing Jiang, linux-raid
In-Reply-To: <87r33cvy58.fsf@notabene.neil.brown.name>

On Mon, Feb 06, 2017 at 01:41:39PM +1100, Neil Brown wrote:
> 
> 
> Commit: cbd199837750 ("md: Fix unfortunate interaction with evms")
> change mddev_put() so that it would not destroy an md device while
> ->ctime was non-zero.
> 
> Unfortunately, we didn't make sure to clear ->ctime when unloading
> the module, so it is possible for an md device to remain after
> module unload.  An attempt to open such a device will trigger
> an invalid memory reference in:
>   get_gendisk -> kobj_lookup -> exact_lock -> get_disk
> 
> when tring to access disk->fops, which was in the module that has
> been removed.
> 
> So ensure we clear ->ctime in md_exit(), and explain how that is useful,
> as it isn't immediately obvious when looking at the code.
> 
> Fixes: cbd199837750 ("md: Fix unfortunate interaction with evms")
> Tested-by: Guoqing Jiang <gqjiang@suse.com>
> Signed-off-by: NeilBrown <neilb@suse.com>
Applied, thanks!

> ---
>  drivers/md/md.c | 7 +++++++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 01175dac0db6..8926fb781cdc 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -8980,7 +8980,14 @@ static __exit void md_exit(void)
>  
>  	for_each_mddev(mddev, tmp) {
>  		export_array(mddev);
> +		mddev->ctime = 0;
>  		mddev->hold_active = 0;
> +		/* for_each_mddev() will call mddev_put() at the
> +		 * end of each iteration.  As the mddev is now
> +		 * fully clear, this will schedule the mddev for destruction
> +		 * by a workqueue, and the destroy_workqueue() below
> +		 * will wait for that to complete.
> +		 */
>  	}
>  	destroy_workqueue(md_misc_wq);
>  	destroy_workqueue(md_wq);
> -- 
> 2.11.0
> 



^ permalink raw reply

* Re: [PATCH V2] MD: add doc for raid5-cache
From: Anthony Youngman @ 2017-02-06 21:02 UTC (permalink / raw)
  To: Shaohua Li, linux-raid
  Cc: philip, songliubraving, neilb, jure.erznoznik, rramesh2400
In-Reply-To: <cac53388fd5903a006792c0165063d63eb66079d.1486408891.git.shli@fb.com>

On 06/02/17 19:23, Shaohua Li wrote:
> I'm starting document of the raid5-cache feature. Please note this is a
> kernel doc instead of a mdadm manual, so I don't add the details about
> how to use the feature in mdadm side. Please let me know what else we
> should put into the document. Of course, comments are welcome!
>
> Signed-off-by: Shaohua Li <shli@fb.com>
> ---
>  Documentation/md/raid5-cache.txt | 109 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 109 insertions(+)
>  create mode 100644 Documentation/md/raid5-cache.txt

Note that the kernel documentation is moving over to a new format - 
.rst. They're using a new system called Sphinx. There was an article on 
lwn about it. I've lost the reference to lwn, but if you look at 
https://www.kernel.org/doc/html/latest/ you will see that it is a match 
to Documentation/index.rst.

So of course, converting all the md stuff has made its way onto my "to 
do" list, but I don't want to start until I've got a converted kernel 
running on my system. I'm currently running 4.4.6 and I think it came 
out with something like 4.6. Problem is, running gentoo, my system is 
well out of date because I need a helper system to get me to upgrade and 
going from KDE4 to KDE5 is likely to give me grief :-)

You're probably working with an up-to-date kernel, but it's up to you - 
do you want to create a .rst file, or submit it as .txt and let me 
convert it and go through the grief of working out how to do it :-). 
Conversion is probably rather more than just converting the one file, 
it'll need converting the directory structure too, I expect. Any 
problems, Jon Corbet's probably our go-to guy.

Cheers,
Wol

^ permalink raw reply

* Re: [md PATCH 0/5] Stop using bi_phys_segments as a counter
From: Shaohua Li @ 2017-02-06 21:41 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: NeilBrown, Konstantin Khlebnikov, linux-raid
In-Reply-To: <20170206085624.GA22409@lst.de>

On Mon, Feb 06, 2017 at 09:56:24AM +0100, Christoph Hellwig wrote:
> Did anything happen to this series?  Having it would be extremely
> useful for the block layer moving forward.

So I'd prefer allocating extra memory for the counter instead of playing tricky
games. I'll post some patches soon.

Thanks,
Shaohua

^ permalink raw reply

* Re: [PATCH 5/6] dmaengine: Add Broadcom SBA RAID driver
From: Anup Patel @ 2017-02-07  6:02 UTC (permalink / raw)
  To: Vinod Koul
  Cc: Rob Herring, Mark Rutland, Herbert Xu, David S . Miller,
	Jassi Brar, Dan Williams, Ray Jui, Scott Branden, Jon Mason,
	Rob Rice, BCM Kernel Feedback, dmaengine, Device Tree,
	Linux ARM Kernel, Linux Kernel, linux-crypto, linux-raid
In-Reply-To: <20170206165400.GD19244@localhost>

On Mon, Feb 6, 2017 at 10:24 PM, Vinod Koul <vinod.koul@intel.com> wrote:
> On Mon, Feb 06, 2017 at 05:31:15PM +0530, Anup Patel wrote:
>
>> >> +
>> >> +/* SBA C_MDATA helper macros */
>> >> +#define SBA_C_MDATA_LOAD_VAL(__bnum0)                ((__bnum0) & 0x3)
>> >> +#define SBA_C_MDATA_WRITE_VAL(__bnum0)               ((__bnum0) & 0x3)
>> >> +#define SBA_C_MDATA_XOR_VAL(__bnum1, __bnum0)                        \
>> >> +                     ({      u32 __v = ((__bnum0) & 0x3);    \
>> >> +                             __v |= ((__bnum1) & 0x3) << 2;  \
>> >> +                             __v;                            \
>> >> +                     })
>> >> +#define SBA_C_MDATA_PQ_VAL(__dnum, __bnum1, __bnum0)         \
>> >> +                     ({      u32 __v = ((__bnum0) & 0x3);    \
>> >> +                             __v |= ((__bnum1) & 0x3) << 2;  \
>> >> +                             __v |= ((__dnum) & 0x1f) << 5;  \
>> >> +                             __v;                            \
>> >> +                     })
>> >
>> > ah why are we usig complex macros, why can't these be simple functions..
>>
>> "static inline functions" seemed too complicated here because most of
>> these macros are two lines of c-code.
>
> and thats where I have an issue with this. Macros for simple things is fine
> but not for couple of line of logic!
>
>>
>> Do you still insist on using "static inline functions"?
>
> Yes

Sure, will use "static inline functions" instead these macros.

>
>>
>> >
>> >> +#define SBA_C_MDATA_LS(__c_mdata_val)        ((__c_mdata_val) & 0xff)
>> >> +#define SBA_C_MDATA_MS(__c_mdata_val)        (((__c_mdata_val) >> 8) & 0x3)
>> >> +
>> >> +/* Driver helper macros */
>> >> +#define to_sba_request(tx)           \
>> >> +     container_of(tx, struct sba_request, tx)
>> >> +#define to_sba_device(dchan)         \
>> >> +     container_of(dchan, struct sba_device, dma_chan)
>> >> +
>> >> +enum sba_request_state {
>> >> +     SBA_REQUEST_STATE_FREE = 1,
>> >> +     SBA_REQUEST_STATE_ALLOCED = 2,
>> >> +     SBA_REQUEST_STATE_PENDING = 3,
>> >> +     SBA_REQUEST_STATE_ACTIVE = 4,
>> >> +     SBA_REQUEST_STATE_COMPLETED = 5,
>> >> +     SBA_REQUEST_STATE_ABORTED = 6,
>> >
>> > whats up with a very funny indentation setting, we use 8 chars.
>> >
>> > Please re-read the Documentation/process/coding-style.rst
>>
>> I have double checked this enum. The indentation is fine
>> and as-per coding style. Am I missing anything else?
>
> Somehow the initial indent doesnt seem to be 8 chars to me.
>
>> >> +static enum dma_status sba_tx_status(struct dma_chan *dchan,
>> >> +                                  dma_cookie_t cookie,
>> >> +                                  struct dma_tx_state *txstate)
>> >> +{
>> >> +     int mchan_idx;
>> >> +     enum dma_status ret;
>> >> +     struct sba_device *sba = to_sba_device(dchan);
>> >> +
>> >> +     ret = dma_cookie_status(dchan, cookie, txstate);
>> >> +     if (ret == DMA_COMPLETE)
>> >> +             return ret;
>> >> +
>> >> +     for (mchan_idx = 0; mchan_idx < sba->mchans_count; mchan_idx++)
>> >> +             mbox_client_peek_data(sba->mchans[mchan_idx]);
>> >
>> > what is this achieving?
>>
>> The mbox_client_peek_data() is a hint to mailbox controller driver
>> to check for available messages.
>>
>> This gives good performance improvement when some DMA client
>> code is polling using tx_status() callback.
>
> Then why do it before and then check status.

If there was a work completed when mbox_client_peek_data()
is called then sba_receive_message() will be called immediately
by mailbox controller driver.

We are doing dma_cookie_complete() in sba_receive_message()
so if mbox_client_peek_data() is called before dma_cookie_status()
then dma_cookie_status() will see correct state of cookie.

Also, I explored virt-dma APIs for BCM-SBA-RAID driver. The virt-dma
implements tasklet based bottom-half for each virt-dma-channel. This
bottom-half is not required for BCM-FS4-RAID driver because its a
mailbox client driver and the mailbox controller driver already implements
bottom-half for each mailbox channel.

If we still go ahead and use virt-dma in BCM-FS4-RAID driver then we
will have two bottom-halfs in-action one in mailbox controller driver and
another in BCM-FS4-RAID driver which in-turn will add bottom-half
scheduling overhead thereby reducing performance of BCM-FS4-RAID
driver.

Regards,
Anup

^ permalink raw reply

* [PATCH v2 0/5] Broadcom SBA RAID support
From: Anup Patel @ 2017-02-07  8:16 UTC (permalink / raw)
  To: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar
  Cc: Dan Williams, Ray Jui, Scott Branden, Jon Mason, Rob Rice,
	bcm-kernel-feedback-list, dmaengine, devicetree, linux-arm-kernel,
	linux-kernel, linux-crypto, linux-raid, Anup Patel

The Broadcom SBA RAID is a stream-based device which provides
RAID5/6 offload.

It requires a SoC specific ring manager (such as Broadcom FlexRM
ring manager) to provide ring-based programming interface. Due to
this, the Broadcom SBA RAID driver (mailbox client) implements
DMA device having one DMA channel using a set of mailbox channels
provided by Broadcom SoC specific ring manager driver (mailbox
controller).

Important limitations of Broadcom SBA RAID hardware are:
1. Requires disk position instead of disk coefficient 
2. Supports only 30 PQ disk coefficients

To address limitation #1, we have added raid_gflog table which
will help driver convert disk coefficient to disk position. To
address limitation #2, we have extended Linux Async Tx APIs to
check for available PQ coefficients before doing PQ offload.

This patchset is based on Linux-4.10-rc2 and depends on patchset
"[PATCH v4 0/2] Broadcom FlexRM ring manager support"

It is also available at sba-raid-v2 branch of
https://github.com/Broadcom/arm64-linux.git

Changes since v1:
 - Droped patch to add mbox_channel_device() API
 - Used GENMASK and BIT macros wherever possible in bcm-sba-raid driver
 - Replaced C_MDATA macros with static inline functions in
   bcm-sba-raid driver
 - Removed sba_alloc_chan_resources() callback in bcm-sba-raid driver
 - Used dev_err() instead of dev_info() wherever applicable
 - Removed call to sba_issue_pending() from sba_tx_submit() in
   bcm-sba-raid driver
 - Implemented SBA request chaning for handling (len > sba->req_size)
   in bcm-sba-raid driver
 - Implemented device_terminate_all() callback in bcm-sba-raid driver

Anup Patel (5):
  lib/raid6: Add log-of-2 table for RAID6 HW requiring disk position
  async_tx: Handle DMA devices having support for fewer PQ coefficients
  async_tx: Fix DMA_PREP_FENCE usage in do_async_gen_syndrome()
  dmaengine: Add Broadcom SBA RAID driver
  dt-bindings: Add DT bindings document for Broadcom SBA RAID driver

 .../devicetree/bindings/dma/brcm,iproc-sba.txt     |   29 +
 crypto/async_tx/async_pq.c                         |    8 +-
 crypto/async_tx/async_raid6_recov.c                |   12 +-
 drivers/dma/Kconfig                                |   13 +
 drivers/dma/Makefile                               |    1 +
 drivers/dma/bcm-sba-raid.c                         | 1470 ++++++++++++++++++++
 include/linux/dmaengine.h                          |   19 +
 include/linux/raid/pq.h                            |    4 +
 lib/raid6/mktables.c                               |   20 +
 9 files changed, 1571 insertions(+), 5 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/dma/brcm,iproc-sba.txt
 create mode 100644 drivers/dma/bcm-sba-raid.c

-- 
2.7.4

^ permalink raw reply

* [PATCH v2 1/5] lib/raid6: Add log-of-2 table for RAID6 HW requiring disk position
From: Anup Patel @ 2017-02-07  8:16 UTC (permalink / raw)
  To: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar
  Cc: Dan Williams, Ray Jui, Scott Branden, Jon Mason, Rob Rice,
	bcm-kernel-feedback-list, dmaengine, devicetree, linux-arm-kernel,
	linux-kernel, linux-crypto, linux-raid, Anup Patel
In-Reply-To: <1486455406-11202-1-git-send-email-anup.patel@broadcom.com>

The raid6_gfexp table represents {2}^n values for 0 <= n < 256. The
Linux async_tx framework pass values from raid6_gfexp as coefficients
for each source to prep_dma_pq() callback of DMA channel with PQ
capability. This creates problem for RAID6 offload engines (such as
Broadcom SBA) which take disk position (i.e. log of {2}) instead of
multiplicative cofficients from raid6_gfexp table.

This patch adds raid6_gflog table having log-of-2 value for any given
x such that 0 <= x < 256. For any given disk coefficient x, the
corresponding disk position is given by raid6_gflog[x]. The RAID6
offload engine driver can use this newly added raid6_gflog table to
get disk position from multiplicative coefficient.

Signed-off-by: Anup Patel <anup.patel@broadcom.com>
Reviewed-by: Scott Branden <scott.branden@broadcom.com>
Reviewed-by: Ray Jui <ray.jui@broadcom.com>
---
 include/linux/raid/pq.h |  1 +
 lib/raid6/mktables.c    | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 4d57bba..30f9453 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -142,6 +142,7 @@ int raid6_select_algo(void);
 extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256)));
 extern const u8 raid6_vgfmul[256][32] __attribute__((aligned(256)));
 extern const u8 raid6_gfexp[256]      __attribute__((aligned(256)));
+extern const u8 raid6_gflog[256]      __attribute__((aligned(256)));
 extern const u8 raid6_gfinv[256]      __attribute__((aligned(256)));
 extern const u8 raid6_gfexi[256]      __attribute__((aligned(256)));
 
diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c
index 39787db..e824d08 100644
--- a/lib/raid6/mktables.c
+++ b/lib/raid6/mktables.c
@@ -125,6 +125,26 @@ int main(int argc, char *argv[])
 	printf("EXPORT_SYMBOL(raid6_gfexp);\n");
 	printf("#endif\n");
 
+	/* Compute log-of-2 table */
+	printf("\nconst u8 __attribute__((aligned(256)))\n"
+	       "raid6_gflog[256] =\n" "{\n");
+	for (i = 0; i < 256; i += 8) {
+		printf("\t");
+		for (j = 0; j < 8; j++) {
+			v = 255;
+			for (k = 0; k < 256; k++)
+				if (exptbl[k] == (i + j)) {
+					v = k;
+					break;
+				}
+			printf("0x%02x,%c", v, (j == 7) ? '\n' : ' ');
+		}
+	}
+	printf("};\n");
+	printf("#ifdef __KERNEL__\n");
+	printf("EXPORT_SYMBOL(raid6_gflog);\n");
+	printf("#endif\n");
+
 	/* Compute inverse table x^-1 == x^254 */
 	printf("\nconst u8 __attribute__((aligned(256)))\n"
 	       "raid6_gfinv[256] =\n" "{\n");
-- 
2.7.4


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox