mdadm raid6 sequential read slower than reading from userspace

All of lore.kernel.org
 help / color / mirror / Atom feed

* mdadm raid6 sequential read slower than reading from userspace
@ 2017-02-03 22:24 Stevie Trujillo
  0 siblings, 0 replies; only message in thread
From: Stevie Trujillo @ 2017-02-03 22:24 UTC (permalink / raw)
  To: linux-raid, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1798 bytes --]

Hello

Kernel:     Linux version 4.9.0-1-amd64 (debian-kernel@lists.debian.org)
            (gcc version 6.3.0 20161229 (Debian 6.3.0-2) ) #1 SMP Debian
            4.9.2-2 (2017-01-12)
CPU:        2xE5-2665
Memory:     256GB
Drives:     6x8TB Seagate
Controller: LSI2008
md0 : active raid6 sdb1[1] sda1[0] sdd1[3] sde1[4] sdc1[2] sdf1[5]
      31255576576 blocks super 1.2 level 6, 512k chunk, algorithm 2
[6/6] [UUUUUU] bitmap: 0/59 pages [0KB], 65536KB chunk

When I read sequentially from one of the disks I get 230-245MB/s. If I
read from all of them at the same time, the performance stays the same
(even if I bind all the dd processes to the same core).
Conclusion: I think the controller is not a bottleneck.

I first tried Debian8 with 3.16 and got 400-500MB/s when dd-ing
from /dev/md0. Upgrading to Debian9 with 4.9.2 roughly doubled my
performance:
53687091200 bytes (54 GB, 50 GiB) copied, 62.0078 s, 866 MB/s
53687091200 bytes (54 GB, 50 GiB) copied, 57.9882 s, 926 MB/s

dd uses 40% cpu and I can't find any process that uses more, so I don't
think I'm limited by CPU.

I wrote a small program that reads directly from the disks and outputs
the same data as reading from md0 would do. It's faster and has
more stable runtime than reading from md0: it finishes in 44.0 +-
0.2seconds (that is ~1150MB/s).

Is it possible to make mdadm work faster? I was hoping it could read
6x240MB/s, but maybe that's not possible. At least I think it should be
able to do 1150MB/s like userspace?
How can I find out what bottleneck? I couldn't see anything obvious
like 100% cpu usage.
I tried copying different tuning instructions I found on Google, but
they usually made negative impact if any.

I attached the program, but I'm still learning programming so it's not
very good.

--
Stevie Trujillo

[-- Attachment #2: raid6read.cc --]
[-- Type: text/x-c++src, Size: 8079 bytes --]

#include <vector>
#include <queue>
#include <thread>
#include <mutex>
#include <condition_variable>
using namespace std;
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <inttypes.h>
#include <sys/poll.h>
#include <scsi/sg.h>
#include <assert.h>
#include <err.h>

#define READ_16 0x88
#define NUM_DISKS 6
#define MAX_READAHEAD 16
#define CHUNK_SIZE (512*1024)

namespace {

struct BufferStorage
{
	int ref_count;
	unsigned char *memory;
};

struct Buffer
{
	BufferStorage *storage;
	unsigned char *buf;
};

struct Request
{
	uint64_t output_idx;
	uint64_t chunk;
};

struct Response
{
	uint64_t output_idx;
	Buffer buffer;
};

struct PendingIO
{
	BufferStorage *storage;
	uint64_t output_idx[4];
};

struct Disk
{
	int sg_fd;
	PendingIO pending_io[MAX_READAHEAD];
	unsigned char slot_i;
	unsigned char slots[MAX_READAHEAD];
	int current_request;
	vector<Request> requests;
};

struct Raid6
{
	Disk disks[6];
};

struct ThreadData
{
	mutex m;
	condition_variable cv;
	uint64_t last_idx;
	vector<Response> responses;
};

static void
sg_read(int sg_fd, void *buf, int pack_id, uint64_t lba, uint64_t len_lba)
{
	uint64_t len_bytes = 512 * len_lba;

	unsigned char cdb[16] = {};
	cdb[0] = READ_16;
	cdb[2] = (lba >> 56) & 0xff;
	cdb[3] = (lba >> 48) & 0xff;
	cdb[4] = (lba >> 40) & 0xff;
	cdb[5] = (lba >> 32) & 0xff;
	cdb[6] = (lba >> 24) & 0xff;
	cdb[7] = (lba >> 16) & 0xff;
	cdb[8] = (lba >> 8) & 0xff;
	cdb[9] = (lba >> 0) & 0xff;
	cdb[10] = (len_lba >> 24) & 0xff;
	cdb[11] = (len_lba >> 16) & 0xff;
	cdb[12] = (len_lba >> 8) & 0xff;
	cdb[13] = (len_lba >> 0) & 0xff;

	sg_io_hdr_t io_hdr;
	memset(&io_hdr, '\0', sizeof(io_hdr));
	io_hdr.interface_id = 'S'; /* SCSI Generic Interface */
	io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
	io_hdr.cmd_len = sizeof(cdb);
	io_hdr.cmdp = cdb;
	io_hdr.dxfer_len = len_bytes;
	io_hdr.dxferp = buf;
	io_hdr.timeout = 20000;
	io_hdr.pack_id = pack_id;

	if (write(sg_fd, &io_hdr, sizeof(io_hdr)) != sizeof(io_hdr))
		err(1, "write");
}

void queue_requests(Disk *disk)
{
	while ((size_t) disk->current_request < disk->requests.size() && disk->slot_i < MAX_READAHEAD) {
		int batch_requests = 1;
		for (; batch_requests < 4 && (size_t) disk->current_request + batch_requests < disk->requests.size(); ++batch_requests) {
			if (disk->requests[disk->current_request].chunk + batch_requests
			 != disk->requests[disk->current_request + batch_requests].chunk
			)
				break;
		}

		unsigned char slot = disk->slots[disk->slot_i++];
		struct PendingIO *pending = &disk->pending_io[slot];
		for (int i = 0; i < 4; ++i)
			pending->output_idx[i] = ~(uint64_t) 0;

		for (int i = 0; i < batch_requests; ++i)
			pending->output_idx[i] = disk->requests[disk->current_request + i].output_idx;

		uint64_t len_bytes = batch_requests * CHUNK_SIZE;
		pending->storage = new BufferStorage;
		pending->storage->ref_count = 0;
		void *buf;
		posix_memalign(&buf, 0x1000, len_bytes);
		pending->storage->memory = (unsigned char *) buf;

		const Request &r = disk->requests[disk->current_request];
		sg_read(disk->sg_fd, pending->storage->memory, slot, r.chunk * CHUNK_SIZE / 512, batch_requests * CHUNK_SIZE / 512);

		disk->current_request += batch_requests;
	}
}

void read_response(vector<Response> &responses, Disk *disk)
{
	sg_io_hdr_t io_hdr;
	memset(&io_hdr, '\0', sizeof(io_hdr));
	io_hdr.interface_id = 'S'; /* SCSI Generic Interface */
	io_hdr.pack_id = -1;

	if (read(disk->sg_fd, &io_hdr, sizeof(io_hdr)) != sizeof(io_hdr))
		err(1, "read");

	assert(io_hdr.pack_id >= 0 && io_hdr.pack_id < MAX_READAHEAD);
	PendingIO *pending = &disk->pending_io[io_hdr.pack_id];

	for (int j = 0; j < 4; ++j) {
		if (pending->output_idx[j] == ~(uint64_t) 0)
			break;

		Buffer buffer;
		buffer.storage = pending->storage;
		buffer.storage->ref_count += 1;
		buffer.buf = buffer.storage->memory + CHUNK_SIZE * j;
		responses.push_back(Response{pending->output_idx[j], buffer});
	}

	disk->slots[--disk->slot_i] = io_hdr.pack_id;
}

/* write the data we read to stdout in correct order */
void writer_function(ThreadData *td)
{
	auto cmp = [](const Response &a, const Response &b) { return a.output_idx > b.output_idx; };
	priority_queue<Response, vector<Response>, decltype(cmp)> responses(cmp);
	uint64_t current_idx = 0;

	while (current_idx < td->last_idx) {
		if (responses.empty() || responses.top().output_idx != current_idx) {
			vector<Response> tmp;
			{
				unique_lock<mutex> lk(td->m);
				td->cv.wait(lk, [=](){ return !td->responses.empty(); });
				tmp = move(td->responses);
			}

			for (const Response &r : tmp)
				responses.push(r);

			continue;
		}

		Response r = responses.top();
		responses.pop();
		unsigned char *buf = r.buffer.buf;
		size_t size = CHUNK_SIZE;

		while (size) {
			ssize_t bytes_written = write(1, buf, size);
			if (bytes_written < 0)
				err(1, "write");

			buf += bytes_written;
			size -= bytes_written;
		}

		if (--r.buffer.storage->ref_count == 0) {
			free(r.buffer.storage->memory);
			delete r.buffer.storage;
		}

		++current_idx;
	}
}

/* run all the disks from the same thread */
void run_sg_poll(Raid6 *raid, ThreadData *writer_td)
{
	for (;;) {
		struct pollfd pfds[NUM_DISKS];
		int nfds = 0;

		for (int i = 0; i < NUM_DISKS; ++i) {
			Disk *disk = &raid->disks[i];
			if (disk->sg_fd < 0)
				continue;

			queue_requests(disk);
			if ((size_t) disk->current_request == disk->requests.size() && disk->slot_i == 0) {
				close(disk->sg_fd);
				disk->sg_fd = -1;
				continue;
			}

			pfds[nfds++] = (struct pollfd) { raid->disks[i].sg_fd, POLLIN, 0 };
		}

		if (!nfds)
			break;

		int ret = poll(pfds, nfds, -1);
		if (ret <= 0)
			err(1, "poll");

		vector<Response> responses;
		for (int i = 0; i < NUM_DISKS; ++i) {
			Disk *disk = &raid->disks[i];
			if (disk->sg_fd >= 0)
				read_response(responses, disk);
		}

		{
			unique_lock<mutex> lk(writer_td->m);
			writer_td->responses.insert(writer_td->responses.end(), responses.begin(), responses.end());
		}
		writer_td->cv.notify_one();
	}
}

/* run each disk from one thread */
void run_sg_single(Raid6 *raid, Disk *disk, ThreadData *writer_td)
{
	for (;;) {
		queue_requests(disk);
		if ((size_t) disk->current_request == disk->requests.size() && disk->slot_i == 0) {
			close(disk->sg_fd);
			disk->sg_fd = -1;
			break;
		}

		vector<Response> responses;
		read_response(responses, disk);

		{
			unique_lock<mutex> lk(writer_td->m);
			writer_td->responses.insert(writer_td->responses.end(), responses.begin(), responses.end());
		}
		writer_td->cv.notify_one();
	}
}

}

int main(int argc, char **argv)
{
	if (argc != 1 + NUM_DISKS)
		errx(1, "usage: disks");

	Raid6 raid;

	for (int i = 0; i < NUM_DISKS; ++i) {
		const char *path = argv[1 + i];
		Disk *disk = &raid.disks[i];

		disk->sg_fd = open(path, O_RDWR);
		if (disk->sg_fd < 0)
			err(1, "open(%s)", path);

		disk->current_request = 0;
		disk->slot_i = 0;
		for (int i = 0; i < MAX_READAHEAD; ++i)
			disk->slots[i] = i;
	}

	uint64_t num_chunks = 102400; // 50TB

	/* precompute all the chunks we want the disks to read */
	uint64_t output_idx = 0;
	for (uint64_t chunk = 0; chunk < num_chunks; ++chunk) {
		uint64_t data_offset = 2048 * 512 / (512*1024) /* from partitioning */
		                     + 256*1024*512 / (512*1024); /* from mdadm --examine */
		/*
		 * stripe0: bcde|fa
		 * stripe1: abcd|ef
		 * stripe2: fabc|de
		 */
		int64_t stripe = chunk / 4;
		uint64_t slot = chunk % 4;
		int64_t disk_idx = 1 - stripe + slot;
		disk_idx %= 6;
		disk_idx = disk_idx + (disk_idx >> 63 & 6);
		raid.disks[disk_idx].requests.push_back(Request{output_idx++, data_offset + stripe});
	}

	ThreadData writer_td;
	writer_td.last_idx = output_idx;

	thread writer_thread(writer_function, &writer_td);

	if (0) {
		run_sg_poll(&raid, &writer_td);
	} else {
		thread threads[6];

		for (int i = 0; i < NUM_DISKS; ++i)
			threads[i] = move(thread(run_sg_single, &raid, &raid.disks[i], &writer_td));

		for (int i = 0; i < NUM_DISKS; ++i)
			threads[i].join();
	}

	writer_thread.join();
	return 0;
}

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2017-02-03 22:24 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-02-03 22:24 mdadm raid6 sequential read slower than reading from userspace Stevie Trujillo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.