All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/9] dm-crypt patches
@ 2014-04-05 18:04 Mikulas Patocka
  2014-04-05 18:04 ` [PATCH 1/9] dm-crypt: remove per-cpu structure Mikulas Patocka
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Mikulas Patocka @ 2014-04-05 18:04 UTC (permalink / raw)
  To: Mike Snitzer, Ondrej Kozina; +Cc: dm-devel

Hi

I fixed the bug that Ondrej found (it happened only under memory pressure, 
that's why I couldn't reproduce it), here I'm resending the dm-crypt 
patches.

Mikulas

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/9] dm-crypt: remove per-cpu structure
  2014-04-05 18:04 [PATCH 0/9] dm-crypt patches Mikulas Patocka
@ 2014-04-05 18:04 ` Mikulas Patocka
  2014-04-05 18:05   ` [PATCH 2/9] bio: use kmalloc alignment for bio slab Mikulas Patocka
  2014-04-07  1:01 ` [PATCH 0/9] dm-crypt patches Mike Snitzer
  2014-04-08 15:25 ` Ondrej Kozina
  2 siblings, 1 reply; 15+ messages in thread
From: Mikulas Patocka @ 2014-04-05 18:04 UTC (permalink / raw)
  To: Mike Snitzer, Ondrej Kozina; +Cc: dm-devel

Dm-crypt used per-cpu structures to hold pointers to ablkcipher_request.
The code assumed that the work item keeps executing on a single CPU, so it
used no synchronization when accessing this structure.

When we disable a CPU by writing zero to
/sys/devices/system/cpu/cpu*/online, the work item could be moved to
another CPU. This causes crashes in dm-crypt because the code starts using
a wrong ablkcipher_request.

This patch fixes this bug by removing the percpu definition. The structure
ablkcipher_request is accessed via a pointer from convert_context.
Consequently, if the work item is rescheduled to a different CPU, the
thread still uses the same ablkcipher_request.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: stable@vger.kernel.org

---
 drivers/md/dm-crypt.c |   61 +++++++++-----------------------------------------
 1 file changed, 12 insertions(+), 49 deletions(-)

Index: linux-3.14-rc1/drivers/md/dm-crypt.c
===================================================================
--- linux-3.14-rc1.orig/drivers/md/dm-crypt.c	2014-02-03 19:18:23.000000000 +0100
+++ linux-3.14-rc1/drivers/md/dm-crypt.c	2014-02-03 19:21:35.000000000 +0100
@@ -19,7 +19,6 @@
 #include <linux/crypto.h>
 #include <linux/workqueue.h>
 #include <linux/backing-dev.h>
-#include <linux/percpu.h>
 #include <linux/atomic.h>
 #include <linux/scatterlist.h>
 #include <asm/page.h>
@@ -43,6 +42,7 @@ struct convert_context {
 	struct bvec_iter iter_out;
 	sector_t cc_sector;
 	atomic_t cc_pending;
+	struct ablkcipher_request *req;
 };
 
 /*
@@ -111,15 +111,7 @@ struct iv_tcw_private {
 enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
 
 /*
- * Duplicated per-CPU state for cipher.
- */
-struct crypt_cpu {
-	struct ablkcipher_request *req;
-};
-
-/*
- * The fields in here must be read only after initialization,
- * changing state should be in crypt_cpu.
+ * The fields in here must be read only after initialization.
  */
 struct crypt_config {
 	struct dm_dev *dev;
@@ -150,12 +142,6 @@ struct crypt_config {
 	sector_t iv_offset;
 	unsigned int iv_size;
 
-	/*
-	 * Duplicated per cpu state. Access through
-	 * per_cpu_ptr() only.
-	 */
-	struct crypt_cpu __percpu *cpu;
-
 	/* ESSIV: struct crypto_cipher *essiv_tfm */
 	void *iv_private;
 	struct crypto_ablkcipher **tfms;
@@ -192,11 +178,6 @@ static void clone_init(struct dm_crypt_i
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
 static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
 
-static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
-{
-	return this_cpu_ptr(cc->cpu);
-}
-
 /*
  * Use this to access cipher attributes that are the same for each CPU.
  */
@@ -903,16 +884,15 @@ static void kcryptd_async_done(struct cr
 static void crypt_alloc_req(struct crypt_config *cc,
 			    struct convert_context *ctx)
 {
-	struct crypt_cpu *this_cc = this_crypt_config(cc);
 	unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
 
-	if (!this_cc->req)
-		this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+	if (!ctx->req)
+		ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO);
 
-	ablkcipher_request_set_tfm(this_cc->req, cc->tfms[key_index]);
-	ablkcipher_request_set_callback(this_cc->req,
+	ablkcipher_request_set_tfm(ctx->req, cc->tfms[key_index]);
+	ablkcipher_request_set_callback(ctx->req,
 	    CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-	    kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
+	    kcryptd_async_done, dmreq_of_req(cc, ctx->req));
 }
 
 /*
@@ -921,7 +901,6 @@ static void crypt_alloc_req(struct crypt
 static int crypt_convert(struct crypt_config *cc,
 			 struct convert_context *ctx)
 {
-	struct crypt_cpu *this_cc = this_crypt_config(cc);
 	int r;
 
 	atomic_set(&ctx->cc_pending, 1);
@@ -932,7 +911,7 @@ static int crypt_convert(struct crypt_co
 
 		atomic_inc(&ctx->cc_pending);
 
-		r = crypt_convert_block(cc, ctx, this_cc->req);
+		r = crypt_convert_block(cc, ctx, ctx->req);
 
 		switch (r) {
 		/* async */
@@ -941,7 +920,7 @@ static int crypt_convert(struct crypt_co
 			reinit_completion(&ctx->restart);
 			/* fall through*/
 		case -EINPROGRESS:
-			this_cc->req = NULL;
+			ctx->req = NULL;
 			ctx->cc_sector++;
 			continue;
 
@@ -1040,6 +1019,7 @@ static struct dm_crypt_io *crypt_io_allo
 	io->sector = sector;
 	io->error = 0;
 	io->base_io = NULL;
+	io->ctx.req = NULL;
 	atomic_set(&io->io_pending, 0);
 
 	return io;
@@ -1065,6 +1045,8 @@ static void crypt_dec_pending(struct dm_
 	if (!atomic_dec_and_test(&io->io_pending))
 		return;
 
+	if (io->ctx.req)
+		mempool_free(io->ctx.req, cc->req_pool);
 	mempool_free(io, cc->io_pool);
 
 	if (likely(!base_io))
@@ -1492,8 +1474,6 @@ static int crypt_wipe_key(struct crypt_c
 static void crypt_dtr(struct dm_target *ti)
 {
 	struct crypt_config *cc = ti->private;
-	struct crypt_cpu *cpu_cc;
-	int cpu;
 
 	ti->private = NULL;
 
@@ -1505,13 +1485,6 @@ static void crypt_dtr(struct dm_target *
 	if (cc->crypt_queue)
 		destroy_workqueue(cc->crypt_queue);
 
-	if (cc->cpu)
-		for_each_possible_cpu(cpu) {
-			cpu_cc = per_cpu_ptr(cc->cpu, cpu);
-			if (cpu_cc->req)
-				mempool_free(cpu_cc->req, cc->req_pool);
-		}
-
 	crypt_free_tfms(cc);
 
 	if (cc->bs)
@@ -1530,9 +1503,6 @@ static void crypt_dtr(struct dm_target *
 	if (cc->dev)
 		dm_put_device(ti, cc->dev);
 
-	if (cc->cpu)
-		free_percpu(cc->cpu);
-
 	kzfree(cc->cipher);
 	kzfree(cc->cipher_string);
 
@@ -1588,13 +1558,6 @@ static int crypt_ctr_cipher(struct dm_ta
 	if (tmp)
 		DMWARN("Ignoring unexpected additional cipher options");
 
-	cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)),
-				 __alignof__(struct crypt_cpu));
-	if (!cc->cpu) {
-		ti->error = "Cannot allocate per cpu state";
-		goto bad_mem;
-	}
-
 	/*
 	 * For compatibility with the original dm-crypt mapping format, if
 	 * only the cipher name is supplied, use cbc-plain.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 2/9] bio: use kmalloc alignment for bio slab
  2014-04-05 18:04 ` [PATCH 1/9] dm-crypt: remove per-cpu structure Mikulas Patocka
@ 2014-04-05 18:05   ` Mikulas Patocka
  2014-04-05 18:05     ` [PATCH 3/9] dm-crypt: use per-bio data Mikulas Patocka
  0 siblings, 1 reply; 15+ messages in thread
From: Mikulas Patocka @ 2014-04-05 18:05 UTC (permalink / raw)
  To: Mike Snitzer, Ondrej Kozina; +Cc: dm-devel

Various subsystems can ask the bio subsystem to create a bio slab cache
with some free space before the bio. This free space can be used for any
purpose. Device mapper uses this feature to place some target-specific and
device-mapper specific data before the bio, so that the target-specific
data doesn't have to be allocated separatedly.

This mechanism is used in place of kmalloc, so we need that the allocated
slab have the same memory alignment as memory allocated with kmalloc.

This patch changes the function bio_find_or_create_slab so that it uses
ARCH_KMALLOC_MINALIGN alignment when creating the slab cache. This patch
is needed so that dm-crypt can use per-bio data for encryption - the
crypto subsystem assumes that these data have the same alignment as
kmallocated memory.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 fs/bio.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

Index: linux-3.14-rc3/fs/bio.c
===================================================================
--- linux-3.14-rc3.orig/fs/bio.c	2014-02-23 23:53:50.000000000 +0100
+++ linux-3.14-rc3/fs/bio.c	2014-02-23 23:55:00.000000000 +0100
@@ -112,7 +112,8 @@ static struct kmem_cache *bio_find_or_cr
 	bslab = &bio_slabs[entry];
 
 	snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
-	slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
+	slab = kmem_cache_create(bslab->name, sz, ARCH_KMALLOC_MINALIGN,
+				 SLAB_HWCACHE_ALIGN, NULL);
 	if (!slab)
 		goto out_unlock;
 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 3/9] dm-crypt: use per-bio data
  2014-04-05 18:05   ` [PATCH 2/9] bio: use kmalloc alignment for bio slab Mikulas Patocka
@ 2014-04-05 18:05     ` Mikulas Patocka
  2014-04-05 18:06       ` [PATCH 4/9] dm-crypt: use unbound workqueue for request processing Mikulas Patocka
  0 siblings, 1 reply; 15+ messages in thread
From: Mikulas Patocka @ 2014-04-05 18:05 UTC (permalink / raw)
  To: Mike Snitzer, Ondrej Kozina; +Cc: dm-devel

This patch changes dm-crypt so that it uses auxiliary data allocated with
the bio.

Dm-crypt requires two allocations per request - struct dm_crypt_io and
struct ablkcipher_request (with other data appended to it). It used
mempool for the allocation.

Some requests may require more dm_crypt_ios and ablkcipher_requests,
however most requests need just one of each of these two structures to
complete.

This patch changes it so that the first dm_crypt_io and ablkcipher_request
and allocated with the bio (using target per_bio_data_size option). If the
request needs additional values, they are allocated from the mempool.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 drivers/md/dm-crypt.c |   40 ++++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 14 deletions(-)

Index: linux-3.14-rc4/drivers/md/dm-crypt.c
===================================================================
--- linux-3.14-rc4.orig/drivers/md/dm-crypt.c	2014-02-27 17:48:31.000000000 +0100
+++ linux-3.14-rc4/drivers/md/dm-crypt.c	2014-02-27 17:48:31.000000000 +0100
@@ -59,7 +59,7 @@ struct dm_crypt_io {
 	int error;
 	sector_t sector;
 	struct dm_crypt_io *base_io;
-};
+} CRYPTO_MINALIGN_ATTR;
 
 struct dm_crypt_request {
 	struct convert_context *ctx;
@@ -162,6 +162,8 @@ struct crypt_config {
 	 */
 	unsigned int dmreq_start;
 
+	unsigned int per_bio_data_size;
+
 	unsigned long flags;
 	unsigned int key_size;
 	unsigned int key_parts;      /* independent parts in key buffer */
@@ -895,6 +897,14 @@ static void crypt_alloc_req(struct crypt
 	    kcryptd_async_done, dmreq_of_req(cc, ctx->req));
 }
 
+static void crypt_free_req(struct crypt_config *cc,
+			   struct ablkcipher_request *req, struct bio *base_bio)
+{
+	struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
+	if ((struct ablkcipher_request *)(io + 1) != req)
+		mempool_free(req, cc->req_pool);
+}
+
 /*
  * Encrypt / decrypt data from one bio to another one (can be the same one)
  */
@@ -1008,12 +1018,9 @@ static void crypt_free_buffer_pages(stru
 	}
 }
 
-static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
-					  struct bio *bio, sector_t sector)
+static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc,
+			  struct bio *bio, sector_t sector)
 {
-	struct dm_crypt_io *io;
-
-	io = mempool_alloc(cc->io_pool, GFP_NOIO);
 	io->cc = cc;
 	io->base_bio = bio;
 	io->sector = sector;
@@ -1021,8 +1028,6 @@ static struct dm_crypt_io *crypt_io_allo
 	io->base_io = NULL;
 	io->ctx.req = NULL;
 	atomic_set(&io->io_pending, 0);
-
-	return io;
 }
 
 static void crypt_inc_pending(struct dm_crypt_io *io)
@@ -1046,8 +1051,9 @@ static void crypt_dec_pending(struct dm_
 		return;
 
 	if (io->ctx.req)
-		mempool_free(io->ctx.req, cc->req_pool);
-	mempool_free(io, cc->io_pool);
+		crypt_free_req(cc, io->ctx.req, base_bio);
+	if (io != dm_per_bio_data(base_bio, cc->per_bio_data_size))
+		mempool_free(io, cc->io_pool);
 
 	if (likely(!base_io))
 		bio_endio(base_bio, error);
@@ -1255,8 +1261,8 @@ static void kcryptd_crypt_write_convert(
 		 * between fragments, so switch to a new dm_crypt_io structure.
 		 */
 		if (unlikely(!crypt_finished && remaining)) {
-			new_io = crypt_io_alloc(io->cc, io->base_bio,
-						sector);
+			new_io = mempool_alloc(cc->io_pool, GFP_NOIO);
+			crypt_io_init(new_io, io->cc, io->base_bio, sector);
 			crypt_inc_pending(new_io);
 			crypt_convert_init(cc, &new_io->ctx, NULL,
 					   io->base_bio, sector);
@@ -1325,7 +1331,7 @@ static void kcryptd_async_done(struct cr
 	if (error < 0)
 		io->error = -EIO;
 
-	mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
+	crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
 
 	if (!atomic_dec_and_test(&ctx->cc_pending))
 		return;
@@ -1728,6 +1734,10 @@ static int crypt_ctr(struct dm_target *t
 		goto bad;
 	}
 
+	cc->per_bio_data_size = ti->per_bio_data_size =
+				sizeof(struct dm_crypt_io) + cc->dmreq_start +
+				sizeof(struct dm_crypt_request) + cc->iv_size;
+
 	cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
 	if (!cc->page_pool) {
 		ti->error = "Cannot allocate page mempool";
@@ -1824,7 +1834,9 @@ static int crypt_map(struct dm_target *t
 		return DM_MAPIO_REMAPPED;
 	}
 
-	io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
+	io = dm_per_bio_data(bio, cc->per_bio_data_size);
+	crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
+	io->ctx.req = (struct ablkcipher_request *)(io + 1);
 
 	if (bio_data_dir(io->base_bio) == READ) {
 		if (kcryptd_io_read(io, GFP_NOWAIT))

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 4/9] dm-crypt: use unbound workqueue for request processing
  2014-04-05 18:05     ` [PATCH 3/9] dm-crypt: use per-bio data Mikulas Patocka
@ 2014-04-05 18:06       ` Mikulas Patocka
  2014-04-05 18:07         ` [PATCH 5/9] dm-crypt: don't allocate pages for a partial request Mikulas Patocka
  0 siblings, 1 reply; 15+ messages in thread
From: Mikulas Patocka @ 2014-04-05 18:06 UTC (permalink / raw)
  To: Mike Snitzer, Ondrej Kozina; +Cc: dm-devel

Use unbound workqueue so that work is automatically ballanced between
available CPUs.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 drivers/md/dm-crypt.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux-3.14-rc8/drivers/md/dm-crypt.c
===================================================================
--- linux-3.14-rc8.orig/drivers/md/dm-crypt.c	2014-03-25 22:57:00.000000000 +0100
+++ linux-3.14-rc8/drivers/md/dm-crypt.c	2014-03-28 17:09:14.000000000 +0100
@@ -1800,7 +1800,7 @@ static int crypt_ctr(struct dm_target *t
 	}
 
 	cc->crypt_queue = alloc_workqueue("kcryptd",
-					  WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
+					  WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus());
 	if (!cc->crypt_queue) {
 		ti->error = "Couldn't create kcryptd queue";
 		goto bad;

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 5/9] dm-crypt: don't allocate pages for a partial request
  2014-04-05 18:06       ` [PATCH 4/9] dm-crypt: use unbound workqueue for request processing Mikulas Patocka
@ 2014-04-05 18:07         ` Mikulas Patocka
  2014-04-05 18:07           ` [PATCH 6/9] dm-crypt: avoid deadlock in mempools Mikulas Patocka
  0 siblings, 1 reply; 15+ messages in thread
From: Mikulas Patocka @ 2014-04-05 18:07 UTC (permalink / raw)
  To: Mike Snitzer, Ondrej Kozina; +Cc: dm-devel

This patch changes crypt_alloc_buffer so that it always allocates pages for
a full request.

This change enables further simplification and removing of one refcounts
in the next patches.

Note: the next patch is needed to fix a theoretical deadlock

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 drivers/md/dm-crypt.c |  139 ++++++++++----------------------------------------
 1 file changed, 30 insertions(+), 109 deletions(-)

Index: linux-3.14/drivers/md/dm-crypt.c
===================================================================
--- linux-3.14.orig/drivers/md/dm-crypt.c	2014-04-04 20:48:50.000000000 +0200
+++ linux-3.14/drivers/md/dm-crypt.c	2014-04-04 20:57:36.000000000 +0200
@@ -58,7 +58,6 @@ struct dm_crypt_io {
 	atomic_t io_pending;
 	int error;
 	sector_t sector;
-	struct dm_crypt_io *base_io;
 } CRYPTO_MINALIGN_ATTR;
 
 struct dm_crypt_request {
@@ -172,7 +171,6 @@ struct crypt_config {
 };
 
 #define MIN_IOS        16
-#define MIN_POOL_PAGES 32
 
 static struct kmem_cache *_crypt_io_pool;
 
@@ -951,14 +949,13 @@ static int crypt_convert(struct crypt_co
 	return 0;
 }
 
+static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone);
+
 /*
  * Generate a new unfragmented bio with the given size
  * This should never violate the device limitations
- * May return a smaller bio when running out of pages, indicated by
- * *out_of_pages set to 1.
  */
-static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
-				      unsigned *out_of_pages)
+static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size)
 {
 	struct crypt_config *cc = io->cc;
 	struct bio *clone;
@@ -966,41 +963,27 @@ static struct bio *crypt_alloc_buffer(st
 	gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
 	unsigned i, len;
 	struct page *page;
+	struct bio_vec *bvec;
 
 	clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
 	if (!clone)
 		return NULL;
 
 	clone_init(io, clone);
-	*out_of_pages = 0;
 
 	for (i = 0; i < nr_iovecs; i++) {
 		page = mempool_alloc(cc->page_pool, gfp_mask);
-		if (!page) {
-			*out_of_pages = 1;
-			break;
-		}
-
-		/*
-		 * If additional pages cannot be allocated without waiting,
-		 * return a partially-allocated bio.  The caller will then try
-		 * to allocate more bios while submitting this partial bio.
-		 */
-		gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
 
 		len = (size > PAGE_SIZE) ? PAGE_SIZE : size;
 
-		if (!bio_add_page(clone, page, len, 0)) {
-			mempool_free(page, cc->page_pool);
-			break;
-		}
+		bvec = &clone->bi_io_vec[clone->bi_vcnt++];
+		bvec->bv_page = page;
+		bvec->bv_len = len;
+		bvec->bv_offset = 0;
 
-		size -= len;
-	}
+		clone->bi_iter.bi_size += len;
 
-	if (!clone->bi_iter.bi_size) {
-		bio_put(clone);
-		return NULL;
+		size -= len;
 	}
 
 	return clone;
@@ -1025,7 +1008,6 @@ static void crypt_io_init(struct dm_cryp
 	io->base_bio = bio;
 	io->sector = sector;
 	io->error = 0;
-	io->base_io = NULL;
 	io->ctx.req = NULL;
 	atomic_set(&io->io_pending, 0);
 }
@@ -1038,13 +1020,11 @@ static void crypt_inc_pending(struct dm_
 /*
  * One of the bios was finished. Check for completion of
  * the whole request and correctly clean up the buffer.
- * If base_io is set, wait for the last fragment to complete.
  */
 static void crypt_dec_pending(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->cc;
 	struct bio *base_bio = io->base_bio;
-	struct dm_crypt_io *base_io = io->base_io;
 	int error = io->error;
 
 	if (!atomic_dec_and_test(&io->io_pending))
@@ -1055,13 +1035,7 @@ static void crypt_dec_pending(struct dm_
 	if (io != dm_per_bio_data(base_bio, cc->per_bio_data_size))
 		mempool_free(io, cc->io_pool);
 
-	if (likely(!base_io))
-		bio_endio(base_bio, error);
-	else {
-		if (error && !base_io->error)
-			base_io->error = error;
-		crypt_dec_pending(base_io);
-	}
+	bio_endio(base_bio, error);
 }
 
 /*
@@ -1197,10 +1171,7 @@ static void kcryptd_crypt_write_convert(
 {
 	struct crypt_config *cc = io->cc;
 	struct bio *clone;
-	struct dm_crypt_io *new_io;
 	int crypt_finished;
-	unsigned out_of_pages = 0;
-	unsigned remaining = io->base_bio->bi_iter.bi_size;
 	sector_t sector = io->sector;
 	int r;
 
@@ -1210,80 +1181,30 @@ static void kcryptd_crypt_write_convert(
 	crypt_inc_pending(io);
 	crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, sector);
 
-	/*
-	 * The allocated buffers can be smaller than the whole bio,
-	 * so repeat the whole process until all the data can be handled.
-	 */
-	while (remaining) {
-		clone = crypt_alloc_buffer(io, remaining, &out_of_pages);
-		if (unlikely(!clone)) {
-			io->error = -ENOMEM;
-			break;
-		}
-
-		io->ctx.bio_out = clone;
-		io->ctx.iter_out = clone->bi_iter;
-
-		remaining -= clone->bi_iter.bi_size;
-		sector += bio_sectors(clone);
-
-		crypt_inc_pending(io);
-
-		r = crypt_convert(cc, &io->ctx);
-		if (r < 0)
-			io->error = -EIO;
-
-		crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
-
-		/* Encryption was already finished, submit io now */
-		if (crypt_finished) {
-			kcryptd_crypt_write_io_submit(io, 0);
-
-			/*
-			 * If there was an error, do not try next fragments.
-			 * For async, error is processed in async handler.
-			 */
-			if (unlikely(r < 0))
-				break;
+	clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size);
+	if (unlikely(!clone)) {
+		io->error = -EIO;
+		goto dec;
+	}
 
-			io->sector = sector;
-		}
+	io->ctx.bio_out = clone;
+	io->ctx.iter_out = clone->bi_iter;
 
-		/*
-		 * Out of memory -> run queues
-		 * But don't wait if split was due to the io size restriction
-		 */
-		if (unlikely(out_of_pages))
-			congestion_wait(BLK_RW_ASYNC, HZ/100);
+	sector += bio_sectors(clone);
 
-		/*
-		 * With async crypto it is unsafe to share the crypto context
-		 * between fragments, so switch to a new dm_crypt_io structure.
-		 */
-		if (unlikely(!crypt_finished && remaining)) {
-			new_io = mempool_alloc(cc->io_pool, GFP_NOIO);
-			crypt_io_init(new_io, io->cc, io->base_bio, sector);
-			crypt_inc_pending(new_io);
-			crypt_convert_init(cc, &new_io->ctx, NULL,
-					   io->base_bio, sector);
-			new_io->ctx.iter_in = io->ctx.iter_in;
-
-			/*
-			 * Fragments after the first use the base_io
-			 * pending count.
-			 */
-			if (!io->base_io)
-				new_io->base_io = io;
-			else {
-				new_io->base_io = io->base_io;
-				crypt_inc_pending(io->base_io);
-				crypt_dec_pending(io);
-			}
+	crypt_inc_pending(io);
+	r = crypt_convert(cc, &io->ctx);
+	if (r)
+		io->error = -EIO;
+	crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
 
-			io = new_io;
-		}
+	/* Encryption was already finished, submit io now */
+	if (crypt_finished) {
+		kcryptd_crypt_write_io_submit(io, 0);
+		io->sector = sector;
 	}
 
+dec:
 	crypt_dec_pending(io);
 }
 
@@ -1738,7 +1659,7 @@ static int crypt_ctr(struct dm_target *t
 				sizeof(struct dm_crypt_io) + cc->dmreq_start +
 				sizeof(struct dm_crypt_request) + cc->iv_size;
 
-	cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
+	cc->page_pool = mempool_create_page_pool(BIO_MAX_PAGES, 0);
 	if (!cc->page_pool) {
 		ti->error = "Cannot allocate page mempool";
 		goto bad;

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 6/9] dm-crypt: avoid deadlock in mempools
  2014-04-05 18:07         ` [PATCH 5/9] dm-crypt: don't allocate pages for a partial request Mikulas Patocka
@ 2014-04-05 18:07           ` Mikulas Patocka
  2014-04-05 18:08             ` [PATCH 7/9] dm-crypt: remove io_pool Mikulas Patocka
  0 siblings, 1 reply; 15+ messages in thread
From: Mikulas Patocka @ 2014-04-05 18:07 UTC (permalink / raw)
  To: Mike Snitzer, Ondrej Kozina; +Cc: dm-devel

This patch fixes a theoretical deadlock introduced in the previous patch.

The function crypt_alloc_buffer may be called concurrently. If we allocate
from the mempool concurrently, there is a possibility of deadlock.
For example, if we have mempool of 256 pages, two processes, each wanting 256,
pages allocate from the mempool concurrently, it may deadlock in a situation
where both processes have allocated 128 pages and the mempool is exhausted.

In order to avoid this scenarios, we allocate the pages under a mutex.

In order to not degrade performance with excessive locking, we try
non-blocking allocations without a mutex first and if it fails, we fallback
to a blocking allocation with a mutex.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 drivers/md/dm-crypt.c |   41 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 5 deletions(-)

Index: linux-3.14/drivers/md/dm-crypt.c
===================================================================
--- linux-3.14.orig/drivers/md/dm-crypt.c	2014-04-04 20:59:46.000000000 +0200
+++ linux-3.14/drivers/md/dm-crypt.c	2014-04-04 21:04:40.000000000 +0200
@@ -124,6 +124,7 @@ struct crypt_config {
 	mempool_t *req_pool;
 	mempool_t *page_pool;
 	struct bio_set *bs;
+	struct mutex bio_alloc_lock;
 
 	struct workqueue_struct *io_queue;
 	struct workqueue_struct *crypt_queue;
@@ -954,27 +955,51 @@ static void crypt_free_buffer_pages(stru
 /*
  * Generate a new unfragmented bio with the given size
  * This should never violate the device limitations
+ *
+ * This function may be called concurrently. If we allocate from the mempool
+ * concurrently, there is a possibility of deadlock. For example, if we have
+ * mempool of 256 pages, two processes, each wanting 256, pages allocate from
+ * the mempool concurrently, it may deadlock in a situation where both processes
+ * have allocated 128 pages and the mempool is exhausted.
+ *
+ * In order to avoid this scenarios, we allocate the pages under a mutex.
+ *
+ * In order to not degrade performance with excessive locking, we try
+ * non-blocking allocations without a mutex first and if it fails, we fallback
+ * to a blocking allocation with a mutex.
  */
 static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size)
 {
 	struct crypt_config *cc = io->cc;
 	struct bio *clone;
 	unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
-	unsigned i, len;
+	gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM;
+	unsigned i, len, remaining_size;
 	struct page *page;
 	struct bio_vec *bvec;
 
+retry:
+	if (unlikely(gfp_mask & __GFP_WAIT))
+		mutex_lock(&cc->bio_alloc_lock);
+
 	clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
 	if (!clone)
-		return NULL;
+		goto return_clone;
 
 	clone_init(io, clone);
 
+	remaining_size = size;
+
 	for (i = 0; i < nr_iovecs; i++) {
 		page = mempool_alloc(cc->page_pool, gfp_mask);
+		if (!page) {
+			crypt_free_buffer_pages(cc, clone);
+			bio_put(clone);
+			gfp_mask |= __GFP_WAIT;
+			goto retry;
+		}
 
-		len = (size > PAGE_SIZE) ? PAGE_SIZE : size;
+		len = (remaining_size > PAGE_SIZE) ? PAGE_SIZE : remaining_size;
 
 		bvec = &clone->bi_io_vec[clone->bi_vcnt++];
 		bvec->bv_page = page;
@@ -983,9 +1008,13 @@ static struct bio *crypt_alloc_buffer(st
 
 		clone->bi_iter.bi_size += len;
 
-		size -= len;
+		remaining_size -= len;
 	}
 
+return_clone:
+	if (unlikely(gfp_mask & __GFP_WAIT))
+		mutex_unlock(&cc->bio_alloc_lock);
+
 	return clone;
 }
 
@@ -1671,6 +1700,8 @@ static int crypt_ctr(struct dm_target *t
 		goto bad;
 	}
 
+	mutex_init(&cc->bio_alloc_lock);
+
 	ret = -EINVAL;
 	if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) {
 		ti->error = "Invalid iv_offset sector";

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 7/9] dm-crypt: remove io_pool
  2014-04-05 18:07           ` [PATCH 6/9] dm-crypt: avoid deadlock in mempools Mikulas Patocka
@ 2014-04-05 18:08             ` Mikulas Patocka
  2014-04-05 18:08               ` [PATCH 8/9] dm-crypt: offload writes to thread Mikulas Patocka
  0 siblings, 1 reply; 15+ messages in thread
From: Mikulas Patocka @ 2014-04-05 18:08 UTC (permalink / raw)
  To: Mike Snitzer, Ondrej Kozina; +Cc: dm-devel

Remove io_pool and _crypt_io_pool because they are unused.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 drivers/md/dm-crypt.c |   21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

Index: linux-3.14/drivers/md/dm-crypt.c
===================================================================
--- linux-3.14.orig/drivers/md/dm-crypt.c	2014-04-04 21:04:40.000000000 +0200
+++ linux-3.14/drivers/md/dm-crypt.c	2014-04-04 21:05:40.000000000 +0200
@@ -120,7 +120,6 @@ struct crypt_config {
 	 * pool for per bio private data, crypto requests and
 	 * encryption requeusts/buffer pages
 	 */
-	mempool_t *io_pool;
 	mempool_t *req_pool;
 	mempool_t *page_pool;
 	struct bio_set *bs;
@@ -173,8 +172,6 @@ struct crypt_config {
 
 #define MIN_IOS        16
 
-static struct kmem_cache *_crypt_io_pool;
-
 static void clone_init(struct dm_crypt_io *, struct bio *);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
 static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
@@ -1061,8 +1058,6 @@ static void crypt_dec_pending(struct dm_
 
 	if (io->ctx.req)
 		crypt_free_req(cc, io->ctx.req, base_bio);
-	if (io != dm_per_bio_data(base_bio, cc->per_bio_data_size))
-		mempool_free(io, cc->io_pool);
 
 	bio_endio(base_bio, error);
 }
@@ -1450,8 +1445,6 @@ static void crypt_dtr(struct dm_target *
 		mempool_destroy(cc->page_pool);
 	if (cc->req_pool)
 		mempool_destroy(cc->req_pool);
-	if (cc->io_pool)
-		mempool_destroy(cc->io_pool);
 
 	if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
 		cc->iv_gen_ops->dtr(cc);
@@ -1664,19 +1657,13 @@ static int crypt_ctr(struct dm_target *t
 	if (ret < 0)
 		goto bad;
 
-	ret = -ENOMEM;
-	cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool);
-	if (!cc->io_pool) {
-		ti->error = "Cannot allocate crypt io mempool";
-		goto bad;
-	}
-
 	cc->dmreq_start = sizeof(struct ablkcipher_request);
 	cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
 	cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
 	cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
 			   ~(crypto_tfm_ctx_alignment() - 1);
 
+	ret = -ENOMEM;
 	cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
 			sizeof(struct dm_crypt_request) + cc->iv_size);
 	if (!cc->req_pool) {
@@ -1938,14 +1925,9 @@ static int __init dm_crypt_init(void)
 {
 	int r;
 
-	_crypt_io_pool = KMEM_CACHE(dm_crypt_io, 0);
-	if (!_crypt_io_pool)
-		return -ENOMEM;
-
 	r = dm_register_target(&crypt_target);
 	if (r < 0) {
 		DMERR("register failed %d", r);
-		kmem_cache_destroy(_crypt_io_pool);
 	}
 
 	return r;
@@ -1954,7 +1936,6 @@ static int __init dm_crypt_init(void)
 static void __exit dm_crypt_exit(void)
 {
 	dm_unregister_target(&crypt_target);
-	kmem_cache_destroy(_crypt_io_pool);
 }
 
 module_init(dm_crypt_init);

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 8/9] dm-crypt: offload writes to thread
  2014-04-05 18:08             ` [PATCH 7/9] dm-crypt: remove io_pool Mikulas Patocka
@ 2014-04-05 18:08               ` Mikulas Patocka
  2014-04-05 18:09                 ` [PATCH 9/9] dm-crypt: sort writes Mikulas Patocka
  0 siblings, 1 reply; 15+ messages in thread
From: Mikulas Patocka @ 2014-04-05 18:08 UTC (permalink / raw)
  To: Mike Snitzer, Ondrej Kozina; +Cc: dm-devel

Submitting write bios directly in the encryption thread caused serious
performance degradation. On multiprocessor machine encryption requests
finish in a different order than they were submitted in. Consequently, write
requests would be submitted in a different order and it could cause severe
performance degradation.

This patch moves submitting write requests to a separate thread so that
the requests can be sorted before submitting.

Sorting is implemented in the next patch.

Note: it is required that a previous patch "dm-crypt: don't allocate pages
for a partial request." is applied before applying this patch. Without
that, this patch could introduce a crash.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 drivers/md/dm-crypt.c |  120 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 97 insertions(+), 23 deletions(-)

Index: linux-3.14/drivers/md/dm-crypt.c
===================================================================
--- linux-3.14.orig/drivers/md/dm-crypt.c	2014-04-04 21:05:40.000000000 +0200
+++ linux-3.14/drivers/md/dm-crypt.c	2014-04-04 21:06:22.000000000 +0200
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/crypto.h>
 #include <linux/workqueue.h>
+#include <linux/kthread.h>
 #include <linux/backing-dev.h>
 #include <linux/atomic.h>
 #include <linux/scatterlist.h>
@@ -58,6 +59,8 @@ struct dm_crypt_io {
 	atomic_t io_pending;
 	int error;
 	sector_t sector;
+
+	struct list_head list;
 } CRYPTO_MINALIGN_ATTR;
 
 struct dm_crypt_request {
@@ -128,6 +131,10 @@ struct crypt_config {
 	struct workqueue_struct *io_queue;
 	struct workqueue_struct *crypt_queue;
 
+	struct task_struct *write_thread;
+	wait_queue_head_t write_thread_wait;
+	struct list_head write_thread_list;
+
 	char *cipher;
 	char *cipher_string;
 
@@ -1141,37 +1148,89 @@ static int kcryptd_io_read(struct dm_cry
 	return 0;
 }
 
+static void kcryptd_io_read_work(struct work_struct *work)
+{
+	struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
+
+	crypt_inc_pending(io);
+	if (kcryptd_io_read(io, GFP_NOIO))
+		io->error = -ENOMEM;
+	crypt_dec_pending(io);
+}
+
+static void kcryptd_queue_read(struct dm_crypt_io *io)
+{
+	struct crypt_config *cc = io->cc;
+
+	INIT_WORK(&io->work, kcryptd_io_read_work);
+	queue_work(cc->io_queue, &io->work);
+}
+
 static void kcryptd_io_write(struct dm_crypt_io *io)
 {
 	struct bio *clone = io->ctx.bio_out;
+
 	generic_make_request(clone);
 }
 
-static void kcryptd_io(struct work_struct *work)
+static int dmcrypt_write(void *data)
 {
-	struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
+	struct crypt_config *cc = data;
+	while (1) {
+		struct list_head local_list;
+		struct blk_plug plug;
 
-	if (bio_data_dir(io->base_bio) == READ) {
-		crypt_inc_pending(io);
-		if (kcryptd_io_read(io, GFP_NOIO))
-			io->error = -ENOMEM;
-		crypt_dec_pending(io);
-	} else
-		kcryptd_io_write(io);
-}
+		DECLARE_WAITQUEUE(wait, current);
 
-static void kcryptd_queue_io(struct dm_crypt_io *io)
-{
-	struct crypt_config *cc = io->cc;
+		spin_lock_irq(&cc->write_thread_wait.lock);
+continue_locked:
 
-	INIT_WORK(&io->work, kcryptd_io);
-	queue_work(cc->io_queue, &io->work);
+		if (!list_empty(&cc->write_thread_list))
+			goto pop_from_list;
+
+		__set_current_state(TASK_INTERRUPTIBLE);
+		__add_wait_queue(&cc->write_thread_wait, &wait);
+
+		spin_unlock_irq(&cc->write_thread_wait.lock);
+
+		if (unlikely(kthread_should_stop())) {
+			set_task_state(current, TASK_RUNNING);
+			remove_wait_queue(&cc->write_thread_wait, &wait);
+			break;
+		}
+
+		schedule();
+
+		set_task_state(current, TASK_RUNNING);
+		spin_lock_irq(&cc->write_thread_wait.lock);
+		__remove_wait_queue(&cc->write_thread_wait, &wait);
+		goto continue_locked;
+
+pop_from_list:
+		local_list = cc->write_thread_list;
+		local_list.next->prev = &local_list;
+		local_list.prev->next = &local_list;
+		INIT_LIST_HEAD(&cc->write_thread_list);
+
+		spin_unlock_irq(&cc->write_thread_wait.lock);
+
+		blk_start_plug(&plug);
+		do {
+			struct dm_crypt_io *io = container_of(local_list.next,
+						struct dm_crypt_io, list);
+			list_del(&io->list);
+			kcryptd_io_write(io);
+		} while (!list_empty(&local_list));
+		blk_finish_plug(&plug);
+	}
+	return 0;
 }
 
-static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
+static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io)
 {
 	struct bio *clone = io->ctx.bio_out;
 	struct crypt_config *cc = io->cc;
+	unsigned long flags;
 
 	if (unlikely(io->error < 0)) {
 		crypt_free_buffer_pages(cc, clone);
@@ -1185,10 +1244,10 @@ static void kcryptd_crypt_write_io_submi
 
 	clone->bi_iter.bi_sector = cc->start + io->sector;
 
-	if (async)
-		kcryptd_queue_io(io);
-	else
-		generic_make_request(clone);
+	spin_lock_irqsave(&cc->write_thread_wait.lock, flags);
+	list_add_tail(&io->list, &cc->write_thread_list);
+	wake_up_locked(&cc->write_thread_wait);
+	spin_unlock_irqrestore(&cc->write_thread_wait.lock, flags);
 }
 
 static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
@@ -1224,7 +1283,7 @@ static void kcryptd_crypt_write_convert(
 
 	/* Encryption was already finished, submit io now */
 	if (crypt_finished) {
-		kcryptd_crypt_write_io_submit(io, 0);
+		kcryptd_crypt_write_io_submit(io);
 		io->sector = sector;
 	}
 
@@ -1284,7 +1343,7 @@ static void kcryptd_async_done(struct cr
 	if (bio_data_dir(io->base_bio) == READ)
 		kcryptd_crypt_read_done(io);
 	else
-		kcryptd_crypt_write_io_submit(io, 1);
+		kcryptd_crypt_write_io_submit(io);
 }
 
 static void kcryptd_crypt(struct work_struct *work)
@@ -1431,6 +1490,9 @@ static void crypt_dtr(struct dm_target *
 	if (!cc)
 		return;
 
+	if (cc->write_thread)
+		kthread_stop(cc->write_thread);
+
 	if (cc->io_queue)
 		destroy_workqueue(cc->io_queue);
 	if (cc->crypt_queue)
@@ -1745,6 +1807,18 @@ static int crypt_ctr(struct dm_target *t
 		goto bad;
 	}
 
+	init_waitqueue_head(&cc->write_thread_wait);
+	INIT_LIST_HEAD(&cc->write_thread_list);
+
+	cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write");
+	if (IS_ERR(cc->write_thread)) {
+		ret = PTR_ERR(cc->write_thread);
+		cc->write_thread = NULL;
+		ti->error = "Couldn't spawn write thread";
+		goto bad;
+	}
+	wake_up_process(cc->write_thread);
+
 	ti->num_flush_bios = 1;
 	ti->discard_zeroes_data_unsupported = true;
 
@@ -1779,7 +1853,7 @@ static int crypt_map(struct dm_target *t
 
 	if (bio_data_dir(io->base_bio) == READ) {
 		if (kcryptd_io_read(io, GFP_NOWAIT))
-			kcryptd_queue_io(io);
+			kcryptd_queue_read(io);
 	} else
 		kcryptd_queue_crypt(io);
 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 9/9] dm-crypt: sort writes
  2014-04-05 18:08               ` [PATCH 8/9] dm-crypt: offload writes to thread Mikulas Patocka
@ 2014-04-05 18:09                 ` Mikulas Patocka
  0 siblings, 0 replies; 15+ messages in thread
From: Mikulas Patocka @ 2014-04-05 18:09 UTC (permalink / raw)
  To: Mike Snitzer, Ondrej Kozina; +Cc: dm-devel

Write requests are sorted in a red-black tree structure and are submitted
in the sorted order.

In theory the sorting should be performed by the underlying disk scheduler,
however, in practice the disk scheduler accepts and sorts only 128 requests.
In order to sort more requests, we need to implement our own sorting.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 drivers/md/dm-crypt.c |   50 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 35 insertions(+), 15 deletions(-)

Index: linux-3.14/drivers/md/dm-crypt.c
===================================================================
--- linux-3.14.orig/drivers/md/dm-crypt.c	2014-04-04 21:06:22.000000000 +0200
+++ linux-3.14/drivers/md/dm-crypt.c	2014-04-04 21:06:55.000000000 +0200
@@ -22,6 +22,7 @@
 #include <linux/backing-dev.h>
 #include <linux/atomic.h>
 #include <linux/scatterlist.h>
+#include <linux/rbtree.h>
 #include <asm/page.h>
 #include <asm/unaligned.h>
 #include <crypto/hash.h>
@@ -60,7 +61,7 @@ struct dm_crypt_io {
 	int error;
 	sector_t sector;
 
-	struct list_head list;
+	struct rb_node rb_node;
 } CRYPTO_MINALIGN_ATTR;
 
 struct dm_crypt_request {
@@ -133,7 +134,7 @@ struct crypt_config {
 
 	struct task_struct *write_thread;
 	wait_queue_head_t write_thread_wait;
-	struct list_head write_thread_list;
+	struct rb_root write_tree;
 
 	char *cipher;
 	char *cipher_string;
@@ -1177,7 +1178,7 @@ static int dmcrypt_write(void *data)
 {
 	struct crypt_config *cc = data;
 	while (1) {
-		struct list_head local_list;
+		struct rb_root write_tree;
 		struct blk_plug plug;
 
 		DECLARE_WAITQUEUE(wait, current);
@@ -1185,7 +1186,7 @@ static int dmcrypt_write(void *data)
 		spin_lock_irq(&cc->write_thread_wait.lock);
 continue_locked:
 
-		if (!list_empty(&cc->write_thread_list))
+		if (!RB_EMPTY_ROOT(&cc->write_tree))
 			goto pop_from_list;
 
 		__set_current_state(TASK_INTERRUPTIBLE);
@@ -1207,20 +1208,23 @@ continue_locked:
 		goto continue_locked;
 
 pop_from_list:
-		local_list = cc->write_thread_list;
-		local_list.next->prev = &local_list;
-		local_list.prev->next = &local_list;
-		INIT_LIST_HEAD(&cc->write_thread_list);
-
+		write_tree = cc->write_tree;
+		cc->write_tree = RB_ROOT;
 		spin_unlock_irq(&cc->write_thread_wait.lock);
 
+		BUG_ON(rb_parent(write_tree.rb_node));
+
+		/*
+		 * Note: we cannot walk the tree here with rb_next because
+		 * the structures may be freed when kcryptd_io_write is called.
+		 */
 		blk_start_plug(&plug);
 		do {
-			struct dm_crypt_io *io = container_of(local_list.next,
-						struct dm_crypt_io, list);
-			list_del(&io->list);
+			struct dm_crypt_io *io = rb_entry(rb_first(&write_tree),
+						struct dm_crypt_io, rb_node);
+			rb_erase(&io->rb_node, &write_tree);
 			kcryptd_io_write(io);
-		} while (!list_empty(&local_list));
+		} while (!RB_EMPTY_ROOT(&write_tree));
 		blk_finish_plug(&plug);
 	}
 	return 0;
@@ -1231,6 +1235,8 @@ static void kcryptd_crypt_write_io_submi
 	struct bio *clone = io->ctx.bio_out;
 	struct crypt_config *cc = io->cc;
 	unsigned long flags;
+	sector_t sector;
+	struct rb_node **p, *parent;
 
 	if (unlikely(io->error < 0)) {
 		crypt_free_buffer_pages(cc, clone);
@@ -1245,7 +1251,21 @@ static void kcryptd_crypt_write_io_submi
 	clone->bi_iter.bi_sector = cc->start + io->sector;
 
 	spin_lock_irqsave(&cc->write_thread_wait.lock, flags);
-	list_add_tail(&io->list, &cc->write_thread_list);
+	p = &cc->write_tree.rb_node;
+	parent = NULL;
+	sector = io->sector;
+	while (*p) {
+		parent = *p;
+#define io_node rb_entry(parent, struct dm_crypt_io, rb_node)
+		if (sector < io_node->sector)
+			p = &io_node->rb_node.rb_left;
+		else
+			p = &io_node->rb_node.rb_right;
+#undef io_node
+	}
+	rb_link_node(&io->rb_node, parent, p);
+	rb_insert_color(&io->rb_node, &cc->write_tree);
+
 	wake_up_locked(&cc->write_thread_wait);
 	spin_unlock_irqrestore(&cc->write_thread_wait.lock, flags);
 }
@@ -1808,7 +1828,7 @@ static int crypt_ctr(struct dm_target *t
 	}
 
 	init_waitqueue_head(&cc->write_thread_wait);
-	INIT_LIST_HEAD(&cc->write_thread_list);
+	cc->write_tree = RB_ROOT;
 
 	cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write");
 	if (IS_ERR(cc->write_thread)) {

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/9] dm-crypt patches
  2014-04-05 18:04 [PATCH 0/9] dm-crypt patches Mikulas Patocka
  2014-04-05 18:04 ` [PATCH 1/9] dm-crypt: remove per-cpu structure Mikulas Patocka
@ 2014-04-07  1:01 ` Mike Snitzer
  2014-04-08 15:25 ` Ondrej Kozina
  2 siblings, 0 replies; 15+ messages in thread
From: Mike Snitzer @ 2014-04-07  1:01 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: Ondrej Kozina, Mike Snitzer, dm-devel

On Sat, Apr 05 2014 at  2:04pm -0400,
Mikulas Patocka <mpatocka@redhat.com> wrote:

> Hi
> 
> I fixed the bug that Ondrej found (it happened only under memory pressure, 
> that's why I couldn't reproduce it), here I'm resending the dm-crypt 
> patches.

I already staged your original patchset in linux-dm.git's 'for-next'
branch (linux-next).  So your re-post of these patches threw away patch
header edits, my Signed-off-by;s and also my cleanu-up of the rb-tree
code in patch 9.

Anyway, I folded the following fix from your new patch series into patch
5 (I saw no reason why 'remaining_size' couldn't be applied in patch 5
rather than patch 6).  All rebased patches are now staged in linux-next
(Ondrej please test, if we can verify performance I'm still open to
sending to Linus for 3.15, but must be by Thursday at the latest with no
further changes to this code):

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index c74c779..bbc1043 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -979,8 +979,9 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size)
 	struct bio *clone;
 	unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM;
-	unsigned i, len;
+	unsigned i, len, remaining_size;
 	struct page *page;
+	struct bio_vec *bvec;
 
 retry:
 	if (unlikely(gfp_mask & __GFP_WAIT))
@@ -992,6 +993,8 @@ retry:
 
 	clone_init(io, clone);
 
+	remaining_size = size;
+
 	for (i = 0; i < nr_iovecs; i++) {
 		page = mempool_alloc(cc->page_pool, gfp_mask);
 		if (!page) {
@@ -1001,18 +1004,16 @@ retry:
 			goto retry;
 		}
 
-		len = (size > PAGE_SIZE) ? PAGE_SIZE : size;
+		len = (remaining_size > PAGE_SIZE) ? PAGE_SIZE : remaining_size;
 
-		if (!bio_add_page(clone, page, len, 0)) {
-			DMERR("bio_add_page failed for page %d: the underlying device has stricter limits than dm-crypt target", i);
-			mempool_free(page, cc->page_pool);
-			crypt_free_buffer_pages(cc, clone);
-			bio_put(clone);
-			clone = NULL;
-			goto return_clone;
-		}
+		bvec = &clone->bi_io_vec[clone->bi_vcnt++];
+		bvec->bv_page = page;
+		bvec->bv_len = len;
+		bvec->bv_offset = 0;
+
+		clone->bi_iter.bi_size += len;
 
-		size -= len;
+		remaining_size -= len;
 	}
 
 return_clone:


Also note, I also folded this into patch 6 (BUG_ON conveys intent that
mempool_alloc should _never_ fail once __GFP_WAIT is set in
gfp_mask.. if it did we'd deadlock trying to re-acquire the mutex,
rather than BUG_ON but...):

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 56784ed..53cbfdf 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -967,10 +967,10 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone);
  * the mempool concurrently, it may deadlock in a situation where both processes
  * have allocated 128 pages and the mempool is exhausted.
  *
- * In order to avoid this scenarios, we allocate the pages under a mutex.
+ * In order to avoid this scenario we allocate the pages under a mutex.
  *
  * In order to not degrade performance with excessive locking, we try
- * non-blocking allocations without a mutex first and if it fails, we fallback
+ * non-blocking allocation without a mutex first and if it fails, we fallback
  * to a blocking allocation with a mutex.
  */
 static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size)
@@ -998,6 +998,7 @@ retry:
 	for (i = 0; i < nr_iovecs; i++) {
 		page = mempool_alloc(cc->page_pool, gfp_mask);
 		if (!page) {
+			BUG_ON(gfp_mask & __GFP_WAIT);
 			crypt_free_buffer_pages(cc, clone);
 			bio_put(clone);
 			gfp_mask |= __GFP_WAIT;

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/9] dm-crypt patches
  2014-04-05 18:04 [PATCH 0/9] dm-crypt patches Mikulas Patocka
  2014-04-05 18:04 ` [PATCH 1/9] dm-crypt: remove per-cpu structure Mikulas Patocka
  2014-04-07  1:01 ` [PATCH 0/9] dm-crypt patches Mike Snitzer
@ 2014-04-08 15:25 ` Ondrej Kozina
  2014-04-08 15:44   ` Mike Snitzer
  2014-04-08 21:29   ` Marian Csontos
  2 siblings, 2 replies; 15+ messages in thread
From: Ondrej Kozina @ 2014-04-08 15:25 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: device-mapper development, Mike Snitzer

Hi,

I ran all tests again and with new version and everything passed even on 
raid5 back end, so it's right time to paste here some performance results:


Requirements:
------------
http://okozina.fedorapeople.org/dmcrypt-tests.tar.xz
compiled fio package

Performance tests are based on script in subdirectory 1/ in
http://okozina.fedorapeople.org/dmcrypt-tests.tar.xz

The result tables were aggregated from output of following fio command:

fio --output=log --latency-log=log --bandwidth-log=log \
--name=global --rw=$3 --size=1G --bsrange=1k-128k \
--filename=$1 \
--name=job1 --name=job2 --name=job3 --name=job4 \
--end_fsync=1

$1 - test device
$3 - read/write mode see explanation of TEST NAME column below

HW:
---
2-socket machine Intel(R) Xeon(TM) CPU 3.40GHz (4 logical CPUs, without 
HW acceleration for AES!)
4GB RAM
4x SEAGATE ST336753LC (rotational drive)

One disk was used for system data, the rest for testing

Explanation for tables below:
-----------------------------
In column TEST NAME:

- "disk_nocrypt" stands for pure device w/o any encryption in effect 
(it's single device or raid5 device using md)
- "zero" stands for encryption over dm-zero target. Basically this test 
is trying to measure the performance of dm-crypt core alone.
- "disk" is real device (same as in "disk_nocrypt") with encryption set 
up over it.

following suffixes after dash in TABLE NAME column:
(all correspond to permissible values for fio --rw option)

- read = sequential reads
- write = sequential writes
- randread = random reads
- randwrite = random writes
- rw = sequential mixed reads and writes
- randrw = random mixed reads and writes

Any mixed option above has separate line for aggregated READs and WRITEs 
operations.

For encryption I used aes-xts-plain64 cipher, but please note that the 
machine doesn't provide HW acceleration for AES algorithm.

The column AGGRB_A stands for results with vanilla 3.14
The column AGGRB_B stands for results with all Mikulas's patches applied.

The interesting results are prefixed with "zero-" and "disk-". The 
"disk_nocrypt-" results are just for reference to see real limits of HW.

The "zero-" prefixed lines doesn't require to be published multiple 
times but it can serve to demonstrate scatter of results even on idling 
machine (except for running tests).

The single device tests:

TEST NAME                 OPERATION AGGRB_A KB/s AGGRB_B KB/s DIFF A->B
------------------------- --------- ------------ ------------ ---------
disk_nocrypt-read         READ      260289       259339       -0.36 %
disk_nocrypt-write        WRITE     242950       242922       -0.01 %
disk_nocrypt-randread     READ       53095        52195       -1.70 %
disk_nocrypt-randwrite    WRITE     193665       194482        0.42 %
disk_nocrypt-rw           READ       64050        64037       -0.02 %
disk_nocrypt-rw           WRITE      64581        64567       -0.02 %
disk_nocrypt-randrw       READ       16792        17235        2.64 %
disk_nocrypt-randrw       WRITE      16796        17239        2.64 %
zero-read                 READ      187614       309748       65.10 %
zero-write                WRITE     240871       332301       37.96 %
zero-randread             READ      299024       325476        8.85 %
zero-randwrite            WRITE     108506       123805       14.10 %
zero-rw                   READ       81694       106583       30.47 %
zero-rw                   WRITE      82370       107465       30.47 %
zero-randrw               READ       60794        85574       40.76 %
zero-randrw               WRITE      60810        85597       40.76 %
disk-read                 READ      232603       238258        2.43 %
disk-write                WRITE     119649       218601       82.70 %
disk-randread             READ       52578        52249       -0.63 %
disk-randwrite            WRITE     153878       173487       12.74 %
disk-rw                   READ       61341        64629        5.36 %
disk-rw                   WRITE      61849        65164        5.36 %
disk-randrw               READ       16251        15229       -6.29 %
disk-randrw               WRITE      16255        15233       -6.29 %


md raid5 with 64KiB chunk size as basking device:

TEST NAME                 OPERATION AGGRB_A KB/s AGGRB_B KB/s DIFF A->B
------------------------- --------- ------------ ------------ ---------
disk_nocrypt-read         READ      391405       390276        -0.29 %
disk_nocrypt-write        WRITE     209736       210272         0.26 %
disk_nocrypt-randread     READ       84644        84319        -0.38 %
disk_nocrypt-randwrite    WRITE      30651        29174        -4.82 %
disk_nocrypt-rw           READ       64533        64472        -0.09 %
disk_nocrypt-rw           WRITE      65068        65005        -0.10 %
disk_nocrypt-randrw       READ       18586        18827         1.30 %
disk_nocrypt-randrw       WRITE      18591        18832         1.30 %
zero-read                 READ      190641       308042        61.58 %
zero-write                WRITE     343936       306533       -10.87 %
zero-randread             READ      298322       326007         9.28 %
zero-randwrite            WRITE     115462       122193         5.83 %
zero-rw                   READ       82092       114980        40.06 %
zero-rw                   WRITE      82771       115932        40.06 %
zero-randrw               READ       69730        90008        29.08 %
zero-randrw               WRITE      69748        90032        29.08 %
disk-read                 READ      283207       264641        -6.56 %
disk-write                WRITE     152088       176431        16.01 %
disk-randread             READ       82021        81720        -0.37 %
disk-randwrite            WRITE      20553        23880        16.19 %
disk-rw                   READ       56833        59059         3.92 %
disk-rw                   WRITE      57303        59548         3.92 %
disk-randrw               READ       15819        16726         5.73 %
disk-randrw               WRITE      15823        16730         5.73 %

md raid5 with 512KiB chunk size as basking device:

TEST NAME                 OPERATION AGGRB_A KB/s AGGRB_B KB/s DIFF A->B
------------------------- --------- ------------ ------------ ---------
disk_nocrypt-read         READ      463151       447822       -3.31 %
disk_nocrypt-write        WRITE     149476       149866        0.26 %
disk_nocrypt-randread     READ       99205        98393       -0.82 %
disk_nocrypt-randwrite    WRITE      34422        34535        0.33 %
disk_nocrypt-rw           READ       75898        77132        1.63 %
disk_nocrypt-rw           WRITE      76527        77770        1.62 %
disk_nocrypt-randrw       READ       21886        21928        0.19 %
disk_nocrypt-randrw       WRITE      21892        21934        0.19 %
zero-read                 READ      189735       305173       60.84 %
zero-write                WRITE     289402       289082       -0.11 %
zero-randread             READ      300373       317061        5.56 %
zero-randwrite            WRITE     128128       123838       -3.35 %
zero-rw                   READ       81956       114558       39.78 %
zero-rw                   WRITE      82635       115506       39.78 %
zero-randrw               READ       74406        86589       16.37 %
zero-randrw               WRITE      74426        86612       16.37 %
disk-read                 READ      333251       399153       19.78 %
disk-write                WRITE     139703       158574       13.51 %
disk-randread             READ       93669        92408       -1.35 %
disk-randwrite            WRITE      22994        26520       15.33 %
disk-rw                   READ       63573        73389       15.44 %
disk-rw                   WRITE      64099        73996       15.44 %
disk-randrw               READ       19045        19617        3.00 %
disk-randrw               WRITE      19050        19622        3.00 %

Ondrej

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/9] dm-crypt patches
  2014-04-08 15:25 ` Ondrej Kozina
@ 2014-04-08 15:44   ` Mike Snitzer
  2014-04-09  8:46     ` Ondrej Kozina
  2014-04-08 21:29   ` Marian Csontos
  1 sibling, 1 reply; 15+ messages in thread
From: Mike Snitzer @ 2014-04-08 15:44 UTC (permalink / raw)
  To: Ondrej Kozina; +Cc: device-mapper development, Mikulas Patocka

On Tue, Apr 08 2014 at 11:25am -0400,
Ondrej Kozina <okozina@redhat.com> wrote:

> Hi,
> 
> I ran all tests again and with new version and everything passed
> even on raid5 back end, so it's right time to paste here some
> performance results:

Thanks for sharing the results.  The system is a pretty low-end machine,
certainly doesn't showcase the intended configuration (improved CPU
scalability on a system with many cores).

BUT it is a useful data point to have.  Because it reflects that the
improvements are realized even on systems without many cores or AES.

> Requirements:
> ------------
> http://okozina.fedorapeople.org/dmcrypt-tests.tar.xz
> compiled fio package
> 
> Performance tests are based on script in subdirectory 1/ in
> http://okozina.fedorapeople.org/dmcrypt-tests.tar.xz
> 
> The result tables were aggregated from output of following fio command:
> 
> fio --output=log --latency-log=log --bandwidth-log=log \
> --name=global --rw=$3 --size=1G --bsrange=1k-128k \
> --filename=$1 \
> --name=job1 --name=job2 --name=job3 --name=job4 \
> --end_fsync=1
> 
> $1 - test device
> $3 - read/write mode see explanation of TEST NAME column below

How is it that you've extracted these results from the test run?  I'd be
open to rerunning the same on my testbed (1 Intel Xeon E5649 Westmere-EP
with 12 logical cores, AES, and 6 SAS devices for RAID, 1 STEC PCIe SSD)

I could also yield this testbed to you if that'd be easier than you
telling me how to extract the test results.

BTW, it seems odd to me that the tests run all the fsx and dt tests yet
only the FIO tests are what you're reporting.. would save time to just
disable the non-fio tests wouldn't it?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/9] dm-crypt patches
  2014-04-08 15:25 ` Ondrej Kozina
  2014-04-08 15:44   ` Mike Snitzer
@ 2014-04-08 21:29   ` Marian Csontos
  1 sibling, 0 replies; 15+ messages in thread
From: Marian Csontos @ 2014-04-08 21:29 UTC (permalink / raw)
  To: device-mapper development, Mikulas Patocka; +Cc: Mike Snitzer

On 04/08/2014 05:25 PM, Ondrej Kozina wrote:
> Hi,
>
> I ran all tests again and with new version and everything passed even on
> raid5 back end, so it's right time to paste here some performance results:

I have found few strange numbers there I have difficulties to understand 
and wonder what maybe the cause of these discrepancies. Could you have a 
look?

>
>
> Requirements:
> ------------
> http://okozina.fedorapeople.org/dmcrypt-tests.tar.xz
> compiled fio package
>
> Performance tests are based on script in subdirectory 1/ in
> http://okozina.fedorapeople.org/dmcrypt-tests.tar.xz
>
> The result tables were aggregated from output of following fio command:
>
> fio --output=log --latency-log=log --bandwidth-log=log \
> --name=global --rw=$3 --size=1G --bsrange=1k-128k \
> --filename=$1 \
> --name=job1 --name=job2 --name=job3 --name=job4 \
> --end_fsync=1
>
> $1 - test device
> $3 - read/write mode see explanation of TEST NAME column below
>
> HW:
> ---
> 2-socket machine Intel(R) Xeon(TM) CPU 3.40GHz (4 logical CPUs, without
> HW acceleration for AES!)
> 4GB RAM
> 4x SEAGATE ST336753LC (rotational drive)
>
> One disk was used for system data, the rest for testing
>
> Explanation for tables below:
> -----------------------------
> In column TEST NAME:
>
> - "disk_nocrypt" stands for pure device w/o any encryption in effect
> (it's single device or raid5 device using md)
> - "zero" stands for encryption over dm-zero target. Basically this test
> is trying to measure the performance of dm-crypt core alone.
> - "disk" is real device (same as in "disk_nocrypt") with encryption set
> up over it.
>
> following suffixes after dash in TABLE NAME column:
> (all correspond to permissible values for fio --rw option)
>
> - read = sequential reads
> - write = sequential writes
> - randread = random reads
> - randwrite = random writes
> - rw = sequential mixed reads and writes
> - randrw = random mixed reads and writes
>
> Any mixed option above has separate line for aggregated READs and WRITEs
> operations.
>
> For encryption I used aes-xts-plain64 cipher, but please note that the
> machine doesn't provide HW acceleration for AES algorithm.
>
> The column AGGRB_A stands for results with vanilla 3.14
> The column AGGRB_B stands for results with all Mikulas's patches applied.
>
> The interesting results are prefixed with "zero-" and "disk-". The
> "disk_nocrypt-" results are just for reference to see real limits of HW.
>
> The "zero-" prefixed lines doesn't require to be published multiple
> times but it can serve to demonstrate scatter of results even on idling
> machine (except for running tests).
>
> The single device tests:
>
> TEST NAME                 OPERATION AGGRB_A KB/s AGGRB_B KB/s DIFF A->B
> ------------------------- --------- ------------ ------------ ---------
> disk_nocrypt-read         READ      260289       259339       -0.36 %
> disk_nocrypt-write        WRITE     242950       242922       -0.01 %
> disk_nocrypt-randread     READ       53095        52195       -1.70 %
> disk_nocrypt-randwrite    WRITE     193665       194482        0.42 %
> disk_nocrypt-rw           READ       64050        64037       -0.02 %
> disk_nocrypt-rw           WRITE      64581        64567       -0.02 %
> disk_nocrypt-randrw       READ       16792        17235        2.64 %
> disk_nocrypt-randrw       WRITE      16796        17239        2.64 %
> zero-read                 READ      187614       309748       65.10 %
> zero-write                WRITE     240871       332301       37.96 %
> zero-randread             READ      299024       325476        8.85 %

                                       ^ Wow! Randread so much faster 
than sequential that I wonder if it is correct...

> zero-randwrite            WRITE     108506       123805       14.10 %
> zero-rw                   READ       81694       106583       30.47 %
> zero-rw                   WRITE      82370       107465       30.47 %
> zero-randrw               READ       60794        85574       40.76 %
> zero-randrw               WRITE      60810        85597       40.76 %
> disk-read                 READ      232603       238258        2.43 %
> disk-write                WRITE     119649       218601       82.70 %
> disk-randread             READ       52578        52249       -0.63 %
> disk-randwrite            WRITE     153878       173487       12.74 %

                                       ^ And disk randwrite faster than 
sequential. And three times faster than randread suggesting it may be 
writing to cache.

> disk-rw                   READ       61341        64629        5.36 %
> disk-rw                   WRITE      61849        65164        5.36 %
> disk-randrw               READ       16251        15229       -6.29 %
> disk-randrw               WRITE      16255        15233       -6.29 %
>
>
> md raid5 with 64KiB chunk size as basking device:
>
> TEST NAME                 OPERATION AGGRB_A KB/s AGGRB_B KB/s DIFF A->B
> ------------------------- --------- ------------ ------------ ---------
> disk_nocrypt-read         READ      391405       390276        -0.29 %
> disk_nocrypt-write        WRITE     209736       210272         0.26 %
> disk_nocrypt-randread     READ       84644        84319        -0.38 %
> disk_nocrypt-randwrite    WRITE      30651        29174        -4.82 %
> disk_nocrypt-rw           READ       64533        64472        -0.09 %
> disk_nocrypt-rw           WRITE      65068        65005        -0.10 %
> disk_nocrypt-randrw       READ       18586        18827         1.30 %
> disk_nocrypt-randrw       WRITE      18591        18832         1.30 %
> zero-read                 READ      190641       308042        61.58 %
> zero-write                WRITE     343936       306533       -10.87 %

                                       ^ Here the write is much much 
faster than reads and the -10% looks more like an error than performance 
regression

> zero-randread             READ      298322       326007         9.28 %

                                       ^ Again randread much faster than 
sequential

> zero-randwrite            WRITE     115462       122193         5.83 %
> zero-rw                   READ       82092       114980        40.06 %
> zero-rw                   WRITE      82771       115932        40.06 %
> zero-randrw               READ       69730        90008        29.08 %
> zero-randrw               WRITE      69748        90032        29.08 %
> disk-read                 READ      283207       264641        -6.56 %

                                       ^ Disk read being so faster than 
dm-zero looks odd...

> disk-write                WRITE     152088       176431        16.01 %
> disk-randread             READ       82021        81720        -0.37 %
> disk-randwrite            WRITE      20553        23880        16.19 %
> disk-rw                   READ       56833        59059         3.92 %
> disk-rw                   WRITE      57303        59548         3.92 %
> disk-randrw               READ       15819        16726         5.73 %
> disk-randrw               WRITE      15823        16730         5.73 %
>
> md raid5 with 512KiB chunk size as basking device:
>
> TEST NAME                 OPERATION AGGRB_A KB/s AGGRB_B KB/s DIFF A->B
> ------------------------- --------- ------------ ------------ ---------
> disk_nocrypt-read         READ      463151       447822       -3.31 %
> disk_nocrypt-write        WRITE     149476       149866        0.26 %
> disk_nocrypt-randread     READ       99205        98393       -0.82 %
> disk_nocrypt-randwrite    WRITE      34422        34535        0.33 %
> disk_nocrypt-rw           READ       75898        77132        1.63 %
> disk_nocrypt-rw           WRITE      76527        77770        1.62 %
> disk_nocrypt-randrw       READ       21886        21928        0.19 %
> disk_nocrypt-randrw       WRITE      21892        21934        0.19 %
> zero-read                 READ      189735       305173       60.84 %
> zero-write                WRITE     289402       289082       -0.11 %
> zero-randread             READ      300373       317061        5.56 %
> zero-randwrite            WRITE     128128       123838       -3.35 %
> zero-rw                   READ       81956       114558       39.78 %
> zero-rw                   WRITE      82635       115506       39.78 %
> zero-randrw               READ       74406        86589       16.37 %
> zero-randrw               WRITE      74426        86612       16.37 %
> disk-read                 READ      333251       399153       19.78 %

                                       ^ It looks odd - this is so much 
faster than dm-zero and I wonder whether dm-zero is not fast enough or 
the disk tests are hitting cache instead of disk and whether all the 
zero-* tests are meaningful

> disk-write                WRITE     139703       158574       13.51 %

                                                    ^ Here dm-crypt is 
faster than raw disk

> disk-randread             READ       93669        92408       -1.35 %
> disk-randwrite            WRITE      22994        26520       15.33 %
> disk-rw                   READ       63573        73389       15.44 %
> disk-rw                   WRITE      64099        73996       15.44 %
> disk-randrw               READ       19045        19617        3.00 %
> disk-randrw               WRITE      19050        19622        3.00 %
>
> Ondrej
>
> --
> dm-devel mailing list
> dm-devel@redhat.com
> https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/9] dm-crypt patches
  2014-04-08 15:44   ` Mike Snitzer
@ 2014-04-09  8:46     ` Ondrej Kozina
  0 siblings, 0 replies; 15+ messages in thread
From: Ondrej Kozina @ 2014-04-09  8:46 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: device-mapper development, Mikulas Patocka

On 04/08/2014 05:44 PM, Mike Snitzer wrote:
> How is it that you've extracted these results from the test run?  I'd be
> open to rerunning the same on my testbed (1 Intel Xeon E5649 Westmere-EP
> with 12 logical cores, AES, and 6 SAS devices for RAID, 1 STEC PCIe SSD)
>
> I could also yield this testbed to you if that'd be easier than you
> telling me how to extract the test results.

Thanks Mike!

Pasting new results for loaned machine and things are getting more 
interesting (with respect to Marian's reply):

single (rotational) device:

TEST NAME                 OPERATION AGGRB_A KB/s AGGRB_B KB/s DIFF A->B
------------------------- --------- ------------ ------------ ---------
disk_nocrypt-read         READ      479129       478528        -0.13 %
disk_nocrypt-write        WRITE     225040       225718         0.30 %
disk_nocrypt-randread     READ       30150        30193         0.14 %
disk_nocrypt-randwrite    WRITE      39059        39546         1.25 %
disk_nocrypt-rw           READ       92477        94021         1.67 %
disk_nocrypt-rw           WRITE      93242        94800         1.67 %
disk_nocrypt-randrw       READ       14426        14902         3.30 %
disk_nocrypt-randrw       WRITE      14388        14862         3.29 %
zero-read                 READ      272889       354069        29.75 %
zero-write                WRITE     145721       306175       110.11 %
zero-randread             READ      482888       599366        24.12 %
zero-randwrite            WRITE     151175       346728       129.36 %
zero-rw                   READ       91280       131950        44.56 %
zero-rw                   WRITE      92036       133042        44.55 %
zero-randrw               READ       72625       142476        96.18 %
zero-randrw               WRITE      72433       142099        96.18 %
disk-read                 READ      193161       210156         8.80 %
disk-write                WRITE      33845       182790       440.08 %
disk-randread             READ       29746        29640        -0.36 %
disk-randwrite            WRITE       7783        31689       307.16 %
disk-rw                   READ       23954        69686       190.92 %
disk-rw                   WRITE      24152        70263       190.92 %
disk-randrw               READ        6770        12523        84.98 %
disk-randrw               WRITE       6753        12490        84.95 %

single PCIe SSD device:

(Staring to immense difference between sequential reads and random 
reads. Can it be there some sort of raid inside SSD card?)

TEST NAME                 OPERATION AGGRB_A KB/s AGGRB_B KB/s DIFF A->B
------------------------- --------- ------------ ------------ ---------
disk_nocrypt-read         READ      1072537      1145446        6.80 %
disk_nocrypt-write        WRITE      843924       828586       -1.82 %
disk_nocrypt-randread     READ      2091110      2108825        0.85 %
disk_nocrypt-randwrite    WRITE     1149542      1137971       -1.01 %
disk_nocrypt-rw           READ       425704       426661        0.22 %
disk_nocrypt-rw           WRITE      429228       430193        0.22 %
disk_nocrypt-randrw       READ       534206       532311       -0.35 %
disk_nocrypt-randrw       WRITE      532791       530900       -0.35 %
zero-read                 READ       271388       351871       29.66 %
zero-write                WRITE      143748       300000      108.70 %
zero-randread             READ       487264       600310       23.20 %
zero-randwrite            WRITE      148958       344308      131.14 %
zero-rw                   READ        90927       131659       44.80 %
zero-rw                   WRITE       91679       132748       44.80 %
zero-randrw               READ        71325       137468       92.73 %
zero-randrw               WRITE       71136       137104       92.74 %
disk-read                 READ       243854       269678       10.59 %
disk-write                WRITE      179773       300408       67.10 %
disk-randread             READ       518848       647679       24.83 %
disk-randwrite            WRITE      195314       348398       78.38 %
disk-rw                   READ       103381       131270       26.98 %
disk-rw                   WRITE      104237       132356       26.98 %
disk-randrw               READ        93681       145719       55.55 %
disk-randrw               WRITE       93433       145333       55.55 %

4 rotational devices forming md raid5, 64KiB chunk size:

TEST NAME                 OPERATION AGGRB_A KB/s AGGRB_B KB/s DIFF A->B
------------------------- --------- ------------ ------------ ---------
disk_nocrypt-read         READ      560436       547773        -2.26 %
disk_nocrypt-write        WRITE     360552       359039        -0.42 %
disk_nocrypt-randread     READ       62256        62118        -0.22 %
disk_nocrypt-randwrite    WRITE      51591        52289         1.35 %
disk_nocrypt-rw           READ       59680        58955        -1.21 %
disk_nocrypt-rw           WRITE      60174        59443        -1.21 %
disk_nocrypt-randrw       READ       18328        18219        -0.59 %
disk_nocrypt-randrw       WRITE      18280        18171        -0.60 %
zero-read                 READ      274712       353651        28.74 %
zero-write                WRITE     146469       300904       105.44 %
zero-randread             READ      498736       603246        20.95 %
zero-randwrite            WRITE     150131       355003       136.46 %
zero-rw                   READ       92481       132359        43.12 %
zero-rw                   WRITE      93246       133455        43.12 %
zero-randrw               READ       72278       141164        95.31 %
zero-randrw               WRITE      72086       140790        95.31 %
disk-read                 READ      177154       203054        14.62 %
disk-write                WRITE      71057       241913       240.45 %
disk-randread             READ       58757        58988         0.39 %
disk-randwrite            WRITE      14453        28966       100.42 %
disk-rw                   READ       59772        67560        13.03 %
disk-rw                   WRITE      60267        68120        13.03 %
disk-randrw               READ        9634        12815        33.02 %
disk-randrw               WRITE       9608        12781        33.02 %

4 rotational devices forming md raid5, 512KiB chunk size:

(I'll repeat this one since there's strange increase in random read 
performance on raw raid5 device, see line "disk_nocrypt-randread"):

TEST NAME                 OPERATION AGGRB_A KB/s AGGRB_B KB/s DIFF A->B
------------------------- --------- ------------ ------------ ---------
disk_nocrypt-read         READ      734811       730206        -0.63 %
disk_nocrypt-write        WRITE     264474       259773        -1.78 %
disk_nocrypt-randread     READ       64387        83218        29.25 %
disk_nocrypt-randwrite    WRITE      52671        51898        -1.47 %
disk_nocrypt-rw           READ       75785        73469        -3.06 %
disk_nocrypt-rw           WRITE      76413        74077        -3.06 %
disk_nocrypt-randrw       READ       17664        17690         0.15 %
disk_nocrypt-randrw       WRITE      17617        17643         0.15 %
zero-read                 READ      270967       354099        30.68 %
zero-write                WRITE     145277       307230       111.48 %
zero-randread             READ      478481       585805        22.43 %
zero-randwrite            WRITE     149734       349997       133.75 %
zero-rw                   READ       91742       127760        39.26 %
zero-rw                   WRITE      92501       128818        39.26 %
zero-randrw               READ       78925       145437        84.27 %
zero-randrw               WRITE      78716       145051        84.27 %
disk-read                 READ      203705       237220        16.45 %
disk-write                WRITE      64068       230051       259.07 %
disk-randread             READ       60698        77793        28.16 %
disk-randwrite            WRITE      18535        32227        73.87 %
disk-rw                   READ       49023        65775        34.17 %
disk-rw                   WRITE      49429        66320        34.17 %
disk-randrw               READ       10740        13084        21.82 %
disk-randrw               WRITE      10712        13049        21.82 %

Now I would like to work on simulating different workloads (these tests 
were simulating 4 jobs in fio on otherwise idling machine)

Ondrej

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2014-04-09  8:46 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-04-05 18:04 [PATCH 0/9] dm-crypt patches Mikulas Patocka
2014-04-05 18:04 ` [PATCH 1/9] dm-crypt: remove per-cpu structure Mikulas Patocka
2014-04-05 18:05   ` [PATCH 2/9] bio: use kmalloc alignment for bio slab Mikulas Patocka
2014-04-05 18:05     ` [PATCH 3/9] dm-crypt: use per-bio data Mikulas Patocka
2014-04-05 18:06       ` [PATCH 4/9] dm-crypt: use unbound workqueue for request processing Mikulas Patocka
2014-04-05 18:07         ` [PATCH 5/9] dm-crypt: don't allocate pages for a partial request Mikulas Patocka
2014-04-05 18:07           ` [PATCH 6/9] dm-crypt: avoid deadlock in mempools Mikulas Patocka
2014-04-05 18:08             ` [PATCH 7/9] dm-crypt: remove io_pool Mikulas Patocka
2014-04-05 18:08               ` [PATCH 8/9] dm-crypt: offload writes to thread Mikulas Patocka
2014-04-05 18:09                 ` [PATCH 9/9] dm-crypt: sort writes Mikulas Patocka
2014-04-07  1:01 ` [PATCH 0/9] dm-crypt patches Mike Snitzer
2014-04-08 15:25 ` Ondrej Kozina
2014-04-08 15:44   ` Mike Snitzer
2014-04-09  8:46     ` Ondrej Kozina
2014-04-08 21:29   ` Marian Csontos

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.