Linux RAID subsystem development
 help / color / mirror / Atom feed
* [PATCH v3 3/9] md: superblock changes for PPL
From: Artur Paszkiewicz @ 2017-01-30 18:59 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, jes.sorensen, Artur Paszkiewicz
In-Reply-To: <20170130185953.30428-1-artur.paszkiewicz@intel.com>

Include information about PPL location and size into mdp_superblock_1
and copy it to/from rdev. Because PPL is mutually exclusive with bitmap,
put it in place of 'bitmap_offset'. Add a new flag MD_FEATURE_PPL for
'feature_map', analogically to MD_FEATURE_BITMAP_OFFSET. Add MD_HAS_PPL
to mddev->flags to indicate that PPL is enabled on an array.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 drivers/md/md.c                | 15 +++++++++++++++
 drivers/md/md.h                |  8 ++++++++
 drivers/md/raid0.c             |  3 ++-
 drivers/md/raid1.c             |  3 ++-
 include/uapi/linux/raid/md_p.h | 18 ++++++++++++++----
 5 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 85ac98417a08..e96f73572e23 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1566,6 +1566,12 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
 	} else if (sb->bblog_offset != 0)
 		rdev->badblocks.shift = 0;
 
+	if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
+		rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
+		rdev->ppl.size = le16_to_cpu(sb->ppl.size);
+		rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
+	}
+
 	if (!refdev) {
 		ret = 1;
 	} else {
@@ -1678,6 +1684,9 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 
 		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
 			set_bit(MD_HAS_JOURNAL, &mddev->flags);
+
+		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL)
+			set_bit(MD_HAS_PPL, &mddev->flags);
 	} else if (mddev->pers == NULL) {
 		/* Insist of good event counter while assembling, except for
 		 * spares (which don't need an event count) */
@@ -1891,6 +1900,12 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
 		sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
 
+	if (test_bit(MD_HAS_PPL, &mddev->flags)) {
+		sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
+		sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
+		sb->ppl.size = cpu_to_le16(rdev->ppl.size);
+	}
+
 	rdev_for_each(rdev2, mddev) {
 		i = rdev2->desc_nr;
 		if (test_bit(Faulty, &rdev2->flags))
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 968bbe72b237..abdb5f2ed2d3 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -122,6 +122,13 @@ struct md_rdev {
 					   * sysfs entry */
 
 	struct badblocks badblocks;
+
+	struct {
+		short offset;	/* Offset from superblock to start of PPL.
+				 * Not used by external metadata. */
+		unsigned int size;	/* Size in sectors of the PPL space */
+		sector_t sector;	/* First sector of the PPL space */
+	} ppl;
 };
 enum flag_bits {
 	Faulty,			/* device is known to have a fault */
@@ -229,6 +236,7 @@ enum mddev_flags {
 				 * supported as calls to md_error() will
 				 * never cause the array to become failed.
 				 */
+	MD_HAS_PPL,		/* The raid array has PPL feature set */
 };
 
 enum mddev_sb_flags {
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 5b3db367814a..37fc1f5185a9 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -29,7 +29,8 @@
 #define UNSUPPORTED_MDDEV_FLAGS		\
 	((1L << MD_HAS_JOURNAL) |	\
 	 (1L << MD_JOURNAL_CLEAN) |	\
-	 (1L << MD_FAILFAST_SUPPORTED))
+	 (1L << MD_FAILFAST_SUPPORTED) |\
+	 (1L << MD_HAS_PPL))
 
 static int raid0_congested(struct mddev *mddev, int bits)
 {
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7b0f647bcccb..53623a31b074 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -44,7 +44,8 @@
 
 #define UNSUPPORTED_MDDEV_FLAGS		\
 	((1L << MD_HAS_JOURNAL) |	\
-	 (1L << MD_JOURNAL_CLEAN))
+	 (1L << MD_JOURNAL_CLEAN) |	\
+	 (1L << MD_HAS_PPL))
 
 /*
  * Number of guaranteed r1bios in case of extreme VM load:
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index 9930f3e9040f..fe2112810c43 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -242,10 +242,18 @@ struct mdp_superblock_1 {
 
 	__le32	chunksize;	/* in 512byte sectors */
 	__le32	raid_disks;
-	__le32	bitmap_offset;	/* sectors after start of superblock that bitmap starts
-				 * NOTE: signed, so bitmap can be before superblock
-				 * only meaningful of feature_map[0] is set.
-				 */
+	union {
+		__le32	bitmap_offset;	/* sectors after start of superblock that bitmap starts
+					 * NOTE: signed, so bitmap can be before superblock
+					 * only meaningful of feature_map[0] is set.
+					 */
+
+		/* only meaningful when feature_map[MD_FEATURE_PPL] is set */
+		struct {
+			__le16 offset; /* sectors from start of superblock that ppl starts (signed) */
+			__le16 size; /* ppl size in sectors */
+		} ppl;
+	};
 
 	/* These are only valid with feature bit '4' */
 	__le32	new_level;	/* new level we are reshaping to		*/
@@ -318,6 +326,7 @@ struct mdp_superblock_1 {
 					     */
 #define MD_FEATURE_CLUSTERED		256 /* clustered MD */
 #define	MD_FEATURE_JOURNAL		512 /* support write cache */
+#define	MD_FEATURE_PPL			1024 /* support PPL */
 #define	MD_FEATURE_ALL			(MD_FEATURE_BITMAP_OFFSET	\
 					|MD_FEATURE_RECOVERY_OFFSET	\
 					|MD_FEATURE_RESHAPE_ACTIVE	\
@@ -328,6 +337,7 @@ struct mdp_superblock_1 {
 					|MD_FEATURE_RECOVERY_BITMAP	\
 					|MD_FEATURE_CLUSTERED		\
 					|MD_FEATURE_JOURNAL		\
+					|MD_FEATURE_PPL			\
 					)
 
 struct r5l_payload_header {
-- 
2.11.0


^ permalink raw reply related

* [PATCH v3 4/9] raid5: calculate partial parity for a stripe
From: Artur Paszkiewicz @ 2017-01-30 18:59 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, jes.sorensen, Artur Paszkiewicz
In-Reply-To: <20170130185953.30428-1-artur.paszkiewicz@intel.com>

Attach a page for holding the partial parity data to stripe_head.
Allocate it only if mddev has the MD_HAS_PPL flag set.

Partial parity is the xor of not modified data chunks of a stripe and is
calculated as follows:

- reconstruct-write case:
  xor data from all not updated disks in a stripe

- read-modify-write case:
  xor old data and parity from all updated disks in a stripe

Implement it using the async_tx API and integrate into raid_run_ops().
It must be called when we still have access to old data, so do it when
STRIPE_OP_BIODRAIN is set, but before ops_run_prexor5(). The result is
stored into sh->ppl_page.

Partial parity is not meaningful for full stripe write and is not stored
in the log or used for recovery, so don't attempt to calculate it when
stripe has STRIPE_FULL_WRITE.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 drivers/md/raid5.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.h |  2 ++
 2 files changed, 100 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d1cba941951e..e1e238da32ba 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -466,6 +466,11 @@ static void shrink_buffers(struct stripe_head *sh)
 		sh->dev[i].page = NULL;
 		put_page(p);
 	}
+
+	if (sh->ppl_page) {
+		put_page(sh->ppl_page);
+		sh->ppl_page = NULL;
+	}
 }
 
 static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
@@ -482,6 +487,13 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
 		sh->dev[i].page = page;
 		sh->dev[i].orig_page = page;
 	}
+
+	if (test_bit(MD_HAS_PPL, &sh->raid_conf->mddev->flags)) {
+		sh->ppl_page = alloc_page(gfp);
+		if (!sh->ppl_page)
+			return 1;
+	}
+
 	return 0;
 }
 
@@ -1977,6 +1989,55 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
 			   &sh->ops.zero_sum_result, percpu->spare_page, &submit);
 }
 
+static struct dma_async_tx_descriptor *
+ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
+		       struct dma_async_tx_descriptor *tx)
+{
+	int disks = sh->disks;
+	struct page **xor_srcs = flex_array_get(percpu->scribble, 0);
+	int count = 0, pd_idx = sh->pd_idx, i;
+	struct async_submit_ctl submit;
+
+	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
+
+	/*
+	 * Partial parity is the XOR of stripe data chunks that are not changed
+	 * during the write request. Depending on available data
+	 * (read-modify-write vs. reconstruct-write case) we calculate it
+	 * differently.
+	 */
+	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+		/* rmw: xor old data and parity from updated disks */
+		for (i = disks; i--;) {
+			struct r5dev *dev = &sh->dev[i];
+			if (test_bit(R5_Wantdrain, &dev->flags) || i == pd_idx)
+				xor_srcs[count++] = dev->page;
+		}
+	} else if (sh->reconstruct_state == reconstruct_state_drain_run) {
+		/* rcw: xor data from all not updated disks */
+		for (i = disks; i--;) {
+			struct r5dev *dev = &sh->dev[i];
+			if (test_bit(R5_UPTODATE, &dev->flags))
+				xor_srcs[count++] = dev->page;
+		}
+	} else {
+		return tx;
+	}
+
+	init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, tx, NULL, sh,
+			  flex_array_get(percpu->scribble, 0)
+			  + sizeof(struct page *) * (sh->disks + 2));
+
+	if (count == 1)
+		tx = async_memcpy(sh->ppl_page, xor_srcs[0], 0, 0, PAGE_SIZE,
+				  &submit);
+	else
+		tx = async_xor(sh->ppl_page, xor_srcs, 0, count, PAGE_SIZE,
+			       &submit);
+
+	return tx;
+}
+
 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 {
 	int overlap_clear = 0, i, disks = sh->disks;
@@ -2007,6 +2068,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 			async_tx_ack(tx);
 	}
 
+	if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
+		tx = ops_run_partial_parity(sh, percpu, tx);
+
 	if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
 		if (level < 6)
 			tx = ops_run_prexor5(sh, percpu, tx);
@@ -3058,6 +3122,12 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 		s->locked++;
 	}
 
+	if (level == 5 && test_bit(MD_HAS_PPL, &conf->mddev->flags) &&
+	    test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
+	    !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
+	    test_bit(R5_Insync, &sh->dev[pd_idx].flags))
+		set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
+
 	pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
 		__func__, (unsigned long long)sh->sector,
 		s->locked, s->ops_request);
@@ -3105,6 +3175,34 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 	if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
 		goto overlap;
 
+	if (forwrite && test_bit(MD_HAS_PPL, &conf->mddev->flags)) {
+		/*
+		 * With PPL only writes to consecutive data chunks within a
+		 * stripe are allowed. Not really an overlap, but
+		 * wait_for_overlap can be used to handle this.
+		 */
+		sector_t sector;
+		sector_t first = 0;
+		sector_t last = 0;
+		int count = 0;
+		int i;
+
+		for (i = 0; i < sh->disks; i++) {
+			if (i != sh->pd_idx &&
+			    (i == dd_idx || sh->dev[i].towrite)) {
+				sector = sh->dev[i].sector;
+				if (count == 0 || sector < first)
+					first = sector;
+				if (sector > last)
+					last = sector;
+				count++;
+			}
+		}
+
+		if (first + conf->chunk_sectors * (count - 1) != last)
+			goto overlap;
+	}
+
 	if (!forwrite || previous)
 		clear_bit(STRIPE_BATCH_READY, &sh->state);
 
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 0f64a58873de..88f1e52d9daf 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -228,6 +228,7 @@ struct stripe_head {
 	struct list_head	log_list;
 	sector_t		log_start; /* first meta block on the journal */
 	struct list_head	r5c; /* for r5c_cache->stripe_in_journal */
+	struct page		*ppl_page; /* partial parity of this stripe */
 	/**
 	 * struct stripe_operations
 	 * @target - STRIPE_OP_COMPUTE_BLK target
@@ -400,6 +401,7 @@ enum {
 	STRIPE_OP_BIODRAIN,
 	STRIPE_OP_RECONSTRUCT,
 	STRIPE_OP_CHECK,
+	STRIPE_OP_PARTIAL_PARITY,
 };
 
 /*
-- 
2.11.0


^ permalink raw reply related

* [PATCH v3 5/9] raid5-ppl: Partial Parity Log write logging implementation
From: Artur Paszkiewicz @ 2017-01-30 18:59 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, jes.sorensen, Artur Paszkiewicz
In-Reply-To: <20170130185953.30428-1-artur.paszkiewicz@intel.com>

This implements the PPL write logging functionality, using the
raid5-cache policy logic introduced in previous patches. The description
of PPL is added to the documentation. More details can be found in the
comments in raid5-ppl.c.

Put the PPL metadata structures to md_p.h because userspace tools
(mdadm) will also need to read/write PPL.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 Documentation/admin-guide/md.rst |  53 ++++
 drivers/md/Makefile              |   2 +-
 drivers/md/raid5-cache.c         |  13 +-
 drivers/md/raid5-cache.h         |   8 +
 drivers/md/raid5-ppl.c           | 551 +++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.c               |  15 +-
 include/uapi/linux/raid/md_p.h   |  26 ++
 7 files changed, 661 insertions(+), 7 deletions(-)
 create mode 100644 drivers/md/raid5-ppl.c

diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst
index e449fb5f277c..7104ef757e73 100644
--- a/Documentation/admin-guide/md.rst
+++ b/Documentation/admin-guide/md.rst
@@ -86,6 +86,9 @@ superblock can be autodetected and run at boot time.
 The kernel parameter ``raid=partitionable`` (or ``raid=part``) means
 that all auto-detected arrays are assembled as partitionable.
 
+
+.. _dirty_degraded_boot:
+
 Boot time assembly of degraded/dirty arrays
 -------------------------------------------
 
@@ -176,6 +179,56 @@ and its role in the array.
 Once started with RUN_ARRAY, uninitialized spares can be added with
 HOT_ADD_DISK.
 
+.. _ppl:
+
+Partial Parity Log
+------------------
+
+Partial Parity Log (PPL) is a feature available for RAID5 arrays. The issue
+addressed by PPL is that after a dirty shutdown, parity of a particular stripe
+may become inconsistent with data on other member disks. If the array is also
+in degraded state, there is no way to recalculate parity, because one of the
+disks is missing. This can lead to silent data corruption when rebuilding the
+array or using it is as degraded - data calculated from parity for array blocks
+that have not been touched by a write request during the unclean shutdown can
+be incorrect. Such condition is known as the ``RAID5 Write Hole``. Because of
+this, md by default does not allow starting a dirty degraded array, see
+:ref:`dirty_degraded_boot`.
+
+Partial parity for a write operation is the XOR of stripe data chunks not
+modified by this write. It is just enough data needed for recovering from the
+write hole. XORing partial parity with the modified chunks produces parity for
+the stripe, consistent with its state before the write operation, regardless of
+which chunk writes have completed. If one of the not modified data disks of
+this stripe is missing, this updated parity can be used to recover its
+contents. PPL recovery is also performed when starting an array after an
+unclean shutdown and all disks are available, eliminating the need to resync
+the array. Because of this, using write-intent bitmap and PPL together is not
+supported.
+
+When handling a write request PPL writes partial parity before new data and
+parity are dispatched to disks. PPL is a distributed log - it is stored on
+array member drives in the metadata area, on the parity drive of a particular
+stripe.  It does not require a dedicated journaling drive. Write performance is
+reduced by up to 30%-40% but it scales with the number of drives in the array
+and the journaling drive does not become a bottleneck or a single point of
+failure.
+
+Unlike raid5-cache, the other solution in md for closing the write hole, PPL is
+not a true journal. It does not protect from losing in-flight data, only from
+silent data corruption. If a dirty disk of a stripe is lost, no PPL recovery is
+performed for this stripe (parity is not updated). So it is possible to have
+arbitrary data in the written part of a stripe if that disk is lost. In such
+case the behavior is the same as in plain raid5.
+
+PPL is available for md version-1 metadata and external (specifically IMSM)
+metadata arrays. It can be enabled using mdadm option
+``--consistency-policy=ppl``.
+
+Currently, volatile write-back cache should be disabled on all member drives
+when using PPL. Otherwise it cannot guarantee consistency in case of power
+failure.
+
 
 MD devices in sysfs
 -------------------
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 3cbda1af87a0..4d48714ccc6b 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -18,7 +18,7 @@ dm-cache-cleaner-y += dm-cache-policy-cleaner.o
 dm-era-y	+= dm-era-target.o
 dm-verity-y	+= dm-verity-target.o
 md-mod-y	+= md.o bitmap.o
-raid456-y	+= raid5.o raid5-cache.o
+raid456-y	+= raid5.o raid5-cache.o raid5-ppl.o
 
 # Note: link order is important.  All raid personalities
 # and must come before md.o, as they each initialise 
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 6fac581804a9..7757c5137300 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -167,8 +167,8 @@ static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
 	return log->device_size > used_size + size;
 }
 
-static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
-				    enum r5l_io_unit_state state)
+void __r5l_set_io_unit_state(struct r5l_io_unit *io,
+			     enum r5l_io_unit_state state)
 {
 	if (WARN_ON(io->state >= state))
 		return;
@@ -385,7 +385,7 @@ static void r5c_finish_cache_stripe(struct stripe_head *sh)
 	}
 }
 
-static void r5l_io_run_stripes(struct r5l_io_unit *io)
+void r5l_io_run_stripes(struct r5l_io_unit *io)
 {
 	struct stripe_head *sh, *next;
 
@@ -995,7 +995,7 @@ static sector_t r5l_reclaimable_space(struct r5l_log *log)
 				 r5c_calculate_new_cp(conf));
 }
 
-static void r5l_run_no_mem_stripe(struct r5l_log *log)
+void r5l_run_no_mem_stripe(struct r5l_log *log)
 {
 	struct stripe_head *sh;
 
@@ -1421,7 +1421,7 @@ bool r5l_log_disk_error(struct r5conf *conf)
 	if (!log)
 		ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
 	else
-		ret = test_bit(Faulty, &log->rdev->flags);
+		ret = log->rdev && test_bit(Faulty, &log->rdev->flags);
 	rcu_read_unlock();
 	return ret;
 }
@@ -2784,6 +2784,7 @@ struct r5l_policy r5l_journal = {
 	.handle_flush_request = __r5l_handle_flush_request,
 	.quiesce = __r5l_quiesce,
 };
+extern struct r5l_policy r5l_ppl;
 
 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 {
@@ -2797,6 +2798,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 
 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
 		log->policy = &r5l_journal;
+	} else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
+		log->policy = &r5l_ppl;
 	} else {
 		kfree(log);
 		return -EINVAL;
diff --git a/drivers/md/raid5-cache.h b/drivers/md/raid5-cache.h
index 97803f3ae0fe..6b622c2742de 100644
--- a/drivers/md/raid5-cache.h
+++ b/drivers/md/raid5-cache.h
@@ -87,6 +87,8 @@ struct r5l_log {
 
 	/* handlers for log operations */
 	struct r5l_policy *policy;
+
+	void *private;
 };
 
 /*
@@ -177,4 +179,10 @@ extern void r5c_check_cached_full_stripe(struct r5conf *conf);
 extern struct md_sysfs_entry r5c_journal_mode;
 extern void r5c_update_on_rdev_error(struct mddev *mddev);
 extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
+
+extern void __r5l_set_io_unit_state(struct r5l_io_unit *io,
+				    enum r5l_io_unit_state state);
+extern void r5l_io_run_stripes(struct r5l_io_unit *io);
+extern void r5l_run_no_mem_stripe(struct r5l_log *log);
+
 #endif /* _RAID5_CACHE_H */
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
new file mode 100644
index 000000000000..6bc246c80f6b
--- /dev/null
+++ b/drivers/md/raid5-ppl.c
@@ -0,0 +1,551 @@
+/*
+ * Partial Parity Log for closing the RAID5 write hole
+ * Copyright (c) 2017, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/crc32c.h>
+#include <linux/raid/md_p.h>
+#include "md.h"
+#include "raid5.h"
+#include "raid5-cache.h"
+
+/*
+ * PPL consists of a 4KB header (struct ppl_header) and at least 128KB for
+ * partial parity data. The header contains an array of entries
+ * (struct ppl_header_entry) which describe the logged write requests.
+ * Partial parity for the entries comes after the header, written in the same
+ * sequence as the entries:
+ *
+ * Header
+ *   entry0
+ *   ...
+ *   entryN
+ * PP data
+ *   PP for entry0
+ *   ...
+ *   PP for entryN
+ *
+ * Every entry holds a checksum of its partial parity, the header also has a
+ * checksum of the header itself. Entries for full stripes writes contain no
+ * partial parity, they only mark the stripes for which parity should be
+ * recalculated after an unclean shutdown.
+ *
+ * A write request is always logged to the PPL instance stored on the parity
+ * disk of the corresponding stripe. For each member disk there is one r5l_log
+ * used to handle logging for this disk, independently from others. They are
+ * grouped in child_logs array in struct ppl_conf, which is assigned to a
+ * common parent r5l_log. This parent log serves as a proxy and is used in
+ * raid5 personality code - it is assigned as _the_ log in r5conf->log.
+ *
+ * r5l_io_unit represents a full PPL write, meta_page contains the ppl_header.
+ * PPL entries for logged stripes are added in ppl_log_stripe(). A stripe can
+ * be appended to the last entry if the chunks to write are the same, otherwise
+ * a new entry is added. Checksums of entries are calculated incrementally as
+ * stripes containing partial parity are being added to entries.
+ * ppl_submit_iounit() calculates the checksum of the header and submits a bio
+ * containing the meta_page (ppl_header) and partial parity pages (sh->ppl_page)
+ * for all stripes of the io_unit. When the PPL write completes, the stripes
+ * associated with the io_unit are released and raid5d starts writing their data
+ * and parity. When all stripes are written, the io_unit is freed and the next
+ * can be submitted.
+ *
+ * An io_unit is used to gather stripes until it is submitted or becomes full
+ * (if the maximum number of entries or size of PPL is reached). Another io_unit
+ * can't be submitted until the previous has completed (PPL and stripe
+ * data+parity is written). The log->running_ios list tracks all io_units of
+ * a log (for a single member disk). New io_units are added to the end of the
+ * list and the first io_unit is submitted, if it is not submitted already.
+ * The current io_unit accepting new stripes is always the last on the list.
+ */
+
+struct ppl_conf {
+	struct mddev *mddev;
+
+	/* the log assigned to r5conf->log */
+	struct r5l_log *parent_log;
+
+	/* array of child logs, one for each raid disk */
+	struct r5l_log *child_logs;
+	int count;
+
+	/* the logical block size used for data_sector in ppl_header_entry */
+	int block_size;
+};
+
+static struct r5l_io_unit *ppl_new_iounit(struct r5l_log *log,
+					  struct stripe_head *sh)
+{
+	struct r5l_io_unit *io;
+	struct ppl_header *pplhdr;
+	struct ppl_conf *ppl_conf = log->private;
+	struct r5l_log *parent_log = ppl_conf->parent_log;
+
+	io = mempool_alloc(log->io_pool, GFP_ATOMIC);
+	if (!io)
+		return NULL;
+
+	memset(io, 0, sizeof(*io));
+	io->log = log;
+	INIT_LIST_HEAD(&io->log_sibling);
+	INIT_LIST_HEAD(&io->stripe_list);
+	io->state = IO_UNIT_RUNNING;
+
+	io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
+	pplhdr = page_address(io->meta_page);
+	clear_page(pplhdr);
+	memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
+	pplhdr->signature = cpu_to_le32(log->uuid_checksum);
+
+	spin_lock(&parent_log->io_list_lock);
+	io->seq = ++parent_log->seq;
+	spin_unlock(&parent_log->io_list_lock);
+	pplhdr->generation = cpu_to_le64(io->seq);
+
+	return io;
+}
+
+static int ppl_log_stripe(struct r5l_log *log, struct stripe_head *sh)
+{
+	struct r5l_io_unit *io = NULL;
+	struct ppl_header *pplhdr;
+	struct ppl_header_entry *e = NULL;
+	int i;
+	sector_t data_sector = 0;
+	int data_disks = 0;
+	unsigned int entries_count;
+	unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE;
+	struct r5conf *conf = sh->raid_conf;
+
+	pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector);
+
+	if (log->current_io) {
+		io = log->current_io;
+		pplhdr = page_address(io->meta_page);
+		entries_count = le32_to_cpu(pplhdr->entries_count);
+
+		/* check if current io_unit is full */
+		if (io->meta_offset >= entry_space ||
+		    entries_count == PPL_HDR_MAX_ENTRIES) {
+			pr_debug("%s: add io_unit blocked by seq: %llu\n",
+				 __func__, io->seq);
+			io = NULL;
+		}
+	}
+
+	/* add a new unit if there is none or the current is full */
+	if (!io) {
+		io = ppl_new_iounit(log, sh);
+		if (!io)
+			return -ENOMEM;
+		spin_lock_irq(&log->io_list_lock);
+		list_add_tail(&io->log_sibling, &log->running_ios);
+		spin_unlock_irq(&log->io_list_lock);
+
+		log->current_io = io;
+		pplhdr = page_address(io->meta_page);
+		entries_count = 0;
+	}
+
+	for (i = 0; i < sh->disks; i++) {
+		struct r5dev *dev = &sh->dev[i];
+		if (i != sh->pd_idx && test_bit(R5_Wantwrite, &dev->flags)) {
+			if (!data_disks || dev->sector < data_sector)
+				data_sector = dev->sector;
+			data_disks++;
+		}
+	}
+	BUG_ON(!data_disks);
+
+	pr_debug("%s: seq: %llu data_sector: %llu data_disks: %d\n", __func__,
+		 io->seq, (unsigned long long)data_sector, data_disks);
+
+	if (entries_count > 0) {
+		struct ppl_header_entry *prev =
+				&pplhdr->entries[entries_count - 1];
+		u64 data_sector_prev = le64_to_cpu(prev->data_sector);
+		u32 data_size_prev = le32_to_cpu(prev->data_size);
+		u32 pp_size_prev = le32_to_cpu(prev->pp_size);
+
+		/*
+		 * Check if we can merge with the previous entry. Must be on
+		 * the same stripe and disks. Use bit shift and logarithm
+		 * to avoid 64-bit division.
+		 */
+		if ((data_sector >> ilog2(conf->chunk_sectors) ==
+		     data_sector_prev >> ilog2(conf->chunk_sectors)) &&
+		    ((pp_size_prev == 0 &&
+		      test_bit(STRIPE_FULL_WRITE, &sh->state)) ||
+		     ((data_sector_prev + (pp_size_prev >> 9) == data_sector) &&
+		      (data_size_prev == pp_size_prev * data_disks))))
+			e = prev;
+	}
+
+	if (!e) {
+		e = &pplhdr->entries[entries_count++];
+		pplhdr->entries_count = cpu_to_le32(entries_count);
+		e->data_sector = cpu_to_le64(data_sector);
+		e->parity_disk = cpu_to_le32(sh->pd_idx);
+		e->checksum = cpu_to_le32(~0);
+	}
+
+	le32_add_cpu(&e->data_size, data_disks << PAGE_SHIFT);
+
+	/* don't write any PP if full stripe write */
+	if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) {
+		le32_add_cpu(&e->pp_size, PAGE_SIZE);
+		io->meta_offset += PAGE_SIZE;
+		e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum),
+						    page_address(sh->ppl_page),
+						    PAGE_SIZE));
+	}
+
+	list_add_tail(&sh->log_list, &io->stripe_list);
+	atomic_inc(&io->pending_stripe);
+	sh->log_io = io;
+
+	return 0;
+}
+
+static int ppl_write_stripe(struct r5l_log *log, struct stripe_head *sh)
+{
+	struct r5l_io_unit *io = sh->log_io;
+
+	if (io || test_bit(STRIPE_SYNCING, &sh->state) ||
+	    !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
+	    !test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) {
+		clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
+		return -EAGAIN;
+	}
+
+	mutex_lock(&log->io_mutex);
+
+	if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
+		mutex_unlock(&log->io_mutex);
+		return -EAGAIN;
+	}
+
+	set_bit(STRIPE_LOG_TRAPPED, &sh->state);
+	clear_bit(STRIPE_DELAYED, &sh->state);
+	atomic_inc(&sh->count);
+
+	if (ppl_log_stripe(log, sh)) {
+		spin_lock_irq(&log->io_list_lock);
+		list_add_tail(&sh->log_list, &log->no_mem_stripes);
+		spin_unlock_irq(&log->io_list_lock);
+	}
+
+	mutex_unlock(&log->io_mutex);
+
+	return 0;
+}
+
+static void ppl_log_endio(struct bio *bio)
+{
+	struct r5l_io_unit *io = bio->bi_private;
+	struct r5l_log *log = io->log;
+	struct ppl_conf *ppl_conf = log->private;
+	unsigned long flags;
+
+	pr_debug("%s: seq: %llu\n", __func__, io->seq);
+
+	if (bio->bi_error)
+		md_error(ppl_conf->mddev, log->rdev);
+
+	bio_put(bio);
+	mempool_free(io->meta_page, log->meta_pool);
+
+	spin_lock_irqsave(&log->io_list_lock, flags);
+	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
+	r5l_io_run_stripes(io);
+	spin_unlock_irqrestore(&log->io_list_lock, flags);
+}
+
+static void ppl_submit_iounit(struct r5l_io_unit *io)
+{
+	struct r5l_log *log = io->log;
+	struct ppl_conf *ppl_conf = log->private;
+	struct r5conf *conf = ppl_conf->mddev->private;
+	struct ppl_header *pplhdr = page_address(io->meta_page);
+	struct bio *bio;
+	struct stripe_head *sh;
+	int i;
+	struct bio_list bios = BIO_EMPTY_LIST;
+	char b[BDEVNAME_SIZE];
+
+	bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
+	bio->bi_private = io;
+	bio->bi_end_io = ppl_log_endio;
+	bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
+	bio->bi_bdev = log->rdev->bdev;
+	bio->bi_iter.bi_sector = log->rdev->ppl.sector;
+	bio_add_page(bio, io->meta_page, PAGE_SIZE, 0);
+	bio_list_add(&bios, bio);
+
+	sh = list_first_entry(&io->stripe_list, struct stripe_head, log_list);
+
+	for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) {
+		struct ppl_header_entry *e = &pplhdr->entries[i];
+		u32 pp_size = le32_to_cpu(e->pp_size);
+		u32 data_size = le32_to_cpu(e->data_size);
+		u64 data_sector = le64_to_cpu(e->data_sector);
+		int stripes_count;
+
+		if (pp_size > 0)
+			stripes_count = pp_size >> PAGE_SHIFT;
+		else
+			stripes_count = (data_size /
+					 (conf->raid_disks -
+					  conf->max_degraded)) >> PAGE_SHIFT;
+
+		while (stripes_count--) {
+			/*
+			 * if entry without partial parity just skip its stripes
+			 * without adding pages to bio
+			 */
+			if (pp_size > 0 &&
+			    !bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) {
+				struct bio *prev = bio;
+
+				bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES,
+						       log->bs);
+				bio->bi_opf = prev->bi_opf;
+				bio->bi_bdev = prev->bi_bdev;
+				bio->bi_iter.bi_sector = bio_end_sector(prev);
+				bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0);
+				bio_chain(bio, prev);
+				bio_list_add(&bios, bio);
+			}
+			sh = list_next_entry(sh, log_list);
+		}
+
+		pr_debug("%s: seq: %llu entry: %d data_sector: %llu pp_size: %u data_size: %u\n",
+			 __func__, io->seq, i, data_sector, pp_size, data_size);
+
+		e->data_sector = cpu_to_le64(data_sector >>
+					     ilog2(ppl_conf->block_size >> 9));
+		e->checksum = cpu_to_le32(~le32_to_cpu(e->checksum));
+	}
+
+	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
+
+	while ((bio = bio_list_pop(&bios))) {
+		pr_debug("%s: seq: %llu submit_bio() size: %u sector: %llu dev: %s\n",
+			 __func__, io->seq, bio->bi_iter.bi_size,
+			 (unsigned long long)bio->bi_iter.bi_sector,
+			 bdevname(bio->bi_bdev, b));
+		submit_bio(bio);
+	}
+}
+
+static void ppl_submit_current_io(struct r5l_log *log)
+{
+	struct r5l_io_unit *io;
+
+	spin_lock_irq(&log->io_list_lock);
+
+	io = list_first_entry_or_null(&log->running_ios,
+				      struct r5l_io_unit, log_sibling);
+
+	if (io && io->state == IO_UNIT_RUNNING) {
+		__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
+		spin_unlock_irq(&log->io_list_lock);
+
+		if (io == log->current_io)
+			log->current_io = NULL;
+
+		ppl_submit_iounit(io);
+		return;
+	}
+
+	spin_unlock_irq(&log->io_list_lock);
+}
+
+static void ppl_write_stripe_run(struct r5l_log *log)
+{
+	mutex_lock(&log->io_mutex);
+	ppl_submit_current_io(log);
+	mutex_unlock(&log->io_mutex);
+}
+
+static void __ppl_stripe_write_finished(struct r5l_io_unit *io)
+{
+	struct r5l_log *log = io->log;
+	unsigned long flags;
+
+	pr_debug("%s: seq: %llu\n", __func__, io->seq);
+
+	spin_lock_irqsave(&log->io_list_lock, flags);
+
+	list_del(&io->log_sibling);
+	mempool_free(io, log->io_pool);
+	r5l_run_no_mem_stripe(log);
+
+	spin_unlock_irqrestore(&log->io_list_lock, flags);
+}
+
+static void __ppl_exit_log(struct r5l_log *log)
+{
+	struct ppl_conf *ppl_conf = log->private;
+
+	kfree(ppl_conf->child_logs);
+	kfree(ppl_conf);
+
+	mempool_destroy(log->meta_pool);
+	if (log->bs)
+		bioset_free(log->bs);
+	mempool_destroy(log->io_pool);
+	kmem_cache_destroy(log->io_kc);
+}
+
+static int __ppl_init_log(struct r5l_log *log, struct r5conf *conf)
+{
+	struct ppl_conf *ppl_conf;
+	struct mddev *mddev = conf->mddev;
+	int ret = 0;
+	int i;
+
+	if (PAGE_SIZE != 4096)
+		return -EINVAL;
+
+	if (mddev->bitmap) {
+		pr_warn("md/raid:%s PPL is not compatible with bitmap.\n",
+			mdname(mddev));
+		return -EINVAL;
+	}
+
+	ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL);
+	if (!ppl_conf)
+		return -ENOMEM;
+	log->private = ppl_conf;
+
+	spin_lock_init(&log->io_list_lock);
+
+	log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
+	if (!log->io_kc) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	log->io_pool = mempool_create_slab_pool(conf->raid_disks, log->io_kc);
+	if (!log->io_pool) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	log->bs = bioset_create(conf->raid_disks, 0);
+	if (!log->bs) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	log->meta_pool = mempool_create_page_pool(conf->raid_disks, 0);
+	if (!log->meta_pool) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ppl_conf->parent_log = log;
+	ppl_conf->mddev = mddev;
+	ppl_conf->count = conf->raid_disks;
+	ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct r5l_log),
+				       GFP_KERNEL);
+	if (!ppl_conf->child_logs) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	if (!mddev->external) {
+		log->uuid_checksum = ~crc32c_le(~0, mddev->uuid,
+						sizeof(mddev->uuid));
+		ppl_conf->block_size = 512;
+	} else {
+		ppl_conf->block_size = queue_logical_block_size(mddev->queue);
+	}
+
+	for (i = 0; i < ppl_conf->count; i++) {
+		struct r5l_log *log_child = &ppl_conf->child_logs[i];
+		struct md_rdev *rdev = conf->disks[i].rdev;
+
+		mutex_init(&log_child->io_mutex);
+		spin_lock_init(&log_child->io_list_lock);
+		INIT_LIST_HEAD(&log_child->running_ios);
+		INIT_LIST_HEAD(&log_child->no_mem_stripes);
+
+		log_child->rdev = rdev;
+		log_child->private = log->private;
+		log_child->io_kc = log->io_kc;
+		log_child->io_pool = log->io_pool;
+		log_child->bs = log->bs;
+		log_child->meta_pool = log->meta_pool;
+		log_child->uuid_checksum = log->uuid_checksum;
+		log_child->policy = log->policy;
+
+		if (rdev) {
+			struct request_queue *q = bdev_get_queue(rdev->bdev);
+
+			if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
+				log->need_cache_flush = true;
+
+			if (rdev->ppl.size < (PPL_HEADER_SIZE +
+					      STRIPE_SIZE) >> 9) {
+				char b[BDEVNAME_SIZE];
+				pr_warn("md/raid:%s: PPL space too small on %s.\n",
+					mdname(mddev), bdevname(rdev->bdev, b));
+				ret = -ENOSPC;
+			}
+		}
+	}
+
+	if (ret)
+		goto err;
+
+	if (log->need_cache_flush)
+		pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
+			mdname(mddev));
+
+	conf->log = log;
+
+	return 0;
+err:
+	__ppl_exit_log(log);
+	return ret;
+}
+
+static int __ppl_write_stripe(struct r5l_log *log, struct stripe_head *sh)
+{
+	struct ppl_conf *ppl_conf = log->private;
+
+	return ppl_write_stripe(&ppl_conf->child_logs[sh->pd_idx], sh);
+}
+
+static void __ppl_write_stripe_run(struct r5l_log *log)
+{
+	struct ppl_conf *ppl_conf = log->private;
+	int i;
+
+	for (i = 0; i < ppl_conf->count; i++)
+		ppl_write_stripe_run(&ppl_conf->child_logs[i]);
+}
+
+struct r5l_policy r5l_ppl = {
+	.init_log = __ppl_init_log,
+	.exit_log = __ppl_exit_log,
+	.write_stripe = __ppl_write_stripe,
+	.write_stripe_run = __ppl_write_stripe_run,
+	.flush_stripe_to_raid = NULL,
+	.stripe_write_finished = __ppl_stripe_write_finished,
+	.handle_flush_request = NULL,
+	.quiesce = NULL,
+};
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e1e238da32ba..ffa19bd4ba6e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -926,7 +926,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
 	might_sleep();
 
-	if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
+	if (!test_bit(STRIPE_R5C_CACHING, &sh->state) ||
+	    test_bit(MD_HAS_PPL, &conf->mddev->flags)) {
 		/* writing out phase */
 		if (s->waiting_extra_page)
 			return;
@@ -7171,6 +7172,13 @@ static int raid5_run(struct mddev *mddev)
 		BUG_ON(mddev->delta_disks != 0);
 	}
 
+	if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
+	    test_bit(MD_HAS_PPL, &mddev->flags)) {
+		pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
+			mdname(mddev));
+		clear_bit(MD_HAS_PPL, &mddev->flags);
+	}
+
 	if (mddev->private == NULL)
 		conf = setup_conf(mddev);
 	else
@@ -7397,6 +7405,11 @@ static int raid5_run(struct mddev *mddev)
 			 mdname(mddev), bdevname(journal_dev->bdev, b));
 		if (r5l_init_log(conf, journal_dev))
 			goto abort;
+	} else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
+		pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n",
+			 mdname(mddev));
+		if (r5l_init_log(conf, NULL))
+			goto abort;
 	}
 
 	return 0;
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index fe2112810c43..2c28711cc5f1 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -398,4 +398,30 @@ struct r5l_meta_block {
 
 #define R5LOG_VERSION 0x1
 #define R5LOG_MAGIC 0x6433c509
+
+struct ppl_header_entry {
+	__le64 data_sector;	/* Raid sector of the new data */
+	__le32 pp_size;		/* Length of partial parity */
+	__le32 data_size;	/* Length of data */
+	__le32 parity_disk;	/* Member disk containing parity */
+	__le32 checksum;	/* Checksum of this entry */
+} __attribute__ ((__packed__));
+
+#define PPL_HEADER_SIZE 4096
+#define PPL_HDR_RESERVED 512
+#define PPL_HDR_ENTRY_SPACE \
+	(PPL_HEADER_SIZE - PPL_HDR_RESERVED - 4 * sizeof(u32) - sizeof(u64))
+#define PPL_HDR_MAX_ENTRIES \
+	(PPL_HDR_ENTRY_SPACE / sizeof(struct ppl_header_entry))
+
+struct ppl_header {
+	__u8 reserved[PPL_HDR_RESERVED];/* Reserved space */
+	__le32 signature;		/* Signature (family number of volume) */
+	__le32 padding;
+	__le64 generation;		/* Generation number of PP Header */
+	__le32 entries_count;		/* Number of entries in entry array */
+	__le32 checksum;		/* Checksum of PP Header */
+	struct ppl_header_entry entries[PPL_HDR_MAX_ENTRIES];
+} __attribute__ ((__packed__));
+
 #endif
-- 
2.11.0


^ permalink raw reply related

* [PATCH v3 6/9] md: add sysfs entries for PPL
From: Artur Paszkiewicz @ 2017-01-30 18:59 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, jes.sorensen, Artur Paszkiewicz
In-Reply-To: <20170130185953.30428-1-artur.paszkiewicz@intel.com>

Add 'consistency_policy' attribute for array. It indicates how the array
maintains consistency in case of unexpected shutdown.

Add 'ppl_sector' and 'ppl_size' for rdev, which describe the location
and size of the PPL space on the device.

These attributes are writable to allow enabling PPL for external
metadata arrays and (later) to enable/disable PPL for a running array.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 Documentation/admin-guide/md.rst |  32 ++++++++++-
 drivers/md/md.c                  | 115 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 144 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst
index 7104ef757e73..2c153b3d798f 100644
--- a/Documentation/admin-guide/md.rst
+++ b/Documentation/admin-guide/md.rst
@@ -329,14 +329,14 @@ All md devices contain:
      array creation it will default to 0, though starting the array as
      ``clean`` will set it much larger.
 
-   new_dev
+  new_dev
      This file can be written but not read.  The value written should
      be a block device number as major:minor.  e.g. 8:0
      This will cause that device to be attached to the array, if it is
      available.  It will then appear at md/dev-XXX (depending on the
      name of the device) and further configuration is then possible.
 
-   safe_mode_delay
+  safe_mode_delay
      When an md array has seen no write requests for a certain period
      of time, it will be marked as ``clean``.  When another write
      request arrives, the array is marked as ``dirty`` before the write
@@ -345,7 +345,7 @@ All md devices contain:
      period as a number of seconds.  The default is 200msec (0.200).
      Writing a value of 0 disables safemode.
 
-   array_state
+  array_state
      This file contains a single word which describes the current
      state of the array.  In many cases, the state can be set by
      writing the word for the desired state, however some states
@@ -454,7 +454,30 @@ All md devices contain:
      once the array becomes non-degraded, and this fact has been
      recorded in the metadata.
 
+  consistency_policy
+     This indicates how the array maintains consistency in case of unexpected
+     shutdown. It can be:
 
+     none
+       Array has no redundancy information, e.g. raid0, linear.
+
+     resync
+       Full resync is performed and all redundancy is regenerated when the
+       array is started after unclean shutdown.
+
+     bitmap
+       Resync assisted by a write-intent bitmap.
+
+     journal
+       For raid4/5/6, journal device is used to log transactions and replay
+       after unclean shutdown.
+
+     ppl
+       For raid5 only, :ref:`ppl` is used to close the write hole and eliminate
+       resync.
+
+     The accepted values when writing to this file are ``ppl`` and ``resync``,
+     used to enable and disable PPL.
 
 
 As component devices are added to an md array, they appear in the ``md``
@@ -616,6 +639,9 @@ Each directory contains:
 	adds bad blocks without acknowledging them. This is largely
 	for testing.
 
+      ppl_sector, ppl_size
+        Location and size (in sectors) of the space used for Partial Parity Log
+        on this device.
 
 
 An active md device will also contain an entry for each active device
diff --git a/drivers/md/md.c b/drivers/md/md.c
index e96f73572e23..47ab3afe87cb 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3205,6 +3205,78 @@ static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
 static struct rdev_sysfs_entry rdev_unack_bad_blocks =
 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
 
+static ssize_t
+ppl_sector_show(struct md_rdev *rdev, char *page)
+{
+	return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
+}
+
+static ssize_t
+ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
+{
+	unsigned long long sector;
+
+	if (kstrtoull(buf, 10, &sector) < 0)
+		return -EINVAL;
+	if (sector != (sector_t)sector)
+	        return -EINVAL;
+
+	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
+	    rdev->raid_disk >= 0)
+		return -EBUSY;
+
+	if (rdev->mddev->persistent) {
+		if (rdev->mddev->major_version == 0)
+			return -EINVAL;
+		if ((sector > rdev->sb_start &&
+		     sector - rdev->sb_start > S16_MAX) ||
+		    (sector < rdev->sb_start &&
+		     rdev->sb_start - sector > -S16_MIN))
+			return -EINVAL;
+		rdev->ppl.offset = sector - rdev->sb_start;
+	} else if (!rdev->mddev->external) {
+		return -EBUSY;
+	}
+	rdev->ppl.sector = sector;
+	return len;
+}
+
+static struct rdev_sysfs_entry rdev_ppl_sector =
+__ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);
+
+static ssize_t
+ppl_size_show(struct md_rdev *rdev, char *page)
+{
+	return sprintf(page, "%u\n", rdev->ppl.size);
+}
+
+static ssize_t
+ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
+{
+	unsigned int size;
+
+	if (kstrtouint(buf, 10, &size) < 0)
+		return -EINVAL;
+
+	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
+	    rdev->raid_disk >= 0)
+		return -EBUSY;
+
+	if (rdev->mddev->persistent) {
+		if (rdev->mddev->major_version == 0)
+			return -EINVAL;
+		if (size > U16_MAX)
+			return -EINVAL;
+	} else if (!rdev->mddev->external) {
+		return -EBUSY;
+	}
+	rdev->ppl.size = size;
+	return len;
+}
+
+static struct rdev_sysfs_entry rdev_ppl_size =
+__ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);
+
 static struct attribute *rdev_default_attrs[] = {
 	&rdev_state.attr,
 	&rdev_errors.attr,
@@ -3215,6 +3287,8 @@ static struct attribute *rdev_default_attrs[] = {
 	&rdev_recovery_start.attr,
 	&rdev_bad_blocks.attr,
 	&rdev_unack_bad_blocks.attr,
+	&rdev_ppl_sector.attr,
+	&rdev_ppl_size.attr,
 	NULL,
 };
 static ssize_t
@@ -4951,6 +5025,46 @@ static struct md_sysfs_entry md_array_size =
 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
        array_size_store);
 
+static ssize_t
+consistency_policy_show(struct mddev *mddev, char *page)
+{
+	int ret;
+
+	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
+		ret = sprintf(page, "journal\n");
+	} else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
+		ret = sprintf(page, "ppl\n");
+	} else if (mddev->bitmap) {
+		ret = sprintf(page, "bitmap\n");
+	} else if (mddev->pers) {
+		if (mddev->pers->sync_request)
+			ret = sprintf(page, "resync\n");
+		else
+			ret = sprintf(page, "none\n");
+	} else {
+		ret = sprintf(page, "unknown\n");
+	}
+
+	return ret;
+}
+
+static ssize_t
+consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
+{
+	if (mddev->pers) {
+		return -EBUSY;
+	} else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
+		set_bit(MD_HAS_PPL, &mddev->flags);
+		return len;
+	} else {
+		return -EINVAL;
+	}
+}
+
+static struct md_sysfs_entry md_consistency_policy =
+__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
+       consistency_policy_store);
+
 static struct attribute *md_default_attrs[] = {
 	&md_level.attr,
 	&md_layout.attr,
@@ -4966,6 +5080,7 @@ static struct attribute *md_default_attrs[] = {
 	&md_reshape_direction.attr,
 	&md_array_size.attr,
 	&max_corr_read_errors.attr,
+	&md_consistency_policy.attr,
 	NULL,
 };
 
-- 
2.11.0


^ permalink raw reply related

* [PATCH v3 7/9] raid5-ppl: load and recover the log
From: Artur Paszkiewicz @ 2017-01-30 18:59 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, jes.sorensen, Artur Paszkiewicz
In-Reply-To: <20170130185953.30428-1-artur.paszkiewicz@intel.com>

Load the log from each disk when starting the array and recover if the
array is dirty.

The initial empty PPL is written by mdadm. When loading the log we
verify the header checksum and signature. For external metadata arrays
the signature is verified in userspace, so here we read it from the
header, verifying only if it matches on all disks, and use it later when
writing PPL.

In addition to the header checksum, each header entry also contains a
checksum of its partial parity data. If the header is valid, recovery is
performed for each entry until an invalid entry is found. If the array
is not degraded and recovery using PPL fully succeeds, there is no need
to resync the array because data and parity will be consistent, so in
this case resync will be disabled.

Due to compatibility with IMSM implementations on other systems, we
can't assume that the recovery data block size is always 4K. Writes
generated by MD raid5 don't have this issue, but when recovering PPL
written in other environments it is possible to have entries with
512-byte sector granularity. The recovery code takes this into account
and also the logical sector size of the underlying drives.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 drivers/md/raid5-ppl.c | 485 +++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.c     |   5 +-
 2 files changed, 489 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 6bc246c80f6b..b21b3bfa8f36 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -16,6 +16,7 @@
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/crc32c.h>
+#include <linux/async_tx.h>
 #include <linux/raid/md_p.h>
 #include "md.h"
 #include "raid5.h"
@@ -82,6 +83,10 @@ struct ppl_conf {
 
 	/* the logical block size used for data_sector in ppl_header_entry */
 	int block_size;
+
+	/* used only for recovery */
+	int recovered_entries;
+	int mismatch_count;
 };
 
 static struct r5l_io_unit *ppl_new_iounit(struct r5l_log *log,
@@ -395,6 +400,469 @@ static void __ppl_stripe_write_finished(struct r5l_io_unit *io)
 	spin_unlock_irqrestore(&log->io_list_lock, flags);
 }
 
+static void ppl_xor(int size, struct page *page1, struct page *page2,
+		    struct page *page_result)
+{
+	struct async_submit_ctl submit;
+	struct dma_async_tx_descriptor *tx;
+	struct page *xor_srcs[] = { page1, page2 };
+
+	init_async_submit(&submit, ASYNC_TX_ACK|ASYNC_TX_XOR_DROP_DST,
+			  NULL, NULL, NULL, NULL);
+	tx = async_xor(page_result, xor_srcs, 0, 2, size, &submit);
+
+	async_tx_quiesce(&tx);
+}
+
+/*
+ * PPL recovery strategy: xor partial parity and data from all modified data
+ * disks within a stripe and write the result as the new stripe parity. If all
+ * stripe data disks are modified (full stripe write), no partial parity is
+ * available, so just xor the data disks.
+ *
+ * Recovery of a PPL entry shall occur only if all modified data disks are
+ * available and read from all of them succeeds.
+ *
+ * A PPL entry applies to a stripe, partial parity size for an entry is at most
+ * the size of the chunk. Examples of possible cases for a single entry:
+ *
+ * case 0: single data disk write:
+ *   data0    data1    data2     ppl        parity
+ * +--------+--------+--------+           +--------------------+
+ * | ------ | ------ | ------ | +----+    | (no change)        |
+ * | ------ | -data- | ------ | | pp | -> | data1 ^ pp         |
+ * | ------ | -data- | ------ | | pp | -> | data1 ^ pp         |
+ * | ------ | ------ | ------ | +----+    | (no change)        |
+ * +--------+--------+--------+           +--------------------+
+ * pp_size = data_size
+ *
+ * case 1: more than one data disk write:
+ *   data0    data1    data2     ppl        parity
+ * +--------+--------+--------+           +--------------------+
+ * | ------ | ------ | ------ | +----+    | (no change)        |
+ * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
+ * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
+ * | ------ | ------ | ------ | +----+    | (no change)        |
+ * +--------+--------+--------+           +--------------------+
+ * pp_size = data_size / modified_data_disks
+ *
+ * case 2: write to all data disks (also full stripe write):
+ *   data0    data1    data2                parity
+ * +--------+--------+--------+           +--------------------+
+ * | ------ | ------ | ------ |           | (no change)        |
+ * | -data- | -data- | -data- | --------> | xor all data       |
+ * | ------ | ------ | ------ | --------> | (no change)        |
+ * | ------ | ------ | ------ |           | (no change)        |
+ * +--------+--------+--------+           +--------------------+
+ * pp_size = 0
+ *
+ * The following cases are possible only in other implementations. The recovery
+ * code can handle them, but they are not generated at runtime because they can
+ * be reduced to cases 0, 1 and 2:
+ *
+ * case 3:
+ *   data0    data1    data2     ppl        parity
+ * +--------+--------+--------+ +----+    +--------------------+
+ * | ------ | -data- | -data- | | pp |    | data1 ^ data2 ^ pp |
+ * | ------ | -data- | -data- | | pp | -> | data1 ^ data2 ^ pp |
+ * | -data- | -data- | -data- | | -- | -> | xor all data       |
+ * | -data- | -data- | ------ | | pp |    | data0 ^ data1 ^ pp |
+ * +--------+--------+--------+ +----+    +--------------------+
+ * pp_size = chunk_size
+ *
+ * case 4:
+ *   data0    data1    data2     ppl        parity
+ * +--------+--------+--------+ +----+    +--------------------+
+ * | ------ | -data- | ------ | | pp |    | data1 ^ pp         |
+ * | ------ | ------ | ------ | | -- | -> | (no change)        |
+ * | ------ | ------ | ------ | | -- | -> | (no change)        |
+ * | -data- | ------ | ------ | | pp |    | data0 ^ pp         |
+ * +--------+--------+--------+ +----+    +--------------------+
+ * pp_size = chunk_size
+ */
+static int ppl_recover_entry(struct r5l_log *log, struct ppl_header_entry *e,
+			     sector_t ppl_sector)
+{
+	struct ppl_conf *ppl_conf = log->private;
+	struct mddev *mddev = ppl_conf->mddev;
+	struct r5conf *conf = mddev->private;
+	int block_size = ppl_conf->block_size;
+	struct page *pages;
+	struct page *page1;
+	struct page *page2;
+	sector_t r_sector_first;
+	sector_t r_sector_last;
+	int strip_sectors;
+	int data_disks;
+	int i;
+	int ret = 0;
+	char b[BDEVNAME_SIZE];
+	unsigned int pp_size = le32_to_cpu(e->pp_size);
+	unsigned int data_size = le32_to_cpu(e->data_size);
+
+	r_sector_first = le64_to_cpu(e->data_sector) * (block_size >> 9);
+
+	if ((pp_size >> 9) < conf->chunk_sectors) {
+		if (pp_size > 0) {
+			data_disks = data_size / pp_size;
+			strip_sectors = pp_size >> 9;
+		} else {
+			data_disks = conf->raid_disks - conf->max_degraded;
+			strip_sectors = (data_size >> 9) / data_disks;
+		}
+		r_sector_last = r_sector_first +
+				(data_disks - 1) * conf->chunk_sectors +
+				strip_sectors;
+	} else {
+		data_disks = conf->raid_disks - conf->max_degraded;
+		strip_sectors = conf->chunk_sectors;
+		r_sector_last = r_sector_first + (data_size >> 9);
+	}
+
+	pages = alloc_pages(GFP_KERNEL, 1);
+	if (!pages)
+		return -ENOMEM;
+	page1 = pages;
+	page2 = pages + 1;
+
+	pr_debug("%s: array sector first: %llu last: %llu\n", __func__,
+		 (unsigned long long)r_sector_first,
+		 (unsigned long long)r_sector_last);
+
+	/* if start and end is 4k aligned, use a 4k block */
+	if (block_size == 512 &&
+	    (r_sector_first & (STRIPE_SECTORS - 1)) == 0 &&
+	    (r_sector_last & (STRIPE_SECTORS - 1)) == 0)
+		block_size = STRIPE_SIZE;
+
+	/* iterate through blocks in strip */
+	for (i = 0; i < strip_sectors; i += (block_size >> 9)) {
+		bool update_parity = false;
+		sector_t parity_sector;
+		struct md_rdev *parity_rdev;
+		struct stripe_head sh;
+		int disk;
+		int indent = 0;
+
+		pr_debug("%s:%*s iter %d start\n", __func__, indent, "", i);
+		indent += 2;
+
+		memset(page_address(page1), 0, PAGE_SIZE);
+
+		/* iterate through data member disks */
+		for (disk = 0; disk < data_disks; disk++) {
+			int dd_idx;
+			struct md_rdev *rdev;
+			sector_t sector;
+			sector_t r_sector = r_sector_first + i +
+					    (disk * conf->chunk_sectors);
+
+			pr_debug("%s:%*s data member disk %d start\n",
+				 __func__, indent, "", disk);
+			indent += 2;
+
+			if (r_sector >= r_sector_last) {
+				pr_debug("%s:%*s array sector %llu doesn't need parity update\n",
+					 __func__, indent, "",
+					 (unsigned long long)r_sector);
+				indent -= 2;
+				continue;
+			}
+
+			update_parity = true;
+
+			/* map raid sector to member disk */
+			sector = raid5_compute_sector(conf, r_sector, 0,
+						      &dd_idx, NULL);
+			pr_debug("%s:%*s processing array sector %llu => data member disk %d, sector %llu\n",
+				 __func__, indent, "",
+				 (unsigned long long)r_sector, dd_idx,
+				 (unsigned long long)sector);
+
+			rdev = conf->disks[dd_idx].rdev;
+			if (!rdev) {
+				pr_debug("%s:%*s data member disk %d missing\n",
+					 __func__, indent, "", dd_idx);
+				update_parity = false;
+				break;
+			}
+
+			pr_debug("%s:%*s reading data member disk %s sector %llu\n",
+				 __func__, indent, "", bdevname(rdev->bdev, b),
+				 (unsigned long long)sector);
+			if (!sync_page_io(rdev, sector, block_size, page2,
+					REQ_OP_READ, 0, false)) {
+				md_error(mddev, rdev);
+				pr_debug("%s:%*s read failed!\n", __func__,
+					 indent, "");
+				ret = -EIO;
+				goto out;
+			}
+
+			ppl_xor(block_size, page1, page2, page1);
+
+			indent -= 2;
+		}
+
+		if (!update_parity)
+			continue;
+
+		if (pp_size > 0) {
+			pr_debug("%s:%*s reading pp disk sector %llu\n",
+				 __func__, indent, "",
+				 (unsigned long long)(ppl_sector + i));
+			if (!sync_page_io(log->rdev,
+					ppl_sector - log->rdev->data_offset + i,
+					block_size, page2, REQ_OP_READ, 0,
+					false)) {
+				pr_debug("%s:%*s read failed!\n", __func__,
+					 indent, "");
+				md_error(mddev, log->rdev);
+				ret = -EIO;
+				goto out;
+			}
+
+			ppl_xor(block_size, page1, page2, page1);
+		}
+
+		/* map raid sector to parity disk */
+		parity_sector = raid5_compute_sector(conf, r_sector_first + i,
+				0, &disk, &sh);
+		BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk));
+		parity_rdev = conf->disks[sh.pd_idx].rdev;
+
+		BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
+		pr_debug("%s:%*s write parity at sector %llu, disk %s\n",
+			 __func__, indent, "",
+			 (unsigned long long)parity_sector,
+			 bdevname(parity_rdev->bdev, b));
+		if (!sync_page_io(parity_rdev, parity_sector, block_size,
+				page1, REQ_OP_WRITE, 0, false)) {
+			pr_debug("%s:%*s parity write error!\n", __func__,
+				 indent, "");
+			md_error(mddev, parity_rdev);
+			ret = -EIO;
+			goto out;
+		}
+	}
+out:
+	__free_pages(pages, 1);
+	return ret;
+}
+
+static int ppl_recover(struct r5l_log *log, struct ppl_header *pplhdr)
+{
+	struct ppl_conf *ppl_conf = log->private;
+	struct md_rdev *rdev = log->rdev;
+	struct mddev *mddev = rdev->mddev;
+	sector_t ppl_sector = rdev->ppl.sector + (PPL_HEADER_SIZE >> 9);
+	struct page *page;
+	int i;
+	int ret = 0;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	/* iterate through all PPL entries saved */
+	for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) {
+		struct ppl_header_entry *e = &pplhdr->entries[i];
+		u32 pp_size = le32_to_cpu(e->pp_size);
+		sector_t sector = ppl_sector;
+		int ppl_entry_sectors = pp_size >> 9;
+		u32 crc, crc_stored;
+
+		pr_debug("%s: disk: %d entry: %d ppl_sector: %llu pp_size: %u\n",
+			 __func__, rdev->raid_disk, i,
+			 (unsigned long long)ppl_sector, pp_size);
+
+		crc = ~0;
+		crc_stored = le32_to_cpu(e->checksum);
+
+		/* read parial parity for this entry and calculate its checksum */
+		while (pp_size) {
+			int s = pp_size > PAGE_SIZE ? PAGE_SIZE : pp_size;
+
+			if (!sync_page_io(rdev, sector - rdev->data_offset,
+					s, page, REQ_OP_READ, 0, false)) {
+				md_error(mddev, rdev);
+				ret = -EIO;
+				goto out;
+			}
+
+			crc = crc32c_le(crc, page_address(page), s);
+
+			pp_size -= s;
+			sector += s >> 9;
+		}
+
+		crc = ~crc;
+
+		if (crc != crc_stored) {
+			/*
+			 * Don't recover this entry if the checksum does not
+			 * match, but keep going and try to recover other
+			 * entries.
+			 */
+			pr_debug("%s: ppl entry crc does not match: stored: 0x%x calculated: 0x%x\n",
+				 __func__, crc_stored, crc);
+			ppl_conf->mismatch_count++;
+		} else {
+			ret = ppl_recover_entry(log, e, ppl_sector);
+			if (ret)
+				goto out;
+			ppl_conf->recovered_entries++;
+		}
+
+		ppl_sector += ppl_entry_sectors;
+	}
+out:
+	__free_page(page);
+	return ret;
+}
+
+static int ppl_write_empty_header(struct r5l_log *log)
+{
+	struct page *page;
+	struct ppl_header *pplhdr;
+	struct md_rdev *rdev = log->rdev;
+	int ret = 0;
+
+	pr_debug("%s: disk: %d ppl_sector: %llu\n", __func__,
+		 rdev->raid_disk, (unsigned long long)rdev->ppl.sector);
+
+	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!page)
+		return -ENOMEM;
+
+	pplhdr = page_address(page);
+	memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
+	pplhdr->signature = cpu_to_le32(log->uuid_checksum);
+	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
+
+	if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
+			  PPL_HEADER_SIZE, page, REQ_OP_WRITE, 0, false)) {
+		md_error(rdev->mddev, rdev);
+		ret = -EIO;
+	}
+
+	__free_page(page);
+	return ret;
+}
+
+static int ppl_load_distributed(struct r5l_log *log)
+{
+	struct ppl_conf *ppl_conf = log->private;
+	struct md_rdev *rdev = log->rdev;
+	struct mddev *mddev = rdev->mddev;
+	struct page *page;
+	struct ppl_header *pplhdr;
+	u32 crc, crc_stored;
+	u32 signature;
+	int ret = 0;
+
+	pr_debug("%s: disk: %d\n", __func__, rdev->raid_disk);
+
+	/* read PPL header */
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
+			  PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
+		md_error(mddev, rdev);
+		ret = -EIO;
+		goto out;
+	}
+	pplhdr = page_address(page);
+
+	/* check header validity */
+	crc_stored = le32_to_cpu(pplhdr->checksum);
+	pplhdr->checksum = 0;
+	crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE);
+
+	if (crc_stored != crc) {
+		pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x\n",
+			 __func__, crc_stored, crc);
+		ppl_conf->mismatch_count++;
+		goto out;
+	}
+
+	signature = le32_to_cpu(pplhdr->signature);
+
+	if (mddev->external) {
+		/*
+		 * For external metadata the header signature is set and
+		 * validated in userspace.
+		 */
+		log->uuid_checksum = signature;
+	} else if (log->uuid_checksum != signature) {
+		pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x\n",
+			 __func__, signature, log->uuid_checksum);
+		ppl_conf->mismatch_count++;
+		goto out;
+	}
+
+	/* attempt to recover from log if we are starting a dirty array */
+	if (!mddev->pers && mddev->recovery_cp != MaxSector)
+		ret = ppl_recover(log, pplhdr);
+out:
+	if (!ret && (le32_to_cpu(pplhdr->entries_count) > 0 ||
+		     ppl_conf->mismatch_count > 0))
+		ret = ppl_write_empty_header(log);
+
+	__free_page(page);
+
+	pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
+		 __func__, ret, ppl_conf->mismatch_count,
+		 ppl_conf->recovered_entries);
+	return ret;
+}
+
+static int ppl_load(struct r5l_log *log)
+{
+	struct ppl_conf *ppl_conf = log->private;
+	int ret = 0;
+	int i;
+	u32 *signature = NULL;
+
+	for (i = 0; i < ppl_conf->count; i++) {
+		struct r5l_log *log_child = &ppl_conf->child_logs[i];
+
+		/* skip missing drive */
+		if (!log_child->rdev)
+			continue;
+
+		ret = ppl_load_distributed(log_child);
+		if (ret)
+			break;
+
+		/*
+		 * For external metadata we can't check if the signature is
+		 * correct on a single drive, but we can check if it is the same
+		 * on all drives.
+		 */
+		if (ppl_conf->mddev->external) {
+			if (!signature) {
+				signature = &log_child->uuid_checksum;
+			} else if (*signature != log_child->uuid_checksum) {
+				pr_warn("md/raid:%s: PPL header signature does not match on all member drives\n",
+					mdname(ppl_conf->mddev));
+				ret = -EINVAL;
+				break;
+			}
+		}
+	}
+
+	if (signature)
+		log->uuid_checksum = *signature;
+
+	pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
+		 __func__, ret, ppl_conf->mismatch_count,
+		 ppl_conf->recovered_entries);
+	return ret;
+}
+
 static void __ppl_exit_log(struct r5l_log *log)
 {
 	struct ppl_conf *ppl_conf = log->private;
@@ -515,6 +983,23 @@ static int __ppl_init_log(struct r5l_log *log, struct r5conf *conf)
 		pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
 			mdname(mddev));
 
+	/* load and possibly recover the logs from the member disks */
+	ret = ppl_load(log);
+
+	if (ret) {
+		goto err;
+	} else if (!mddev->pers &&
+		   mddev->recovery_cp == 0 && !mddev->degraded &&
+		   ppl_conf->recovered_entries > 0 &&
+		   ppl_conf->mismatch_count == 0) {
+		/*
+		 * If we are starting a dirty array and the recovery succeeds
+		 * without any issues, set the array as clean.
+		 */
+		mddev->recovery_cp = MaxSector;
+		set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
+	}
+
 	conf->log = log;
 
 	return 0;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ffa19bd4ba6e..c015925710f5 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7270,7 +7270,10 @@ static int raid5_run(struct mddev *mddev)
 
 	if (mddev->degraded > dirty_parity_disks &&
 	    mddev->recovery_cp != MaxSector) {
-		if (mddev->ok_start_degraded)
+		if (test_bit(MD_HAS_PPL, &mddev->flags))
+			pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
+				mdname(mddev));
+		else if (mddev->ok_start_degraded)
 			pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
 				mdname(mddev));
 		else {
-- 
2.11.0


^ permalink raw reply related

* [PATCH v3 8/9] raid5-ppl: support disk hot add/remove with PPL
From: Artur Paszkiewicz @ 2017-01-30 18:59 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, jes.sorensen, Artur Paszkiewicz
In-Reply-To: <20170130185953.30428-1-artur.paszkiewicz@intel.com>

Add a function to modify the log by removing an rdev when a drive fails
or adding when a spare/replacement is activated as a raid member.

Removing a disk just clears the child log rdev pointer. No new stripes
will be accepted for this child log in ppl_write_stripe() and running io
units will be processed without writing PPL to the device.

Adding a disk sets the child log rdev pointer and writes an empty PPL
header.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 drivers/md/raid5-cache.c |  8 ++++++++
 drivers/md/raid5-cache.h |  9 +++++++++
 drivers/md/raid5-ppl.c   | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.c       | 15 +++++++++++++++
 4 files changed, 80 insertions(+)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 7757c5137300..3031c3f720a9 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2774,6 +2774,14 @@ void r5l_exit_log(struct r5l_log *log)
 	kfree(log);
 }
 
+int r5l_modify_log(struct r5l_log *log, struct md_rdev *rdev,
+		   enum r5l_modify_log_operation operation)
+{
+	if (log && log->policy->modify_log)
+		return log->policy->modify_log(log, rdev, operation);
+	return 0;
+}
+
 struct r5l_policy r5l_journal = {
 	.init_log = __r5l_init_log,
 	.exit_log = __r5l_exit_log,
diff --git a/drivers/md/raid5-cache.h b/drivers/md/raid5-cache.h
index 6b622c2742de..9e351708c247 100644
--- a/drivers/md/raid5-cache.h
+++ b/drivers/md/raid5-cache.h
@@ -138,9 +138,16 @@ enum r5l_io_unit_state {
 	IO_UNIT_STRIPE_END = 3,	/* stripes data finished writing to raid */
 };
 
+enum r5l_modify_log_operation {
+	R5L_MODIFY_LOG_DISK_REMOVE,
+	R5L_MODIFY_LOG_DISK_ADD,
+};
+
 struct r5l_policy {
 	int (*init_log)(struct r5l_log *log, struct r5conf *conf);
 	void (*exit_log)(struct r5l_log *log);
+	int (*modify_log)(struct r5l_log *log, struct md_rdev *rdev,
+			  enum r5l_modify_log_operation operation);
 	int (*write_stripe)(struct r5l_log *log, struct stripe_head *sh);
 	void (*write_stripe_run)(struct r5l_log *log);
 	void (*flush_stripe_to_raid)(struct r5l_log *log);
@@ -151,6 +158,8 @@ struct r5l_policy {
 
 extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
 extern void r5l_exit_log(struct r5l_log *log);
+extern int r5l_modify_log(struct r5l_log *log, struct md_rdev *rdev,
+			  enum r5l_modify_log_operation operation);
 extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh);
 extern void r5l_write_stripe_run(struct r5l_log *log);
 extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index b21b3bfa8f36..87e718f5b29f 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -291,6 +291,12 @@ static void ppl_submit_iounit(struct r5l_io_unit *io)
 
 	bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
 	bio->bi_private = io;
+
+	if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
+		ppl_log_endio(bio);
+		return;
+	}
+
 	bio->bi_end_io = ppl_log_endio;
 	bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
 	bio->bi_bdev = log->rdev->bdev;
@@ -1008,6 +1014,47 @@ static int __ppl_init_log(struct r5l_log *log, struct r5conf *conf)
 	return ret;
 }
 
+static int __ppl_modify_log(struct r5l_log *log, struct md_rdev *rdev,
+			    enum r5l_modify_log_operation operation)
+{
+	struct r5l_log *log_child;
+	struct ppl_conf *ppl_conf = log->private;
+	int ret = 0;
+	char b[BDEVNAME_SIZE];
+
+	if (!rdev)
+		return -EINVAL;
+
+	pr_debug("%s: disk: %d operation: %s dev: %s\n",
+		 __func__, rdev->raid_disk,
+		 operation == R5L_MODIFY_LOG_DISK_REMOVE ? "remove" :
+		 (operation == R5L_MODIFY_LOG_DISK_ADD ? "add" : "?"),
+		 bdevname(rdev->bdev, b));
+
+	if (rdev->raid_disk < 0)
+		return 0;
+
+	if (rdev->raid_disk >= ppl_conf->count)
+		return -ENODEV;
+
+	log_child = &ppl_conf->child_logs[rdev->raid_disk];
+
+	mutex_lock(&log_child->io_mutex);
+	if (operation == R5L_MODIFY_LOG_DISK_REMOVE) {
+		log_child->rdev = NULL;
+	} else if (operation == R5L_MODIFY_LOG_DISK_ADD) {
+		log_child->rdev = rdev;
+		if (rdev->mddev->external)
+			log_child->uuid_checksum = log->uuid_checksum;
+		ret = ppl_write_empty_header(log_child);
+	} else {
+		ret = -EINVAL;
+	}
+	mutex_unlock(&log_child->io_mutex);
+
+	return ret;
+}
+
 static int __ppl_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 {
 	struct ppl_conf *ppl_conf = log->private;
@@ -1027,6 +1074,7 @@ static void __ppl_write_stripe_run(struct r5l_log *log)
 struct r5l_policy r5l_ppl = {
 	.init_log = __ppl_init_log,
 	.exit_log = __ppl_exit_log,
+	.modify_log = __ppl_modify_log,
 	.write_stripe = __ppl_write_stripe,
 	.write_stripe_run = __ppl_write_stripe_run,
 	.flush_stripe_to_raid = NULL,
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c015925710f5..a711bf940116 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7576,6 +7576,12 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 			*rdevp = rdev;
 		}
 	}
+	if (test_bit(MD_HAS_PPL, &mddev->flags)) {
+		err = r5l_modify_log(conf->log, rdev,
+				     R5L_MODIFY_LOG_DISK_REMOVE);
+		if (err)
+			goto abort;
+	}
 	if (p->replacement) {
 		/* We must have just cleared 'rdev' */
 		p->rdev = p->replacement;
@@ -7585,6 +7591,10 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 			   */
 		p->replacement = NULL;
 		clear_bit(WantReplacement, &rdev->flags);
+
+		if (test_bit(MD_HAS_PPL, &mddev->flags))
+			err = r5l_modify_log(conf->log, p->rdev,
+					     R5L_MODIFY_LOG_DISK_ADD);
 	} else
 		/* We might have just removed the Replacement as faulty-
 		 * clear the bit just in case
@@ -7648,6 +7658,11 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			if (rdev->saved_raid_disk != disk)
 				conf->fullsync = 1;
 			rcu_assign_pointer(p->rdev, rdev);
+
+			if (test_bit(MD_HAS_PPL, &mddev->flags))
+				err = r5l_modify_log(conf->log, rdev,
+						     R5L_MODIFY_LOG_DISK_ADD);
+
 			goto out;
 		}
 	}
-- 
2.11.0


^ permalink raw reply related

* [PATCH v3 9/9] raid5-ppl: runtime PPL enabling or disabling
From: Artur Paszkiewicz @ 2017-01-30 18:59 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, jes.sorensen, Artur Paszkiewicz
In-Reply-To: <20170130185953.30428-1-artur.paszkiewicz@intel.com>

Allow writing to 'consistency_policy' attribute when the array is
active. Add a new function 'change_consistency_policy' to the
md_personality operations structure to handle the change in the
personality code. Values "ppl" and "resync" are accepted and
turn PPL on and off respectively.

When enabling PPL its location and size should first be set using
'ppl_sector' and 'ppl_size' attributes and a valid PPL header should be
written at this location on each member device.

Enabling or disabling PPL is performed under a suspended array.  The
raid5_reset_stripe_cache function frees the stripe cache and allocates
it again in order to allocate or free the ppl_pages for the stripes in
the stripe cache.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 drivers/md/md.c        | 12 +++++++---
 drivers/md/md.h        |  2 ++
 drivers/md/raid5-ppl.c | 19 +++++++++++++++-
 drivers/md/raid5.c     | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 90 insertions(+), 4 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 47ab3afe87cb..efbaa2f90eb0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5051,14 +5051,20 @@ consistency_policy_show(struct mddev *mddev, char *page)
 static ssize_t
 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
 {
+	int err = 0;
+
 	if (mddev->pers) {
-		return -EBUSY;
+		if (mddev->pers->change_consistency_policy)
+			err = mddev->pers->change_consistency_policy(mddev, buf);
+		else
+			err = -EBUSY;
 	} else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
 		set_bit(MD_HAS_PPL, &mddev->flags);
-		return len;
 	} else {
-		return -EINVAL;
+		err = -EINVAL;
 	}
+
+	return err ? err : len;
 }
 
 static struct md_sysfs_entry md_consistency_policy =
diff --git a/drivers/md/md.h b/drivers/md/md.h
index abdb5f2ed2d3..ead1aeff9302 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -548,6 +548,8 @@ struct md_personality
 	/* congested implements bdi.congested_fn().
 	 * Will not be called while array is 'suspended' */
 	int (*congested)(struct mddev *mddev, int bits);
+	/* Changes the consistency policy of an active array. */
+	int (*change_consistency_policy)(struct mddev *mddev, const char *buf);
 };
 
 struct md_sysfs_entry {
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 87e718f5b29f..5a8e7a56eac6 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -873,7 +873,20 @@ static void __ppl_exit_log(struct r5l_log *log)
 {
 	struct ppl_conf *ppl_conf = log->private;
 
-	kfree(ppl_conf->child_logs);
+	if (ppl_conf->child_logs) {
+		struct r5l_log *log_child;
+		int i;
+
+		for (i = 0; i < ppl_conf->count; i++) {
+			log_child = &ppl_conf->child_logs[i];
+			if (log_child->rdev) {
+				log_child->rdev->ppl.offset = 0;
+				log_child->rdev->ppl.sector = 0;
+				log_child->rdev->ppl.size = 0;
+			}
+		}
+		kfree(ppl_conf->child_logs);
+	}
 	kfree(ppl_conf);
 
 	mempool_destroy(log->meta_pool);
@@ -1004,6 +1017,10 @@ static int __ppl_init_log(struct r5l_log *log, struct r5conf *conf)
 		 */
 		mddev->recovery_cp = MaxSector;
 		set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
+	} else if (mddev->pers && ppl_conf->mismatch_count > 0) {
+		/* no mismatch allowed when enabling PPL for a running array */
+		ret = -EINVAL;
+		goto err;
 	}
 
 	conf->log = log;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a711bf940116..a095ac2f5d64 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -8272,6 +8272,66 @@ static void *raid6_takeover(struct mddev *mddev)
 	return setup_conf(mddev);
 }
 
+static void raid5_reset_stripe_cache(struct mddev *mddev)
+{
+	struct r5conf *conf = mddev->private;
+
+	mutex_lock(&conf->cache_size_mutex);
+	while (conf->max_nr_stripes &&
+	       drop_one_stripe(conf))
+		;
+	while (conf->min_nr_stripes > conf->max_nr_stripes &&
+	       grow_one_stripe(conf, GFP_KERNEL))
+		;
+	mutex_unlock(&conf->cache_size_mutex);
+}
+
+static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
+{
+	struct r5conf *conf;
+	int err;
+
+	err = mddev_lock(mddev);
+	if (err)
+		return err;
+	conf = mddev->private;
+	if (!conf) {
+		mddev_unlock(mddev);
+		return -ENODEV;
+	}
+
+	if (strncmp(buf, "ppl", 3) == 0 &&
+	    !test_bit(MD_HAS_PPL, &mddev->flags) &&
+	    !test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
+	    !mddev->bitmap) {
+		mddev_suspend(mddev);
+		set_bit(MD_HAS_PPL, &mddev->flags);
+		err = r5l_init_log(conf, NULL);
+		if (err)
+			clear_bit(MD_HAS_PPL, &mddev->flags);
+		else
+			raid5_reset_stripe_cache(mddev);
+		mddev_resume(mddev);
+	} else if (strncmp(buf, "resync", 6) == 0 &&
+		   test_bit(MD_HAS_PPL, &mddev->flags)) {
+		mddev_suspend(mddev);
+		r5l_exit_log(conf->log);
+		conf->log = NULL;
+		clear_bit(MD_HAS_PPL, &mddev->flags);
+		raid5_reset_stripe_cache(mddev);
+		mddev_resume(mddev);
+	} else {
+		err = -EINVAL;
+	}
+
+	if (!err)
+		md_update_sb(mddev, 1);
+
+	mddev_unlock(mddev);
+
+	return err;
+}
+
 static struct md_personality raid6_personality =
 {
 	.name		= "raid6",
@@ -8317,6 +8377,7 @@ static struct md_personality raid5_personality =
 	.quiesce	= raid5_quiesce,
 	.takeover	= raid5_takeover,
 	.congested	= raid5_congested,
+	.change_consistency_policy = raid5_change_consistency_policy,
 };
 
 static struct md_personality raid4_personality =
-- 
2.11.0


^ permalink raw reply related

* Re: drives failed during reshape, array won't even force-assemble
From: Thomas Warntjen @ 2017-01-30 19:57 UTC (permalink / raw)
  To: Phil Turmel, linux-raid
In-Reply-To: <73c909f6-5bf3-06c5-368f-2a641a3445b7@turmel.org>

Hi Phil,

thanks for your reply - sadly it's the first I got so no, I haven't 
solved it yet. Any help is still highly appreciated!

Thomas

^ permalink raw reply

* Re: [systemd-devel] Errorneous detection of degraded array
From: NeilBrown @ 2017-01-30 22:19 UTC (permalink / raw)
  To: Andrei Borzenkov
  Cc: Luke Pyzowski, systemd-devel@lists.freedesktop.org, linux-raid
In-Reply-To: <CAA91j0V92RXDky-AnD6w+Dy=M7KJVCWyssA7yHRfRqBxLTWvog@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 3172 bytes --]

On Mon, Jan 30 2017, Andrei Borzenkov wrote:

> On Mon, Jan 30, 2017 at 9:36 AM, NeilBrown <neilb@suse.com> wrote:
> ...
>>>>>>
>>>>>> systemd[1]: Created slice system-mdadm\x2dlast\x2dresort.slice.
>>>>>> systemd[1]: Starting system-mdadm\x2dlast\x2dresort.slice.
>>>>>> systemd[1]: Starting Activate md array even though degraded...
>>>>>> systemd[1]: Stopped target Local File Systems.
>>>>>> systemd[1]: Stopping Local File Systems.
>>>>>> systemd[1]: Unmounting /share...
>>>>>> systemd[1]: Stopped (with error) /dev/md0.
>>>>
> ...
>>
>> The race is, I think, that one I mentioned.  If the md device is started
>> before udev tells systemd to start the timer, the Conflicts dependencies
>> goes the "wrong" way and stops the wrong thing.
>>
>
> From the logs provided it is unclear whether it is *timer* or
> *service*. If it is timer - I do not understand why it is started
> exactly 30 seconds after device apparently appears. This would match
> starting service.

My guess is that the timer is triggered immediately after the device is
started, but before it is mounted.
The Conflicts directive tries to stop the device, but is cannot stop the
device and there are no dependencies yet, so nothing happen.
After the timer fires (30 seconds later) the .service starts.  It also
has a Conflicts directory so systemd tried to stop the device again.
Now that it has been mounted, there is a dependences that can be
stopped, and the device gets unmounted.

>
> Yet another case where system logging is hopelessly unfriendly for
> troubleshooting :(
>
>> It would be nice to be able to reliably stop the timer when the device
>> starts, without risking having the device get stopped when the timer
>> starts, but I don't think we can reliably do that.
>>
>
> Well, let's wait until we can get some more information about what happens.
>
>> Changing the
>>   Conflicts=sys-devices-virtual-block-%i.device
>> lines to
>>   ConditionPathExists=/sys/devices/virtual/block/%i
>> might make the problem go away, without any negative consequences.
>>
>
> Ugly, but yes, may be this is the only way using current systemd.
>
>> The primary purpose of having the 'Conflicts' directives was so that
>> systemd wouldn't log
>>   Starting Activate md array even though degraded
>> after the array was successfully started.
>
> This looks like cosmetic problem. What will happen if last resort
> service is started when array is fully assembled? Will it do any harm?

Yes, it could be seen as cosmetic, but cosmetic issues can be important
too.  Confusing messages in logs can be harmful.

In all likely cases, running the last-resort service won't cause any
harm.
If, during the 30 seconds, the array is started, then deliberately
stopped, then partially assembled again, then when the last-resort
service finally starts it might do the wrong thing.
So it would be cleanest if the timer was killed as soon as the device
is started.  But I don't think there is a practical concern.

I guess I could make a udev rule that fires when the array started, and
that runs "systemctl stop mdadm-last-resort@md0.timer"

NeilBrown


>
>> Hopefully it won't do that when the Condition fails.
>>

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* RE: [systemd-devel] Errorneous detection of degraded array
From: Luke Pyzowski @ 2017-01-30 22:41 UTC (permalink / raw)
  To: 'Andrei Borzenkov',
	'systemd-devel@lists.freedesktop.org',
	linux-raid@vger.kernel.org
In-Reply-To: <4504399b-4d6f-a18a-d64a-e46ecd8efa46@gmail.com>

> Does
>   systemctl  list-dependencies  sys-devices-virtual-block-md0.device
> report anything interesting?  I get
>
> sys-devices-virtual-block-md0.device
> ● └─mdmonitor.service

Nothing interesting, the same output as you have above.



> Could you try run with systemd.log_level=debug on kernel command line and upload journal again. We can only hope that it will not skew timings enough but it may prove my hypothesis.

I've uploaded the full debug logs to: https://gist.github.com/Kryai/8273322c8a61347e2300e476c70b4d05
In around 20 reboots, the error appeared only twice, certainly with debug enabled it is more rare, but it does still occur, but to your correct guess, debug logging does affect the exhibition of the race condition.

Reminder of key things in the log:
# cat /etc/systemd/system/mdadm-last-resort@.timer 
[Unit]
Description=Timer to wait for more drives before activating degraded array.
DefaultDependencies=no
Conflicts=sys-devices-virtual-block-%i.device

[Timer]
OnActiveSec=30



# cat /etc/systemd/system/share.mount 
[Unit]
Description=Mount /share RAID partition explicitly
Before=nfs-server.service

[Mount]
What=/dev/disk/by-uuid/2b9114be-3d5a-41d7-8d4b-e5047d223129
Where=/share
Type=ext4
Options=defaults
TimeoutSec=120

[Install]
WantedBy=multi-user.target


Again, if any more information is needed please let me know I'll provide it.


Many thanks,
Luke Pyzowski

^ permalink raw reply

* Re: [PATCH 1/2] md: add bad block flag to disk state
From: Shaohua Li @ 2017-01-30 23:33 UTC (permalink / raw)
  To: Tomasz Majchrzak; +Cc: linux-raid, jes.sorensen
In-Reply-To: <1485259419-2308-2-git-send-email-tomasz.majchrzak@intel.com>

On Tue, Jan 24, 2017 at 01:03:38PM +0100, Tomasz Majchrzak wrote:
> Add a new flag to report that bad blocks are present on a disk. It will
> allow userspace to notify the user of the problem.
> 
> Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
> ---
>  drivers/md/md.c                | 2 ++
>  include/uapi/linux/raid/md_p.h | 1 +
>  2 files changed, 3 insertions(+)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 0abb147..1a807ec 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -6034,6 +6034,8 @@ static int get_disk_info(struct mddev *mddev, void __user * arg)
>  			info.state |= (1<<MD_DISK_WRITEMOSTLY);
>  		if (test_bit(FailFast, &rdev->flags))
>  			info.state |= (1<<MD_DISK_FAILFAST);
> +		if (rdev->badblocks.count)
> +			info.state |= (1<<MD_DISK_BB_PRESENT);

Userspace can find if a disk has badblocks by reading the bad_blocks sysfs
file. Why adds another interface?

Thanks,
Shaohua

>  	} else {
>  		info.major = info.minor = 0;
>  		info.raid_disk = -1;
> diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
> index 9930f3e..b151e93 100644
> --- a/include/uapi/linux/raid/md_p.h
> +++ b/include/uapi/linux/raid/md_p.h
> @@ -93,6 +93,7 @@
>  				   * read requests will only be sent here in
>  				   * dire need
>  				   */
> +#define MD_DISK_BB_PRESENT	11 /* disk has bad blocks */
>  #define MD_DISK_JOURNAL		18 /* disk is used as the write journal in RAID-5/6 */
>  
>  #define MD_DISK_ROLE_SPARE	0xffff
> -- 
> 1.8.3.1
> 

^ permalink raw reply

* Re: [PATCH v1] md/r5cache: improve journal device efficiency
From: Shaohua Li @ 2017-01-31  0:11 UTC (permalink / raw)
  To: Song Liu
  Cc: linux-raid, neilb, shli, kernel-team, dan.j.williams, hch,
	liuzhengyuan, liuyun01, jsorensen
In-Reply-To: <20170124220823.1481119-1-songliubraving@fb.com>

On Tue, Jan 24, 2017 at 02:08:23PM -0800, Song Liu wrote:
> It is important to be able to flush all stripes in raid5-cache.
> Therefore, we need reserve some space on the journal device for
> these flushes. If flush operation includes pending writes to the
> stripe, we need to reserve (conf->raid_disk + 1) pages per stripe
> for the flush out. This reduces the efficiency of journal space.
> If we exclude these pending writes from flush operation, we only
> need (conf->max_degraded + 1) pages per stripe.
> 
> With this patch, when log space is critical (R5C_LOG_CRITICAL=1),
> pending writes will be excluded from stripe flush out. Therefore,
> we can reduce reserved space for flush out and thus improve journal
> device efficiency.

Applied, thanks! 
> - * To improve this, we will need writing-out phase to be able to NOT include
> - * pending writes, which will reduce the requirement to
> - * (conf->max_degraded + 1) pages per stripe in cache.
> + * In cache flush, the stripe goes through 1 and then 2. For a stripe that
> + * already passed 1, flushing it requires at most (conf->raid_disks + 1)
                                                      ^ I changed it to conf->max_degraded
> + * pages of journal space. For stripes that has not passed 1, flushing it
> + * requires (conf->max_degraded + 1) pages of journal space. There are at
                ^ I changed it to conf->raid_disks


> + * most (conf->group_cnt + 1) stripe that passed 1. So total journal space
> + * required to flush all cached stripes (in pages) is:
> + *
> + *     (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) +
> + *     (group_cnt + 1) * (raid_disks + 1)
> + * or
> + *     (stripe_in_journal_count) * (max_degraded + 1) +
> + *     (group_cnt + 1) * (raid_disks - max_degraded)
>   */
>  static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
>  {
> @@ -408,8 +421,9 @@ static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
>  	if (!r5c_is_writeback(log))
>  		return 0;
>  
> -	return BLOCK_SECTORS * (conf->raid_disks + 1) *
> -		atomic_read(&log->stripe_in_journal_count);
> +	return BLOCK_SECTORS *
> +		((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) +
> +		 (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1));
>  }

^ permalink raw reply

* Re: drives failed during reshape, array won't even force-assemble
From: Phil Turmel @ 2017-01-31  0:29 UTC (permalink / raw)
  To: Thomas Warntjen, linux-raid
In-Reply-To: <d49d7786-1174-0e96-e857-f2b2b6144c1b@warntjen.net>

On 01/30/2017 02:57 PM, Thomas Warntjen wrote:
> Hi Phil,
> 
> thanks for your reply - sadly it's the first I got so no, I haven't
> solved it yet. Any help is still highly appreciated!

Ok.

I'm a bit surprised forced assembly didn't work.  Please provide fresh
mdadm --examine output for all member devices (untrimmed), plus the
output from "ls -l /dev/disk/by-id/ata-*".

That'll help.  Please paste inline and turn off line wrap, so it all
comes through neatly.

Phil


^ permalink raw reply

* Re: [dm-devel] split scsi passthrough fields out of struct request V2
From: Bart Van Assche @ 2017-01-31  1:12 UTC (permalink / raw)
  To: hch@lst.de, axboe@fb.com
  Cc: linux-scsi@vger.kernel.org, linux-raid@vger.kernel.org,
	dm-devel@redhat.com, linux-block@vger.kernel.org,
	snitzer@redhat.com, j-nomura@ce.jp.nec.com
In-Reply-To: <2c696943-2a44-4f36-f0f8-0bebceb95a4a@fb.com>

On Fri, 2017-01-27 at 09:56 -0700, Jens Axboe wrote:
> On 01/27/2017 09:52 AM, Bart Van Assche wrote:
> > [  215.724452] general protection fault: 0000 [#1] SMP
> > [  215.725060] Call Trace:
> > [  215.725086]  scsi_disk_put+0x2d/0x40
> > [  215.725110]  sd_release+0x3d/0xb0
> > [  215.725137]  __blkdev_put+0x29e/0x360
> > [  215.725163]  blkdev_put+0x49/0x170
> > [  215.725192]  dm_put_table_device+0x58/0xc0 [dm_mod]
> > [  215.725219]  dm_put_device+0x70/0xc0 [dm_mod]
> > [  215.725269]  free_priority_group+0x92/0xc0 [dm_multipath]
> > [  215.725295]  free_multipath+0x70/0xc0 [dm_multipath]
> > [  215.725320]  multipath_dtr+0x19/0x20 [dm_multipath]
> > [  215.725348]  dm_table_destroy+0x67/0x120 [dm_mod]
> > [  215.725379]  dev_suspend+0xde/0x240 [dm_mod]
> > [  215.725434]  ctl_ioctl+0x1f5/0x520 [dm_mod]
> > [  215.725489]  dm_ctl_ioctl+0xe/0x20 [dm_mod]
> > [  215.725515]  do_vfs_ioctl+0x8f/0x700
> > [  215.725589]  SyS_ioctl+0x3c/0x70
> > [  215.725614]  entry_SYSCALL_64_fastpath+0x18/0xad
> > 
> 
> I have no idea what this is, I haven't messed with life time or devices
> or queues at all in that branch.

Hello Jens,

Running the srp-test software against kernel 4.9.6 and kernel 4.10-rc5
went fine. With your for-4.11/block branch (commit 400f73b23f457a) however
I just ran into the following:

[  214.555527] ------------[ cut here ]------------
[  214.555565] WARNING: CPU: 5 PID: 13201 at kernel/locking/lockdep.c:3514 lock_release+0x346/0x480
[  214.555588] DEBUG_LOCKS_WARN_ON(depth <= 0)
[  214.555824] CPU: 5 PID: 13201 Comm: fio Not tainted 4.10.0-rc3-dbg+ #1
[  214.555846] Hardware name: Dell Inc. PowerEdge R430/03XKDV, BIOS 1.0.2 11/17/2014
[  214.555867] Call Trace:
[  214.555889]  dump_stack+0x68/0x93
[  214.555911]  __warn+0xc6/0xe0
[  214.555953]  warn_slowpath_fmt+0x4a/0x50
[  214.555973]  lock_release+0x346/0x480
[  214.556021]  aio_write+0x106/0x140
[  214.556067]  do_io_submit+0x37d/0x900
[  214.556108]  SyS_io_submit+0xb/0x10
[  214.556131]  entry_SYSCALL_64_fastpath+0x18/0xad

I will continue to try to figure out what is causing this behavior.

Bart.

^ permalink raw reply

* Re: [dm-devel] split scsi passthrough fields out of struct request V2
From: Jens Axboe @ 2017-01-31  1:38 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: hch@lst.de, axboe@fb.com, linux-scsi@vger.kernel.org,
	linux-raid@vger.kernel.org, dm-devel@redhat.com,
	linux-block@vger.kernel.org, snitzer@redhat.com,
	j-nomura@ce.jp.nec.com
In-Reply-To: <1485825148.2669.18.camel@sandisk.com>



> On Jan 30, 2017, at 5:12 PM, Bart Van Assche <Bart.VanAssche@sandisk.com> wrote:
> 
>> On Fri, 2017-01-27 at 09:56 -0700, Jens Axboe wrote:
>>> On 01/27/2017 09:52 AM, Bart Van Assche wrote:
>>> [  215.724452] general protection fault: 0000 [#1] SMP
>>> [  215.725060] Call Trace:
>>> [  215.725086]  scsi_disk_put+0x2d/0x40
>>> [  215.725110]  sd_release+0x3d/0xb0
>>> [  215.725137]  __blkdev_put+0x29e/0x360
>>> [  215.725163]  blkdev_put+0x49/0x170
>>> [  215.725192]  dm_put_table_device+0x58/0xc0 [dm_mod]
>>> [  215.725219]  dm_put_device+0x70/0xc0 [dm_mod]
>>> [  215.725269]  free_priority_group+0x92/0xc0 [dm_multipath]
>>> [  215.725295]  free_multipath+0x70/0xc0 [dm_multipath]
>>> [  215.725320]  multipath_dtr+0x19/0x20 [dm_multipath]
>>> [  215.725348]  dm_table_destroy+0x67/0x120 [dm_mod]
>>> [  215.725379]  dev_suspend+0xde/0x240 [dm_mod]
>>> [  215.725434]  ctl_ioctl+0x1f5/0x520 [dm_mod]
>>> [  215.725489]  dm_ctl_ioctl+0xe/0x20 [dm_mod]
>>> [  215.725515]  do_vfs_ioctl+0x8f/0x700
>>> [  215.725589]  SyS_ioctl+0x3c/0x70
>>> [  215.725614]  entry_SYSCALL_64_fastpath+0x18/0xad
>>> 
>> 
>> I have no idea what this is, I haven't messed with life time or devices
>> or queues at all in that branch.
> 
> Hello Jens,
> 
> Running the srp-test software against kernel 4.9.6 and kernel 4.10-rc5
> went fine. With your for-4.11/block branch (commit 400f73b23f457a) however
> I just ran into the following:
> 
> [  214.555527] ------------[ cut here ]------------
> [  214.555565] WARNING: CPU: 5 PID: 13201 at kernel/locking/lockdep.c:3514 lock_release+0x346/0x480
> [  214.555588] DEBUG_LOCKS_WARN_ON(depth <= 0)
> [  214.555824] CPU: 5 PID: 13201 Comm: fio Not tainted 4.10.0-rc3-dbg+ #1
> [  214.555846] Hardware name: Dell Inc. PowerEdge R430/03XKDV, BIOS 1.0.2 11/17/2014
> [  214.555867] Call Trace:
> [  214.555889]  dump_stack+0x68/0x93
> [  214.555911]  __warn+0xc6/0xe0
> [  214.555953]  warn_slowpath_fmt+0x4a/0x50
> [  214.555973]  lock_release+0x346/0x480
> [  214.556021]  aio_write+0x106/0x140
> [  214.556067]  do_io_submit+0x37d/0x900
> [  214.556108]  SyS_io_submit+0xb/0x10
> [  214.556131]  entry_SYSCALL_64_fastpath+0x18/0xad
> 
> I will continue to try to figure out what is causing this behavior.

That's a known bug in mainline. Pull it into 4.10-rc6,
or use my for-next where everything is already merged. 



^ permalink raw reply

* Re: split scsi passthrough fields out of struct request V2
From: Jens Axboe @ 2017-01-31  4:13 UTC (permalink / raw)
  To: Jens Axboe, Bart Van Assche
  Cc: linux-block@vger.kernel.org, linux-scsi@vger.kernel.org,
	snitzer@redhat.com, linux-raid@vger.kernel.org,
	dm-devel@redhat.com, j-nomura@ce.jp.nec.com, hch@lst.de
In-Reply-To: <4D024E85-CDE7-4FB0-B8CA-F2B8C86CCFCB@kernel.dk>

On 01/30/2017 05:38 PM, Jens Axboe wrote:
> 
> 
>> On Jan 30, 2017, at 5:12 PM, Bart Van Assche <Bart.VanAssche@sandisk.com> wrote:
>>
>>> On Fri, 2017-01-27 at 09:56 -0700, Jens Axboe wrote:
>>>> On 01/27/2017 09:52 AM, Bart Van Assche wrote:
>>>> [  215.724452] general protection fault: 0000 [#1] SMP
>>>> [  215.725060] Call Trace:
>>>> [  215.725086]  scsi_disk_put+0x2d/0x40
>>>> [  215.725110]  sd_release+0x3d/0xb0
>>>> [  215.725137]  __blkdev_put+0x29e/0x360
>>>> [  215.725163]  blkdev_put+0x49/0x170
>>>> [  215.725192]  dm_put_table_device+0x58/0xc0 [dm_mod]
>>>> [  215.725219]  dm_put_device+0x70/0xc0 [dm_mod]
>>>> [  215.725269]  free_priority_group+0x92/0xc0 [dm_multipath]
>>>> [  215.725295]  free_multipath+0x70/0xc0 [dm_multipath]
>>>> [  215.725320]  multipath_dtr+0x19/0x20 [dm_multipath]
>>>> [  215.725348]  dm_table_destroy+0x67/0x120 [dm_mod]
>>>> [  215.725379]  dev_suspend+0xde/0x240 [dm_mod]
>>>> [  215.725434]  ctl_ioctl+0x1f5/0x520 [dm_mod]
>>>> [  215.725489]  dm_ctl_ioctl+0xe/0x20 [dm_mod]
>>>> [  215.725515]  do_vfs_ioctl+0x8f/0x700
>>>> [  215.725589]  SyS_ioctl+0x3c/0x70
>>>> [  215.725614]  entry_SYSCALL_64_fastpath+0x18/0xad
>>>>
>>>
>>> I have no idea what this is, I haven't messed with life time or devices
>>> or queues at all in that branch.
>>
>> Hello Jens,
>>
>> Running the srp-test software against kernel 4.9.6 and kernel 4.10-rc5
>> went fine. With your for-4.11/block branch (commit 400f73b23f457a) however
>> I just ran into the following:
>>
>> [  214.555527] ------------[ cut here ]------------
>> [  214.555565] WARNING: CPU: 5 PID: 13201 at kernel/locking/lockdep.c:3514 lock_release+0x346/0x480
>> [  214.555588] DEBUG_LOCKS_WARN_ON(depth <= 0)
>> [  214.555824] CPU: 5 PID: 13201 Comm: fio Not tainted 4.10.0-rc3-dbg+ #1
>> [  214.555846] Hardware name: Dell Inc. PowerEdge R430/03XKDV, BIOS 1.0.2 11/17/2014
>> [  214.555867] Call Trace:
>> [  214.555889]  dump_stack+0x68/0x93
>> [  214.555911]  __warn+0xc6/0xe0
>> [  214.555953]  warn_slowpath_fmt+0x4a/0x50
>> [  214.555973]  lock_release+0x346/0x480
>> [  214.556021]  aio_write+0x106/0x140
>> [  214.556067]  do_io_submit+0x37d/0x900
>> [  214.556108]  SyS_io_submit+0xb/0x10
>> [  214.556131]  entry_SYSCALL_64_fastpath+0x18/0xad
>>
>> I will continue to try to figure out what is causing this behavior.
> 
> That's a known bug in mainline. Pull it into 4.10-rc6,
> or use my for-next where everything is already merged.

Since I'm not on the phone anymore, this is the commit that was
merged after my for-4.11/block was forked, which fixes this issue:

commit a12f1ae61c489076a9aeb90bddca7722bf330df3
Author: Shaohua Li <shli@fb.com>
Date:   Tue Dec 13 12:09:56 2016 -0800

    aio: fix lock dep warning

So you can just pull that in, if you want, or do what I suggested above.

-- 
Jens Axboe

^ permalink raw reply

* [PATCH v3] DM: dm-inplace-compress: inplace compressed DM target
From: Ram Pai @ 2017-01-31  7:42 UTC (permalink / raw)
  To: dm-devel, linux-doc, linux-kernel, linux-raid
  Cc: hbabu, shli, snitzer, agk, corbet

This  patch   provides   a   generic     device-mapper   compression  device.
Originally written by Shaohua Li.
https://www.redhat.com/archives/dm-devel/2013-December/msg00143.html

I have optimized and hardened the code.

Testing:
-------
This compression block device  is  tested in the following scenarios
a) backing a ext4 filesystem 
b) backing swap
Its stress tested on PPC64 and x86 system.

Version v1:
	Comments from Alasdair have been incorporated.
	https://www.redhat.com/archives/dm-devel/2013-December/msg00144.html

Version v2:
	All patches are merged into a single patch.
	Major code re-arrangement.
	Data and metablocks allocated based on the length of the device
	map rather than the size of the backing device.
        Size   of   each entry   in  the bitmap array is explicitly set
	 to 32bits.
	Attempt  to  reuse  the  provided  bio  buffer  space   instead
	 of allocating a new one.

Version v3:
	Fixed  sector  alignment  bugs  exposed  while   testing on x86.
	Explicitly set the maximum request size  to 128K.  Without which
	range  locking  failed,  causing  I/Os  to  stamp   each  other.
	Fixed an occasional data corruption  caused by wrong size of the
	compression buffer.
	Added  a   parameter   while  creation  of  the   block  device, 
	to  not  sleep   during  memory  allocations. This can be useful
       	if the device is used as a swap device.

Your comments to improve the code is very much appreciated.

Ram Pai (1):
  From: Shaohua Li <shli@kernel.org>

 .../device-mapper/dm-inplace-compress.txt          |  155 ++
 drivers/md/Kconfig                                 |    6 +
 drivers/md/Makefile                                |    2 +
 drivers/md/dm-inplace-compress.c                   | 2214 ++++++++++++++++++++
 drivers/md/dm-inplace-compress.h                   |  187 ++
 5 files changed, 2564 insertions(+)
 create mode 100644 Documentation/device-mapper/dm-inplace-compress.txt
 create mode 100644 drivers/md/dm-inplace-compress.c
 create mode 100644 drivers/md/dm-inplace-compress.h

-- 
1.8.3.1

^ permalink raw reply

* [PATCH v3 1/1] DM: inplace compressed DM target
From: Ram Pai @ 2017-01-31  7:42 UTC (permalink / raw)
  To: dm-devel, linux-doc, linux-kernel, linux-raid
  Cc: hbabu, shli, snitzer, agk, corbet
In-Reply-To: <1485848533-27778-1-git-send-email-linuxram@us.ibm.com>

This is a simple DM target supporting inplace compression. Its best
suited for SSD. The underlying disk must support 512B sector size.
The target only supports 4k sector size.

Disk layout:
|super|...meta...|..data...|

Store unit is 4k (a block). Super is 1 block, which stores meta  and
data size and compression algorithm. Meta is a bitmap. For each data
 block, there are 5 bits meta.

Data:

Data of   a block is compressed. Compressed  data  is round up to 512B,
which is the payload. In disk, payload is  stored at  the beginning of
logical sector  of the block. Let's look  at an example.  Say we store
data to block A, which  is in sector  B(A*8), its orginal  size is 4k,
compressed size is  1500.    Compressed     data (CD)  will  use three
sectors (512B). The three  sectors  are the  payload. Payload  will be
stored at sector B.

---------------------------------------------------
... | CD1 | CD2 | CD3 |   |   |   |   |    | ...
---------------------------------------------------
    ^B    ^B+1  ^B+2                  ^B+7 ^B+8

For this block, we will not use sector B+3 to B+7 (a hole). We use four
meta  bits  to  present payload  size. The compressed size (1500) isn't
stored in meta directly. Instead, we  store  it  at  the last 32bits of
payload. In this  example, we store it at the  end  of  sector  B+2. If
compressed size + sizeof(32bits)  crosses a   sector, payload size will
increase one sector.  If payload  uses 8 sectors, we store uncompressed
data directly.

If IO size is bigger than one block, we can store the data as an extent.
Data of the  whole extent will compressed and stored in the similar way
like above.  The first  block of the extent is the head, all others are
the tail.  If extent is 1 block,  the  block  is head. We have 1 bit of
meta to present if a  block  is  head  or  tail. If 4 meta bits of head
block can't  store  extent payload size, we will borrow tail block meta
bits to  store  payload  size.   Max  allowd extent size is 128k, so we
don't compress/decompress too big size data.

Meta:
Modifying   data   will modify meta too. Meta will be written(flush) to
disk   depending   on   meta   write   policy. We support writeback and
writethrough mode.  In  writeback mode, meta will be written to disk in
an interval or a  FLUSH  request.  In  writethrough mode, data and meta
data will be written to disk together.

Advantages:

1. Simple. Since  we  store  compressed  data  in-place,  we don't need
   complicated disk data management.
2. Efficient. For  each  4k, we only need 5 bits meta. 1T data will use
less than 200M meta, so we  can  load  all meta into memory. And actual
compression size is in payload. So   if  IO doesn't need RMW and we use
writeback meta flush, we don't  need  extra IO for meta.

Disadvantages:

1. hole. Since we   store  compressed data in-place, there are a lot of
   holes (in above  example,  B+3 - B+7) Hole can impact IO, because we
   can't do IO merge.

2. 1:1 size. Compression  doesn't  change disk  size. If disk is 1T, we
   can only store 1T data even we do compression.

But this is for SSD only. Generally SSD firmware has a FTL layer to map
disk  sectors  to flash nand. High end SSD firmware has filesystem-like
FTL.

1. hole. Disk has a lot of holes, but SSD FTL   can   still  store data
   contiguous in nand. Even if we can't do IO   merge in  OS layer, SSD
   firmware can do it.

2. 1:1 size. On one side, we write compressed data to SSD, which means
   less  data is  written to SSD. This will be very helpful to improve
   SSD garbage collection, and  so write speed and life cycle. So even
   this is a problem, the target  is still helpful. On the other side,
   advanced SSD FTL can easily do thin provision. For example, if nand
   is   1T   and   we   let   SSD   report   it   as   2T,   and   use
   the  SSD  as  compressed target. In such SSD, we don't have the 1:1
   size issue.

So even if   SSD   FTL   cannot   map   non-contiguous disk sectors to
contiguous nand, the compression target can still function well.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: Ram Pai  <linuxram@us.ibm.com>
---
 .../device-mapper/dm-inplace-compress.txt          |  155 ++
 drivers/md/Kconfig                                 |    6 +
 drivers/md/Makefile                                |    2 +
 drivers/md/dm-inplace-compress.c                   | 2214 ++++++++++++++++++++
 drivers/md/dm-inplace-compress.h                   |  187 ++
 5 files changed, 2564 insertions(+)
 create mode 100644 Documentation/device-mapper/dm-inplace-compress.txt
 create mode 100644 drivers/md/dm-inplace-compress.c
 create mode 100644 drivers/md/dm-inplace-compress.h

diff --git a/Documentation/device-mapper/dm-inplace-compress.txt b/Documentation/device-mapper/dm-inplace-compress.txt
new file mode 100644
index 0000000..1835369
--- /dev/null
+++ b/Documentation/device-mapper/dm-inplace-compress.txt
@@ -0,0 +1,155 @@
+Device-Mapper's "inplace-compress" target provides inplace compression of block
+devices using the kernel compression API.
+
+Parameters: <device path> \
+	[ <#opt_params writethough> ]
+	[ <#opt_params <writeback> <meta_commit_delay> ]
+	[ <#opt_params compressor> <type> ]
+	[ <#opt_params critical>  ]
+
+
+<writethrough>
+    Write data and metadata together.
+
+<writeback> <meta_commit_delay>
+    Write metadata every 'meta_commit_delay' interval.
+
+<device path>
+    This is the device that is going to be used as backend and contains the
+    compressed data.  You can specify it as a path like /dev/xxx or a device
+    number <major>:<minor>.
+
+<compressor> <type>
+    Choose the compressor algorithm. 'lzo' and '842'
+    compressors are supported.
+
+<critical>
+    Block device used in critical path.
+
+Example scripts
+===============
+
+create a inplace-compress block device using lzo compression. Write metadata
+and data together.
+[[
+#!/bin/sh
+# Create a inplace-compress device using dmsetup
+device=$1  #your backing storage eg: /dev/sdc1
+size=80000 #size of your new compressed block device
+dmsetup create comp1 --table "0 $size inplacecompress $device
+		writethrough compressor lzo"
+]]
+
+
+create a inplace-compress block device using nx-842 hardware compression. Write
+metadata periodially every 5sec.
+
+[[
+#!/bin/sh
+# Create a inplace-compress device using dmsetup
+device=$1  #your backing storage eg: /dev/sdc1
+size=80000 #size of your new compressed block device
+dmsetup create comp1 --table "0 $size inplacecompress $device
+		writeback 5 compressor 842"
+]]
+
+
+Create a inplace-compress block device. Device is used in critical path;
+ex: swap device.
+
+[[
+#!/bin/sh
+# Create a inplace-compress device using dmsetup
+device=$1  #your backing storage eg: /dev/sdc1
+size=80000 #size of your new compressed block device
+dmsetup create comp1 --table "0 $size inplacecompress $device critical"
+]]
+
+Description
+===========
+    This is a simple DM target supporting inplace compression. Its best suited for
+    SSD. The underlying disk must support 512B sector size, the target only
+    supports 4k sector size.
+
+    Disk layout:
+    |super|...meta...|..data...|
+
+    Store unit is 4k (a block). Super is 1 block, which stores meta and data
+    size and compression algorithm. Meta is a bitmap. For each data block,
+    there are 5 bits meta.
+
+    Data:
+
+    Data of a block is compressed. Compressed data is round up to 512B, which
+    is the payload. In disk, payload is stored at the beginning of logical
+    sector of the block. Let's look at an example. Say we store data to block
+    A, which is in sector B(A*8), its orginal size is 4k, compressed size is
+    1500. Compressed data (CD) will use 3 sectors (512B). The 3 sectors are the
+    payload. Payload will be stored at sector B.
+
+    ---------------------------------------------------
+    ... | CD1 | CD2 | CD3 |   |   |   |   |    | ...
+    ---------------------------------------------------
+        ^B    ^B+1  ^B+2                  ^B+7 ^B+8
+
+    For this block, we will not use sector B+3 to B+7 (a hole). We use 4 meta
+    bits to present payload size. The compressed size (1500) isn't stored in
+    meta directly. Instead, we store it at the last 32bits of payload. In this
+    example, we store it at the end of sector B+2. If compressed size +
+    sizeof(32bits) crosses a sector, payload size will increase one sector. If
+    payload uses 8 sectors, we store uncompressed data directly.
+
+    If IO size is bigger than one block, we can store the data as an extent.
+    Data of the whole extent will compressed and stored in the similar way like
+    above.  The first block of the extent is the head, all others are the tail.
+    If extent is 1 block, the block is head. We have 1 bit of meta to present
+    if a block is head or tail. If 4 meta bits of head block can't store extent
+    payload size, we will borrow tail block meta bits to store payload size.
+    Max allowd extent size is 128k, so we don't compress/decompress too big
+    size data.
+
+    Meta:
+    Modifying data will modify meta too. Meta will be written(flush) to disk
+    depending on meta write policy. We support writeback and writethrough mode.
+    In writeback mode, meta will be written to disk in an interval or a FLUSH
+    request.  In writethrough mode, data and meta data will be written to disk
+    together.
+
+    Advantages:
+
+    1. Simple. Since we store compressed data in-place, we don't need complicated
+    disk data management.
+    2. Efficient. For each 4k, we only need 5 bits meta. 1T data will use less than
+    200M meta, so we can load all meta into memory. And actual compression size is
+    in payload. So if IO doesn't need RMW and we use writeback meta flush, we don't
+    need extra IO for meta.
+
+    Disadvantages:
+
+    1. hole. Since we store compressed data in-place, there are a lot of holes
+    (in above example, B+3 - B+7) Hole can impact IO, because we can't do IO
+    merge.
+
+    2. 1:1 size. Compression doesn't change disk size. If disk is 1T, we can
+    only store 1T data even we do compression.
+
+    But this is for SSD only. Generally SSD firmware has a FTL layer to map
+    disk sectors to flash nand. High end SSD firmware has filesystem-like FTL.
+
+    1. hole. Disk has a lot of holes, but SSD FTL can still store data continuous
+    in nand. Even if we can't do IO merge in OS layer, SSD firmware can do it.
+
+    2. 1:1 size. On one side, we write compressed data to SSD, which means less
+    data is written to SSD. This will be very helpful to improve SSD garbage
+    collection, and so write speed and life cycle. So even this is a problem, the
+    target is still helpful. On the other side, advanced SSD FTL can easily do thin
+    provision. For example, if nand is 1T and we let SSD report it as 2T, and use
+    the SSD as compressed target. In such SSD, we don't have the 1:1 size issue.
+
+    So even if SSD FTL cannot map non-continuous disk sectors to continuous nand,
+    the compression target can still function well.
+
+
+Author:
+	Shaohua Li <shli@fusionio.com>
+	Ram Pai <linuxram@us.ibm.com>
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index b7767da..2eece2a 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -508,4 +508,10 @@ config DM_LOG_WRITES
 
 	  If unsure, say N.
 
+config DM_INPLACE_COMPRESS
+	tristate "Inplace Compression target"
+	depends on BLK_DEV_DM
+	---help---
+	  Allow volume managers to compress data for SSD.
+
 endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 3cbda1a..4525482 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -59,6 +59,8 @@ obj-$(CONFIG_DM_CACHE_SMQ)	+= dm-cache-smq.o
 obj-$(CONFIG_DM_CACHE_CLEANER)	+= dm-cache-cleaner.o
 obj-$(CONFIG_DM_ERA)		+= dm-era.o
 obj-$(CONFIG_DM_LOG_WRITES)	+= dm-log-writes.o
+obj-$(CONFIG_DM_LOG_WRITES)	+= dm-log-writes.o
+obj-$(CONFIG_DM_INPLACE_COMPRESS)	+= dm-inplace-compress.o
 
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs			+= dm-uevent.o
diff --git a/drivers/md/dm-inplace-compress.c b/drivers/md/dm-inplace-compress.c
new file mode 100644
index 0000000..ba12f04
--- /dev/null
+++ b/drivers/md/dm-inplace-compress.c
@@ -0,0 +1,2214 @@
+/*
+ *  device mapper compression block device.
+ *
+ *  Released under GPL v2.
+ *
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-io.h>
+#include <linux/crypto.h>
+#include <linux/lzo.h>
+#include <linux/kthread.h>
+#include <linux/page-flags.h>
+#include <linux/completion.h>
+#include <linux/vmalloc.h>
+#include "dm-inplace-compress.h"
+
+#define DM_MSG_PREFIX "dm-inplace-compress"
+
+
+static const struct kernel_param_ops dm_icomp_alloc_param_ops = {
+	.set    = param_set_ulong,
+	.get    = param_get_ulong,
+};
+
+static atomic64_t dm_icomp_total_alloc_size;
+#define DMCP_ALLOC(s) {atomic64_add(s, &dm_icomp_total_alloc_size); }
+#define DMCP_FREE_ALLOC(s) {atomic64_sub(s, &dm_icomp_total_alloc_size); }
+module_param_cb(dm_icomp_total_alloc_size, &dm_icomp_alloc_param_ops,
+				&dm_icomp_total_alloc_size, 0644);
+
+static atomic64_t dm_icomp_total_bio_save;
+#define DMCP_ALLOC_SAVE(s) {atomic64_add(s, &dm_icomp_total_bio_save); }
+module_param_cb(dm_icomp_total_bio_save, &dm_icomp_alloc_param_ops,
+				&dm_icomp_total_bio_save, 0644);
+
+
+static struct kmem_cache *dm_icomp_req_cachep;
+static struct kmem_cache *dm_icomp_io_range_cachep;
+static struct kmem_cache *dm_icomp_meta_io_cachep;
+
+static struct dm_icomp_io_worker dm_icomp_io_workers[NR_CPUS];
+static struct workqueue_struct *dm_icomp_wq;
+
+/*
+ *****************************************************
+ * compressor selection logic
+ *****************************************************
+ */
+static struct dm_icomp_compressor_data compressors[] = {
+	[DMCP_COMP_ALG_LZO] = {
+		.name = "lzo",
+		.can_handle_overflow = false,
+		.comp_len = lzo_comp_len,
+		.max_comp_len = lzo_max_comp_len,
+	},
+	[DMCP_COMP_ALG_842] = {
+		.name = "842",
+		.can_handle_overflow = true,
+		.comp_len = nx842_comp_len,
+		.max_comp_len = nx842_max_comp_len,
+	},
+};
+
+static int default_compressor = DMCP_COMP_ALG_LZO;
+#define DMCP_ALGO_LENGTH 9
+static char dm_icomp_algorithm[DMCP_ALGO_LENGTH] = "lzo";
+static struct kparam_string dm_icomp_compressor_kparam = {
+	.string =	dm_icomp_algorithm,
+	.maxlen =	sizeof(dm_icomp_algorithm),
+};
+static int dm_icomp_compressor_param_set(const char *,
+		const struct kernel_param *);
+static struct kernel_param_ops dm_icomp_compressor_param_ops = {
+	.set =	dm_icomp_compressor_param_set,
+	.get =	param_get_string,
+};
+module_param_cb(compress_algorithm, &dm_icomp_compressor_param_ops,
+		&dm_icomp_compressor_kparam, 0644);
+
+
+
+static int get_comp_id(const char *s)
+{
+	int r, val_len;
+
+	if (!crypto_has_comp(s, 0, 0))
+		return -1;
+
+	for (r = 0; r < ARRAY_SIZE(compressors); r++) {
+		val_len = strlen(compressors[r].name);
+		if (!strncmp(s, compressors[r].name, val_len))
+			return r;
+	}
+	return -1;
+}
+
+static const char *get_comp_name(int id)
+{
+	if (id < 0 || id > ARRAY_SIZE(compressors))
+		return NULL;
+	return compressors[id].name;
+}
+
+static void set_default_compressor(int index)
+{
+	default_compressor = index;
+	strlcpy(dm_icomp_algorithm, compressors[index].name,
+			sizeof(dm_icomp_algorithm));
+	DMINFO("compressor  is %s", dm_icomp_algorithm);
+}
+
+static inline int get_default_compressor(void)
+{
+	return default_compressor;
+}
+
+static int select_default_compressor(void)
+{
+	int r;
+	int arr_size = ARRAY_SIZE(compressors);
+
+	for (r = 0; r < arr_size; r++)
+		if (crypto_has_comp(compressors[r].name, 0, 0))
+			break;
+	if (r >= arr_size) {
+		DMWARN("No crypto compressors are supported");
+		return -EINVAL;
+	}
+	set_default_compressor(r);
+	return 0;
+}
+
+static int dm_icomp_compressor_param_set(const char *val,
+		const struct kernel_param *kp)
+{
+	int ret;
+	char str[kp->str->maxlen], *s;
+	int val_len = strlen(val)+1;
+
+	strlcpy(str, val, val_len);
+	s = strim(str);
+	ret = get_comp_id(s);
+	if (ret < 0) {
+		DMWARN("Compressor %s not supported", s);
+		return -1;
+	}
+	set_default_compressor(ret);
+	return 0;
+}
+
+static void free_compressor(struct dm_icomp_info *info)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		if (info->tfm[i]) {
+			crypto_free_comp(info->tfm[i]);
+			info->tfm[i] = NULL;
+		}
+	}
+}
+
+static int alloc_compressor(struct dm_icomp_info *info)
+{
+	int i;
+	const char *alg_name = get_comp_name(info->comp_alg);
+
+	for_each_possible_cpu(i) {
+		info->tfm[i] = crypto_alloc_comp(
+			alg_name, 0, 0);
+		if (IS_ERR(info->tfm[i]))
+			goto err;
+	}
+	return 0;
+
+err:
+	free_compressor(info);
+	return -ENOMEM;
+}
+
+/**** END compressor select logic ****/
+
+
+/*****  metadata logic ***************/
+/*
+ * return the meta data bits corresponding to a block
+ * @block_index : the index of the block
+ */
+static u8 dm_icomp_get_meta(struct dm_icomp_info *info, u64 block_index)
+{
+	u64 first_bit = block_index * DMCP_META_BITS;
+	int bits, offset;
+	u32 data;
+	u8  ret = 0;
+
+	offset = first_bit & (DMCP_BITS_PER_ENTRY-1);
+	bits = min_t(u32, DMCP_META_BITS, DMCP_BITS_PER_ENTRY - offset);
+
+	data = (u32)info->meta_bitmap[first_bit >> DMCP_META_BITS];
+	ret = (data >> offset) & ((1 << bits) - 1);
+
+	if (bits < DMCP_META_BITS) {
+		data = info->meta_bitmap[(first_bit >> DMCP_META_BITS) + 1];
+		bits = DMCP_META_BITS - bits;
+		ret |= (data & ((1 << bits) - 1)) << (DMCP_META_BITS - bits);
+	}
+	return ret;
+}
+
+
+static void dm_icomp_mark_page(struct dm_icomp_info *info, u32 *addr,
+				bool dirty_meta)
+{
+	struct page *page;
+
+	page = vmalloc_to_page(addr);
+	if (!page)
+		return;
+	if (dirty_meta)
+		SetPageDirty(page);
+	else
+		ClearPageDirty(page);
+}
+
+/*
+ * set the meta data bits corresponding to a block
+ * @block_index : the index of the block
+ * @meta        : the meta data bits.
+ */
+static void dm_icomp_set_meta(struct dm_icomp_info *info, u64 block_index,
+		u8 meta, bool dirty_meta)
+{
+	u64 first_bit = block_index * DMCP_META_BITS;
+	int bits, offset;
+	u32 data;
+
+	offset = first_bit & (DMCP_BITS_PER_ENTRY-1);
+	bits = min_t(u32, DMCP_META_BITS, DMCP_BITS_PER_ENTRY - offset);
+
+
+	data = (u32)info->meta_bitmap[first_bit >> DMCP_META_BITS];
+	data &= ~(((1 << bits) - 1) << offset);
+	data |= (meta & ((1 << bits) - 1)) << offset;
+	info->meta_bitmap[first_bit >> DMCP_META_BITS] = (u32)data;
+
+	if (info->write_mode == DMCP_WRITE_BACK)
+		dm_icomp_mark_page(info,
+			&info->meta_bitmap[first_bit >> DMCP_META_BITS],
+			dirty_meta);
+
+	if (bits < DMCP_META_BITS) {
+		meta >>= bits;
+		data = (u32)
+			info->meta_bitmap[(first_bit >> DMCP_META_BITS) + 1];
+		bits = DMCP_META_BITS - bits;
+		data = (data >> bits) << bits;
+		data |= meta & ((1 << bits) - 1);
+		info->meta_bitmap[(first_bit >> DMCP_META_BITS) + 1] =
+				(u32)data;
+
+		if (info->write_mode == DMCP_WRITE_BACK)
+			dm_icomp_mark_page(info,
+			&info->meta_bitmap[(first_bit >> DMCP_META_BITS) + 1],
+			dirty_meta);
+	}
+}
+
+
+/*
+ * set the meta data bits corresponding to an extent
+ * @block : the index of the block
+ * @logical_blocks: the number of blocks in the extent
+ * @sectors: the number of sectors holding the compressed
+ *		data
+ */
+static void dm_icomp_set_extent(struct dm_icomp_req *req, u64 block,
+	u16 logical_blocks, sector_t data_sectors)
+{
+	int i;
+	u8 data;
+
+	for (i = 0; i < logical_blocks; i++) {
+		data = min_t(sector_t, data_sectors, 8);
+		data_sectors -= data;
+		if (i != 0)
+			data |= DMCP_TAIL_MASK;
+		/* For FUA, we write out meta data directly */
+		dm_icomp_set_meta(req->info, block + i, data,
+					!(req->bio->bi_opf & REQ_FUA));
+	}
+}
+
+/*
+ * get the meta data bits corresponding to an extent
+ * @block_index : the index of the block
+ * @logical_blocks: return the number of blocks in the extent
+ * @sectors: return the number of sectors holding the compressed
+ *		data
+ */
+static void dm_icomp_get_extent(struct dm_icomp_info *info, u64 block_index,
+	u64 *first_block_index, u16 *logical_sectors, u16 *data_sectors)
+{
+	u8 data;
+
+	data = dm_icomp_get_meta(info, block_index);
+	while (data & DMCP_TAIL_MASK) {
+		block_index--;
+		data = dm_icomp_get_meta(info, block_index);
+	}
+	*first_block_index = block_index;
+	*logical_sectors = DMCP_BYTES_TO_SECTOR(DMCP_BLOCK_SIZE);
+	*data_sectors = data & DMCP_LENGTH_MASK;
+	block_index++;
+	while (block_index < info->data_blocks) {
+		data = dm_icomp_get_meta(info, block_index);
+		if (!(data & DMCP_TAIL_MASK))
+			break;
+		*logical_sectors += DMCP_BYTES_TO_SECTOR(DMCP_BLOCK_SIZE);
+		*data_sectors += data & DMCP_LENGTH_MASK;
+		block_index++;
+	}
+}
+
+/*
+ * return the super block
+ */
+static int dm_icomp_access_super(struct dm_icomp_info *info, void *addr,
+		int op, int flag)
+{
+	struct dm_io_region region;
+	struct dm_io_request req;
+	unsigned long io_error = 0;
+	int ret;
+
+	region.bdev = info->dev->bdev;
+	region.sector = 0;
+	region.count = DMCP_BYTES_TO_SECTOR(DMCP_BLOCK_SIZE);
+
+	req.bi_op = op;
+	req.bi_op_flags = flag;
+	req.mem.type = DM_IO_KMEM;
+	req.mem.offset = 0;
+	req.mem.ptr.addr = addr;
+	req.notify.fn = NULL;
+	req.client = info->io_client;
+
+	ret = dm_io(&req, 1, &region, &io_error);
+	if (ret || io_error)
+		return -EIO;
+	return 0;
+}
+
+static void dm_icomp_meta_io_done(unsigned long error, void *context)
+{
+	struct dm_icomp_meta_io *meta_io = context;
+
+	meta_io->fn(meta_io->data, error);
+	kmem_cache_free(dm_icomp_meta_io_cachep, meta_io);
+}
+
+/*
+ * write meta data to the meta blocks in the backing store.
+ */
+static int dm_icomp_write_meta(struct dm_icomp_info *info, u64 start_page,
+	u64 end_page, void *data,
+	void (*fn)(void *data, unsigned long error), int rw, int flags)
+{
+	struct dm_icomp_meta_io *meta_io;
+	sector_t sector, last_sector, last_meta_sector = info->data_start-1;
+
+	WARN_ON(end_page > info->meta_bitmap_pages);
+
+	sector = DMCP_META_START_SECTOR + (start_page << (PAGE_SHIFT - 9));
+	WARN_ON(sector > last_meta_sector);
+	if (sector > last_meta_sector) {
+		fn(data, -EINVAL);
+		return -EINVAL;
+	}
+	last_sector = sector + ((end_page - start_page) << (PAGE_SHIFT - 9));
+	if (last_sector > last_meta_sector)
+		last_sector = last_meta_sector;
+
+
+	meta_io = kmem_cache_alloc(dm_icomp_meta_io_cachep, GFP_NOIO);
+	if (!meta_io) {
+		fn(data, -ENOMEM);
+		return -ENOMEM;
+	}
+	meta_io->data = data;
+	meta_io->fn = fn;
+
+	meta_io->io_region.bdev = info->dev->bdev;
+
+
+	meta_io->io_region.sector = sector;
+	meta_io->io_region.count = last_sector - sector + 1;
+	atomic64_add(DMCP_SECTOR_TO_BYTES(meta_io->io_region.count),
+				&info->meta_write_size);
+
+	meta_io->io_req.bi_op = rw;
+	meta_io->io_req.bi_op_flags = flags;
+	meta_io->io_req.mem.type = DM_IO_VMA;
+	meta_io->io_req.mem.offset = 0;
+	meta_io->io_req.mem.ptr.addr = ((char *)(info->meta_bitmap)) +
+						(start_page << PAGE_SHIFT);
+	meta_io->io_req.notify.fn = dm_icomp_meta_io_done;
+	meta_io->io_req.notify.context = meta_io;
+	meta_io->io_req.client = info->io_client;
+
+	dm_io(&meta_io->io_req, 1, &meta_io->io_region, NULL);
+	return 0;
+}
+
+struct writeback_flush_data {
+	struct completion complete;
+	atomic_t cnt;
+};
+
+static void writeback_flush_io_done(void *data, unsigned long error)
+{
+	struct writeback_flush_data *wb = data;
+
+	if (atomic_dec_return(&wb->cnt))
+		return;
+	complete(&wb->complete);
+}
+
+static void dm_icomp_flush_dirty_meta(struct dm_icomp_info *info,
+			struct writeback_flush_data *data)
+{
+	struct page *page;
+	u64 start = 0, index;
+	u32 pending = 0, cnt = 0;
+	bool dirty;
+	struct blk_plug plug;
+
+	blk_start_plug(&plug);
+	for (index = 0; index < info->meta_bitmap_pages; index++, cnt++) {
+		if (cnt == 256) {
+			cnt = 0;
+			cond_resched();
+		}
+
+		page = vmalloc_to_page((char *)(info->meta_bitmap) +
+					(index << PAGE_SHIFT));
+		if (!page)
+			DMWARN("Uable to find page for block=%llu", index);
+		dirty = TestClearPageDirty(page);
+
+		if (pending == 0 && dirty) {
+			start = index;
+			pending++;
+			continue;
+		} else if (pending == 0)
+			continue;
+		else if (pending > 0 && dirty) {
+			pending++;
+			continue;
+		}
+
+		/* pending > 0 && !dirty */
+		atomic_inc(&data->cnt);
+		dm_icomp_write_meta(info, start, start + pending, data,
+			writeback_flush_io_done, REQ_OP_WRITE, WRITE);
+		pending = 0;
+	}
+
+	if (pending > 0) {
+		atomic_inc(&data->cnt);
+		dm_icomp_write_meta(info, start, start + pending, data,
+			writeback_flush_io_done, REQ_OP_WRITE, WRITE);
+	}
+	blkdev_issue_flush(info->dev->bdev, GFP_NOIO, NULL);
+	blk_finish_plug(&plug);
+}
+
+static int dm_icomp_meta_writeback_thread(void *data)
+{
+	struct dm_icomp_info *info = data;
+	struct writeback_flush_data wb;
+
+	atomic_set(&wb.cnt, 1);
+	init_completion(&wb.complete);
+
+	while (!kthread_should_stop()) {
+		schedule_timeout_interruptible(
+			msecs_to_jiffies(info->writeback_delay * 1000));
+		dm_icomp_flush_dirty_meta(info, &wb);
+	}
+
+	dm_icomp_flush_dirty_meta(info, &wb);
+
+	writeback_flush_io_done(&wb, 0);
+	wait_for_completion(&wb.complete);
+	return 0;
+}
+
+static int dm_icomp_init_meta(struct dm_icomp_info *info, bool new)
+{
+	struct dm_io_region region;
+	struct dm_io_request req;
+	unsigned long io_error = 0;
+	struct blk_plug plug;
+	int ret;
+	ssize_t len = DIV_ROUND_UP_ULL(info->meta_bitmap_bits,
+			DMCP_BITS_PER_ENTRY);
+
+	len *= (DMCP_BITS_PER_ENTRY >> 3);
+
+	region.bdev = info->dev->bdev;
+	region.sector = DMCP_META_START_SECTOR;
+	region.count = DMCP_BYTES_TO_SECTOR(round_up(len,
+				DMCP_SECTOR_SIZE));
+
+	req.mem.type = DM_IO_VMA;
+	req.mem.offset = 0;
+	req.mem.ptr.addr = info->meta_bitmap;
+	req.notify.fn = NULL;
+	req.client = info->io_client;
+
+	blk_start_plug(&plug);
+	if (new) {
+		memset(info->meta_bitmap, 0, len);
+		req.bi_op = REQ_OP_WRITE;
+		req.bi_op_flags = REQ_FUA;
+		ret = dm_io(&req, 1, &region, &io_error);
+	} else {
+		req.bi_op = REQ_OP_READ;
+		req.bi_op_flags = READ;
+		ret = dm_io(&req, 1, &region, &io_error);
+	}
+	blk_finish_plug(&plug);
+
+	if (ret || io_error) {
+		info->ti->error = "Access metadata error";
+		return -EIO;
+	}
+
+	if (info->write_mode == DMCP_WRITE_BACK) {
+		info->writeback_tsk = kthread_run(
+			dm_icomp_meta_writeback_thread,
+			info, "dm_icomp_writeback");
+		if (!info->writeback_tsk) {
+			info->ti->error = "Create writeback thread error";
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/***** END metadata logic *****/
+
+
+#define SET_REQ_STAGE(req, value) (req->stage = value)
+#define GET_REQ_STAGE(req) req->stage
+
+
+static void print_max_sectors_possible(struct dm_icomp_info *info)
+{
+	u64 total_blocks, data_blocks, meta_blocks;
+	u64 no_pairs, pair_blocks, rem;
+
+	/* superblock takes away one block */
+	total_blocks = DMCP_BYTES_TO_BLOCK(i_size_read(
+				info->dev->bdev->bd_inode)) - 1;
+
+	/* number of datablocks representable by one metadata block. */
+	data_blocks  = (DMCP_BLOCK_SIZE * 8) /  DMCP_META_BITS;
+	meta_blocks  = 1;
+
+	/* how many such pairing can we make ? */
+	pair_blocks  = data_blocks + meta_blocks;
+	no_pairs     = total_blocks / pair_blocks;
+
+	/*
+	 * these many datablocks and these many ..
+	 * metadatablocks will support each other.
+	 */
+	data_blocks  *= no_pairs;
+	meta_blocks  *= no_pairs;
+
+	rem = total_blocks % pair_blocks;
+	if (rem) {
+		/* we have some remaining blocks.
+		 * give one to meta and remaining to data.
+		 */
+		meta_blocks++;
+		data_blocks += (rem - 1);
+	}
+
+	DMINFO(" This device can accommodate at most %llu sector ",
+		DMCP_BLOCK_TO_SECTOR(data_blocks));
+}
+
+
+/*
+ * create a new super block and initialize its contents.
+ */
+static int dm_icomp_read_or_create_super(struct dm_icomp_info *info)
+{
+	void *addr, *bitmap_addr;
+	struct dm_icomp_super_block *super;
+	u64 total_blocks, data_blocks, meta_blocks;
+	bool new_super = false;
+	int ret;
+	ssize_t len;
+
+	info->total_sector = DMCP_BYTES_TO_SECTOR(
+			i_size_read(info->dev->bdev->bd_inode));
+	total_blocks = DMCP_SECTOR_TO_BLOCK(info->total_sector) - 1;
+
+	data_blocks =  DMCP_SECTOR_TO_BLOCK(info->ti->len);
+	meta_blocks =  ((data_blocks * DMCP_META_BITS) +
+			((DMCP_BLOCK_SIZE * 8) - 1)) / (DMCP_BLOCK_SIZE * 8);
+
+	info->data_blocks = data_blocks;
+	info->data_start = DMCP_BLOCK_TO_SECTOR(1 + meta_blocks);
+
+	DMINFO(
+	"data_start=%u data_blocks=%llu metablocks=%llu total_blocks=%llu",
+		(unsigned int)info->data_start, info->data_blocks,
+		meta_blocks, total_blocks);
+
+	if (DMCP_BLOCK_TO_SECTOR(data_blocks + meta_blocks + 1)
+			>= info->total_sector) {
+		print_max_sectors_possible(info);
+		info->ti->error =
+			"Insufficient sectors to satisfy requested size";
+		return -ENOMEM;
+	}
+
+	addr = kzalloc(DMCP_BLOCK_SIZE+DMCP_SECTOR_SIZE, GFP_KERNEL);
+	if (!addr) {
+		info->ti->error = "Cannot allocate super";
+		return -ENOMEM;
+	}
+
+	super = PTR_ALIGN(addr, DMCP_SECTOR_SIZE);
+	ret = dm_icomp_access_super(info, super, REQ_OP_READ, REQ_FUA);
+	if (ret)
+		goto out;
+
+	if (le64_to_cpu(super->magic) == DMCP_SUPER_MAGIC) {
+
+		const char *alg_name;
+
+		if (le64_to_cpu(super->meta_blocks) != meta_blocks ||
+		    le64_to_cpu(super->data_blocks) != data_blocks) {
+			info->ti->error = "Super is invalid";
+			ret = -EINVAL;
+			goto out;
+		}
+
+		alg_name = get_comp_name(super->comp_alg);
+		if (!crypto_has_comp(alg_name, 0, 0)) {
+			info->ti->error =
+				"Compressor algorithm doesn't support";
+			ret = -EINVAL;
+			goto out;
+		}
+		info->comp_alg = super->comp_alg;
+
+	} else {
+		super->magic = cpu_to_le64(DMCP_SUPER_MAGIC);
+		super->meta_blocks = cpu_to_le64(meta_blocks);
+		super->data_blocks = cpu_to_le64(data_blocks);
+		super->comp_alg = info->comp_alg;
+		ret = dm_icomp_access_super(info, super, REQ_OP_WRITE,
+				REQ_FUA);
+		if (ret) {
+			info->ti->error = "Access super fails";
+			goto out;
+		}
+		new_super = true;
+	}
+
+	if (alloc_compressor(info)) {
+		info->ti->error = "Cannot allocate compressor";
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	info->meta_bitmap_bits = data_blocks * DMCP_META_BITS;
+	len = DIV_ROUND_UP_ULL(info->meta_bitmap_bits, DMCP_BITS_PER_ENTRY);
+	len *= (DMCP_BITS_PER_ENTRY >> 3);
+	info->meta_bitmap_pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	bitmap_addr = vzalloc((info->meta_bitmap_pages * PAGE_SIZE) +
+				DMCP_SECTOR_SIZE);
+	if (!bitmap_addr) {
+		info->ti->error = "Cannot allocate bitmap";
+		ret = -ENOMEM;
+		goto bitmap_err;
+	}
+	info->meta_bitmap = PTR_ALIGN(bitmap_addr, DMCP_SECTOR_SIZE);
+
+	ret = dm_icomp_init_meta(info, new_super);
+	if (ret)
+		goto meta_err;
+
+	return 0;
+meta_err:
+	vfree(bitmap_addr);
+bitmap_err:
+	free_compressor(info);
+out:
+	kfree(addr);
+	return ret;
+}
+
+/*
+ * <dev> [ <writethough>/<writeback> <meta_commit_delay> ]
+ *	 [ <compressor> <type> ]
+ */
+static int dm_icomp_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	struct dm_icomp_info *info;
+	char mode[15];
+	int par = 0;
+	int ret, i;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		ti->error = "dm-inplace-compress: Cannot allocate context";
+		return -ENOMEM;
+	}
+	info->ti = ti;
+	info->comp_alg = get_default_compressor();
+	info->critical = false;
+	while (++par < argc) {
+		if (sscanf(argv[par], "%s", mode) != 1) {
+			ti->error = "Invalid argument";
+			ret = -EINVAL;
+			goto err_para;
+		}
+
+		if (strcmp(mode, "writeback") == 0) {
+			info->write_mode = DMCP_WRITE_BACK;
+			if (kstrtouint(argv[++par], 10,
+				 &info->writeback_delay)) {
+				ti->error = "Invalid argument";
+				ret = -EINVAL;
+				goto err_para;
+			}
+		} else if (strcmp(mode, "writethrough") == 0) {
+			info->write_mode = DMCP_WRITE_THROUGH;
+		} else if (strcmp(mode, "critical") == 0) {
+			info->critical = true;
+		} else if (strcmp(mode, "compressor") == 0) {
+			if (sscanf(argv[++par], "%s", mode) != 1) {
+				ti->error = "Invalid argument";
+				ret = -EINVAL;
+				goto err_para;
+			}
+			ret = get_comp_id(mode);
+			if (ret >= 0) {
+				DMINFO("compressor  is %s", mode);
+				info->comp_alg = ret;
+			} else {
+				ti->error = "Unsupported compressor";
+				ret = -EINVAL;
+				goto err_para;
+			}
+		}
+	}
+
+	if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
+							&info->dev)) {
+		ti->error = "Can't get device";
+		ret = -EINVAL;
+		goto err_para;
+	}
+
+	info->io_client = dm_io_client_create();
+	if (!info->io_client) {
+		ti->error = "Can't create io client";
+		ret = -EINVAL;
+		goto err_ioclient;
+	}
+
+	if (bdev_logical_block_size(info->dev->bdev) != 512) {
+		ti->error = "Can't logical block size too big";
+		ret = -EINVAL;
+		goto err_blocksize;
+	}
+
+	if (dm_set_target_max_io_len(ti, DMCP_BYTES_TO_SECTOR(DMCP_MAX_SIZE))) {
+		ti->error = "Failed to configure device ";
+		ret = -EINVAL;
+		goto err_blocksize;
+	}
+
+	if (dm_icomp_read_or_create_super(info)) {
+		ret = -EINVAL;
+		goto err_blocksize;
+	}
+
+	for (i = 0; i < BITMAP_HASH_LEN; i++) {
+		info->bitmap_locks[i].io_running = 0;
+		spin_lock_init(&info->bitmap_locks[i].wait_lock);
+		INIT_LIST_HEAD(&info->bitmap_locks[i].wait_list);
+	}
+
+	atomic64_set(&info->compressed_write_size, 0);
+	atomic64_set(&info->uncompressed_write_size, 0);
+	atomic64_set(&info->meta_write_size, 0);
+	atomic64_set(&dm_icomp_total_alloc_size, 0);
+	atomic64_set(&dm_icomp_total_bio_save, 0);
+
+	ti->num_flush_bios = 1;
+	ti->private = info;
+	return 0;
+
+err_blocksize:
+	dm_io_client_destroy(info->io_client);
+err_ioclient:
+	dm_put_device(ti, info->dev);
+err_para:
+	kfree(info);
+	return ret;
+}
+
+static void dm_icomp_dtr(struct dm_target *ti)
+{
+	struct dm_icomp_info *info = ti->private;
+
+	if (info->write_mode == DMCP_WRITE_BACK)
+		kthread_stop(info->writeback_tsk);
+	free_compressor(info);
+	vfree(info->meta_bitmap);
+	dm_io_client_destroy(info->io_client);
+	dm_put_device(ti, info->dev);
+	kfree(info);
+}
+
+/*
+ * return the range lock to this block.
+ */
+static struct dm_icomp_hash_lock *dm_icomp_block_hash_lock(
+		struct dm_icomp_info *info, u64 block_index)
+{
+	return &info->bitmap_locks[(block_index >> BITMAP_HASH_SHIFT) &
+			BITMAP_HASH_MASK];
+}
+
+/*
+ * unlock the io range correspondingg to this block.
+ */
+static struct dm_icomp_hash_lock *dm_icomp_trylock_block(
+		struct dm_icomp_info *info,
+		struct dm_icomp_req *req, u64 block_index)
+{
+	struct dm_icomp_hash_lock *hash_lock;
+
+	hash_lock = dm_icomp_block_hash_lock(req->info, block_index);
+
+	spin_lock_irq(&hash_lock->wait_lock);
+	if (!hash_lock->io_running) {
+		hash_lock->io_running = 1;
+		spin_unlock_irq(&hash_lock->wait_lock);
+		return hash_lock;
+	}
+	list_add_tail(&req->sibling, &hash_lock->wait_list);
+	spin_unlock_irq(&hash_lock->wait_lock);
+	return NULL;
+}
+
+static void dm_icomp_queue_req_list(struct dm_icomp_info *info,
+	 struct list_head *list);
+
+static void dm_icomp_unlock_block(struct dm_icomp_info *info,
+	struct dm_icomp_req *req, struct dm_icomp_hash_lock *hash_lock)
+{
+	LIST_HEAD(pending_list);
+	unsigned long flags;
+
+	spin_lock_irqsave(&hash_lock->wait_lock, flags);
+	/* wakeup all pending reqs to avoid live lock */
+	list_splice_init(&hash_lock->wait_list, &pending_list);
+	hash_lock->io_running = 0;
+	spin_unlock_irqrestore(&hash_lock->wait_lock, flags);
+
+	dm_icomp_queue_req_list(info, &pending_list);
+}
+
+/*
+ * lock all the range locks corresponding to this io request.
+ */
+static int dm_icomp_lock_req_range(struct dm_icomp_req *req)
+{
+	u64 block_index, first_block_index;
+	u64 first_lock_block, second_lock_block;
+	u16 logical_sectors, data_sectors;
+
+	block_index = DMCP_SECTOR_TO_BLOCK(req->bio->bi_iter.bi_sector);
+	req->locks[0] = dm_icomp_trylock_block(req->info, req, block_index);
+	if (!req->locks[0])
+		return 0;
+	dm_icomp_get_extent(req->info, block_index, &first_block_index,
+				&logical_sectors, &data_sectors);
+	if (dm_icomp_block_hash_lock(req->info, first_block_index) !=
+						req->locks[0]) {
+		dm_icomp_unlock_block(req->info, req, req->locks[0]);
+		first_lock_block = first_block_index;
+		second_lock_block = block_index;
+		goto two_locks;
+	}
+
+	block_index = DMCP_SECTOR_TO_BLOCK(bio_end_sector(req->bio) - 1);
+	dm_icomp_get_extent(req->info, block_index, &first_block_index,
+				&logical_sectors, &data_sectors);
+	first_block_index += DMCP_SECTOR_TO_BLOCK(logical_sectors);
+	if (dm_icomp_block_hash_lock(req->info, first_block_index) !=
+						req->locks[0]) {
+		second_lock_block = first_block_index;
+		goto second_lock;
+	}
+	req->locked_locks = 1;
+	return 1;
+
+two_locks:
+	req->locks[0] = dm_icomp_trylock_block(req->info, req,
+		first_lock_block);
+	if (!req->locks[0])
+		return 0;
+second_lock:
+	req->locks[1] = dm_icomp_trylock_block(req->info, req,
+				second_lock_block);
+	if (!req->locks[1]) {
+		dm_icomp_unlock_block(req->info, req, req->locks[0]);
+		return 0;
+	}
+	/* Don't need check if meta is changed */
+	req->locked_locks = 2;
+	return 1;
+}
+
+
+
+/*
+ * unlock all the range locks corresponding to this io request.
+ */
+static void dm_icomp_unlock_req_range(struct dm_icomp_req *req)
+{
+	int i;
+
+	for (i = req->locked_locks - 1; i >= 0; i--)
+		dm_icomp_unlock_block(req->info, req, req->locks[i]);
+}
+
+static void dm_icomp_queue_req(struct dm_icomp_info *info,
+		struct dm_icomp_req *req)
+{
+	unsigned long flags;
+	struct dm_icomp_io_worker *worker = &dm_icomp_io_workers[req->cpu];
+
+	spin_lock_irqsave(&worker->lock, flags);
+	list_add_tail(&req->sibling, &worker->pending);
+	spin_unlock_irqrestore(&worker->lock, flags);
+
+	queue_work_on(req->cpu, dm_icomp_wq, &worker->work);
+}
+
+static void dm_icomp_queue_req_list(struct dm_icomp_info *info,
+		struct list_head *list)
+{
+	struct dm_icomp_req *req;
+
+	while (!list_empty(list)) {
+		req = list_first_entry(list, struct dm_icomp_req, sibling);
+		list_del_init(&req->sibling);
+		dm_icomp_queue_req(info, req);
+	}
+}
+
+static void dm_icomp_get_req(struct dm_icomp_req *req)
+{
+	atomic_inc(&req->io_pending);
+}
+
+static inline int get_alloc_flag(struct dm_icomp_info *info)
+{
+	/*
+	 * Use GFP_ATOMIC allocations if the device
+	 * is used on the critical path
+	 */
+	return info->critical ? GFP_ATOMIC : GFP_NOIO;
+}
+
+static void *dm_icomp_kmalloc(size_t size, int alloc_flag)
+{
+	void *addr = kmalloc(size, alloc_flag);
+
+	if (!addr)
+		return NULL;
+	DMCP_ALLOC(size);
+	return addr;
+}
+
+static void *dm_icomp_krealloc(void *ptr, size_t size,
+		size_t origsize, int alloc_flag)
+{
+	void *addr = krealloc(ptr, size, alloc_flag);
+
+	if (!addr)
+		return NULL;
+	DMCP_FREE_ALLOC(origsize);
+	DMCP_ALLOC(size);
+	return addr;
+}
+
+static int dm_icomp_alloc_compbuffer(struct dm_icomp_io_range *io, int size)
+{
+	int alloc_len = size + DMCP_SECTOR_SIZE;
+	void *addr = dm_icomp_kmalloc(alloc_len,
+			get_alloc_flag(io->req->info));
+
+	if (!addr)
+		return 1;
+
+	io->comp_real_data = addr;
+	io->comp_kmap	= false;
+	io->comp_len	= size;
+
+	/*
+	 * comp_data is used to read and write from storage.
+	 * So align it.
+	 */
+	io->comp_data   = io->io_req.mem.ptr.addr
+			= PTR_ALIGN(addr, DMCP_SECTOR_SIZE);
+
+	return 0;
+}
+
+static int dm_icomp_realloc_compbuffer(struct dm_icomp_io_range *io, int size)
+{
+	void *addr = dm_icomp_krealloc(io->comp_real_data,
+			size+DMCP_SECTOR_SIZE, io->comp_len,
+				get_alloc_flag(io->req->info));
+	if (!addr)
+		return 1;
+
+	io->comp_real_data = addr;
+	io->comp_kmap	   = false;
+	io->comp_data      = io->io_req.mem.ptr.addr = PTR_ALIGN(addr,
+				DMCP_SECTOR_SIZE);
+	io->comp_len	   = size;
+	return 0;
+}
+
+static void dm_icomp_kfree(void *addr, unsigned int size)
+{
+	kfree(addr);
+	DMCP_FREE_ALLOC(size);
+}
+
+static void dm_icomp_release_decomp_buffer(struct dm_icomp_io_range *io)
+{
+	if (!io->decomp_data)
+		return;
+
+	if (io->decomp_kmap)
+		kunmap(io->decomp_real_data);
+	else
+		dm_icomp_kfree(io->decomp_real_data, io->decomp_len);
+
+	io->decomp_data = io->decomp_real_data = NULL;
+	io->decomp_len  = 0;
+	io->decomp_kmap = false;
+}
+
+static void dm_icomp_release_comp_buffer(struct dm_icomp_io_range *io)
+{
+	if (!io->comp_data)
+		return;
+
+	if (io->comp_kmap)
+		kunmap(io->comp_real_data);
+	else
+		dm_icomp_kfree(io->comp_real_data, io->comp_len);
+
+	io->comp_real_data = io->comp_data = NULL;
+	io->comp_len = 0;
+	io->comp_kmap = false;
+}
+
+static void dm_icomp_free_io_range(struct dm_icomp_io_range *io)
+{
+	dm_icomp_release_decomp_buffer(io);
+	dm_icomp_release_comp_buffer(io);
+	kmem_cache_free(dm_icomp_io_range_cachep, io);
+}
+
+static void dm_icomp_put_req(struct dm_icomp_req *req)
+{
+	struct dm_icomp_io_range *io;
+
+	if (atomic_dec_return(&req->io_pending))
+		return;
+
+	if (GET_REQ_STAGE(req) == STAGE_INIT) /* waiting for locking */
+		return;
+
+	if (GET_REQ_STAGE(req) == STAGE_READ_DECOMP ||
+	    GET_REQ_STAGE(req) == STAGE_WRITE_COMP)
+		SET_REQ_STAGE(req, STAGE_DONE);
+
+	if (!!!req->result && GET_REQ_STAGE(req) != STAGE_DONE) {
+		dm_icomp_queue_req(req->info, req);
+		return;
+	}
+
+	while (!list_empty(&req->all_io)) {
+		io = list_entry(req->all_io.next,
+			struct dm_icomp_io_range, next);
+		list_del(&io->next);
+		dm_icomp_free_io_range(io);
+	}
+
+	dm_icomp_unlock_req_range(req);
+
+	req->bio->bi_error = req->result;
+
+	bio_endio(req->bio);
+	kmem_cache_free(dm_icomp_req_cachep, req);
+}
+
+static void dm_icomp_bio_copy(struct bio *bio, off_t bio_off, void *buf,
+		ssize_t len, bool to_buf)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	off_t buf_off = 0;
+	ssize_t size;
+	void *addr;
+
+	WARN_ON(bio_off + len > DMCP_SECTOR_TO_BYTES(bio_sectors(bio)));
+
+	bio_for_each_segment(bv, bio, iter) {
+		int length = bv.bv_len;
+
+		if (bio_off > length) {
+			bio_off -= length;
+			continue;
+		}
+		addr = kmap_atomic(bv.bv_page);
+		size = min_t(ssize_t, len, length - bio_off);
+		if (to_buf)
+			memcpy(buf + buf_off, addr + bio_off + bv.bv_offset,
+			size);
+		else
+			memcpy(addr + bio_off + bv.bv_offset, buf + buf_off,
+			size);
+		kunmap_atomic(addr);
+		bio_off = 0;
+		buf_off += size;
+
+		if (len <= size)
+			break;
+
+		len -= size;
+	}
+}
+
+static void dm_icomp_io_range_done(unsigned long error, void *context)
+{
+	struct dm_icomp_io_range *io = context;
+
+	if (error)
+		io->req->result = error;
+
+	dm_icomp_put_req(io->req);
+}
+
+static inline int dm_icomp_compressor_len(struct dm_icomp_info *info, int len)
+{
+	if (compressors[info->comp_alg].comp_len)
+		return compressors[info->comp_alg].comp_len(len);
+	return len;
+}
+
+static inline bool dm_icomp_can_handle_overflow(struct dm_icomp_info *info)
+{
+	return compressors[info->comp_alg].can_handle_overflow;
+}
+
+static inline int dm_icomp_compressor_maxlen(struct dm_icomp_info *info,
+		int len)
+{
+	if (compressors[info->comp_alg].max_comp_len)
+		return compressors[info->comp_alg].max_comp_len(len);
+	return len;
+}
+
+/*
+ * caller should set region.sector, region.count. bi_rw. IO always to/from
+ * comp_data
+ */
+static struct dm_icomp_io_range *dm_icomp_create_io_range(
+		struct dm_icomp_req *req)
+{
+	struct dm_icomp_io_range *io;
+
+	io = kmem_cache_alloc(dm_icomp_io_range_cachep, GFP_NOIO);
+	if (!io)
+		return NULL;
+
+	io->io_req.notify.fn = dm_icomp_io_range_done;
+	io->io_req.notify.context = io;
+	io->io_req.client = req->info->io_client;
+	io->io_req.mem.type = DM_IO_KMEM;
+	io->io_req.mem.offset = 0;
+
+	io->io_region.bdev = req->info->dev->bdev;
+	io->req = req;
+
+	io->comp_data = io->comp_real_data =
+			io->decomp_data = io->decomp_real_data = NULL;
+
+	io->data_bytes = io->comp_len =
+			io->decomp_len = io->logical_bytes = 0;
+
+	io->comp_kmap = io->decomp_kmap = false;
+	return io;
+}
+
+
+/*
+ * return an address, within the bio. The address corresponds to
+ * the requested offset 'bio_off' and is contiguous of size 'len'
+ */
+static void *get_addr(struct bio *bio,  int len, u64 bio_off, u64 *offset)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	void *addr;
+
+	bio_for_each_segment(bv, bio, iter) {
+		int length = bv.bv_len;
+
+		if (bio_off > length) {
+			bio_off -= length;
+			continue;
+		}
+		addr = bv.bv_page;
+		if (bv.bv_offset + bio_off + len >= length) {
+			*offset = bv.bv_offset + bio_off;
+			return kmap(addr);
+		}
+		break;
+	}
+	return NULL;
+}
+
+
+/*
+ * create a io range for tracking  predominantly a read request.
+ * @req		: the read request
+ * @comp_len	: allocation size of the compress buffer
+ * @decomp_len	: allocation size of the decompress buffer
+ * @actual_comp_len : real size of the compress data
+ * @bio_off	: offset within the bio read buffer this request corresponds to.
+ *		try to reuse and read into the bio buffer. -1 means don't reuse.
+ */
+static struct dm_icomp_io_range *dm_icomp_create_io_read_range(
+		struct dm_icomp_req *req, int comp_len, int decomp_len,
+		long bio_off, int actual_comp_len)
+{
+	struct bio *bio = req->bio;
+	void *addr = NULL;
+	struct dm_icomp_io_range *io = dm_icomp_create_io_range(req);
+	u64 offset;
+
+	if (!io)
+		return NULL;
+
+	WARN_ON(comp_len % DMCP_SECTOR_SIZE);
+
+	/* try reusing the bio if possible */
+	if (bio_off >= 0) {
+		addr = get_addr(bio, comp_len, (u64)bio_off, &offset);
+		if (addr) {
+			io->comp_real_data =  addr;
+			io->comp_data = io->io_req.mem.ptr.addr = addr + offset;
+			io->comp_kmap = true;
+			io->comp_len  = comp_len;
+		}
+	}
+
+	if (!addr && dm_icomp_alloc_compbuffer(io, comp_len)) {
+		kmem_cache_free(dm_icomp_io_range_cachep, io);
+		return NULL;
+	}
+
+	io->data_bytes	= actual_comp_len;  /* NOTE, this value can change */
+
+	/*
+	 * note requested length for decompress buffer. Do not allocate it yet.
+	 * Value once set is final.
+	 */
+	io->logical_bytes = decomp_len;
+
+	return io;
+}
+
+/*
+ *  ensure that the io range has all its buffers; of the correct size,
+ *  allocated.
+ */
+static int dm_icomp_update_io_read_range(struct dm_icomp_io_range *io)
+{
+	WARN_ON(!io->comp_data);
+	WARN_ON(io->decomp_data || io->decomp_len);
+	io->decomp_data = dm_icomp_kmalloc(io->logical_bytes,
+				get_alloc_flag(io->req->info));
+	if (!io->decomp_data)
+		return 1;
+	io->decomp_real_data = io->decomp_data;
+	io->decomp_len = io->logical_bytes;
+	io->decomp_kmap = false;
+	return 0;
+}
+
+/*
+ *  resize the comp buffer to its largest possible size.
+ */
+static int dm_icomp_mod_to_max_io_range(struct dm_icomp_info *info,
+			 struct dm_icomp_io_range *io)
+{
+	unsigned int maxlen = dm_icomp_compressor_maxlen(info, io->decomp_len);
+
+	WARN_ON(maxlen > io->logical_bytes);
+
+	if (io->comp_kmap) {
+		WARN_ON(io->comp_kmap);
+		kunmap(io->comp_real_data);
+		io->comp_kmap = false;
+		io->comp_real_data = io->comp_data = NULL;
+	}
+
+	if (dm_icomp_realloc_compbuffer(io, maxlen)) {
+		io->comp_len = 0;
+		return -ENOSPC;
+	}
+	io->comp_len = maxlen;
+	return 0;
+}
+
+/*
+ * create a io range for tracking a write request.
+ * @req		: the write request
+ * @count	: size of the write in sectors.
+ * @offset	: offset within the bio read buffer this request correspond to.
+ */
+static struct dm_icomp_io_range *dm_icomp_create_io_write_range(
+	struct dm_icomp_req *req, sector_t offset, sector_t count)
+{
+	struct bio *bio = req->bio;
+	int size  = DMCP_SECTOR_TO_BYTES(count);
+	u64 of;
+	int comp_len = dm_icomp_compressor_len(req->info, size);
+	void *addr;
+	struct dm_icomp_io_range *io = dm_icomp_create_io_range(req);
+
+	if (!io)
+		return NULL;
+
+	WARN_ON(io->comp_data);
+
+	if (dm_icomp_alloc_compbuffer(io, comp_len)) {
+		kmem_cache_free(dm_icomp_io_range_cachep, io);
+		return NULL;
+	}
+
+	/* we donot know the size of the compress segment yet. */
+	io->data_bytes = 0;
+
+
+	WARN_ON(io->decomp_data);
+
+	io->decomp_kmap = false;
+
+	/* try reusing the bio buffer for decomp data. */
+	addr = get_addr(bio, size, DMCP_SECTOR_TO_BYTES(offset), &of);
+	if (addr)
+		io->decomp_kmap = true;
+	else
+		addr  = dm_icomp_kmalloc(size,
+				get_alloc_flag(req->info));
+
+	if (!addr) {
+		dm_icomp_kfree(io->comp_data, comp_len);
+		kmem_cache_free(dm_icomp_io_range_cachep, io);
+		return NULL;
+	}
+
+	io->logical_bytes = io->decomp_len = size;
+
+	if (io->decomp_kmap) {
+		io->decomp_real_data = addr;
+		io->decomp_data = addr + of;
+		DMCP_ALLOC_SAVE(size);
+	} else {
+		io->decomp_data = io->decomp_real_data = addr;
+		dm_icomp_bio_copy(req->bio, DMCP_SECTOR_TO_BYTES(offset),
+			io->decomp_data, size, true);
+	}
+
+	return io;
+}
+
+static unsigned int round_to_next_sector(unsigned int val)
+{
+	unsigned int c = round_up(val, DMCP_SECTOR_SIZE);
+
+	if ((c - val) < 2*sizeof(u32))
+		c += DMCP_SECTOR_SIZE;
+	return c;
+}
+
+/*
+ * compress and store the data in compress buffer.
+ * return value:
+ * < 0 : error
+ * == 0 : ok
+ * == 1 : ok, but comp/decomp is skipped
+ * Compressed data size is roundup of 512, which makes the payload.
+ * We store the actual compressed len in the last u32 of the payload.
+ * If there is no free space, we add 512 to the payload size.
+ */
+static int dm_icomp_io_range_compress(struct dm_icomp_info *info,
+		struct dm_icomp_io_range *io, unsigned int *comp_len)
+{
+	unsigned int actual_comp_len = io->comp_len;
+	u32 *addr;
+	struct crypto_comp *tfm =  info->tfm[get_cpu()];
+	unsigned int decomp_len = io->logical_bytes;
+	int ret;
+
+	actual_comp_len = io->comp_len;
+	ret = crypto_comp_compress(tfm, io->decomp_data, decomp_len,
+		io->comp_data, &actual_comp_len);
+
+	if (ret || round_to_next_sector(actual_comp_len) > io->comp_len) {
+		ret = dm_icomp_mod_to_max_io_range(info, io);
+		if (!ret) {
+			actual_comp_len = io->comp_len;
+			ret = crypto_comp_compress(tfm, io->decomp_data,
+				decomp_len, io->comp_data,
+				&actual_comp_len);
+		}
+	}
+
+	put_cpu();
+
+	atomic64_add(decomp_len, &info->uncompressed_write_size);
+	io->data_bytes = *comp_len = round_to_next_sector(actual_comp_len);
+	if (ret || decomp_len < *comp_len) {
+		*comp_len = decomp_len;
+		memcpy(io->comp_data, io->decomp_data, *comp_len);
+		atomic64_add(*comp_len, &info->compressed_write_size);
+	} else {
+		atomic64_add(*comp_len, &info->compressed_write_size);
+		addr = (u32 *)((char *)io->comp_data + *comp_len);
+		addr--;
+		*addr = cpu_to_le32(actual_comp_len);
+		addr--;
+		*addr = cpu_to_le32(DMCP_COMPRESS_MAGIC);
+	}
+
+	return 0;
+}
+
+/*
+ * decompress and store the data in decompress buffer.
+ * return value:
+ * < 0 : error
+ * == 0 : ok
+ */
+static int dm_icomp_io_range_decompress(struct dm_icomp_info *info,
+		struct dm_icomp_io_range *io, unsigned int *decomp_len)
+{
+	struct crypto_comp *tfm;
+	u32 *addr;
+	int ret;
+	int comp_len = io->data_bytes;
+
+	WARN_ON(!io->data_bytes);
+
+	if (comp_len == io->logical_bytes) {
+		memcpy(io->decomp_data, io->comp_data, comp_len);
+		*decomp_len = comp_len;
+		return 0;
+	}
+
+	WARN_ON(io->comp_data != io->io_req.mem.ptr.addr);
+
+	addr = (u32 *)((char *)(io->comp_data) + comp_len);
+	addr--;
+	comp_len = le32_to_cpu(*addr);
+	addr--;
+
+	if (le32_to_cpu(*addr) == DMCP_COMPRESS_MAGIC) {
+		tfm = info->tfm[get_cpu()];
+		*decomp_len = io->logical_bytes;
+		ret = crypto_comp_decompress(tfm, io->comp_data, comp_len,
+			io->decomp_data, decomp_len);
+		WARN_ON(*decomp_len != io->decomp_len);
+		put_cpu();
+		if (ret)
+			return -EINVAL;
+		return 0;
+	}
+
+	DMWARN("Decompress Error ");
+	return -1;
+}
+
+/*
+ *  fill the bio with the corresponding decompressed data.
+ */
+static void dm_icomp_handle_read_decomp(struct dm_icomp_req *req)
+{
+	struct dm_icomp_io_range *io;
+	off_t bio_off = 0;
+	int ret;
+	sector_t bio_len  = DMCP_SECTOR_TO_BYTES(bio_sectors(req->bio));
+
+	SET_REQ_STAGE(req, STAGE_READ_DECOMP);
+
+	if (req->result)
+		return;
+
+	list_for_each_entry(io, &req->all_io, next) {
+		ssize_t dst_off = 0, src_off = 0, len;
+		unsigned int decomp_len;
+
+		io->io_region.sector -= req->info->data_start;
+
+		if (io->io_region.sector >=
+				req->bio->bi_iter.bi_sector)
+			dst_off = DMCP_SECTOR_TO_BYTES(
+				io->io_region.sector -
+				req->bio->bi_iter.bi_sector);
+		else
+			src_off = DMCP_SECTOR_TO_BYTES(
+				req->bio->bi_iter.bi_sector -
+				io->io_region.sector);
+
+		if (dm_icomp_update_io_read_range(io)) {
+			req->result = -EIO;
+			return;
+		}
+
+		/* Do decomp here */
+		ret = dm_icomp_io_range_decompress(req->info, io, &decomp_len);
+		if (ret < 0) {
+			dm_icomp_release_decomp_buffer(io);
+			dm_icomp_release_comp_buffer(io);
+			req->result = -EIO;
+			return;
+		}
+
+		len = min_t(ssize_t,
+			max_t(ssize_t, decomp_len - src_off, 0),
+			max_t(ssize_t, bio_len - dst_off, 0));
+
+		dm_icomp_bio_copy(req->bio, dst_off,
+			   io->decomp_data + src_off, len, false);
+
+		/* io range in all_io list is ordered for read IO */
+		while (bio_off < dst_off) {
+			ssize_t size = min_t(ssize_t, PAGE_SIZE,
+					dst_off - bio_off);
+			dm_icomp_bio_copy(req->bio, bio_off, empty_zero_page,
+					size, false);
+			bio_off += size;
+		}
+
+		bio_off = dst_off + len;
+		dm_icomp_release_decomp_buffer(io);
+		dm_icomp_release_comp_buffer(io);
+	}
+
+	while (bio_off < bio_len) {
+		ssize_t size = min_t(ssize_t, PAGE_SIZE, (bio_len - bio_off));
+
+		dm_icomp_bio_copy(req->bio, bio_off, empty_zero_page,
+			size, false);
+		bio_off += size;
+	}
+}
+
+
+/*
+ * read an extent
+ * @req        : the read request
+ * @block      : the block to be read
+ * @logical_sectors   : no of sectors occupied by the decompressed data
+ * @data_sectors      : no of sectors occupied by the compressed data
+ * @may_resize : the compress data size may change during its life.
+ */
+static void dm_icomp_read_one_extent(struct dm_icomp_req *req, u64 block,
+	u16 logical_sectors, u16 data_sectors, bool may_resize)
+{
+	struct dm_icomp_io_range *io;
+	long bio_off = 0, comp_len;
+	int actual_comp_len = DMCP_SECTOR_TO_BYTES(data_sectors);
+	int actual_decomp_len = DMCP_SECTOR_TO_BYTES(logical_sectors);
+
+	comp_len = actual_comp_len;
+	if (may_resize && !dm_icomp_can_handle_overflow(req->info))
+		comp_len = dm_icomp_compressor_maxlen(req->info,
+				actual_decomp_len);
+
+	bio_off	 =  (may_resize) ? -1 :
+			 DMCP_BLOCK_TO_SECTOR(block) -
+				req->bio->bi_iter.bi_sector;
+
+	io = dm_icomp_create_io_read_range(req, comp_len,
+		actual_decomp_len,
+		bio_off,
+		actual_comp_len);
+	if (!io) {
+		req->result = -EIO;
+		return;
+	}
+
+	dm_icomp_get_req(req);
+	list_add_tail(&io->next, &req->all_io);
+
+	io->io_region.sector = DMCP_BLOCK_TO_SECTOR(block) +
+				req->info->data_start;
+	io->io_region.count = data_sectors;
+	io->io_req.mem.ptr.addr = io->comp_data;
+	io->io_req.mem.type = DM_IO_KMEM;
+	io->io_req.mem.offset = 0;
+	io->io_req.bi_op = REQ_OP_READ;
+	io->io_req.bi_op_flags = (req->bio->bi_opf & REQ_FUA);
+
+	WARN_ON((io->io_region.sector + io->io_region.count)
+		>= req->info->total_sector);
+
+	dm_io(&io->io_req, 1, &io->io_region, NULL);
+}
+
+
+/*
+ * read the data corresponding to this request.
+ * @req   : the request.
+ * @reuse : the read data may be modified. So plan accordingly.
+ */
+static void dm_icomp_handle_read_existing(struct dm_icomp_req *req, bool reuse)
+{
+	u64 block_index, first_block_index;
+	u16 logical_sectors, data_sectors;
+
+	SET_REQ_STAGE(req, STAGE_READ_EXISTING);
+
+	block_index = DMCP_SECTOR_TO_BLOCK(req->bio->bi_iter.bi_sector);
+
+	while (!!!req->result &&
+		(block_index <= DMCP_SECTOR_TO_BLOCK(
+				bio_end_sector(req->bio)-1)) &&
+		(block_index < req->info->data_blocks)) {
+
+		dm_icomp_get_extent(req->info, block_index, &first_block_index,
+			&logical_sectors, &data_sectors);
+
+		if (data_sectors)
+			dm_icomp_read_one_extent(req, first_block_index,
+				logical_sectors, data_sectors, reuse);
+
+		block_index = first_block_index +
+				DMCP_SECTOR_TO_BLOCK(logical_sectors);
+	}
+}
+
+/*
+ * read existing data
+ */
+static void dm_icomp_handle_read_read_existing(struct dm_icomp_req *req)
+{
+	dm_icomp_handle_read_existing(req, false);
+
+	if (req->result)
+		return;
+
+	/* A shortcut if all data is in already */
+	if (list_empty(&req->all_io))
+		dm_icomp_handle_read_decomp(req);
+}
+
+static void dm_icomp_handle_read_request(struct dm_icomp_req *req)
+{
+	dm_icomp_get_req(req);
+
+	if (GET_REQ_STAGE(req) == STAGE_INIT) {
+		if (!dm_icomp_lock_req_range(req)) {
+			dm_icomp_put_req(req);
+			return;
+		}
+		dm_icomp_handle_read_read_existing(req);
+	} else if (GET_REQ_STAGE(req) == STAGE_READ_EXISTING) {
+		dm_icomp_handle_read_decomp(req);
+	}
+
+	dm_icomp_put_req(req);
+}
+
+static void dm_icomp_write_meta_done(void *context, unsigned long error)
+{
+	struct dm_icomp_req *req = context;
+
+	dm_icomp_put_req(req);
+}
+
+static u64 dm_icomp_block_meta_page_index(u64 block, bool end)
+{
+	u64 bits = block * DMCP_META_BITS - !!end;
+	/*
+	 * >> 5; 32 bits per entry
+	 * << 2; each entry is 4 bytes
+	 * >> PAGE_SHIFT; PAGE_SHIFT pages
+	 */
+	return bits >> (5 - 2 + PAGE_SHIFT);
+}
+
+
+/*
+ * write compressed data to the backing storage.
+ * @io : io range
+ * @sector_start : the sector on backing storage to which the
+ *	compressed data needs to be written.
+ * @meta_start: the page index of the bits corresponding to
+ * @meta_end  : start and end blocks.
+ */
+static int dm_icomp_compress_write(struct dm_icomp_io_range *io,
+		sector_t sector_start, u64 *meta_start, u64 *meta_end)
+{
+	struct dm_icomp_req *req = io->req;
+	sector_t count = DMCP_BYTES_TO_SECTOR(io->decomp_len);
+	unsigned int comp_len, ret;
+	u64 page_index;
+
+	/* comp_data must be able to accommadate a larger compress buffer */
+	ret = dm_icomp_io_range_compress(req->info, io, &comp_len);
+	if (ret < 0) {
+		req->result = -EIO;
+		return -EIO;
+	}
+	WARN_ON(comp_len > io->comp_len);
+
+	dm_icomp_get_req(req);
+
+	io->io_req.bi_op = REQ_OP_WRITE;
+	io->io_req.bi_op_flags = (req->bio->bi_opf & REQ_FUA);
+	io->io_req.mem.ptr.addr = io->comp_data;
+	io->io_req.mem.type = DM_IO_KMEM;
+	io->io_req.mem.offset = 0;
+	io->io_region.count = DMCP_BYTES_TO_SECTOR(comp_len);
+	io->io_region.sector = sector_start + req->info->data_start;
+
+	dm_icomp_release_decomp_buffer(io);
+
+
+	WARN_ON((io->io_region.sector + io->io_region.count)
+			>= req->info->total_sector);
+
+	dm_io(&io->io_req, 1, &io->io_region, NULL);
+
+	/* update the meta data bits */
+	dm_icomp_set_extent(req, DMCP_SECTOR_TO_BLOCK(sector_start),
+		DMCP_SECTOR_TO_BLOCK(count), DMCP_BYTES_TO_SECTOR(comp_len));
+
+	page_index = dm_icomp_block_meta_page_index(
+		DMCP_SECTOR_TO_BLOCK(sector_start), false);
+	if (*meta_start > page_index)
+		*meta_start = page_index;
+
+	page_index = dm_icomp_block_meta_page_index(
+			DMCP_SECTOR_TO_BLOCK(sector_start + count), true);
+	if (*meta_end < page_index)
+		*meta_end = page_index;
+	return 0;
+}
+
+/*
+ * modify and write compressed data to the backing storage.
+ * @io : io range
+ * @meta_start: the page index of the bits corresponding to
+ * @meta_end  : start and end blocks.
+ */
+static int dm_icomp_handle_write_modify(struct dm_icomp_io_range *io,
+	u64 *meta_start, u64 *meta_end)
+{
+	struct dm_icomp_req *req = io->req;
+	sector_t bio_start, bio_end, buf_start, buf_end, overlap;
+	off_t bio_off, buf_off;
+	int ret;
+	unsigned int decomp_len;
+
+	io->io_region.sector -= req->info->data_start;
+
+	/* decompress original data */
+	if (dm_icomp_update_io_read_range(io)) {
+		req->result = -EIO;
+		return -EIO;
+	}
+
+	ret = dm_icomp_io_range_decompress(req->info, io, &decomp_len);
+	if (ret < 0) {
+		req->result = -EINVAL;
+		return -EIO;
+	}
+
+	bio_start = req->bio->bi_iter.bi_sector;
+	bio_end = bio_end_sector(req->bio) - 1;
+
+	buf_start = io->io_region.sector;
+	buf_end = buf_start + DMCP_BYTES_TO_SECTOR(decomp_len) - 1;
+
+	/* if no overlap, nothing to do. Just return */
+	if (bio_start >= buf_end || bio_end <= buf_start)
+		return 0;
+
+	bio_off = (buf_start > bio_start) ?  (buf_start - bio_start) : 0;
+	buf_off = (bio_start > buf_start) ?  (bio_start - buf_start) : 0;
+
+	/*
+	 * overlap = sizeof(block1) + sizeof(block2) - sizeof(left_side_shift) -
+	 *		sizeof(right_side_shift)  / 2  +  1
+	 */
+	overlap  =  (((bio_end - bio_start) + (buf_end - buf_start) -
+		abs(buf_end - bio_end) - abs(buf_start - bio_start)) >> 1) + 1;
+
+
+	dm_icomp_bio_copy(req->bio, DMCP_SECTOR_TO_BYTES(bio_off),
+		   io->decomp_data + DMCP_SECTOR_TO_BYTES(buf_off),
+		   DMCP_SECTOR_TO_BYTES(overlap), true);
+
+	return dm_icomp_compress_write(io, io->io_region.sector,
+			meta_start, meta_end);
+}
+
+
+/*
+ * create and write new extents. Each extent is not more than
+ * 256 sectors.
+ * @req : the request
+ * @sec_start: the start sector of the request
+ * @total  : the total sectors
+ * @list  : collect each 256 sector size io request in this list
+ * @meta_start: the page index of the bits corresponding to
+ * @meta_end  : start and end blocks.
+ *
+ */
+static void dm_icomp_handle_write_create(struct dm_icomp_req *req,
+	sector_t sec_start, sector_t total,
+	struct list_head *list, u64 *meta_start, u64 *meta_end)
+{
+	struct dm_icomp_io_range *io;
+	sector_t count, offset = 0;
+	int ret;
+
+	while (total) {
+
+		/* max i/o is 128kbytes i.e 256 sectors */
+		count = min_t(sector_t, total, 256);
+		io = dm_icomp_create_io_write_range(req, offset, count);
+		if (!io) {
+			req->result = -EIO;
+			return;
+		}
+
+		ret = dm_icomp_compress_write(io, sec_start, meta_start,
+			meta_end);
+		if (ret) {
+			dm_icomp_free_io_range(io);
+			return;
+		}
+
+
+		list_add_tail(&io->next, list);
+		total -= count;
+		sec_start += count;
+		offset += count;
+
+	}
+}
+
+/*
+ *  handle the write request.
+ */
+static void dm_icomp_handle_write_comp(struct dm_icomp_req *req)
+{
+	struct dm_icomp_io_range *io;
+	sector_t io_start, req_start, req_end;
+	u64 meta_start = -1L, meta_end = 0;
+	LIST_HEAD(newlist);
+
+	SET_REQ_STAGE(req, STAGE_WRITE_COMP);
+
+	if (req->result)
+		return;
+
+	req_start = req->bio->bi_iter.bi_sector;
+	list_for_each_entry(io, &req->all_io, next) {
+
+		io_start = io->io_region.sector - req->info->data_start;
+
+		if (req_start < io_start) {
+			/* fill the gap */
+			dm_icomp_handle_write_create(req, req_start,
+				(io_start - req_start), &newlist,
+				&meta_start, &meta_end);
+		}
+
+		dm_icomp_handle_write_modify(io, &meta_start, &meta_end);
+
+		req_start = io_start + DMCP_BYTES_TO_SECTOR(io->logical_bytes);
+	}
+
+	req_end =  bio_end_sector(req->bio);
+	if (req_start < req_end) {
+		/* fill the gap */
+		dm_icomp_handle_write_create(req, req_start,
+			 req_end-req_start, &newlist, &meta_start,
+			&meta_end);
+	}
+
+	list_splice_tail(&newlist, &req->all_io);
+
+	if (req->info->write_mode == DMCP_WRITE_THROUGH ||
+				(req->bio->bi_opf & REQ_FUA)) {
+		if (meta_start == -1)
+			return;
+		dm_icomp_get_req(req);
+		dm_icomp_write_meta(req->info, meta_start,
+			meta_end+1, req,
+			dm_icomp_write_meta_done,
+			REQ_OP_WRITE, req->bio->bi_opf);
+	}
+}
+
+/*
+ *  read the data, modify and write it back to the backing store.
+ */
+static void dm_icomp_handle_write_read_existing(struct dm_icomp_req *req)
+{
+	dm_icomp_handle_read_existing(req, true);
+	if (req->result)
+		return;
+
+	if (list_empty(&req->all_io))
+		dm_icomp_handle_write_comp(req);
+}
+
+static void dm_icomp_handle_write_request(struct dm_icomp_req *req)
+{
+	dm_icomp_get_req(req);
+
+	if (GET_REQ_STAGE(req) == STAGE_INIT) {
+		if (!dm_icomp_lock_req_range(req)) {
+			dm_icomp_put_req(req);
+			return;
+		}
+		dm_icomp_handle_write_read_existing(req);
+	} else if (GET_REQ_STAGE(req) == STAGE_READ_EXISTING) {
+		dm_icomp_handle_write_comp(req);
+	}
+
+	dm_icomp_put_req(req);
+}
+
+/* For writeback mode */
+static void dm_icomp_handle_flush_request(struct dm_icomp_req *req)
+{
+	struct writeback_flush_data wb;
+
+	atomic_set(&wb.cnt, 1);
+	init_completion(&wb.complete);
+
+	dm_icomp_flush_dirty_meta(req->info, &wb);
+
+	writeback_flush_io_done(&wb, 0);
+	wait_for_completion(&wb.complete);
+
+	req->bio->bi_error = 0;
+	bio_endio(req->bio);
+	kmem_cache_free(dm_icomp_req_cachep, req);
+}
+
+static void dm_icomp_handle_request(struct dm_icomp_req *req)
+{
+	if (req->bio->bi_opf & REQ_PREFLUSH)
+		dm_icomp_handle_flush_request(req);
+	else if (op_is_write(bio_op(req->bio)))
+		dm_icomp_handle_write_request(req);
+	else
+		dm_icomp_handle_read_request(req);
+}
+
+static void dm_icomp_do_request_work(struct work_struct *work)
+{
+	struct dm_icomp_io_worker *worker = container_of(work,
+				struct dm_icomp_io_worker, work);
+	LIST_HEAD(list);
+	struct dm_icomp_req *req;
+	struct blk_plug plug;
+	bool repeat;
+
+	blk_start_plug(&plug);
+again:
+	spin_lock_irq(&worker->lock);
+	list_splice_init(&worker->pending, &list);
+	spin_unlock_irq(&worker->lock);
+
+	repeat = !list_empty(&list);
+	while (!list_empty(&list)) {
+		req = list_first_entry(&list, struct dm_icomp_req, sibling);
+		list_del(&req->sibling);
+
+		schedule();
+		dm_icomp_handle_request(req);
+	}
+	if (repeat)
+		goto again;
+	blk_finish_plug(&plug);
+}
+
+static bool valid_request(struct bio *bio, struct dm_icomp_info *info)
+{
+	sector_t dev_end	=  info->ti->len;
+	sector_t req_end	=  bio_end_sector(bio) - 1;
+
+	return (req_end <= dev_end);
+}
+
+static int dm_icomp_map(struct dm_target *ti, struct bio *bio)
+{
+	struct dm_icomp_info *info = ti->private;
+	struct dm_icomp_req *req;
+
+	if ((bio->bi_opf & REQ_PREFLUSH) &&
+			info->write_mode == DMCP_WRITE_THROUGH) {
+		bio->bi_bdev = info->dev->bdev;
+		return DM_MAPIO_REMAPPED;
+	}
+
+
+	req = kmem_cache_alloc(dm_icomp_req_cachep, GFP_NOIO);
+	if (!req)
+		return -EIO;
+
+	req->bio = bio;
+	if (!(bio->bi_opf & REQ_PREFLUSH) && !valid_request(bio, info)) {
+		req->bio = bio;
+		req->bio->bi_error = -EINVAL;
+		bio_endio(req->bio);
+		return DM_MAPIO_SUBMITTED;
+	}
+
+	req->info = info;
+	atomic_set(&req->io_pending, 0);
+	INIT_LIST_HEAD(&req->all_io);
+	req->result = 0;
+	SET_REQ_STAGE(req, STAGE_INIT);
+	req->locked_locks = 0;
+
+	req->cpu = raw_smp_processor_id();
+	dm_icomp_queue_req(info, req);
+
+	return DM_MAPIO_SUBMITTED;
+}
+
+static void dm_icomp_status(struct dm_target *ti, status_type_t type,
+	  unsigned int status_flags, char *result, unsigned int maxlen)
+{
+	struct dm_icomp_info *info = ti->private;
+	unsigned int sz = 0;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		DMEMIT("%lu %lu %lu",
+			atomic64_read(&info->uncompressed_write_size),
+			atomic64_read(&info->compressed_write_size),
+			atomic64_read(&info->meta_write_size));
+		break;
+	case STATUSTYPE_TABLE:
+		if (info->write_mode == DMCP_WRITE_BACK)
+			DMEMIT("%s %s:%d %s:%s %s:%d", info->dev->name,
+				"writeback", info->writeback_delay,
+				"compressor", compressors[info->comp_alg].name,
+				"critical", info->critical);
+		else
+			DMEMIT("%s %s %s:%s %s:%d", info->dev->name,
+				"writethrough",
+				"compressor", compressors[info->comp_alg].name,
+				"critical", info->critical);
+		break;
+	}
+}
+
+static int dm_icomp_iterate_devices(struct dm_target *ti,
+				  iterate_devices_callout_fn fn, void *data)
+{
+	struct dm_icomp_info *info = ti->private;
+
+	return fn(ti, info->dev, info->data_start,
+		DMCP_BLOCK_TO_SECTOR(info->data_blocks), data);
+}
+
+static void dm_icomp_io_hints(struct dm_target *ti,
+			    struct queue_limits *limits)
+{
+	/* No blk_limits_logical_block_size */
+	limits->logical_block_size = limits->physical_block_size =
+		limits->io_min = DMCP_BLOCK_SIZE;
+	limits->max_sectors = limits->max_hw_sectors =
+		DMCP_BYTES_TO_SECTOR(DMCP_MAX_SIZE);
+}
+
+static struct target_type dm_icomp_target = {
+	.name   = "inplacecompress",
+	.version = {1, 0, 0},
+	.module = THIS_MODULE,
+	.ctr    = dm_icomp_ctr,
+	.dtr    = dm_icomp_dtr,
+	.map    = dm_icomp_map,
+	.status = dm_icomp_status,
+	.iterate_devices = dm_icomp_iterate_devices,
+	.io_hints = dm_icomp_io_hints,
+};
+
+static int __init dm_icomp_init(void)
+{
+	int r;
+
+	if (select_default_compressor())
+		return -EINVAL;
+
+	r = -ENOMEM;
+	dm_icomp_req_cachep = kmem_cache_create("dm_icomp_requests",
+		sizeof(struct dm_icomp_req), 0, 0, NULL);
+	if (!dm_icomp_req_cachep) {
+		DMWARN("Can't create request cache");
+		goto err;
+	}
+
+	dm_icomp_io_range_cachep = kmem_cache_create("dm_icomp_io_range",
+		sizeof(struct dm_icomp_io_range), 0, 0, NULL);
+	if (!dm_icomp_io_range_cachep) {
+		DMWARN("Can't create io_range cache");
+		goto err;
+	}
+
+	dm_icomp_meta_io_cachep = kmem_cache_create("dm_icomp_meta_io",
+		sizeof(struct dm_icomp_meta_io), 0, 0, NULL);
+	if (!dm_icomp_meta_io_cachep) {
+		DMWARN("Can't create meta_io cache");
+		goto err;
+	}
+
+	dm_icomp_wq = alloc_workqueue("dm_icomp_io",
+		WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
+	if (!dm_icomp_wq) {
+		DMWARN("Can't create io workqueue");
+		goto err;
+	}
+
+	r = dm_register_target(&dm_icomp_target);
+	if (r < 0) {
+		DMWARN("target registration failed");
+		goto err;
+	}
+
+	for_each_possible_cpu(r) {
+		INIT_LIST_HEAD(&dm_icomp_io_workers[r].pending);
+		spin_lock_init(&dm_icomp_io_workers[r].lock);
+		INIT_WORK(&dm_icomp_io_workers[r].work,
+				dm_icomp_do_request_work);
+	}
+	return 0;
+err:
+	kmem_cache_destroy(dm_icomp_req_cachep);
+	kmem_cache_destroy(dm_icomp_io_range_cachep);
+	kmem_cache_destroy(dm_icomp_meta_io_cachep);
+	if (dm_icomp_wq)
+		destroy_workqueue(dm_icomp_wq);
+
+	return r;
+}
+
+static void __exit dm_icomp_exit(void)
+{
+	dm_unregister_target(&dm_icomp_target);
+	kmem_cache_destroy(dm_icomp_req_cachep);
+	kmem_cache_destroy(dm_icomp_io_range_cachep);
+	kmem_cache_destroy(dm_icomp_meta_io_cachep);
+	destroy_workqueue(dm_icomp_wq);
+}
+
+module_init(dm_icomp_init);
+module_exit(dm_icomp_exit);
+
+MODULE_AUTHOR("Shaohua Li <shli@kernel.org>");
+MODULE_DESCRIPTION(DM_NAME " target with data inplace-compression");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-inplace-compress.h b/drivers/md/dm-inplace-compress.h
new file mode 100644
index 0000000..757799a
--- /dev/null
+++ b/drivers/md/dm-inplace-compress.h
@@ -0,0 +1,187 @@
+#ifndef __DM_INPLACE_COMPRESS_H__
+#define __DM_INPLACE_COMPRESS_H__
+#include <linux/types.h>
+
+#define DMCP_SUPER_MAGIC 0x106526c206506c09
+#define DMCP_COMPRESS_MAGIC 0xfaceecaf
+struct dm_icomp_super_block {
+	__le64 magic;
+	__le64 meta_blocks;
+	__le64 data_blocks;
+	u8 comp_alg;
+} __packed;
+
+#define DMCP_COMP_ALG_LZO 1
+#define DMCP_COMP_ALG_842 0
+
+#ifdef __KERNEL__
+/*
+ * Minium logical size of this target is 4096 byte, which is a block.
+ * Data of a block is compressed. Compressed data is round up to 512B, which is
+ * the payload. For each block, we have 5 bits meta data. bit 0 - 3 stands
+ * payload length(0 - 8 sectors). If compressed payload length is 8 sectors, we
+ * just store uncompressed data. Actual compressed data length is stored at the
+ * last 32 bits of payload if data is compressed. In disk, payload is stored at
+ * the beginning of logical sector of the block. If IO size is bigger than one
+ * block, we store the whole data as an extent. Bit 4 stands tail for an
+ * extent. Max allowed extent size is 128k.
+ */
+#define DMCP_BLOCK_SHIFT	12
+#define DMCP_BLOCK_SIZE		(1 << DMCP_BLOCK_SHIFT)
+#define DMCP_SECTOR_SHIFT	SECTOR_SHIFT
+#define DMCP_SECTOR_SIZE	(1 << SECTOR_SHIFT)
+#define DMCP_BLOCK_SECTOR_SHIFT (DMCP_BLOCK_SHIFT - DMCP_SECTOR_SHIFT)
+#define DMCP_BLOCK_TO_SECTOR(b) ((b) << DMCP_BLOCK_SECTOR_SHIFT)
+#define DMCP_SECTOR_TO_BLOCK(s) ((s) >> DMCP_BLOCK_SECTOR_SHIFT)
+#define DMCP_SECTOR_TO_BYTES(s) ((s) << DMCP_SECTOR_SHIFT)
+#define DMCP_BYTES_TO_SECTOR(b) ((b) >> DMCP_SECTOR_SHIFT)
+#define DMCP_BYTES_TO_BLOCK(b)	((b) >> DMCP_BLOCK_SHIFT)
+
+#define DMCP_MIN_SIZE	DMCP_BLOCK_SIZE
+#define DMCP_MAX_SIZE	(128 * 2 * DMCP_SECTOR_SIZE) /* 128k */
+
+#define DMCP_BITS_PER_ENTRY	32
+#define DMCP_META_BITS		5
+#define DMCP_LENGTH_BITS	4
+#define DMCP_TAIL_MASK		(1 << DMCP_LENGTH_BITS)
+#define DMCP_LENGTH_MASK	(DMCP_TAIL_MASK - 1)
+
+#define DMCP_META_START_SECTOR (DMCP_BLOCK_SIZE >> DMCP_SECTOR_SHIFT)
+
+enum DMCP_WRITE_MODE {
+	DMCP_WRITE_BACK,
+	DMCP_WRITE_THROUGH,
+};
+
+/*
+ * a lock spans 128 blocks i.e 512kbytes.
+ * max I/O is 128K, which can at-most span two locks.
+ */
+#define BITMAP_HASH_SHIFT 7
+#define BITMAP_HASH_LEN (1<<6)
+#define BITMAP_HASH_MASK (BITMAP_HASH_LEN - 1)
+struct dm_icomp_hash_lock {
+	int io_running;
+	spinlock_t wait_lock;
+	struct list_head wait_list;
+};
+
+struct dm_icomp_info {
+	struct dm_target *ti;
+	struct dm_dev *dev;
+
+	int comp_alg;
+	bool critical;
+	struct crypto_comp *tfm[NR_CPUS];
+
+	sector_t total_sector;	/* total sectors in the backing store */
+	sector_t data_start;
+	u64 data_blocks;
+	u64 no_of_sectors;
+
+	u32 *meta_bitmap;
+	u64 meta_bitmap_bits;
+	u64 meta_bitmap_pages;
+	struct dm_icomp_hash_lock bitmap_locks[BITMAP_HASH_LEN];
+
+	enum DMCP_WRITE_MODE write_mode;
+	unsigned int writeback_delay; /* second */
+	struct task_struct *writeback_tsk;
+	struct dm_io_client *io_client;
+
+	atomic64_t compressed_write_size;
+	atomic64_t uncompressed_write_size;
+	atomic64_t meta_write_size;
+};
+
+struct dm_icomp_meta_io {
+	struct dm_io_request io_req;
+	struct dm_io_region io_region;
+	void *data;
+	void (*fn)(void *data, unsigned long error);
+};
+
+struct dm_icomp_io_range {
+	struct dm_io_request io_req;
+	struct dm_io_region io_region;
+	bool decomp_kmap;	     /* Is the decomp_data kmapped'? */
+	void *decomp_data;
+	void *decomp_real_data;      /* holds the actual start of the buffer */
+	unsigned int decomp_len;     /* actual allocated/mapped length */
+	unsigned int logical_bytes;  /* decompressed size of the extent */
+	bool comp_kmap;		     /* Is the comp_data kmapped'? */
+	void *comp_data;
+	void *comp_real_data;	     /* holds the actual start of the buffer */
+	unsigned int comp_len;	     /* actual allocated/mapped length */
+	unsigned int data_bytes;     /* compressed size of the extent */
+	struct list_head next;
+	struct dm_icomp_req *req;
+};
+
+enum DMCP_REQ_STAGE {
+	STAGE_INIT,
+	STAGE_READ_EXISTING,
+	STAGE_READ_DECOMP,
+	STAGE_WRITE_COMP,
+	STAGE_DONE,
+};
+
+struct dm_icomp_req {
+	struct bio *bio;
+	struct dm_icomp_info *info;
+	struct list_head sibling;
+	struct list_head all_io;
+	atomic_t io_pending;
+	enum DMCP_REQ_STAGE stage;
+	struct dm_icomp_hash_lock *locks[2];
+	int locked_locks;
+	int result;
+	int cpu;
+	struct work_struct work;
+};
+
+struct dm_icomp_io_worker {
+	struct list_head pending;
+	spinlock_t lock;
+	struct work_struct work;
+};
+
+struct dm_icomp_compressor_data {
+	char *name;
+	bool can_handle_overflow;
+	int (*comp_len)(int comp_len);
+	int (*max_comp_len)(int comp_len);
+};
+
+static inline int lzo_comp_len(int comp_len)
+{
+	/* lzo compression overshoots the comp buffer
+	 * if the buffer size is insufficient.
+	 * Once that bug is fixed we can return half
+	 * the length.
+	 *
+	 * return lzo1x_worst_compress(comp_len) >> 1;
+	 *
+	 * For now its the full length.
+	 */
+	return lzo1x_worst_compress(comp_len);
+}
+
+static inline int lzo_max_comp_len(int comp_len)
+{
+	return lzo1x_worst_compress(comp_len);
+}
+
+static inline int nx842_comp_len(int comp_len)
+{
+	return (comp_len >> 1);
+}
+
+static inline int nx842_max_comp_len(int comp_len)
+{
+	return comp_len;
+}
+
+#endif /* __KERNEL__ */
+
+#endif /* __DM_INPLACE_COMPRESS_H__ */
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH v3 1/1] DM: inplace compressed DM target
From: kbuild test robot @ 2017-01-31 10:32 UTC (permalink / raw)
  To: Ram Pai
  Cc: kbuild-all, dm-devel, linux-doc, linux-kernel, linux-raid, agk,
	snitzer, corbet, shli, hbabu
In-Reply-To: <1485848533-27778-2-git-send-email-linuxram@us.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 4195 bytes --]

Hi Ram,

[auto build test WARNING on dm/for-next]
[also build test WARNING on v4.10-rc6 next-20170130]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Ram-Pai/DM-inplace-compressed-DM-target/20170131-154811
base:   https://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git for-next
config: cris-allyesconfig (attached as .config)
compiler: cris-linux-gcc (GCC) 6.2.0
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=cris 

All warnings (new ones prefixed by >>):

   drivers/md/dm-inplace-compress.c: In function 'dm_icomp_handle_read_decomp':
>> drivers/md/dm-inplace-compress.c:1580:41: warning: passing argument 3 of 'dm_icomp_bio_copy' makes pointer from integer without a cast [-Wint-conversion]
       dm_icomp_bio_copy(req->bio, bio_off, empty_zero_page,
                                            ^~~~~~~~~~~~~~~
   drivers/md/dm-inplace-compress.c:1133:13: note: expected 'void *' but argument is of type 'long unsigned int'
    static void dm_icomp_bio_copy(struct bio *bio, off_t bio_off, void *buf,
                ^~~~~~~~~~~~~~~~~
   drivers/md/dm-inplace-compress.c:1593:40: warning: passing argument 3 of 'dm_icomp_bio_copy' makes pointer from integer without a cast [-Wint-conversion]
      dm_icomp_bio_copy(req->bio, bio_off, empty_zero_page,
                                           ^~~~~~~~~~~~~~~
   drivers/md/dm-inplace-compress.c:1133:13: note: expected 'void *' but argument is of type 'long unsigned int'
    static void dm_icomp_bio_copy(struct bio *bio, off_t bio_off, void *buf,
                ^~~~~~~~~~~~~~~~~
   In file included from drivers/md/dm-inplace-compress.c:12:0:
   drivers/md/dm-inplace-compress.c: In function 'dm_icomp_status':
   drivers/md/dm-inplace-compress.c:2090:10: warning: format '%lu' expects argument of type 'long unsigned int', but argument 4 has type 'long long int' [-Wformat=]
      DMEMIT("%lu %lu %lu",
             ^
   include/linux/device-mapper.h:577:46: note: in definition of macro 'DMEMIT'
         0 : scnprintf(result + sz, maxlen - sz, x))
                                                 ^
   drivers/md/dm-inplace-compress.c:2090:10: warning: format '%lu' expects argument of type 'long unsigned int', but argument 5 has type 'long long int' [-Wformat=]
      DMEMIT("%lu %lu %lu",
             ^
   include/linux/device-mapper.h:577:46: note: in definition of macro 'DMEMIT'
         0 : scnprintf(result + sz, maxlen - sz, x))
                                                 ^
   drivers/md/dm-inplace-compress.c:2090:10: warning: format '%lu' expects argument of type 'long unsigned int', but argument 6 has type 'long long int' [-Wformat=]
      DMEMIT("%lu %lu %lu",
             ^
   include/linux/device-mapper.h:577:46: note: in definition of macro 'DMEMIT'
         0 : scnprintf(result + sz, maxlen - sz, x))
                                                 ^

vim +/dm_icomp_bio_copy +1580 drivers/md/dm-inplace-compress.c

  1564				dm_icomp_release_comp_buffer(io);
  1565				req->result = -EIO;
  1566				return;
  1567			}
  1568	
  1569			len = min_t(ssize_t,
  1570				max_t(ssize_t, decomp_len - src_off, 0),
  1571				max_t(ssize_t, bio_len - dst_off, 0));
  1572	
  1573			dm_icomp_bio_copy(req->bio, dst_off,
  1574				   io->decomp_data + src_off, len, false);
  1575	
  1576			/* io range in all_io list is ordered for read IO */
  1577			while (bio_off < dst_off) {
  1578				ssize_t size = min_t(ssize_t, PAGE_SIZE,
  1579						dst_off - bio_off);
> 1580				dm_icomp_bio_copy(req->bio, bio_off, empty_zero_page,
  1581						size, false);
  1582				bio_off += size;
  1583			}
  1584	
  1585			bio_off = dst_off + len;
  1586			dm_icomp_release_decomp_buffer(io);
  1587			dm_icomp_release_comp_buffer(io);
  1588		}

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 40207 bytes --]

^ permalink raw reply

* Re: [PATCH v3 1/1] DM: inplace compressed DM target
From: kbuild test robot @ 2017-01-31 11:46 UTC (permalink / raw)
  To: Ram Pai
  Cc: kbuild-all, dm-devel, linux-doc, linux-kernel, linux-raid, agk,
	snitzer, corbet, shli, hbabu
In-Reply-To: <1485848533-27778-2-git-send-email-linuxram@us.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 1195 bytes --]

Hi Ram,

[auto build test ERROR on dm/for-next]
[also build test ERROR on v4.10-rc6]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Ram-Pai/DM-inplace-compressed-DM-target/20170131-154811
base:   https://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git for-next
config: m68k-allmodconfig (attached as .config)
compiler: m68k-linux-gcc (GCC) 4.9.0
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=m68k 

All errors (new ones prefixed by >>):

   WARNING: modpost: missing MODULE_LICENSE() in drivers/media/dvb-frontends/gp8psk-fe.o
   see include/linux/module.h for more information
>> ERROR: "__umoddi3" [drivers/md/dm-inplace-compress.ko] undefined!
>> ERROR: "__udivdi3" [drivers/md/dm-inplace-compress.ko] undefined!

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 38248 bytes --]

^ permalink raw reply

* [PATCH] MD: add doc for raid5-cache
From: Shaohua Li @ 2017-01-31 19:18 UTC (permalink / raw)
  To: linux-raid; +Cc: antlists, philip, songliubraving, neilb

I'm starting document of the raid5-cache feature. Please let me know
what else we should put into the document. Of course, comments are
welcome!

Signed-off-by: Shaohua Li <shli@fb.com>
---
 Documentation/md/raid5-cache.txt | 99 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)
 create mode 100644 Documentation/md/raid5-cache.txt

diff --git a/Documentation/md/raid5-cache.txt b/Documentation/md/raid5-cache.txt
new file mode 100644
index 0000000..17a6279
--- /dev/null
+++ b/Documentation/md/raid5-cache.txt
@@ -0,0 +1,99 @@
+RAID5 cache
+
+Raid 4/5/6 could include an extra disk for data cache. The cache could be
+in write-through or write-back mode. mdadm has a new option
+'--write-journal' to create array with cache. By default (raid array
+starts), the cache is in write-through mode. User can switch it to
+write-back mode by:
+
+echo "write-back" > /sys/block/md0/md/journal_mode
+
+And switch it back to write-through mode by:
+
+echo "write-through" > /sys/block/md0/md/journal_mode
+
+In both modes, all writes to the array will hit cache disk first. This means
+the cache disk must be fast and sustainable (if you use a SSD as the cache).
+
+-------------------------------------
+write-through mode:
+
+This mode mainly fixes 'write hole' issue. For RAID 4/5/6 array, an
+unclean shutdown could cause data in some stripes is not in consistent
+state, eg, data and parity don't match. The reason is a stripe write
+involves several raid disks and it's possible writes don't hit all raid
+disks yet before the unclean shutdown. After an unclean shutdown, MD try
+to 'resync' the array to put all stripes back into consistent state. In
+the resync, any disk failure will cause real data corruption. This problem
+is called 'write hole'. So the 'write hole' issue occurs between unclean
+shutdown and 'resync'. This window isn't big. On the other hand, if one
+disk fails, other disks could fail soon, which happens sometimes if the
+disks are from the same vendor and manufactured in the same time. This
+will increase the chance of 'write whole', but overall the chance isn't
+big, so don't panic even not using cache disk.
+
+The write-through cache will cache all data in cache disk first. Until the
+data hits into the cache disk, the data is flushed into RAID disks. The
+two-step write will guarantee MD can recover correct data after unclean
+shutdown even with disk failure. Thus the cache can close the 'write
+hole'.
+
+In write-through mode, MD reports IO finish to upper layer (usually
+filesystems) till the data hits RAID disks, so cache disk failure doesn't
+cause data lost. Of course cache disk failure means the array is exposed
+into 'write hole' again.
+
+--------------------------------------
+write-back mode:
+
+write-back mode fixes the 'write hole' issue too, since all write data is
+cached in cache disk. But the main goal of 'write-back' cache is to speed up
+write. If a write crosses all raid disks of a stripe, we call it full-stripe
+write. For non-full-stripe write, MD must do a read-modify-write. The extra
+read (for data in other disks) and write (for parity) introduce a lot of
+overhead. Some writes which are sequential but not dispatched in the same time
+will suffer from this overhead too. write-back cache will aggregate the data
+and flush the data to raid disks till the data becomes a full stripe write.
+This will completely avoid the overhead, so it's very helpful for some
+workloads. A typical workload which does sequential write and follows fsync is
+an example.
+
+In write-back mode, MD reports IO finish to upper layer (usually filesystems)
+right after the data hit cache disk. The data is flushed to raid disks later
+after specific conditions met. So cache disk failure will cause data lost.
+
+--------------------------------------
+The implementation:
+
+The write-through and write-back cache use the same disk format. The cache disk
+is organized as a simple write log. The log consists of 'meta data' and 'data'
+pairs. The meta data describes the data. It also includes checksum and sequence
+ID for recovery identification. Data could be IO data and parity data. Data is
+checksumed too. The checksum is stored in the meta data ahead of the data. The
+checksum is an optimization because MD can write meta and data freely without
+worry about the order. MD superblock has a field pointed to the valid meta data
+of log head.
+
+The log implementation is pretty straightforward. The difficult part is the
+order MD write data to cache disk and raid disks. Specifically, in
+write-through mode, MD calculates parity for IO data, writes both IO data and
+parity to the log, write the data and parity to raid disks after the data and
+parity is settled down in log and finally the IO is finished. Read just reads
+from raid disks as usual.
+
+In write-back mode, MD writes IO data to the log and reports IO finish. The
+data is also fully cached in memory at that time, which means read must query
+memory cache. If some conditions are met, MD will flush the data to raid disks.
+MD will calculate parity for the data and write parity into the log. After this
+is finished, MD will write both data and parity into raid disks, then MD can
+release the memory cache. The flush conditions could be stripe becomes a full
+stripe write, free cache disk space is low or in-kernel memory cache space is
+low.
+
+After an unclean shutdown, MD does recovery. MD reads all meta data and data
+from the log. The sequence ID and checksum will help us detect corrupted meta
+data and data. If MD finds a stripe with data and valid parities (1 parity for
+raid4/5 and 2 for raid6), MD will write the data and parities to raid disks. If
+parities are incompleted, they are discarded. If part of data is corrupted,
+they are discarded too. MD then loads valid data and writes them to raid disks
+in normal way.
-- 
2.9.3


^ permalink raw reply related

* Re: [systemd-devel] Errorneous detection of degraded array
From: Andrei Borzenkov @ 2017-01-31 20:17 UTC (permalink / raw)
  To: NeilBrown; +Cc: Luke Pyzowski, systemd-devel@lists.freedesktop.org, linux-raid
In-Reply-To: <8760kwry0r.fsf@notabene.neil.brown.name>


[-- Attachment #1.1: Type: text/plain, Size: 4222 bytes --]

31.01.2017 01:19, NeilBrown пишет:
> On Mon, Jan 30 2017, Andrei Borzenkov wrote:
> 
>> On Mon, Jan 30, 2017 at 9:36 AM, NeilBrown <neilb@suse.com> wrote:
>> ...
>>>>>>>
>>>>>>> systemd[1]: Created slice system-mdadm\x2dlast\x2dresort.slice.
>>>>>>> systemd[1]: Starting system-mdadm\x2dlast\x2dresort.slice.
>>>>>>> systemd[1]: Starting Activate md array even though degraded...
>>>>>>> systemd[1]: Stopped target Local File Systems.
>>>>>>> systemd[1]: Stopping Local File Systems.
>>>>>>> systemd[1]: Unmounting /share...
>>>>>>> systemd[1]: Stopped (with error) /dev/md0.
>>>>>
>> ...
>>>
>>> The race is, I think, that one I mentioned.  If the md device is started
>>> before udev tells systemd to start the timer, the Conflicts dependencies
>>> goes the "wrong" way and stops the wrong thing.
>>>
>>
>> From the logs provided it is unclear whether it is *timer* or
>> *service*. If it is timer - I do not understand why it is started
>> exactly 30 seconds after device apparently appears. This would match
>> starting service.
> 
> My guess is that the timer is triggered immediately after the device is
> started, but before it is mounted.
> The Conflicts directive tries to stop the device, but is cannot stop the
> device and there are no dependencies yet, so nothing happen.
> After the timer fires (30 seconds later) the .service starts.  It also
> has a Conflicts directory so systemd tried to stop the device again.
> Now that it has been mounted, there is a dependences that can be
> stopped, and the device gets unmounted.
> 
>>
>> Yet another case where system logging is hopelessly unfriendly for
>> troubleshooting :(
>>
>>> It would be nice to be able to reliably stop the timer when the device
>>> starts, without risking having the device get stopped when the timer
>>> starts, but I don't think we can reliably do that.
>>>
>>
>> Well, let's wait until we can get some more information about what happens.
>>

Not much more, but we at least have confirmed that it was indeed last
resort service which was fired off by last resort timer. Unfortunately
no trace of timer itself.

>>> Changing the
>>>   Conflicts=sys-devices-virtual-block-%i.device
>>> lines to
>>>   ConditionPathExists=/sys/devices/virtual/block/%i
>>> might make the problem go away, without any negative consequences.
>>>
>>
>> Ugly, but yes, may be this is the only way using current systemd.
>>

This won't work. sysfs node appears as soon as the very first array
member is found and array is still inactive, while what we need is
condition "array is active".

Conflicts line works because array is not announced to systemd
(SYSTEMD_READY) until it is active. Which in turn is derived from the
content of md/array_state.

>>> The primary purpose of having the 'Conflicts' directives was so that
>>> systemd wouldn't log
>>>   Starting Activate md array even though degraded
>>> after the array was successfully started.
>>

Yes, I understand it.

>> This looks like cosmetic problem. What will happen if last resort
>> service is started when array is fully assembled? Will it do any harm?
> 
> Yes, it could be seen as cosmetic, but cosmetic issues can be important
> too.  Confusing messages in logs can be harmful.
> 
> In all likely cases, running the last-resort service won't cause any
> harm.
> If, during the 30 seconds, the array is started, then deliberately
> stopped, then partially assembled again, then when the last-resort
> service finally starts it might do the wrong thing.
> So it would be cleanest if the timer was killed as soon as the device
> is started.  But I don't think there is a practical concern.
> 
> I guess I could make a udev rule that fires when the array started, and
> that runs "systemctl stop mdadm-last-resort@md0.timer"
> 


Well ... what we really need is unidirectional dependency. Actually the
way Conflicts is used *is* unidirectional anyway - nobody seriously
expects that starting foo.service will stop currently running
shutdown.target. But that is semantic we have currently.

But this probably will do to mitigate this issue until something more
generic can be implemented.


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply

* Re: [dm-devel] split scsi passthrough fields out of struct request V2
From: Bart Van Assche @ 2017-01-31 21:35 UTC (permalink / raw)
  To: axboe@kernel.dk
  Cc: linux-block@vger.kernel.org, linux-raid@vger.kernel.org,
	snitzer@redhat.com, hch@lst.de, linux-scsi@vger.kernel.org,
	axboe@fb.com, j-nomura@ce.jp.nec.com, dm-devel@redhat.com
In-Reply-To: <4D024E85-CDE7-4FB0-B8CA-F2B8C86CCFCB@kernel.dk>

On Mon, 2017-01-30 at 17:38 -0800, Jens Axboe wrote:
> That's a known bug in mainline. Pull it into 4.10-rc6,
> or use my for-next where everything is already merged. 

Hello Jens,

With your for-next branch (commit c2e60b3a2602) I haven't hit any block
layer crashes so far. The only issue I encountered that is new is a
memory leak triggered by the SG-IO code. These memory leak reports
started to appear after I started testing the mq-deadline scheduler.
kmemleak reported the following call stack multiple times after my tests
had finished:

unreferenced object 0xffff88041119e528 (size 192):
  comm "multipathd", pid 2353, jiffies 4295128020 (age 1332.440s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    00 00 00 00 00 00 00 00 12 01 00 00 00 00 00 00  ................
  backtrace:
    [<ffffffff8165e3b5>] kmemleak_alloc+0x45/0xa0
    [<ffffffff811cc23d>] __kmalloc+0x15d/0x2f0
    [<ffffffff81310e35>] bio_alloc_bioset+0x185/0x1f0
    [<ffffffff813117f4>] bio_map_user_iov+0x124/0x400
    [<ffffffff81320b7a>] blk_rq_map_user_iov+0x11a/0x210
    [<ffffffff81320cbd>] blk_rq_map_user+0x4d/0x60
    [<ffffffff81336694>] sg_io+0x3d4/0x410
    [<ffffffff813369d0>] scsi_cmd_ioctl+0x300/0x490
    [<ffffffff81336b9d>] scsi_cmd_blk_ioctl+0x3d/0x50
    [<ffffffff814b4360>] sd_ioctl+0x80/0x100
    [<ffffffff8132ddde>] blkdev_ioctl+0x51e/0x9f0
    [<ffffffff8122f388>] block_ioctl+0x38/0x40
    [<ffffffff8120097f>] do_vfs_ioctl+0x8f/0x700
    [<ffffffff8120102c>] SyS_ioctl+0x3c/0x70
    [<ffffffff8166c4aa>] entry_SYSCALL_64_fastpath+0x18/0xad

Bart.

^ permalink raw reply

* Re: [dm-devel] split scsi passthrough fields out of struct request V2
From: Bart Van Assche @ 2017-01-31 21:55 UTC (permalink / raw)
  To: axboe@kernel.dk
  Cc: linux-block@vger.kernel.org, linux-raid@vger.kernel.org,
	snitzer@redhat.com, hch@lst.de, linux-scsi@vger.kernel.org,
	axboe@fb.com, j-nomura@ce.jp.nec.com, dm-devel@redhat.com
In-Reply-To: <1485898487.3113.7.camel@sandisk.com>

On Tue, 2017-01-31 at 13:34 -0800, Bart Van Assche wrote:
> On Mon, 2017-01-30 at 17:38 -0800, Jens Axboe wrote:
> > That's a known bug in mainline. Pull it into 4.10-rc6,
> > or use my for-next where everything is already merged. 
> 
> Hello Jens,
> 
> With your for-next branch (commit c2e60b3a2602) I haven't hit any block
> layer crashes so far. The only issue I encountered that is new is a
> memory leak triggered by the SG-IO code. These memory leak reports
> started to appear after I started testing the mq-deadline scheduler.
> kmemleak reported the following call stack multiple times after my tests
> had finished:
> 
> unreferenced object 0xffff88041119e528 (size 192):
>   comm "multipathd", pid 2353, jiffies 4295128020 (age 1332.440s)
>   hex dump (first 32 bytes):
>     00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
>     00 00 00 00 00 00 00 00 12 01 00 00 00 00 00 00  ................
>   backtrace:
>     [<ffffffff8165e3b5>] kmemleak_alloc+0x45/0xa0
>     [<ffffffff811cc23d>] __kmalloc+0x15d/0x2f0
>     [<ffffffff81310e35>] bio_alloc_bioset+0x185/0x1f0
>     [<ffffffff813117f4>] bio_map_user_iov+0x124/0x400
>     [<ffffffff81320b7a>] blk_rq_map_user_iov+0x11a/0x210
>     [<ffffffff81320cbd>] blk_rq_map_user+0x4d/0x60
>     [<ffffffff81336694>] sg_io+0x3d4/0x410
>     [<ffffffff813369d0>] scsi_cmd_ioctl+0x300/0x490
>     [<ffffffff81336b9d>] scsi_cmd_blk_ioctl+0x3d/0x50
>     [<ffffffff814b4360>] sd_ioctl+0x80/0x100
>     [<ffffffff8132ddde>] blkdev_ioctl+0x51e/0x9f0
>     [<ffffffff8122f388>] block_ioctl+0x38/0x40
>     [<ffffffff8120097f>] do_vfs_ioctl+0x8f/0x700
>     [<ffffffff8120102c>] SyS_ioctl+0x3c/0x70
>     [<ffffffff8166c4aa>] entry_SYSCALL_64_fastpath+0x18/0xad

After I repeated my test the above findings were confirmed: no memory leaks
were reported by kmemleak after a test with I/O scheduler "none" and the
above call stack was reported 44 times by kmemleak after a test with I/O
scheduler "mq-deadline".

Bart.

^ permalink raw reply

* Re: [dm-devel] split scsi passthrough fields out of struct request V2
From: Jens Axboe @ 2017-01-31 21:58 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: linux-block@vger.kernel.org, linux-raid@vger.kernel.org,
	snitzer@redhat.com, hch@lst.de, linux-scsi@vger.kernel.org,
	axboe@fb.com, j-nomura@ce.jp.nec.com, dm-devel@redhat.com
In-Reply-To: <1485899692.3113.9.camel@sandisk.com>

On 01/31/2017 01:55 PM, Bart Van Assche wrote:
> On Tue, 2017-01-31 at 13:34 -0800, Bart Van Assche wrote:
>> On Mon, 2017-01-30 at 17:38 -0800, Jens Axboe wrote:
>>> That's a known bug in mainline. Pull it into 4.10-rc6,
>>> or use my for-next where everything is already merged. 
>>
>> Hello Jens,
>>
>> With your for-next branch (commit c2e60b3a2602) I haven't hit any block
>> layer crashes so far. The only issue I encountered that is new is a
>> memory leak triggered by the SG-IO code. These memory leak reports
>> started to appear after I started testing the mq-deadline scheduler.
>> kmemleak reported the following call stack multiple times after my tests
>> had finished:
>>
>> unreferenced object 0xffff88041119e528 (size 192):
>>   comm "multipathd", pid 2353, jiffies 4295128020 (age 1332.440s)
>>   hex dump (first 32 bytes):
>>     00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
>>     00 00 00 00 00 00 00 00 12 01 00 00 00 00 00 00  ................
>>   backtrace:
>>     [<ffffffff8165e3b5>] kmemleak_alloc+0x45/0xa0
>>     [<ffffffff811cc23d>] __kmalloc+0x15d/0x2f0
>>     [<ffffffff81310e35>] bio_alloc_bioset+0x185/0x1f0
>>     [<ffffffff813117f4>] bio_map_user_iov+0x124/0x400
>>     [<ffffffff81320b7a>] blk_rq_map_user_iov+0x11a/0x210
>>     [<ffffffff81320cbd>] blk_rq_map_user+0x4d/0x60
>>     [<ffffffff81336694>] sg_io+0x3d4/0x410
>>     [<ffffffff813369d0>] scsi_cmd_ioctl+0x300/0x490
>>     [<ffffffff81336b9d>] scsi_cmd_blk_ioctl+0x3d/0x50
>>     [<ffffffff814b4360>] sd_ioctl+0x80/0x100
>>     [<ffffffff8132ddde>] blkdev_ioctl+0x51e/0x9f0
>>     [<ffffffff8122f388>] block_ioctl+0x38/0x40
>>     [<ffffffff8120097f>] do_vfs_ioctl+0x8f/0x700
>>     [<ffffffff8120102c>] SyS_ioctl+0x3c/0x70
>>     [<ffffffff8166c4aa>] entry_SYSCALL_64_fastpath+0x18/0xad
> 
> After I repeated my test the above findings were confirmed: no memory leaks
> were reported by kmemleak after a test with I/O scheduler "none" and the
> above call stack was reported 44 times by kmemleak after a test with I/O
> scheduler "mq-deadline".

Interesting, I'll check this. Doesn't make any sense why the scheduler
would be implicated in that, given how we run completions now. But if
it complains, then something must be up.

-- 
Jens Axboe

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox