Linux RAID subsystem development

Linux RAID subsystem development
 help / color / mirror / Atom feed

* [md PATCH 05/14] md/raid5: use bio_inc_remaining() instead of repurposing bi_phys_segments as a counter
From: NeilBrown @ 2017-02-16  4:39 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148721992248.7521.17160361058957519076.stgit@noble>

md/raid5 needs to keep track of how many stripe_heads are processing a
bio so that it can delay calling bio_endio() until all stripe_heads
have completed.  It currently uses 16 bits of ->bi_phys_segments for
this purpose.

16 bits is only enough for 256M requests, and it is possible for a
single bio to be larger than this, which causes problems.  Also, the
bio struct contains a larger counter, __bi_remaining, which has a
purpose very similar to the purpose of our counter.  So stop using
->bi_phys_segments, and instead use __bi_remaining.

This means we don't need to initialize the counter, as our caller
initializes it to '1'.  It also means we can call bio_endio() directly
as it tests this counter internally.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5-cache.c |    3 +--
 drivers/md/raid5.c       |   50 +++++++++++-----------------------------------
 drivers/md/raid5.h       |   17 +---------------
 3 files changed, 14 insertions(+), 56 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 1b972a172800..5c282ae71cbd 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -265,8 +265,7 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
 	       dev->sector + STRIPE_SECTORS) {
 		wbi2 = r5_next_bio(wbi, dev->sector);
 		md_write_end(conf->mddev);
-		if (!raid5_dec_bi_active_stripes(wbi))
-			bio_endio(wbi);
+		bio_endio(wbi);
 		wbi = wbi2;
 	}
 }
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2fbf939c1a15..905abf081acf 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1187,8 +1187,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
 			while (rbi && rbi->bi_iter.bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				rbi2 = r5_next_bio(rbi, dev->sector);
-				if (!raid5_dec_bi_active_stripes(rbi))
-					bio_endio(rbi);
+				bio_endio(rbi);
 				rbi = rbi2;
 			}
 		}
@@ -3027,14 +3026,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 		(unsigned long long)bi->bi_iter.bi_sector,
 		(unsigned long long)sh->sector);
 
-	/*
-	 * If several bio share a stripe. The bio bi_phys_segments acts as a
-	 * reference count to avoid race. The reference count should already be
-	 * increased before this function is called (for example, in
-	 * raid5_make_request()), so other bio sharing this stripe will not free the
-	 * stripe. If a stripe is owned by one stripe, the stripe lock will
-	 * protect it.
-	 */
 	spin_lock_irq(&sh->stripe_lock);
 	/* Don't allow new IO added to stripes in batch list */
 	if (sh->batch_head)
@@ -3060,7 +3051,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 	if (*bip)
 		bi->bi_next = *bip;
 	*bip = bi;
-	raid5_inc_bi_active_stripes(bi);
+	bio_inc_remaining(bi);
 	md_write_start(conf->mddev, bi);
 
 	if (forwrite) {
@@ -3185,8 +3176,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 
 			bi->bi_error = -EIO;
 			md_write_end(conf->mddev);
-			if (!raid5_dec_bi_active_stripes(bi))
-				bio_endio(bi);
+			bio_endio(bi);
 			bi = nextbi;
 		}
 		if (bitmap_end)
@@ -3208,8 +3198,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 
 			bi->bi_error = -EIO;
 			md_write_end(conf->mddev);
-			if (!raid5_dec_bi_active_stripes(bi))
-				bio_endio(bi);
+			bio_endio(bi);
 			bi = bi2;
 		}
 
@@ -3234,8 +3223,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 					r5_next_bio(bi, sh->dev[i].sector);
 
 				bi->bi_error = -EIO;
-				if (!raid5_dec_bi_active_stripes(bi))
-					bio_endio(bi);
+				bio_endio(bi);
 				bi = nextbi;
 			}
 		}
@@ -3567,8 +3555,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 					dev->sector + STRIPE_SECTORS) {
 					wbi2 = r5_next_bio(wbi, dev->sector);
 					md_write_end(conf->mddev);
-					if (!raid5_dec_bi_active_stripes(wbi))
-						bio_endio(wbi);
+					bio_endio(wbi);
 					wbi = wbi2;
 				}
 				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
@@ -4913,7 +4900,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
 		 * this sets the active strip count to 1 and the processed
 		 * strip count to zero (upper 8 bits)
 		 */
-		raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
+		raid5_set_bi_processed_stripes(bi, 0);
 	}
 
 	return bi;
@@ -5228,7 +5215,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 	struct r5conf *conf = mddev->private;
 	sector_t logical_sector, last_sector;
 	struct stripe_head *sh;
-	int remaining;
 	int stripe_sectors;
 
 	if (mddev->reshape_position != MaxSector)
@@ -5239,7 +5225,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 	last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
 
 	bi->bi_next = NULL;
-	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
 	md_write_start(mddev, bi);
 
 	stripe_sectors = conf->chunk_sectors *
@@ -5286,7 +5271,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 				continue;
 			sh->dev[d].towrite = bi;
 			set_bit(R5_OVERWRITE, &sh->dev[d].flags);
-			raid5_inc_bi_active_stripes(bi);
+			bio_inc_remaining(bi);
 			md_write_start(mddev, bi);
 			sh->overwrite_disks++;
 		}
@@ -5311,10 +5296,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 	}
 
 	md_write_end(mddev);
-	remaining = raid5_dec_bi_active_stripes(bi);
-	if (remaining == 0) {
-		bio_endio(bi);
-	}
+	bio_endio(bi);
 }
 
 static void raid5_make_request(struct mddev *mddev, struct bio * bi)
@@ -5325,7 +5307,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	sector_t logical_sector, last_sector;
 	struct stripe_head *sh;
 	const int rw = bio_data_dir(bi);
-	int remaining;
 	DEFINE_WAIT(w);
 	bool do_prepare;
 	bool do_flush = false;
@@ -5368,7 +5349,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
 	last_sector = bio_end_sector(bi);
 	bi->bi_next = NULL;
-	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
 	md_write_start(mddev, bi);
 
 	prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
@@ -5506,10 +5486,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 
 	if (rw == WRITE)
 		md_write_end(mddev);
-	remaining = raid5_dec_bi_active_stripes(bi);
-	if (remaining == 0) {
-		bio_endio(bi);
-	}
+	bio_endio(bi);
 }
 
 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
@@ -5874,7 +5851,6 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 	int dd_idx;
 	sector_t sector, logical_sector, last_sector;
 	int scnt = 0;
-	int remaining;
 	int handled = 0;
 
 	logical_sector = raid_bio->bi_iter.bi_sector &
@@ -5913,10 +5889,8 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 		raid5_release_stripe(sh);
 		handled++;
 	}
-	remaining = raid5_dec_bi_active_stripes(raid_bio);
-	if (remaining == 0) {
-		bio_endio(raid_bio);
-	}
+	bio_endio(raid_bio);
+
 	if (atomic_dec_and_test(&conf->active_aligned_reads))
 		wake_up(&conf->wait_for_quiescent);
 	return handled;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 9051631cb7f0..3018a33693ab 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -481,8 +481,7 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
 }
 
 /*
- * We maintain a biased count of active stripes in the bottom 16 bits of
- * bi_phys_segments, and a count of processed stripes in the upper 16 bits
+ * We maintain a count of processed stripes in the upper 16 bits
  */
 static inline int raid5_bi_processed_stripes(struct bio *bio)
 {
@@ -491,20 +490,6 @@ static inline int raid5_bi_processed_stripes(struct bio *bio)
 	return (atomic_read(segments) >> 16) & 0xffff;
 }
 
-static inline int raid5_dec_bi_active_stripes(struct bio *bio)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
-	return atomic_sub_return(1, segments) & 0xffff;
-}
-
-static inline void raid5_inc_bi_active_stripes(struct bio *bio)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
-	atomic_inc(segments);
-}
-
 static inline void raid5_set_bi_processed_stripes(struct bio *bio,
 	unsigned int cnt)
 {



^ permalink raw reply related

* [md PATCH 06/14] md/raid5: remove over-loading of ->bi_phys_segments.
From: NeilBrown @ 2017-02-16  4:39 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148721992248.7521.17160361058957519076.stgit@noble>

When a read request, which bypassed the cache, fails, we need to retry
it through the cache.
This involves attaching it to a sequence of stripe_heads, and it may not
be possible to get all the stripe_heads we need at once.
We do what we can, and record how far we got in ->bi_phys_segments so
we can pick up again later.

There is only ever one bio which may have a non-zero offset stored in
->bi_phys_segments, the one that is either active in the single thread
which calls retry_aligned_read(), or is in conf->retry_read_aligned
waiting for retry_aligned_read() to be called again.

So we only need to store one offset value.  This can be in a local
variable passed between remove_bio_from_retry() and
retry_aligned_read(), or in the r5conf structure next to the
->retry_read_aligned pointer.

Storing it there allows the last usage of ->bi_phys_segments to be
removed from md/raid5.c.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5.c |   24 ++++++++++++------------
 drivers/md/raid5.h |   30 +-----------------------------
 2 files changed, 13 insertions(+), 41 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 905abf081acf..f93e8fddbb23 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4883,12 +4883,14 @@ static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
 	md_wakeup_thread(conf->mddev->thread);
 }
 
-static struct bio *remove_bio_from_retry(struct r5conf *conf)
+static struct bio *remove_bio_from_retry(struct r5conf *conf,
+					 unsigned int *offset)
 {
 	struct bio *bi;
 
 	bi = conf->retry_read_aligned;
 	if (bi) {
+		*offset = conf->retry_read_offset;
 		conf->retry_read_aligned = NULL;
 		return bi;
 	}
@@ -4896,11 +4898,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
 	if(bi) {
 		conf->retry_read_aligned_list = bi->bi_next;
 		bi->bi_next = NULL;
-		/*
-		 * this sets the active strip count to 1 and the processed
-		 * strip count to zero (upper 8 bits)
-		 */
-		raid5_set_bi_processed_stripes(bi, 0);
+		*offset = 0;
 	}
 
 	return bi;
@@ -5835,7 +5833,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
 	return STRIPE_SECTORS;
 }
 
-static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
+static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
+			       unsigned int offset)
 {
 	/* We may not be able to submit a whole bio at once as there
 	 * may not be enough stripe_heads available.
@@ -5864,7 +5863,7 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 		     sector += STRIPE_SECTORS,
 		     scnt++) {
 
-		if (scnt < raid5_bi_processed_stripes(raid_bio))
+		if (scnt < offset)
 			/* already done this stripe */
 			continue;
 
@@ -5872,15 +5871,15 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 
 		if (!sh) {
 			/* failed to get a stripe - must wait */
-			raid5_set_bi_processed_stripes(raid_bio, scnt);
 			conf->retry_read_aligned = raid_bio;
+			conf->retry_read_offset = scnt;
 			return handled;
 		}
 
 		if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
 			raid5_release_stripe(sh);
-			raid5_set_bi_processed_stripes(raid_bio, scnt);
 			conf->retry_read_aligned = raid_bio;
+			conf->retry_read_offset = scnt;
 			return handled;
 		}
 
@@ -6003,6 +6002,7 @@ static void raid5d(struct md_thread *thread)
 	while (1) {
 		struct bio *bio;
 		int batch_size, released;
+		unsigned int offset;
 
 		released = release_stripe_list(conf, conf->temp_inactive_list);
 		if (released)
@@ -6020,10 +6020,10 @@ static void raid5d(struct md_thread *thread)
 		}
 		raid5_activate_delayed(conf);
 
-		while ((bio = remove_bio_from_retry(conf))) {
+		while ((bio = remove_bio_from_retry(conf, &offset))) {
 			int ok;
 			spin_unlock_irq(&conf->device_lock);
-			ok = retry_aligned_read(conf, bio);
+			ok = retry_aligned_read(conf, bio, offset);
 			spin_lock_irq(&conf->device_lock);
 			if (!ok)
 				break;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 3018a33693ab..a4ef02176afb 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -480,35 +480,6 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
 		return NULL;
 }
 
-/*
- * We maintain a count of processed stripes in the upper 16 bits
- */
-static inline int raid5_bi_processed_stripes(struct bio *bio)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
-	return (atomic_read(segments) >> 16) & 0xffff;
-}
-
-static inline void raid5_set_bi_processed_stripes(struct bio *bio,
-	unsigned int cnt)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-	int old, new;
-
-	do {
-		old = atomic_read(segments);
-		new = (old & 0xffff) | (cnt << 16);
-	} while (atomic_cmpxchg(segments, old, new) != old);
-}
-
-static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
-	atomic_set(segments, cnt);
-}
-
 /* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
  * This is because we sometimes take all the spinlocks
  * and creating that much locking depth can cause
@@ -596,6 +567,7 @@ struct r5conf {
 	struct list_head	delayed_list; /* stripes that have plugged requests */
 	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
 	struct bio		*retry_read_aligned; /* currently retrying aligned bios   */
+	unsigned int		retry_read_offset; /* sector offset into retry_read_aligned */
 	struct bio		*retry_read_aligned_list; /* aligned bios retry list  */
 	atomic_t		preread_active_stripes; /* stripes with scheduled io */
 	atomic_t		active_aligned_reads;



^ permalink raw reply related

* [md PATCH 08/14] md/raid1, raid10: move rXbio accounting closer to allocation.
From: NeilBrown @ 2017-02-16  4:39 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148721992248.7521.17160361058957519076.stgit@noble>

When raid1 or raid10 need find they will need to allocate
a new r1bio/r10bio, in order to work around a known bad block,
the account for the allocation well before the allocation
is made.  This separation makes the correctness less obvious and
requires comments.

The accounting needs to be a little before: before the first rXbio is
submitted, but that is all.

So move the accounting down to where it makes more sense.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid1.c  |   23 ++++++++++-------------
 drivers/md/raid10.c |   22 +++++++++-------------
 2 files changed, 19 insertions(+), 26 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7b0f647bcccb..c1d0675880fb 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1326,18 +1326,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 		goto retry_write;
 	}
 
-	if (max_sectors < r1_bio->sectors) {
-		/* We are splitting this write into multiple parts, so
-		 * we need to prepare for allocating another r1_bio.
-		 */
+	if (max_sectors < r1_bio->sectors)
 		r1_bio->sectors = max_sectors;
-		spin_lock_irq(&conf->device_lock);
-		if (bio->bi_phys_segments == 0)
-			bio->bi_phys_segments = 2;
-		else
-			bio->bi_phys_segments++;
-		spin_unlock_irq(&conf->device_lock);
-	}
+
 	sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector;
 
 	atomic_set(&r1_bio->remaining, 1);
@@ -1426,10 +1417,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	 * as it could result in the bio being freed.
 	 */
 	if (sectors_handled < bio_sectors(bio)) {
-		r1_bio_write_done(r1_bio);
-		/* We need another r1_bio.  It has already been counted
+		/* We need another r1_bio, which must be accounted
 		 * in bio->bi_phys_segments
 		 */
+		spin_lock_irq(&conf->device_lock);
+		if (bio->bi_phys_segments == 0)
+			bio->bi_phys_segments = 2;
+		else
+			bio->bi_phys_segments++;
+		spin_unlock_irq(&conf->device_lock);
+		r1_bio_write_done(r1_bio);
 		r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
 		r1_bio->master_bio = bio;
 		r1_bio->sectors = bio_sectors(bio) - sectors_handled;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 1920756828df..9258cbe233bb 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1383,18 +1383,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 		goto retry_write;
 	}
 
-	if (max_sectors < r10_bio->sectors) {
-		/* We are splitting this into multiple parts, so
-		 * we need to prepare for allocating another r10_bio.
-		 */
+	if (max_sectors < r10_bio->sectors)
 		r10_bio->sectors = max_sectors;
-		spin_lock_irq(&conf->device_lock);
-		if (bio->bi_phys_segments == 0)
-			bio->bi_phys_segments = 2;
-		else
-			bio->bi_phys_segments++;
-		spin_unlock_irq(&conf->device_lock);
-	}
 	sectors_handled = r10_bio->sector + max_sectors -
 		bio->bi_iter.bi_sector;
 
@@ -1491,10 +1481,16 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 	 */
 
 	if (sectors_handled < bio_sectors(bio)) {
-		one_write_done(r10_bio);
-		/* We need another r10_bio.  It has already been counted
+		/* We need another r10_bio and it needs to be counted
 		 * in bio->bi_phys_segments.
 		 */
+		spin_lock_irq(&conf->device_lock);
+		if (bio->bi_phys_segments == 0)
+			bio->bi_phys_segments = 2;
+		else
+			bio->bi_phys_segments++;
+		spin_unlock_irq(&conf->device_lock);
+		one_write_done(r10_bio);
 		r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 
 		r10_bio->master_bio = bio;



^ permalink raw reply related

* [md PATCH 07/14] Revert "md/raid5: limit request size according to implementation limits"
From: NeilBrown @ 2017-02-16  4:39 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148721992248.7521.17160361058957519076.stgit@noble>

This reverts commit e8d7c33232e5fdfa761c3416539bc5b4acd12db5.

Now that raid5 doesn't abuse bi_phys_segments any more, we no longer
need to impose these limits.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5.c |    9 ---------
 1 file changed, 9 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f93e8fddbb23..c96fca2c6a98 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7100,15 +7100,6 @@ static int raid5_run(struct mddev *mddev)
 			stripe = (stripe | (stripe-1)) + 1;
 		mddev->queue->limits.discard_alignment = stripe;
 		mddev->queue->limits.discard_granularity = stripe;
-
-		/*
-		 * We use 16-bit counter of active stripes in bi_phys_segments
-		 * (minus one for over-loaded initialization)
-		 */
-		blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS);
-		blk_queue_max_discard_sectors(mddev->queue,
-					      0xfffe * STRIPE_SECTORS);
-
 		/*
 		 * unaligned part of discard request will be ignored, so can't
 		 * guarantee discard_zeroes_data



^ permalink raw reply related

* [md PATCH 10/14] md/raid1: stop using bi_phys_segment
From: NeilBrown @ 2017-02-16  4:39 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148721992248.7521.17160361058957519076.stgit@noble>

Change to use bio->__bi_remaining to count number of r1bio attached
to a bio.
See precious raid10 patch for more details.

inc_pending is a little more complicated in raid1 as we need to adjust
next_window_requests or current_window_requests.

The wait_event() call if start_next_window is no longer needed as each
r1_bio is completed separately.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid1.c |   99 ++++++++++++++++++----------------------------------
 1 file changed, 34 insertions(+), 65 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c1d0675880fb..5a57111c7bc9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -241,36 +241,19 @@ static void reschedule_retry(struct r1bio *r1_bio)
 static void call_bio_endio(struct r1bio *r1_bio)
 {
 	struct bio *bio = r1_bio->master_bio;
-	int done;
 	struct r1conf *conf = r1_bio->mddev->private;
 	sector_t start_next_window = r1_bio->start_next_window;
 	sector_t bi_sector = bio->bi_iter.bi_sector;
 
-	if (bio->bi_phys_segments) {
-		unsigned long flags;
-		spin_lock_irqsave(&conf->device_lock, flags);
-		bio->bi_phys_segments--;
-		done = (bio->bi_phys_segments == 0);
-		spin_unlock_irqrestore(&conf->device_lock, flags);
-		/*
-		 * make_request() might be waiting for
-		 * bi_phys_segments to decrease
-		 */
-		wake_up(&conf->wait_barrier);
-	} else
-		done = 1;
-
 	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
 		bio->bi_error = -EIO;
 
-	if (done) {
-		bio_endio(bio);
-		/*
-		 * Wake up any possible resync thread that waits for the device
-		 * to go idle.
-		 */
-		allow_barrier(conf, start_next_window, bi_sector);
-	}
+	bio_endio(bio);
+	/*
+	 * Wake up any possible resync thread that waits for the device
+	 * to go idle.
+	 */
+	allow_barrier(conf, start_next_window, bi_sector);
 }
 
 static void raid_end_bio_io(struct r1bio *r1_bio)
@@ -923,6 +906,26 @@ static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
 	return sector;
 }
 
+static void inc_pending(struct r1conf *conf, sector_t start_next_window,
+			sector_t bi_sector)
+{
+	/* The current request requires multiple r1_bio, so
+	 * we need to increment the pending count, and the corresponding
+	 * window count.
+	 */
+	spin_lock(&conf->resync_lock);
+	conf->nr_pending++;
+	if (start_next_window == conf->start_next_window) {
+		if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
+		    <= bi_sector)
+			conf->next_window_requests++;
+		else
+			conf->current_window_requests++;
+	} else
+		conf->current_window_requests++;
+	spin_unlock(&conf->resync_lock);
+}
+
 static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
 			  sector_t bi_sector)
 {
@@ -1137,12 +1140,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 		sectors_handled = (r1_bio->sector + max_sectors
 				   - bio->bi_iter.bi_sector);
 		r1_bio->sectors = max_sectors;
-		spin_lock_irq(&conf->device_lock);
-		if (bio->bi_phys_segments == 0)
-			bio->bi_phys_segments = 2;
-		else
-			bio->bi_phys_segments++;
-		spin_unlock_irq(&conf->device_lock);
+		bio_inc_remaining(bio);
 
 		/*
 		 * Cannot call generic_make_request directly as that will be
@@ -1304,7 +1302,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	if (unlikely(blocked_rdev)) {
 		/* Wait for this device to become unblocked */
 		int j;
-		sector_t old = start_next_window;
 
 		for (j = 0; j < i; j++)
 			if (r1_bio->bios[j])
@@ -1314,15 +1311,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 		raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
 		md_wait_for_blocked_rdev(blocked_rdev, mddev);
 		start_next_window = wait_barrier(conf, bio);
-		/*
-		 * We must make sure the multi r1bios of bio have
-		 * the same value of bi_phys_segments
-		 */
-		if (bio->bi_phys_segments && old &&
-		    old != start_next_window)
-			/* Wait for the former r1bio(s) to complete */
-			wait_event(conf->wait_barrier,
-				   bio->bi_phys_segments == 1);
 		goto retry_write;
 	}
 
@@ -1417,22 +1405,18 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	 * as it could result in the bio being freed.
 	 */
 	if (sectors_handled < bio_sectors(bio)) {
-		/* We need another r1_bio, which must be accounted
-		 * in bio->bi_phys_segments
-		 */
-		spin_lock_irq(&conf->device_lock);
-		if (bio->bi_phys_segments == 0)
-			bio->bi_phys_segments = 2;
-		else
-			bio->bi_phys_segments++;
-		spin_unlock_irq(&conf->device_lock);
+		/* We need another r1_bio, which must be counted */
+		sector_t sect = bio->bi_iter.bi_sector + sectors_handled;
+
+		inc_pending(conf, start_next_window, sect);
+		bio_inc_remaining(bio);
 		r1_bio_write_done(r1_bio);
 		r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
 		r1_bio->master_bio = bio;
 		r1_bio->sectors = bio_sectors(bio) - sectors_handled;
 		r1_bio->state = 0;
 		r1_bio->mddev = mddev;
-		r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+		r1_bio->sector = sect;
 		goto retry_write;
 	}
 
@@ -1460,16 +1444,6 @@ static void raid1_make_request(struct mddev *mddev, struct bio *bio)
 	r1_bio->mddev = mddev;
 	r1_bio->sector = bio->bi_iter.bi_sector;
 
-	/*
-	 * We might need to issue multiple reads to different devices if there
-	 * are bad blocks around, so we keep track of the number of reads in
-	 * bio->bi_phys_segments.  If this is 0, there is only one r1_bio and
-	 * no locking will be needed when requests complete.  If it is
-	 * non-zero, then it is the number of not-completed requests.
-	 */
-	bio->bi_phys_segments = 0;
-	bio_clear_flag(bio, BIO_SEG_VALID);
-
 	if (bio_data_dir(bio) == READ)
 		raid1_read_request(mddev, bio, r1_bio);
 	else
@@ -2434,12 +2408,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 			int sectors_handled = (r1_bio->sector + max_sectors
 					       - mbio->bi_iter.bi_sector);
 			r1_bio->sectors = max_sectors;
-			spin_lock_irq(&conf->device_lock);
-			if (mbio->bi_phys_segments == 0)
-				mbio->bi_phys_segments = 2;
-			else
-				mbio->bi_phys_segments++;
-			spin_unlock_irq(&conf->device_lock);
+			bio_inc_remaining(mbio);
 			trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
 					      bio, bio_dev, bio_sector);
 			generic_make_request(bio);



^ permalink raw reply related

* [md PATCH 09/14] md/raid10: stop using bi_phys_segments
From: NeilBrown @ 2017-02-16  4:39 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148721992248.7521.17160361058957519076.stgit@noble>

raid10 currently repurposes bi_phys_segments on each
incoming bio to count how many r10bio was used to encode the
request.

We need to know when the number of attached r10bio reaches
zero to:
1/ call bio_endio() when all IO on the bio is finished
2/ decrement ->nr_pending so that resync IO can proceed.

Now that the bio has its own __bi_remaining counter, that
can be used instead. We can call bio_inc_remaining to
increment the counter and call bio_endio() every time an
r10bio completes, rather than only when bi_phys_segments
reaches zero.

This addresses point 1, but not point 2.  bio_endio()
doesn't (and cannot) report when the last r10bio has
finished, so a different approach is needed.

So: instead of counting bios in ->nr_pending, count r10bios.
i.e. every time we attach a bio, increment nr_pending.
Every time an r10bio completes, decrement nr_pending.

Normally we only increment nr_pending after first checking
that ->barrier is zero, or some other non-trivial tests and
possible waiting.  When attaching multiple r10bios to a bio,
we only need the tests and the waiting once.  After the
first increment, subsequent increments can happen
unconditionally as they are really all part of the one
request.

So introduce inc_pending() which can be used when we know
that nr_pending is already elevated.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid10.c |   76 +++++++++++++++++----------------------------------
 1 file changed, 25 insertions(+), 51 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 9258cbe233bb..6b4d8643c574 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -301,27 +301,18 @@ static void reschedule_retry(struct r10bio *r10_bio)
 static void raid_end_bio_io(struct r10bio *r10_bio)
 {
 	struct bio *bio = r10_bio->master_bio;
-	int done;
 	struct r10conf *conf = r10_bio->mddev->private;
 
-	if (bio->bi_phys_segments) {
-		unsigned long flags;
-		spin_lock_irqsave(&conf->device_lock, flags);
-		bio->bi_phys_segments--;
-		done = (bio->bi_phys_segments == 0);
-		spin_unlock_irqrestore(&conf->device_lock, flags);
-	} else
-		done = 1;
 	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
 		bio->bi_error = -EIO;
-	if (done) {
-		bio_endio(bio);
-		/*
-		 * Wake up any possible resync thread that waits for the device
-		 * to go idle.
-		 */
-		allow_barrier(conf);
-	}
+
+	/*
+	 * Wake up any possible resync thread that waits for the device
+	 * to go idle.
+	 */
+	allow_barrier(conf);
+	bio_endio(bio);
+
 	free_r10bio(r10_bio);
 }
 
@@ -984,6 +975,15 @@ static void wait_barrier(struct r10conf *conf)
 	spin_unlock_irq(&conf->resync_lock);
 }
 
+static void inc_pending(struct r10conf *conf)
+{
+	/* The current request requires multiple r10_bio, so
+	 * we need to increment the pending count.
+	 */
+	WARN_ON(!atomic_read(&conf->nr_pending));
+	atomic_inc(&conf->nr_pending);
+}
+
 static void allow_barrier(struct r10conf *conf)
 {
 	if ((atomic_dec_and_test(&conf->nr_pending)) ||
@@ -1161,12 +1161,8 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 		sectors_handled = (r10_bio->sector + max_sectors
 				   - bio->bi_iter.bi_sector);
 		r10_bio->sectors = max_sectors;
-		spin_lock_irq(&conf->device_lock);
-		if (bio->bi_phys_segments == 0)
-			bio->bi_phys_segments = 2;
-		else
-			bio->bi_phys_segments++;
-		spin_unlock_irq(&conf->device_lock);
+		inc_pending(conf);
+		bio_inc_remaining(bio);
 		/*
 		 * Cannot call generic_make_request directly as that will be
 		 * queued in __generic_make_request and subsequent
@@ -1261,9 +1257,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 	 * on which we have seen a write error, we want to avoid
 	 * writing to those blocks.  This potentially requires several
 	 * writes to write around the bad blocks.  Each set of writes
-	 * gets its own r10_bio with a set of bios attached.  The number
-	 * of r10_bios is recored in bio->bi_phys_segments just as with
-	 * the read case.
+	 * gets its own r10_bio with a set of bios attached.
 	 */
 
 	r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
@@ -1481,15 +1475,9 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 	 */
 
 	if (sectors_handled < bio_sectors(bio)) {
-		/* We need another r10_bio and it needs to be counted
-		 * in bio->bi_phys_segments.
-		 */
-		spin_lock_irq(&conf->device_lock);
-		if (bio->bi_phys_segments == 0)
-			bio->bi_phys_segments = 2;
-		else
-			bio->bi_phys_segments++;
-		spin_unlock_irq(&conf->device_lock);
+		/* We need another r10_bio and it needs to be counted */
+		inc_pending(conf);
+		bio_inc_remaining(bio);
 		one_write_done(r10_bio);
 		r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 
@@ -1518,16 +1506,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 	r10_bio->sector = bio->bi_iter.bi_sector;
 	r10_bio->state = 0;
 
-	/*
-	 * We might need to issue multiple reads to different devices if there
-	 * are bad blocks around, so we keep track of the number of reads in
-	 * bio->bi_phys_segments.  If this is 0, there is only one r10_bio and
-	 * no locking will be needed when the request completes.  If it is
-	 * non-zero, then it is the number of not-completed requests.
-	 */
-	bio->bi_phys_segments = 0;
-	bio_clear_flag(bio, BIO_SEG_VALID);
-
 	if (bio_data_dir(bio) == READ)
 		raid10_read_request(mddev, bio, r10_bio);
 	else
@@ -2662,12 +2640,8 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 			r10_bio->sector + max_sectors
 			- mbio->bi_iter.bi_sector;
 		r10_bio->sectors = max_sectors;
-		spin_lock_irq(&conf->device_lock);
-		if (mbio->bi_phys_segments == 0)
-			mbio->bi_phys_segments = 2;
-		else
-			mbio->bi_phys_segments++;
-		spin_unlock_irq(&conf->device_lock);
+		bio_inc_remaining(mbio);
+		inc_pending(conf);
 		generic_make_request(bio);
 
 		r10_bio = mempool_alloc(conf->r10bio_pool,



^ permalink raw reply related

* [md PATCH 11/14] md/raid5: don't test ->writes_pending in raid5_remove_disk
From: NeilBrown @ 2017-02-16  4:39 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148721992248.7521.17160361058957519076.stgit@noble>

This test on ->writes_pending cannot be safe as the counter
can be incremented at any moment and cannot be locked against.

Change it to test conf->active_stripes.  This allows read requests
to interfere so it might make removal of the journal device
harder, but it is safer.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5.c |    6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c96fca2c6a98..4de03479cdce 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7279,10 +7279,14 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 		 * we can't wait pending write here, as this is called in
 		 * raid5d, wait will deadlock.
 		 */
-		if (atomic_read(&mddev->writes_pending))
+		lock_all_device_hash_locks_irq(conf);
+		if (atomic_read(&conf->active_stripes)) {
+			unlock_all_device_hash_locks_irq(conf);
 			return -EBUSY;
+		}
 		log = conf->log;
 		conf->log = NULL;
+		unlock_all_device_hash_locks_irq(conf);
 		synchronize_rcu();
 		r5l_exit_log(log);
 		return 0;



^ permalink raw reply related

* [md PATCH 12/14] md: factor out set_in_sync()
From: NeilBrown @ 2017-02-16  4:39 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148721992248.7521.17160361058957519076.stgit@noble>

Three separate places in md.c check if the number of active
writes is zero and, if so, sets mddev->in_sync.

There are a few differences, but there shouldn't be:
- it is always appropriate to notify the change in
  sysfs_state, and there is no need to do this outside a
  spin-locked region.
- we never need to check ->recovery_cp.  The state of resync
  is not relevant for whether there are any pending writes
  or not (which is what ->in_sync reports).

So create set_in_sync() which does the correct tests and
makes the correct changes, and call this in all three
places.

Any behaviour changes here a minor and cosmetic.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c |   57 ++++++++++++++++++++++---------------------------------
 1 file changed, 23 insertions(+), 34 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 01175dac0db6..1d5fdc4a659d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2250,6 +2250,24 @@ static void export_array(struct mddev *mddev)
 	mddev->major_version = 0;
 }
 
+static bool set_in_sync(struct mddev *mddev)
+{
+	bool ret = false;
+
+	WARN_ON_ONCE(!spin_is_locked(&mddev->lock));
+	if (atomic_read(&mddev->writes_pending) == 0) {
+		if (mddev->in_sync == 0) {
+			mddev->in_sync = 1;
+			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
+			sysfs_notify_dirent_safe(mddev->sysfs_state);
+		}
+		ret = true;
+	}
+	if (mddev->safemode == 1)
+		mddev->safemode = 0;
+	return ret;
+}
+
 static void sync_sbs(struct mddev *mddev, int nospares)
 {
 	/* Update each superblock (in-memory image), but
@@ -3948,7 +3966,7 @@ static int restart_array(struct mddev *mddev);
 static ssize_t
 array_state_store(struct mddev *mddev, const char *buf, size_t len)
 {
-	int err;
+	int err = 0;
 	enum array_state st = match_word(buf, array_states);
 
 	if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
@@ -3961,18 +3979,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
 			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
 			md_wakeup_thread(mddev->thread);
 			wake_up(&mddev->sb_wait);
-			err = 0;
 		} else /* st == clean */ {
 			restart_array(mddev);
-			if (atomic_read(&mddev->writes_pending) == 0) {
-				if (mddev->in_sync == 0) {
-					mddev->in_sync = 1;
-					if (mddev->safemode == 1)
-						mddev->safemode = 0;
-					set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
-				}
-				err = 0;
-			} else
+			if (!set_in_sync(mddev))
 				err = -EBUSY;
 		}
 		if (!err)
@@ -4030,15 +4039,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
 			if (err)
 				break;
 			spin_lock(&mddev->lock);
-			if (atomic_read(&mddev->writes_pending) == 0) {
-				if (mddev->in_sync == 0) {
-					mddev->in_sync = 1;
-					if (mddev->safemode == 1)
-						mddev->safemode = 0;
-					set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
-				}
-				err = 0;
-			} else
+			if (!set_in_sync(mddev))
 				err = -EBUSY;
 			spin_unlock(&mddev->lock);
 		} else
@@ -8450,22 +8451,10 @@ void md_check_recovery(struct mddev *mddev)
 				md_reload_sb(mddev, mddev->good_device_nr);
 		}
 
-		if (!mddev->external) {
-			int did_change = 0;
+		if (!mddev->external && !mddev->in_sync) {
 			spin_lock(&mddev->lock);
-			if (mddev->safemode &&
-			    !atomic_read(&mddev->writes_pending) &&
-			    !mddev->in_sync &&
-			    mddev->recovery_cp == MaxSector) {
-				mddev->in_sync = 1;
-				did_change = 1;
-				set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
-			}
-			if (mddev->safemode == 1)
-				mddev->safemode = 0;
+			set_in_sync(mddev);
 			spin_unlock(&mddev->lock);
-			if (did_change)
-				sysfs_notify_dirent_safe(mddev->sysfs_state);
 		}
 
 		if (mddev->sb_flags)



^ permalink raw reply related

* [md PATCH 13/14] md: close a race with setting mddev->in_sync
From: NeilBrown @ 2017-02-16  4:39 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148721992248.7521.17160361058957519076.stgit@noble>

If ->in_sync is being set just as md_write_start() is being called,
it is possible that set_in_sync() won't see the elevated
->writes_pending, and md_write_start() won't see the set ->in_sync.

To close this race, re-test ->writes_pending after setting ->in_sync,
and add memory barriers to ensure the increment of ->writes_pending
will be seen by the time of this second test, or the new ->in_sync
will be seen by md_write_start().

Add a spinlock to array_state_show() to ensure this temporary
instability is never visible from userspace.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c |    7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1d5fdc4a659d..f856c95ee7d5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2258,6 +2258,10 @@ static bool set_in_sync(struct mddev *mddev)
 	if (atomic_read(&mddev->writes_pending) == 0) {
 		if (mddev->in_sync == 0) {
 			mddev->in_sync = 1;
+			smp_mb();
+			if (atomic_read(&mddev->writes_pending))
+				/* lost a race with md_write_start() */
+				mddev->in_sync = 0;
 			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
 			sysfs_notify_dirent_safe(mddev->sysfs_state);
 		}
@@ -3938,6 +3942,7 @@ array_state_show(struct mddev *mddev, char *page)
 			st = read_auto;
 			break;
 		case 0:
+			spin_lock(&mddev->lock);
 			if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
 				st = write_pending;
 			else if (mddev->in_sync)
@@ -3946,6 +3951,7 @@ array_state_show(struct mddev *mddev, char *page)
 				st = active_idle;
 			else
 				st = active;
+			spin_unlock(&mddev->lock);
 		}
 	else {
 		if (list_empty(&mddev->disks) &&
@@ -7769,6 +7775,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
 		did_change = 1;
 	}
 	atomic_inc(&mddev->writes_pending);
+	smp_mb(); /* Match smp_mb in set_in_sync() */
 	if (mddev->safemode == 1)
 		mddev->safemode = 0;
 	if (mddev->in_sync) {



^ permalink raw reply related

* [md PATCH 14/14] MD: use per-cpu counter for writes_pending
From: NeilBrown @ 2017-02-16  4:39 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148721992248.7521.17160361058957519076.stgit@noble>

The 'writes_pending' counter is used to determine when the
array is stable so that it can be marked in the superblock
as "Clean".  Consequently it needs to be updated frequently
but only checked for zero occasionally.  Recent changes to
raid5 cause the count to be updated even more often - once
per 4K rather than once per bio.  This provided
justification for making the updates more efficient.

So we replace the atomic counter with a per-cpu array of
'long' counters. Incrementing and decrementing is normally
much cheaper, testing for zero is more expensive.

To meaningfully be able to test for zero we need to be able
to block further updates.  This is done by forcing the
"increment" step to take a spinlock in the rare case that
another thread is checking if the count is zero.  This is
done using a new field: "checkers".  "checkers" is the
number of threads that are currently checking whether the
count is zero.  It is usually 0, occasionally 1, and it is
not impossible that it could be higher, though this would be
rare.

If, within an rcu_read_locked section, checkers is seen to
be zero, then the local-cpu counter can be incremented
freely.  If checkers is not zero, mddev->lock must be taken
before the increment is allowed.  A decrement is always
allowed.

To test for zero, a thread must increment "checkers", call
synchronize_rcu(), then take mddev->lock.  Once this is done
no new increments can happen.  A thread may choose to
perform a quick test-for-zero by summing all the counters
without holding a lock.  If this is non-zero, the the total
count is non-zero, or was non-zero very recently, so it is
safe to assume that it isn't zero.  If the quick check does
report a zero sum, then it is worth performing the locking
protocol.

When the counter is decremented, it is no longer possible to
immediately test if the result is zero
(atmic_dec_and_test()).  We don't even really want to
perform the "quick" tests as that sums over all cpus and is
work that will most often bring no benefit.

In the "safemode==2" case, when we want to mark the array as
"clean" immediately when there are no writes, we perform the
quick test anyway, and possibly wake the md thread to do the
full test.  "safemode==2" is only used during shutdown so
the cost is not problematic.

When safemode!=2 we always set the timer, rather than only
when the counter reaches zero.

If mod_timer() is called to set the timeout to the value it
already has, mod_timer() has low overhead with no atomic
operations.  So at worst it will have a noticeable cost once
per jiffie.  To further reduce the otherhead, we round the
requests delay to a multiple of ->safemode_delay.  This
might increase the delay until the timer fires a little, but
will reduce the overhead of calling mod_timer()
significantly.  If lots of requests are completing, the
timer will be updated every 200 milliseconds (by default)
and never fire.  When it does eventually fire, it will
schedule the md thread to perform the full test for
writes_pending==0, and this is quite likely to find '0'.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c |  107 +++++++++++++++++++++++++++++++++++++++++++++++--------
 drivers/md/md.h |    3 +-
 2 files changed, 93 insertions(+), 17 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index f856c95ee7d5..47bbd22e2865 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -64,6 +64,8 @@
 #include <linux/raid/md_p.h>
 #include <linux/raid/md_u.h>
 #include <linux/slab.h>
+#include <linux/percpu.h>
+
 #include <trace/events/block.h>
 #include "md.h"
 #include "bitmap.h"
@@ -2250,21 +2252,52 @@ static void export_array(struct mddev *mddev)
 	mddev->major_version = 0;
 }

+/*
+ * The percpu writes_pending counters are linked with ->checkers and
+ * ->lock.  If ->writes_pending can always be decremented without a
+ * lock.  It can only be incremented without a lock if ->checkers is 0
+ * and the test+incr happen in a rcu_readlocked region.
+ * ->checkers can only be changed under ->lock protection.
+ * To determine if ->writes_pending is totally zero, a quick sum without
+ * locks can be performed. If this is non-zero, then the result is final.
+ * Otherwise ->checkers must be incremented and synchronize_rcu() called.
+ * Then a sum calculated under ->lock, and the result is final until the
+ * ->checkers is decremented, or the lock is dropped.
+ *
+ */
+
+static bool __writes_pending(struct mddev *mddev)
+{
+	long cnt = 0;
+	int i;
+
+	for_each_possible_cpu(i)
+		cnt += *per_cpu_ptr(mddev->writes_pending_percpu, i);
+	return cnt != 0;
+}
+
 static bool set_in_sync(struct mddev *mddev)
 {
 	bool ret = false;

 	WARN_ON_ONCE(!spin_is_locked(&mddev->lock));
-	if (atomic_read(&mddev->writes_pending) == 0) {
-		if (mddev->in_sync == 0) {
+	if (!__writes_pending(mddev) && !mddev->in_sync) {
+		mddev->checkers++;
+		spin_unlock(&mddev->lock);
+		synchronize_rcu();
+		spin_lock(&mddev->lock);
+		if (!mddev->in_sync &&
+		    !__writes_pending(mddev)) {
 			mddev->in_sync = 1;
+			/*
+			 * Ensure in_sync is visible before ->checkers
+			 * is decremented
+			 */
 			smp_mb();
-			if (atomic_read(&mddev->writes_pending))
-				/* lost a race with md_write_start() */
-				mddev->in_sync = 0;
 			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
 			sysfs_notify_dirent_safe(mddev->sysfs_state);
 		}
+		mddev->checkers--;
 		ret = true;
 	}
 	if (mddev->safemode == 1)
@@ -2272,6 +2305,29 @@ static bool set_in_sync(struct mddev *mddev)
 	return ret;
 }

+static void inc_writes_pending(struct mddev *mddev)
+{
+	rcu_read_lock();
+	if (mddev->checkers == 0) {
+		__this_cpu_inc(*mddev->writes_pending_percpu);
+		rcu_read_unlock();
+		return;
+	}
+	rcu_read_unlock();
+	/* Need that spinlock */
+	spin_lock(&mddev->lock);
+	this_cpu_inc(*mddev->writes_pending_percpu);
+	spin_unlock(&mddev->lock);
+}
+
+static void zero_writes_pending(struct mddev *mddev)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		*per_cpu_ptr(mddev->writes_pending_percpu, i) = 0;
+}
+
 static void sync_sbs(struct mddev *mddev, int nospares)
 {
 	/* Update each superblock (in-memory image), but
@@ -5000,6 +5056,8 @@ static void md_free(struct kobject *ko)
 		del_gendisk(mddev->gendisk);
 		put_disk(mddev->gendisk);
 	}
+	if (mddev->writes_pending_percpu)
+		free_percpu(mddev->writes_pending_percpu);

 	kfree(mddev);
 }
@@ -5076,6 +5134,13 @@ static int md_alloc(dev_t dev, char *name)
 	blk_queue_make_request(mddev->queue, md_make_request);
 	blk_set_stacking_limits(&mddev->queue->limits);

+	mddev->writes_pending_percpu = alloc_percpu(long);
+	if (!mddev->writes_pending_percpu) {
+		blk_cleanup_queue(mddev->queue);
+		mddev->queue = NULL;
+		goto abort;
+	}
+
 	disk = alloc_disk(1 << shift);
 	if (!disk) {
 		blk_cleanup_queue(mddev->queue);
@@ -5159,7 +5224,7 @@ static void md_safemode_timeout(unsigned long data)
 {
 	struct mddev *mddev = (struct mddev *) data;

-	if (!atomic_read(&mddev->writes_pending)) {
+	if (!__writes_pending(mddev)) {
 		mddev->safemode = 1;
 		if (mddev->external)
 			sysfs_notify_dirent_safe(mddev->sysfs_state);
@@ -5365,7 +5430,7 @@ int md_run(struct mddev *mddev)
 	} else if (mddev->ro == 2) /* auto-readonly not meaningful */
 		mddev->ro = 0;

-	atomic_set(&mddev->writes_pending,0);
+	zero_writes_pending(mddev);
 	atomic_set(&mddev->max_corr_read_errors,
 		   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
 	mddev->safemode = 0;
@@ -7774,11 +7839,13 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
 		md_wakeup_thread(mddev->sync_thread);
 		did_change = 1;
 	}
-	atomic_inc(&mddev->writes_pending);
+	rcu_read_lock();
+	inc_writes_pending(mddev);
 	smp_mb(); /* Match smp_mb in set_in_sync() */
 	if (mddev->safemode == 1)
 		mddev->safemode = 0;
-	if (mddev->in_sync) {
+	if (mddev->in_sync || mddev->checkers) {
+		rcu_read_unlock();
 		spin_lock(&mddev->lock);
 		if (mddev->in_sync) {
 			mddev->in_sync = 0;
@@ -7788,7 +7855,9 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
 			did_change = 1;
 		}
 		spin_unlock(&mddev->lock);
-	}
+	} else
+		rcu_read_unlock();
+
 	if (did_change)
 		sysfs_notify_dirent_safe(mddev->sysfs_state);
 	wait_event(mddev->sb_wait,
@@ -7798,12 +7867,18 @@ EXPORT_SYMBOL(md_write_start);

 void md_write_end(struct mddev *mddev)
 {
-	if (atomic_dec_and_test(&mddev->writes_pending)) {
-		if (mddev->safemode == 2)
+	this_cpu_dec(*mddev->writes_pending_percpu);
+	if (mddev->safemode == 2) {
+		if (!__writes_pending(mddev))
 			md_wakeup_thread(mddev->thread);
-		else if (mddev->safemode_delay)
-			mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
-	}
+	} else if (mddev->safemode_delay)
+		/* The roundup() ensure this only performs locking once
+		 * every ->safemode_delay jiffies
+		 */
+		mod_timer(&mddev->safemode_timer,
+			  roundup(jiffies, mddev->safemode_delay) +
+			  mddev->safemode_delay);
+
 }
 EXPORT_SYMBOL(md_write_end);

@@ -8406,7 +8481,7 @@ void md_check_recovery(struct mddev *mddev)
 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
 		test_bit(MD_RELOAD_SB, &mddev->flags) ||
 		(mddev->external == 0 && mddev->safemode == 1) ||
-		(mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
+		(mddev->safemode == 2 && !__writes_pending(mddev)
 		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
 		))
 		return;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 2a514036a83d..7e41f882d33d 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -404,7 +404,8 @@ struct mddev {
 							 */
 	unsigned int			safemode_delay;
 	struct timer_list		safemode_timer;
-	atomic_t			writes_pending;
+	long				*writes_pending_percpu;
+	int				checkers;	/* # of threads checking writes_pending */
 	struct request_queue		*queue;	/* for plugging ... */

 	struct bitmap			*bitmap; /* the bitmap for the device */

^ permalink raw reply related

* Re: [PATCH] async_tx: deprecate broken support for channel switching
From: Vinod Koul @ 2017-02-16  4:39 UTC (permalink / raw)
  To: Dan Williams
  Cc: Thomas Petazzoni, Anup Patel, Rameshwar Prasad Sahu, Russell King,
	linux-kernel, linux-raid, dmaengine, Anatolij Gustschin,
	Saeed Bishara
In-Reply-To: <148721292971.19343.10618932473668163270.stgit@dwillia2-desk3.amr.corp.intel.com>

On Wed, Feb 15, 2017 at 06:42:09PM -0800, Dan Williams wrote:
> Back in 2011, Russell pointed out that the "async_tx channel switch"
> capability was violating expectations of the dma mapping api [1]. At the
> time the existing uses were reviewed as still usable, but that longer
> term we needed a rework of the raid offload implementation. While some
> of the framework for a fixed implementation was introduced in 2012 [2],
> the wider rewrite never materialized.
> 
> There continues to be interest in raid offload with new dma/raid engine
> drivers being submitted. Those drivers must not build on top of the
> broken channel switching capability.
> 
> Prevent async_tx from using an offload engine if the channel switching
> capability is enabled. This still allows the engine to be used for other
> purposes, but the broken way async_tx uses these engines for raid will
> be disabled. For configurations where this causes a performance
> regression the only solution is to start the work of eliminating the
> async_tx api and moving channel management into the raid code directly
> where it can manage marshalling an operation stream between multiple dma
> channels.

Applied, thanks

-- 
~Vinod

^ permalink raw reply

* [PATCH v5 0/4] Broadcom SBA RAID support
From: Anup Patel @ 2017-02-16  6:48 UTC (permalink / raw)
  To: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar
  Cc: Dan Williams, Ray Jui, Scott Branden, Jon Mason, Rob Rice,
	bcm-kernel-feedback-list, dmaengine, devicetree, linux-arm-kernel,
	linux-kernel, linux-crypto, linux-raid, Anup Patel

The Broadcom SBA RAID is a stream-based device which provides
RAID5/6 offload.

It requires a SoC specific ring manager (such as Broadcom FlexRM
ring manager) to provide ring-based programming interface. Due to
this, the Broadcom SBA RAID driver (mailbox client) implements
DMA device having one DMA channel using a set of mailbox channels
provided by Broadcom SoC specific ring manager driver (mailbox
controller).

The Broadcom SBA RAID hardware requires PQ disk position instead
of PQ disk coefficient. To address this, we have added raid_gflog
table which will help driver to convert PQ disk coefficient to PQ
disk position.

This patchset is based on Linux-4.10-rc2 and depends on patchset
"[PATCH v4 0/2] Broadcom FlexRM ring manager support"

It is also available at sba-raid-v5 branch of
https://github.com/Broadcom/arm64-linux.git

Changes since v4:
 - Removed dependency of bcm-sba-raid driver on kconfig opton
   ASYNC_TX_ENABLE_CHANNEL_SWITCH
 - Select kconfig options ASYNC_TX_DISABLE_XOR_VAL_DMA and
   ASYNC_TX_DISABLE_PQ_VAL_DMA for bcm-sba-raid driver
 - Implemented device_prep_dma_interrupt() using dummy 8-byte
   copy operation so that the dma_async_device_register() can
   set DMA_ASYNC_TX capability for the DMA device provided
   by bcm-sba-raid driver

Changes since v3:
 - Replaced SBA_ENC() with sba_cmd_enc() inline function
 - Use list_first_entry_or_null() wherever possible
 - Remove unwanted brances around loops wherever possible
 - Use lockdep_assert_held() where required

Changes since v2:
 - Droped patch to handle DMA devices having support for fewer
   PQ coefficients in Linux Async Tx
 - Added work-around in bcm-sba-raid driver to handle unsupported
   PQ coefficients using multiple SBA requests

Changes since v1:
 - Droped patch to add mbox_channel_device() API
 - Used GENMASK and BIT macros wherever possible in bcm-sba-raid driver
 - Replaced C_MDATA macros with static inline functions in
   bcm-sba-raid driver
 - Removed sba_alloc_chan_resources() callback in bcm-sba-raid driver
 - Used dev_err() instead of dev_info() wherever applicable
 - Removed call to sba_issue_pending() from sba_tx_submit() in
   bcm-sba-raid driver
 - Implemented SBA request chaning for handling (len > sba->req_size)
   in bcm-sba-raid driver
 - Implemented device_terminate_all() callback in bcm-sba-raid driver

Anup Patel (4):
  lib/raid6: Add log-of-2 table for RAID6 HW requiring disk position
  async_tx: Fix DMA_PREP_FENCE usage in do_async_gen_syndrome()
  dmaengine: Add Broadcom SBA RAID driver
  dt-bindings: Add DT bindings document for Broadcom SBA RAID driver

 .../devicetree/bindings/dma/brcm,iproc-sba.txt     |   29 +
 crypto/async_tx/async_pq.c                         |    5 +-
 drivers/dma/Kconfig                                |   14 +
 drivers/dma/Makefile                               |    1 +
 drivers/dma/bcm-sba-raid.c                         | 1785 ++++++++++++++++++++
 include/linux/raid/pq.h                            |    1 +
 lib/raid6/mktables.c                               |   20 +
 7 files changed, 1852 insertions(+), 3 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/dma/brcm,iproc-sba.txt
 create mode 100644 drivers/dma/bcm-sba-raid.c

-- 
2.7.4

^ permalink raw reply

* [PATCH v5 1/4] lib/raid6: Add log-of-2 table for RAID6 HW requiring disk position
From: Anup Patel @ 2017-02-16  6:48 UTC (permalink / raw)
  To: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar
  Cc: Dan Williams, Ray Jui, Scott Branden, Jon Mason, Rob Rice,
	bcm-kernel-feedback-list, dmaengine, devicetree, linux-arm-kernel,
	linux-kernel, linux-crypto, linux-raid, Anup Patel
In-Reply-To: <1487227695-965-1-git-send-email-anup.patel@broadcom.com>

The raid6_gfexp table represents {2}^n values for 0 <= n < 256. The
Linux async_tx framework pass values from raid6_gfexp as coefficients
for each source to prep_dma_pq() callback of DMA channel with PQ
capability. This creates problem for RAID6 offload engines (such as
Broadcom SBA) which take disk position (i.e. log of {2}) instead of
multiplicative cofficients from raid6_gfexp table.

This patch adds raid6_gflog table having log-of-2 value for any given
x such that 0 <= x < 256. For any given disk coefficient x, the
corresponding disk position is given by raid6_gflog[x]. The RAID6
offload engine driver can use this newly added raid6_gflog table to
get disk position from multiplicative coefficient.

Signed-off-by: Anup Patel <anup.patel@broadcom.com>
Reviewed-by: Scott Branden <scott.branden@broadcom.com>
Reviewed-by: Ray Jui <ray.jui@broadcom.com>
---
 include/linux/raid/pq.h |  1 +
 lib/raid6/mktables.c    | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 4d57bba..30f9453 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -142,6 +142,7 @@ int raid6_select_algo(void);
 extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256)));
 extern const u8 raid6_vgfmul[256][32] __attribute__((aligned(256)));
 extern const u8 raid6_gfexp[256]      __attribute__((aligned(256)));
+extern const u8 raid6_gflog[256]      __attribute__((aligned(256)));
 extern const u8 raid6_gfinv[256]      __attribute__((aligned(256)));
 extern const u8 raid6_gfexi[256]      __attribute__((aligned(256)));
 
diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c
index 39787db..e824d08 100644
--- a/lib/raid6/mktables.c
+++ b/lib/raid6/mktables.c
@@ -125,6 +125,26 @@ int main(int argc, char *argv[])
 	printf("EXPORT_SYMBOL(raid6_gfexp);\n");
 	printf("#endif\n");
 
+	/* Compute log-of-2 table */
+	printf("\nconst u8 __attribute__((aligned(256)))\n"
+	       "raid6_gflog[256] =\n" "{\n");
+	for (i = 0; i < 256; i += 8) {
+		printf("\t");
+		for (j = 0; j < 8; j++) {
+			v = 255;
+			for (k = 0; k < 256; k++)
+				if (exptbl[k] == (i + j)) {
+					v = k;
+					break;
+				}
+			printf("0x%02x,%c", v, (j == 7) ? '\n' : ' ');
+		}
+	}
+	printf("};\n");
+	printf("#ifdef __KERNEL__\n");
+	printf("EXPORT_SYMBOL(raid6_gflog);\n");
+	printf("#endif\n");
+
 	/* Compute inverse table x^-1 == x^254 */
 	printf("\nconst u8 __attribute__((aligned(256)))\n"
 	       "raid6_gfinv[256] =\n" "{\n");
-- 
2.7.4

^ permalink raw reply related

* [PATCH v5 2/4] async_tx: Fix DMA_PREP_FENCE usage in do_async_gen_syndrome()
From: Anup Patel @ 2017-02-16  6:48 UTC (permalink / raw)
  To: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar
  Cc: Dan Williams, Ray Jui, Scott Branden, Jon Mason, Rob Rice,
	bcm-kernel-feedback-list, dmaengine, devicetree, linux-arm-kernel,
	linux-kernel, linux-crypto, linux-raid, Anup Patel
In-Reply-To: <1487227695-965-1-git-send-email-anup.patel@broadcom.com>

The DMA_PREP_FENCE is to be used when preparing Tx descriptor if output
of Tx descriptor is to be used by next/dependent Tx descriptor.

The DMA_PREP_FENSE will not be set correctly in do_async_gen_syndrome()
when calling dma->device_prep_dma_pq() under following conditions:
1. ASYNC_TX_FENCE not set in submit->flags
2. DMA_PREP_FENCE not set in dma_flags
3. src_cnt (= (disks - 2)) is greater than dma_maxpq(dma, dma_flags)

This patch fixes DMA_PREP_FENCE usage in do_async_gen_syndrome() taking
inspiration from do_async_xor() implementation.

Signed-off-by: Anup Patel <anup.patel@broadcom.com>
Reviewed-by: Ray Jui <ray.jui@broadcom.com>
Reviewed-by: Scott Branden <scott.branden@broadcom.com>
---
 crypto/async_tx/async_pq.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c
index f83de99..56bd612 100644
--- a/crypto/async_tx/async_pq.c
+++ b/crypto/async_tx/async_pq.c
@@ -62,9 +62,6 @@ do_async_gen_syndrome(struct dma_chan *chan,
 	dma_addr_t dma_dest[2];
 	int src_off = 0;
 
-	if (submit->flags & ASYNC_TX_FENCE)
-		dma_flags |= DMA_PREP_FENCE;
-
 	while (src_cnt > 0) {
 		submit->flags = flags_orig;
 		pq_src_cnt = min(src_cnt, dma_maxpq(dma, dma_flags));
@@ -83,6 +80,8 @@ do_async_gen_syndrome(struct dma_chan *chan,
 			if (cb_fn_orig)
 				dma_flags |= DMA_PREP_INTERRUPT;
 		}
+		if (submit->flags & ASYNC_TX_FENCE)
+			dma_flags |= DMA_PREP_FENCE;
 
 		/* Drivers force forward progress in case they can not provide
 		 * a descriptor
-- 
2.7.4

^ permalink raw reply related

* [PATCH v5 3/4] dmaengine: Add Broadcom SBA RAID driver
From: Anup Patel @ 2017-02-16  6:48 UTC (permalink / raw)
  To: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar
  Cc: Dan Williams, Ray Jui, Scott Branden, Jon Mason, Rob Rice,
	bcm-kernel-feedback-list, dmaengine, devicetree, linux-arm-kernel,
	linux-kernel, linux-crypto, linux-raid, Anup Patel
In-Reply-To: <1487227695-965-1-git-send-email-anup.patel@broadcom.com>

The Broadcom stream buffer accelerator (SBA) provides offloading
capabilities for RAID operations. This SBA offload engine is
accessible via Broadcom SoC specific ring manager.

This patch adds Broadcom SBA RAID driver which provides one
DMA device with RAID capabilities using one or more Broadcom
SoC specific ring manager channels. The SBA RAID driver in its
current shape implements memcpy, xor, and pq operations.

Signed-off-by: Anup Patel <anup.patel@broadcom.com>
Reviewed-by: Ray Jui <ray.jui@broadcom.com>
---
 drivers/dma/Kconfig        |   14 +
 drivers/dma/Makefile       |    1 +
 drivers/dma/bcm-sba-raid.c | 1785 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1800 insertions(+)
 create mode 100644 drivers/dma/bcm-sba-raid.c

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 263495d..3d23597 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -99,6 +99,20 @@ config AXI_DMAC
 	  controller is often used in Analog Device's reference designs for FPGA
 	  platforms.
 
+config BCM_SBA_RAID
+	tristate "Broadcom SBA RAID engine support"
+	depends on (ARM64 && MAILBOX && RAID6_PQ) || COMPILE_TEST
+	select DMA_ENGINE
+	select DMA_ENGINE_RAID
+	select ASYNC_TX_DISABLE_XOR_VAL_DMA
+	select ASYNC_TX_DISABLE_PQ_VAL_DMA
+	default ARCH_BCM_IPROC
+	help
+	  Enable support for Broadcom SBA RAID Engine. The SBA RAID
+	  engine is available on most of the Broadcom iProc SoCs. It
+	  has the capability to offload memcpy, xor and pq computation
+	  for raid5/6.
+
 config COH901318
 	bool "ST-Ericsson COH901318 DMA support"
 	select DMA_ENGINE
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index a4fa336..ba96bdd 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_AMCC_PPC440SPE_ADMA) += ppc4xx/
 obj-$(CONFIG_AT_HDMAC) += at_hdmac.o
 obj-$(CONFIG_AT_XDMAC) += at_xdmac.o
 obj-$(CONFIG_AXI_DMAC) += dma-axi-dmac.o
+obj-$(CONFIG_BCM_SBA_RAID) += bcm-sba-raid.o
 obj-$(CONFIG_COH901318) += coh901318.o coh901318_lli.o
 obj-$(CONFIG_DMA_BCM2835) += bcm2835-dma.o
 obj-$(CONFIG_DMA_JZ4740) += dma-jz4740.o
diff --git a/drivers/dma/bcm-sba-raid.c b/drivers/dma/bcm-sba-raid.c
new file mode 100644
index 0000000..d6b927b
--- /dev/null
+++ b/drivers/dma/bcm-sba-raid.c
@@ -0,0 +1,1785 @@
+/*
+ * Copyright (C) 2017 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * Broadcom SBA RAID Driver
+ *
+ * The Broadcom stream buffer accelerator (SBA) provides offloading
+ * capabilities for RAID operations. The SBA offload engine is accessible
+ * via Broadcom SoC specific ring manager. Two or more offload engines
+ * can share same Broadcom SoC specific ring manager due to this Broadcom
+ * SoC specific ring manager driver is implemented as a mailbox controller
+ * driver and offload engine drivers are implemented as mallbox clients.
+ *
+ * Typically, Broadcom SoC specific ring manager will implement larger
+ * number of hardware rings over one or more SBA hardware devices. By
+ * design, the internal buffer size of SBA hardware device is limited
+ * but all offload operations supported by SBA can be broken down into
+ * multiple small size requests and executed parallely on multiple SBA
+ * hardware devices for achieving high through-put.
+ *
+ * The Broadcom SBA RAID driver does not require any register programming
+ * except submitting request to SBA hardware device via mailbox channels.
+ * This driver implements a DMA device with one DMA channel using a set
+ * of mailbox channels provided by Broadcom SoC specific ring manager
+ * driver. To exploit parallelism (as described above), all DMA request
+ * coming to SBA RAID DMA channel are broken down to smaller requests
+ * and submitted to multiple mailbox channels in round-robin fashion.
+ * For having more SBA DMA channels, we can create more SBA device nodes
+ * in Broadcom SoC specific DTS based on number of hardware rings supported
+ * by Broadcom SoC ring manager.
+ */
+
+#include <linux/bitops.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmaengine.h>
+#include <linux/list.h>
+#include <linux/mailbox_client.h>
+#include <linux/mailbox/brcm-message.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/slab.h>
+#include <linux/raid/pq.h>
+
+#include "dmaengine.h"
+
+/* SBA command related defines */
+#define SBA_TYPE_SHIFT					48
+#define SBA_TYPE_MASK					GENMASK(1, 0)
+#define SBA_TYPE_A					0x0
+#define SBA_TYPE_B					0x2
+#define SBA_TYPE_C					0x3
+#define SBA_USER_DEF_SHIFT				32
+#define SBA_USER_DEF_MASK				GENMASK(15, 0)
+#define SBA_R_MDATA_SHIFT				24
+#define SBA_R_MDATA_MASK				GENMASK(7, 0)
+#define SBA_C_MDATA_MS_SHIFT				18
+#define SBA_C_MDATA_MS_MASK				GENMASK(1, 0)
+#define SBA_INT_SHIFT					17
+#define SBA_INT_MASK					BIT(0)
+#define SBA_RESP_SHIFT					16
+#define SBA_RESP_MASK					BIT(0)
+#define SBA_C_MDATA_SHIFT				8
+#define SBA_C_MDATA_MASK				GENMASK(7, 0)
+#define SBA_C_MDATA_BNUMx_SHIFT(__bnum)			(2 * (__bnum))
+#define SBA_C_MDATA_BNUMx_MASK				GENMASK(1, 0)
+#define SBA_C_MDATA_DNUM_SHIFT				5
+#define SBA_C_MDATA_DNUM_MASK				GENMASK(4, 0)
+#define SBA_C_MDATA_LS(__v)				((__v) & 0xff)
+#define SBA_C_MDATA_MS(__v)				(((__v) >> 8) & 0x3)
+#define SBA_CMD_SHIFT					0
+#define SBA_CMD_MASK					GENMASK(3, 0)
+#define SBA_CMD_ZERO_BUFFER				0x4
+#define SBA_CMD_ZERO_ALL_BUFFERS			0x8
+#define SBA_CMD_LOAD_BUFFER				0x9
+#define SBA_CMD_XOR					0xa
+#define SBA_CMD_GALOIS_XOR				0xb
+#define SBA_CMD_WRITE_BUFFER				0xc
+#define SBA_CMD_GALOIS					0xe
+
+/* Driver helper macros */
+#define to_sba_request(tx)		\
+	container_of(tx, struct sba_request, tx)
+#define to_sba_device(dchan)		\
+	container_of(dchan, struct sba_device, dma_chan)
+
+enum sba_request_state {
+	SBA_REQUEST_STATE_FREE = 1,
+	SBA_REQUEST_STATE_ALLOCED = 2,
+	SBA_REQUEST_STATE_PENDING = 3,
+	SBA_REQUEST_STATE_ACTIVE = 4,
+	SBA_REQUEST_STATE_RECEIVED = 5,
+	SBA_REQUEST_STATE_COMPLETED = 6,
+	SBA_REQUEST_STATE_ABORTED = 7,
+};
+
+struct sba_request {
+	/* Global state */
+	struct list_head node;
+	struct sba_device *sba;
+	enum sba_request_state state;
+	bool fence;
+	/* Chained requests management */
+	struct sba_request *first;
+	struct list_head next;
+	unsigned int next_count;
+	atomic_t next_pending_count;
+	/* BRCM message data */
+	void *resp;
+	dma_addr_t resp_dma;
+	struct brcm_sba_command *cmds;
+	struct brcm_message msg;
+	struct dma_async_tx_descriptor tx;
+};
+
+enum sba_version {
+	SBA_VER_1 = 0,
+	SBA_VER_2
+};
+
+struct sba_device {
+	/* Underlying device */
+	struct device *dev;
+	/* DT configuration parameters */
+	enum sba_version ver;
+	/* Derived configuration parameters */
+	u32 max_req;
+	u32 hw_buf_size;
+	u32 hw_resp_size;
+	u32 max_pq_coefs;
+	u32 max_pq_srcs;
+	u32 max_cmd_per_req;
+	u32 max_xor_srcs;
+	u32 max_resp_pool_size;
+	u32 max_cmds_pool_size;
+	/* Maibox client and Mailbox channels */
+	struct mbox_client client;
+	int mchans_count;
+	atomic_t mchans_current;
+	struct mbox_chan **mchans;
+	struct device *mbox_dev;
+	/* DMA device and DMA channel */
+	struct dma_device dma_dev;
+	struct dma_chan dma_chan;
+	/* DMA channel resources */
+	void *resp_base;
+	dma_addr_t resp_dma_base;
+	void *cmds_base;
+	dma_addr_t cmds_dma_base;
+	spinlock_t reqs_lock;
+	struct sba_request *reqs;
+	bool reqs_fence;
+	struct list_head reqs_alloc_list;
+	struct list_head reqs_pending_list;
+	struct list_head reqs_active_list;
+	struct list_head reqs_received_list;
+	struct list_head reqs_completed_list;
+	struct list_head reqs_aborted_list;
+	struct list_head reqs_free_list;
+	int reqs_free_count;
+};
+
+/* ====== SBA command helper routines ===== */
+
+static inline u64 __pure sba_cmd_enc(u64 cmd, u32 val, u32 shift, u32 mask)
+{
+	cmd &= ~((u64)mask << shift);
+	cmd |= ((u64)(val & mask) << shift);
+	return cmd;
+}
+
+static inline u32 __pure sba_cmd_load_c_mdata(u32 b0)
+{
+	return b0 & SBA_C_MDATA_BNUMx_MASK;
+}
+
+static inline u32 __pure sba_cmd_write_c_mdata(u32 b0)
+{
+	return b0 & SBA_C_MDATA_BNUMx_MASK;
+}
+
+static inline u32 __pure sba_cmd_xor_c_mdata(u32 b1, u32 b0)
+{
+	return (b0 & SBA_C_MDATA_BNUMx_MASK) |
+	       ((b1 & SBA_C_MDATA_BNUMx_MASK) << SBA_C_MDATA_BNUMx_SHIFT(1));
+}
+
+static inline u32 __pure sba_cmd_pq_c_mdata(u32 d, u32 b1, u32 b0)
+{
+	return (b0 & SBA_C_MDATA_BNUMx_MASK) |
+	       ((b1 & SBA_C_MDATA_BNUMx_MASK) << SBA_C_MDATA_BNUMx_SHIFT(1)) |
+	       ((d & SBA_C_MDATA_DNUM_MASK) << SBA_C_MDATA_DNUM_SHIFT);
+}
+
+/* ====== Channel resource management routines ===== */
+
+static struct sba_request *sba_alloc_request(struct sba_device *sba)
+{
+	unsigned long flags;
+	struct sba_request *req = NULL;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	req = list_first_entry_or_null(&sba->reqs_free_list,
+				       struct sba_request, node);
+	if (req) {
+		list_move_tail(&req->node, &sba->reqs_alloc_list);
+		req->state = SBA_REQUEST_STATE_ALLOCED;
+		req->fence = false;
+		req->first = req;
+		INIT_LIST_HEAD(&req->next);
+		req->next_count = 1;
+		atomic_set(&req->next_pending_count, 1);
+
+		sba->reqs_free_count--;
+
+		dma_async_tx_descriptor_init(&req->tx, &sba->dma_chan);
+	}
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+
+	return req;
+}
+
+/* Note: Must be called with sba->reqs_lock held */
+static void _sba_pending_request(struct sba_device *sba,
+				 struct sba_request *req)
+{
+	lockdep_assert_held(&sba->reqs_lock);
+	req->state = SBA_REQUEST_STATE_PENDING;
+	list_move_tail(&req->node, &sba->reqs_pending_list);
+	if (list_empty(&sba->reqs_active_list))
+		sba->reqs_fence = false;
+}
+
+/* Note: Must be called with sba->reqs_lock held */
+static bool _sba_active_request(struct sba_device *sba,
+				struct sba_request *req)
+{
+	lockdep_assert_held(&sba->reqs_lock);
+	if (list_empty(&sba->reqs_active_list))
+		sba->reqs_fence = false;
+	if (sba->reqs_fence)
+		return false;
+	req->state = SBA_REQUEST_STATE_ACTIVE;
+	list_move_tail(&req->node, &sba->reqs_active_list);
+	if (req->fence)
+		sba->reqs_fence = true;
+	return true;
+}
+
+/* Note: Must be called with sba->reqs_lock held */
+static void _sba_abort_request(struct sba_device *sba,
+			       struct sba_request *req)
+{
+	lockdep_assert_held(&sba->reqs_lock);
+	req->state = SBA_REQUEST_STATE_ABORTED;
+	list_move_tail(&req->node, &sba->reqs_aborted_list);
+	if (list_empty(&sba->reqs_active_list))
+		sba->reqs_fence = false;
+}
+
+/* Note: Must be called with sba->reqs_lock held */
+static void _sba_free_request(struct sba_device *sba,
+			      struct sba_request *req)
+{
+	lockdep_assert_held(&sba->reqs_lock);
+	req->state = SBA_REQUEST_STATE_FREE;
+	list_move_tail(&req->node, &sba->reqs_free_list);
+	if (list_empty(&sba->reqs_active_list))
+		sba->reqs_fence = false;
+	sba->reqs_free_count++;
+}
+
+static void sba_received_request(struct sba_request *req)
+{
+	unsigned long flags;
+	struct sba_device *sba = req->sba;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+	req->state = SBA_REQUEST_STATE_RECEIVED;
+	list_move_tail(&req->node, &sba->reqs_received_list);
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+static void sba_complete_chained_requests(struct sba_request *req)
+{
+	unsigned long flags;
+	struct sba_request *nreq;
+	struct sba_device *sba = req->sba;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	req->state = SBA_REQUEST_STATE_COMPLETED;
+	list_move_tail(&req->node, &sba->reqs_completed_list);
+	list_for_each_entry(nreq, &req->next, next) {
+		nreq->state = SBA_REQUEST_STATE_COMPLETED;
+		list_move_tail(&nreq->node, &sba->reqs_completed_list);
+	}
+	if (list_empty(&sba->reqs_active_list))
+		sba->reqs_fence = false;
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+static void sba_free_chained_requests(struct sba_request *req)
+{
+	unsigned long flags;
+	struct sba_request *nreq;
+	struct sba_device *sba = req->sba;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	_sba_free_request(sba, req);
+	list_for_each_entry(nreq, &req->next, next)
+		_sba_free_request(sba, nreq);
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+static void sba_chain_request(struct sba_request *first,
+			      struct sba_request *req)
+{
+	unsigned long flags;
+	struct sba_device *sba = req->sba;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	list_add_tail(&req->next, &first->next);
+	req->first = first;
+	first->next_count++;
+	atomic_set(&first->next_pending_count, first->next_count);
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+static void sba_cleanup_nonpending_requests(struct sba_device *sba)
+{
+	unsigned long flags;
+	struct sba_request *req, *req1;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	/* Freeup all alloced request */
+	list_for_each_entry_safe(req, req1, &sba->reqs_alloc_list, node)
+		_sba_free_request(sba, req);
+
+	/* Freeup all received request */
+	list_for_each_entry_safe(req, req1, &sba->reqs_received_list, node)
+		_sba_free_request(sba, req);
+
+	/* Freeup all completed request */
+	list_for_each_entry_safe(req, req1, &sba->reqs_completed_list, node)
+		_sba_free_request(sba, req);
+
+	/* Set all active requests as aborted */
+	list_for_each_entry_safe(req, req1, &sba->reqs_active_list, node)
+		_sba_abort_request(sba, req);
+
+	/*
+	 * Note: We expect that aborted request will be eventually
+	 * freed by sba_receive_message()
+	 */
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+static void sba_cleanup_pending_requests(struct sba_device *sba)
+{
+	unsigned long flags;
+	struct sba_request *req, *req1;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	/* Freeup all pending request */
+	list_for_each_entry_safe(req, req1, &sba->reqs_pending_list, node)
+		_sba_free_request(sba, req);
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+/* ====== DMAENGINE callbacks ===== */
+
+static void sba_free_chan_resources(struct dma_chan *dchan)
+{
+	/*
+	 * Channel resources are pre-alloced so we just free-up
+	 * whatever we can so that we can re-use pre-alloced
+	 * channel resources next time.
+	 */
+	sba_cleanup_nonpending_requests(to_sba_device(dchan));
+}
+
+static int sba_device_terminate_all(struct dma_chan *dchan)
+{
+	/* Cleanup all pending requests */
+	sba_cleanup_pending_requests(to_sba_device(dchan));
+
+	return 0;
+}
+
+static int sba_send_mbox_request(struct sba_device *sba,
+				 struct sba_request *req)
+{
+	int mchans_idx, ret = 0;
+
+	/* Select mailbox channel in round-robin fashion */
+	mchans_idx = atomic_inc_return(&sba->mchans_current);
+	mchans_idx = mchans_idx % sba->mchans_count;
+
+	/* Send message for the request */
+	req->msg.error = 0;
+	ret = mbox_send_message(sba->mchans[mchans_idx], &req->msg);
+	if (ret < 0) {
+		dev_err(sba->dev, "send message failed with error %d", ret);
+		return ret;
+	}
+	ret = req->msg.error;
+	if (ret < 0) {
+		dev_err(sba->dev, "message error %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void sba_issue_pending(struct dma_chan *dchan)
+{
+	int ret;
+	unsigned long flags;
+	struct sba_request *req, *req1;
+	struct sba_device *sba = to_sba_device(dchan);
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	/* Process all pending request */
+	list_for_each_entry_safe(req, req1, &sba->reqs_pending_list, node) {
+		/* Try to make request active */
+		if (!_sba_active_request(sba, req))
+			break;
+
+		/* Send request to mailbox channel */
+		spin_unlock_irqrestore(&sba->reqs_lock, flags);
+		ret = sba_send_mbox_request(sba, req);
+		spin_lock_irqsave(&sba->reqs_lock, flags);
+
+		/* If something went wrong then keep request pending */
+		if (ret < 0) {
+			_sba_pending_request(sba, req);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+static dma_cookie_t sba_tx_submit(struct dma_async_tx_descriptor *tx)
+{
+	unsigned long flags;
+	dma_cookie_t cookie;
+	struct sba_device *sba;
+	struct sba_request *req, *nreq;
+
+	if (unlikely(!tx))
+		return -EINVAL;
+
+	sba = to_sba_device(tx->chan);
+	req = to_sba_request(tx);
+
+	/* Assign cookie and mark all chained requests pending */
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+	cookie = dma_cookie_assign(tx);
+	_sba_pending_request(sba, req);
+	list_for_each_entry(nreq, &req->next, next)
+		_sba_pending_request(sba, nreq);
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+
+	return cookie;
+}
+
+static enum dma_status sba_tx_status(struct dma_chan *dchan,
+				     dma_cookie_t cookie,
+				     struct dma_tx_state *txstate)
+{
+	int mchan_idx;
+	enum dma_status ret;
+	struct sba_device *sba = to_sba_device(dchan);
+
+	for (mchan_idx = 0; mchan_idx < sba->mchans_count; mchan_idx++)
+		mbox_client_peek_data(sba->mchans[mchan_idx]);
+
+	ret = dma_cookie_status(dchan, cookie, txstate);
+	if (ret == DMA_COMPLETE)
+		return ret;
+
+	return dma_cookie_status(dchan, cookie, txstate);
+}
+
+static void sba_fillup_interrupt_msg(struct sba_request *req,
+				     struct brcm_sba_command *cmds,
+				     struct brcm_message *msg)
+{
+	u64 cmd;
+	u32 c_mdata;
+	struct brcm_sba_command *cmdsp = cmds;
+
+	/* Type-B command to load dummy data into buf0 */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, req->sba->hw_resp_size,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	c_mdata = sba_cmd_load_c_mdata(0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_LOAD_BUFFER,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+	cmdsp->data = req->resp_dma;
+	cmdsp->data_len = req->sba->hw_resp_size;
+	cmdsp++;
+
+	/* Type-A command to write buf0 to dummy location */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, req->sba->hw_resp_size,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	cmd = sba_cmd_enc(cmd, 0x1,
+			  SBA_RESP_SHIFT, SBA_RESP_MASK);
+	c_mdata = sba_cmd_write_c_mdata(0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_WRITE_BUFFER,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+	if (req->sba->hw_resp_size) {
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+		cmdsp->resp = req->resp_dma;
+		cmdsp->resp_len = req->sba->hw_resp_size;
+	}
+	cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+	cmdsp->data = req->resp_dma;
+	cmdsp->data_len = req->sba->hw_resp_size;
+	cmdsp++;
+
+	/* Fillup brcm_message */
+	msg->type = BRCM_MESSAGE_SBA;
+	msg->sba.cmds = cmds;
+	msg->sba.cmds_count = cmdsp - cmds;
+	msg->ctx = req;
+	msg->error = 0;
+}
+
+static struct dma_async_tx_descriptor *
+sba_prep_dma_interrupt(struct dma_chan *dchan, unsigned long flags)
+{
+	struct sba_request *req = NULL;
+	struct sba_device *sba = to_sba_device(dchan);
+
+	/* Alloc new request */
+	req = sba_alloc_request(sba);
+	if (!req)
+		return NULL;
+
+	/*
+	 * Force fence so that no requests are submitted
+	 * until DMA callback for this request is invoked.
+	 */
+	req->fence = true;
+
+	/* Fillup request message */
+	sba_fillup_interrupt_msg(req, req->cmds, &req->msg);
+
+	/* Init async_tx descriptor */
+	req->tx.flags = flags;
+	req->tx.cookie = -EBUSY;
+
+	return (req) ? &req->tx : NULL;
+}
+
+static void sba_fillup_memcpy_msg(struct sba_request *req,
+				  struct brcm_sba_command *cmds,
+				  struct brcm_message *msg,
+				  dma_addr_t msg_offset, size_t msg_len,
+				  dma_addr_t dst, dma_addr_t src)
+{
+	u64 cmd;
+	u32 c_mdata;
+	struct brcm_sba_command *cmdsp = cmds;
+
+	/* Type-B command to load data into buf0 */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	c_mdata = sba_cmd_load_c_mdata(0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_LOAD_BUFFER,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+	cmdsp->data = src + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+	/* Type-A command to write buf0 */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	cmd = sba_cmd_enc(cmd, 0x1,
+			  SBA_RESP_SHIFT, SBA_RESP_MASK);
+	c_mdata = sba_cmd_write_c_mdata(0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_WRITE_BUFFER,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+	if (req->sba->hw_resp_size) {
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+		cmdsp->resp = req->resp_dma;
+		cmdsp->resp_len = req->sba->hw_resp_size;
+	}
+	cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+	cmdsp->data = dst + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+	/* Fillup brcm_message */
+	msg->type = BRCM_MESSAGE_SBA;
+	msg->sba.cmds = cmds;
+	msg->sba.cmds_count = cmdsp - cmds;
+	msg->ctx = req;
+	msg->error = 0;
+}
+
+static struct sba_request *
+sba_prep_dma_memcpy_req(struct sba_device *sba,
+			dma_addr_t off, dma_addr_t dst, dma_addr_t src,
+			size_t len, unsigned long flags)
+{
+	struct sba_request *req = NULL;
+
+	/* Alloc new request */
+	req = sba_alloc_request(sba);
+	if (!req)
+		return NULL;
+	req->fence = (flags & DMA_PREP_FENCE) ? true : false;
+
+	/* Fillup request message */
+	sba_fillup_memcpy_msg(req, req->cmds, &req->msg,
+			      off, len, dst, src);
+
+	/* Init async_tx descriptor */
+	req->tx.flags = flags;
+	req->tx.cookie = -EBUSY;
+
+	return req;
+}
+
+static struct dma_async_tx_descriptor *
+sba_prep_dma_memcpy(struct dma_chan *dchan, dma_addr_t dst, dma_addr_t src,
+		    size_t len, unsigned long flags)
+{
+	size_t req_len;
+	dma_addr_t off = 0;
+	struct sba_device *sba = to_sba_device(dchan);
+	struct sba_request *first = NULL, *req;
+
+	/* Create chained requests where each request is upto hw_buf_size */
+	while (len) {
+		req_len = (len < sba->hw_buf_size) ? len : sba->hw_buf_size;
+
+		req = sba_prep_dma_memcpy_req(sba, off, dst, src,
+					      req_len, flags);
+		if (!req) {
+			if (first)
+				sba_free_chained_requests(first);
+			return NULL;
+		}
+
+		if (first)
+			sba_chain_request(first, req);
+		else
+			first = req;
+
+		off += req_len;
+		len -= req_len;
+	}
+
+	return (first) ? &first->tx : NULL;
+}
+
+static void sba_fillup_xor_msg(struct sba_request *req,
+				struct brcm_sba_command *cmds,
+				struct brcm_message *msg,
+				dma_addr_t msg_offset, size_t msg_len,
+				dma_addr_t dst, dma_addr_t *src, u32 src_cnt)
+{
+	u64 cmd;
+	u32 c_mdata;
+	unsigned int i;
+	struct brcm_sba_command *cmdsp = cmds;
+
+	/* Type-B command to load data into buf0 */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	c_mdata = sba_cmd_load_c_mdata(0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_LOAD_BUFFER,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+	cmdsp->data = src[0] + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+	/* Type-B commands to xor data with buf0 and put it back in buf0 */
+	for (i = 1; i < src_cnt; i++) {
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		c_mdata = sba_cmd_xor_c_mdata(0, 0);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_XOR,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+		cmdsp->data = src[i] + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	}
+
+	/* Type-A command to write buf0 */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	cmd = sba_cmd_enc(cmd, 0x1,
+			  SBA_RESP_SHIFT, SBA_RESP_MASK);
+	c_mdata = sba_cmd_write_c_mdata(0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_WRITE_BUFFER,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+	if (req->sba->hw_resp_size) {
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+		cmdsp->resp = req->resp_dma;
+		cmdsp->resp_len = req->sba->hw_resp_size;
+	}
+	cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+	cmdsp->data = dst + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+	/* Fillup brcm_message */
+	msg->type = BRCM_MESSAGE_SBA;
+	msg->sba.cmds = cmds;
+	msg->sba.cmds_count = cmdsp - cmds;
+	msg->ctx = req;
+	msg->error = 0;
+}
+
+struct sba_request *
+sba_prep_dma_xor_req(struct sba_device *sba,
+		     dma_addr_t off, dma_addr_t dst, dma_addr_t *src,
+		     u32 src_cnt, size_t len, unsigned long flags)
+{
+	struct sba_request *req = NULL;
+
+	/* Alloc new request */
+	req = sba_alloc_request(sba);
+	if (!req)
+		return NULL;
+	req->fence = (flags & DMA_PREP_FENCE) ? true : false;
+
+	/* Fillup request message */
+	sba_fillup_xor_msg(req, req->cmds, &req->msg,
+			   off, len, dst, src, src_cnt);
+
+	/* Init async_tx descriptor */
+	req->tx.flags = flags;
+	req->tx.cookie = -EBUSY;
+
+	return req;
+}
+
+static struct dma_async_tx_descriptor *
+sba_prep_dma_xor(struct dma_chan *dchan, dma_addr_t dst, dma_addr_t *src,
+		 u32 src_cnt, size_t len, unsigned long flags)
+{
+	size_t req_len;
+	dma_addr_t off = 0;
+	struct sba_device *sba = to_sba_device(dchan);
+	struct sba_request *first = NULL, *req;
+
+	/* Sanity checks */
+	if (unlikely(src_cnt > sba->max_xor_srcs))
+		return NULL;
+
+	/* Create chained requests where each request is upto hw_buf_size */
+	while (len) {
+		req_len = (len < sba->hw_buf_size) ? len : sba->hw_buf_size;
+
+		req = sba_prep_dma_xor_req(sba, off, dst, src, src_cnt,
+					   req_len, flags);
+		if (!req) {
+			if (first)
+				sba_free_chained_requests(first);
+			return NULL;
+		}
+
+		if (first)
+			sba_chain_request(first, req);
+		else
+			first = req;
+
+		off += req_len;
+		len -= req_len;
+	}
+
+	return (first) ? &first->tx : NULL;
+}
+
+static void sba_fillup_pq_msg(struct sba_request *req,
+				bool pq_continue,
+				struct brcm_sba_command *cmds,
+				struct brcm_message *msg,
+				dma_addr_t msg_offset, size_t msg_len,
+				dma_addr_t *dst_p, dma_addr_t *dst_q,
+				const u8 *scf, dma_addr_t *src, u32 src_cnt)
+{
+	u64 cmd;
+	u32 c_mdata;
+	unsigned int i;
+	struct brcm_sba_command *cmdsp = cmds;
+
+	if (pq_continue) {
+		/* Type-B command to load old P into buf0 */
+		if (dst_p) {
+			cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+			cmd = sba_cmd_enc(cmd, msg_len,
+				SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+			c_mdata = sba_cmd_load_c_mdata(0);
+			cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+			cmd = sba_cmd_enc(cmd, SBA_CMD_LOAD_BUFFER,
+				SBA_CMD_SHIFT, SBA_CMD_MASK);
+			cmdsp->cmd = cmd;
+			*cmdsp->cmd_dma = cpu_to_le64(cmd);
+			cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+			cmdsp->data = *dst_p + msg_offset;
+			cmdsp->data_len = msg_len;
+			cmdsp++;
+		}
+
+		/* Type-B command to load old Q into buf1 */
+		if (dst_q) {
+			cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+			cmd = sba_cmd_enc(cmd, msg_len,
+				SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+			c_mdata = sba_cmd_load_c_mdata(1);
+			cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+			cmd = sba_cmd_enc(cmd, SBA_CMD_LOAD_BUFFER,
+				SBA_CMD_SHIFT, SBA_CMD_MASK);
+			cmdsp->cmd = cmd;
+			*cmdsp->cmd_dma = cpu_to_le64(cmd);
+			cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+			cmdsp->data = *dst_q + msg_offset;
+			cmdsp->data_len = msg_len;
+			cmdsp++;
+		}
+	} else {
+		/* Type-A command to zero all buffers */
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_ZERO_ALL_BUFFERS,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+		cmdsp++;
+	}
+
+	/* Type-B commands for generate P onto buf0 and Q onto buf1 */
+	for (i = 0; i < src_cnt; i++) {
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		c_mdata = sba_cmd_pq_c_mdata(raid6_gflog[scf[i]], 1, 0);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_MS(c_mdata),
+				  SBA_C_MDATA_MS_SHIFT, SBA_C_MDATA_MS_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_GALOIS_XOR,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+		cmdsp->data = src[i] + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	}
+
+	/* Type-A command to write buf0 */
+	if (dst_p) {
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		cmd = sba_cmd_enc(cmd, 0x1,
+				  SBA_RESP_SHIFT, SBA_RESP_MASK);
+		c_mdata = sba_cmd_write_c_mdata(0);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_WRITE_BUFFER,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+		if (req->sba->hw_resp_size) {
+			cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+			cmdsp->resp = req->resp_dma;
+			cmdsp->resp_len = req->sba->hw_resp_size;
+		}
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+		cmdsp->data = *dst_p + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	}
+
+	/* Type-A command to write buf1 */
+	if (dst_q) {
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		cmd = sba_cmd_enc(cmd, 0x1,
+				  SBA_RESP_SHIFT, SBA_RESP_MASK);
+		c_mdata = sba_cmd_write_c_mdata(1);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_WRITE_BUFFER,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+		if (req->sba->hw_resp_size) {
+			cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+			cmdsp->resp = req->resp_dma;
+			cmdsp->resp_len = req->sba->hw_resp_size;
+		}
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+		cmdsp->data = *dst_q + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	}
+
+	/* Fillup brcm_message */
+	msg->type = BRCM_MESSAGE_SBA;
+	msg->sba.cmds = cmds;
+	msg->sba.cmds_count = cmdsp - cmds;
+	msg->ctx = req;
+	msg->error = 0;
+}
+
+struct sba_request *
+sba_prep_dma_pq_req(struct sba_device *sba, dma_addr_t off,
+		    dma_addr_t *dst_p, dma_addr_t *dst_q, dma_addr_t *src,
+		    u32 src_cnt, const u8 *scf, size_t len, unsigned long flags)
+{
+	struct sba_request *req = NULL;
+
+	/* Alloc new request */
+	req = sba_alloc_request(sba);
+	if (!req)
+		return NULL;
+	req->fence = (flags & DMA_PREP_FENCE) ? true : false;
+
+	/* Fillup request messages */
+	sba_fillup_pq_msg(req, dmaf_continue(flags),
+			  req->cmds, &req->msg,
+			  off, len, dst_p, dst_q, scf, src, src_cnt);
+
+	/* Init async_tx descriptor */
+	req->tx.flags = flags;
+	req->tx.cookie = -EBUSY;
+
+	return req;
+}
+
+static void sba_fillup_pq_single_msg(struct sba_request *req,
+				bool pq_continue,
+				struct brcm_sba_command *cmds,
+				struct brcm_message *msg,
+				dma_addr_t msg_offset, size_t msg_len,
+				dma_addr_t *dst_p, dma_addr_t *dst_q,
+				dma_addr_t src, u8 scf)
+{
+	u64 cmd;
+	u32 c_mdata;
+	u8 pos, dpos = raid6_gflog[scf];
+	struct brcm_sba_command *cmdsp = cmds;
+
+	if (!dst_p)
+		goto skip_p;
+
+	if (pq_continue) {
+		/* Type-B command to load old P into buf0 */
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		c_mdata = sba_cmd_load_c_mdata(0);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_LOAD_BUFFER,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+		cmdsp->data = *dst_p + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+
+		/*
+		 * Type-B commands to xor data with buf0 and put it
+		 * back in buf0
+		 */
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		c_mdata = sba_cmd_xor_c_mdata(0, 0);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_XOR,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+		cmdsp->data = src + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	} else {
+		/* Type-B command to load old P into buf0 */
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		c_mdata = sba_cmd_load_c_mdata(0);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_LOAD_BUFFER,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+		cmdsp->data = src + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	}
+
+	/* Type-A command to write buf0 */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	cmd = sba_cmd_enc(cmd, 0x1,
+			  SBA_RESP_SHIFT, SBA_RESP_MASK);
+	c_mdata = sba_cmd_write_c_mdata(0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_WRITE_BUFFER,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+	if (req->sba->hw_resp_size) {
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+		cmdsp->resp = req->resp_dma;
+		cmdsp->resp_len = req->sba->hw_resp_size;
+	}
+	cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+	cmdsp->data = *dst_p + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+skip_p:
+	if (!dst_q)
+		goto skip_q;
+
+	/* Type-A command to zero all buffers */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_ZERO_ALL_BUFFERS,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+	cmdsp++;
+
+	if (dpos == 255)
+		goto skip_q_computation;
+	pos = (dpos < req->sba->max_pq_coefs) ?
+		dpos : (req->sba->max_pq_coefs - 1);
+
+	/*
+	 * Type-B command to generate initial Q from data
+	 * and store output into buf0
+	 */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	c_mdata = sba_cmd_pq_c_mdata(pos, 0, 0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_MS(c_mdata),
+			  SBA_C_MDATA_MS_SHIFT, SBA_C_MDATA_MS_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_GALOIS,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+	cmdsp->data = src + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+	dpos -= pos;
+
+	/* Multiple Type-A command to generate final Q */
+	while (dpos) {
+		pos = (dpos < req->sba->max_pq_coefs) ?
+			dpos : (req->sba->max_pq_coefs - 1);
+
+		/*
+		 * Type-A command to generate Q with buf0 and
+		 * buf1 store result in buf0
+		 */
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		c_mdata = sba_cmd_pq_c_mdata(pos, 0, 1);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_MS(c_mdata),
+				  SBA_C_MDATA_MS_SHIFT, SBA_C_MDATA_MS_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_GALOIS,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+		cmdsp++;
+
+		dpos -= pos;
+	}
+
+skip_q_computation:
+	if (pq_continue) {
+		/*
+		 * Type-B command to XOR previous output with
+		 * buf0 and write it into buf0
+		 */
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		c_mdata = sba_cmd_xor_c_mdata(0, 0);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_XOR,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+		cmdsp->data = *dst_q + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	}
+
+	/* Type-A command to write buf0 */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	cmd = sba_cmd_enc(cmd, 0x1,
+			  SBA_RESP_SHIFT, SBA_RESP_MASK);
+	c_mdata = sba_cmd_write_c_mdata(0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_WRITE_BUFFER,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+	if (req->sba->hw_resp_size) {
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+		cmdsp->resp = req->resp_dma;
+		cmdsp->resp_len = req->sba->hw_resp_size;
+	}
+	cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+	cmdsp->data = *dst_q + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+skip_q:
+	/* Fillup brcm_message */
+	msg->type = BRCM_MESSAGE_SBA;
+	msg->sba.cmds = cmds;
+	msg->sba.cmds_count = cmdsp - cmds;
+	msg->ctx = req;
+	msg->error = 0;
+}
+
+struct sba_request *
+sba_prep_dma_pq_single_req(struct sba_device *sba, dma_addr_t off,
+			   dma_addr_t *dst_p, dma_addr_t *dst_q,
+			   dma_addr_t src, u8 scf, size_t len,
+			   unsigned long flags)
+{
+	struct sba_request *req = NULL;
+
+	/* Alloc new request */
+	req = sba_alloc_request(sba);
+	if (!req)
+		return NULL;
+	req->fence = (flags & DMA_PREP_FENCE) ? true : false;
+
+	/* Fillup request messages */
+	sba_fillup_pq_single_msg(req,  dmaf_continue(flags),
+				 req->cmds, &req->msg, off, len,
+				 dst_p, dst_q, src, scf);
+
+	/* Init async_tx descriptor */
+	req->tx.flags = flags;
+	req->tx.cookie = -EBUSY;
+
+	return req;
+}
+
+static struct dma_async_tx_descriptor *
+sba_prep_dma_pq(struct dma_chan *dchan, dma_addr_t *dst, dma_addr_t *src,
+		u32 src_cnt, const u8 *scf, size_t len, unsigned long flags)
+{
+	u32 i, dst_q_index;
+	size_t req_len;
+	bool slow = false;
+	dma_addr_t off = 0;
+	dma_addr_t *dst_p = NULL, *dst_q = NULL;
+	struct sba_device *sba = to_sba_device(dchan);
+	struct sba_request *first = NULL, *req;
+
+	/* Sanity checks */
+	if (unlikely(src_cnt > sba->max_pq_srcs))
+		return NULL;
+	for (i = 0; i < src_cnt; i++)
+		if (sba->max_pq_coefs <= raid6_gflog[scf[i]])
+			slow = true;
+
+	/* Figure-out P and Q destination addresses */
+	if (!(flags & DMA_PREP_PQ_DISABLE_P))
+		dst_p = &dst[0];
+	if (!(flags & DMA_PREP_PQ_DISABLE_Q))
+		dst_q = &dst[1];
+
+	/* Create chained requests where each request is upto hw_buf_size */
+	while (len) {
+		req_len = (len < sba->hw_buf_size) ? len : sba->hw_buf_size;
+
+		if (slow) {
+			dst_q_index = src_cnt;
+
+			if (dst_q) {
+				for (i = 0; i < src_cnt; i++) {
+					if (*dst_q == src[i]) {
+						dst_q_index = i;
+						break;
+					}
+				}
+			}
+
+			if (dst_q_index < src_cnt) {
+				i = dst_q_index;
+				req = sba_prep_dma_pq_single_req(sba,
+					off, dst_p, dst_q, src[i], scf[i],
+					req_len, flags | DMA_PREP_FENCE);
+				if (!req)
+					goto fail;
+
+				if (first)
+					sba_chain_request(first, req);
+				else
+					first = req;
+
+				flags |= DMA_PREP_CONTINUE;
+			}
+
+			for (i = 0; i < src_cnt; i++) {
+				if (dst_q_index == i)
+					continue;
+
+				req = sba_prep_dma_pq_single_req(sba,
+					off, dst_p, dst_q, src[i], scf[i],
+					req_len, flags | DMA_PREP_FENCE);
+				if (!req)
+					goto fail;
+
+				if (first)
+					sba_chain_request(first, req);
+				else
+					first = req;
+
+				flags |= DMA_PREP_CONTINUE;
+			}
+		} else {
+			req = sba_prep_dma_pq_req(sba, off,
+						  dst_p, dst_q, src, src_cnt,
+						  scf, req_len, flags);
+			if (!req)
+				goto fail;
+
+			if (first)
+				sba_chain_request(first, req);
+			else
+				first = req;
+		}
+
+		off += req_len;
+		len -= req_len;
+	}
+
+	return (first) ? &first->tx : NULL;
+
+fail:
+	if (first)
+		sba_free_chained_requests(first);
+	return NULL;
+}
+
+/* ====== Mailbox callbacks ===== */
+
+static void sba_dma_tx_actions(struct sba_request *req)
+{
+	struct dma_async_tx_descriptor *tx = &req->tx;
+
+	WARN_ON(tx->cookie < 0);
+
+	if (tx->cookie > 0) {
+		dma_cookie_complete(tx);
+
+		/*
+		 * Call the callback (must not sleep or submit new
+		 * operations to this channel)
+		 */
+		if (tx->callback)
+			tx->callback(tx->callback_param);
+
+		dma_descriptor_unmap(tx);
+	}
+
+	/* Run dependent operations */
+	dma_run_dependencies(tx);
+
+	/* If waiting for 'ack' then move to completed list */
+	if (!async_tx_test_ack(&req->tx))
+		sba_complete_chained_requests(req);
+	else
+		sba_free_chained_requests(req);
+}
+
+static void sba_receive_message(struct mbox_client *cl, void *msg)
+{
+	unsigned long flags;
+	struct brcm_message *m = msg;
+	struct sba_request *req = m->ctx, *req1;
+	struct sba_device *sba = req->sba;
+
+	/* Error count if message has error */
+	if (m->error < 0)
+		dev_err(sba->dev, "%s got message with error %d",
+			dma_chan_name(&sba->dma_chan), m->error);
+
+	/* Mark request as received */
+	sba_received_request(req);
+
+	/* Wait for all chained requests to be completed */
+	if (atomic_dec_return(&req->first->next_pending_count))
+		goto done;
+
+	/* Point to first request */
+	req = req->first;
+
+	/* Update request */
+	if (req->state == SBA_REQUEST_STATE_RECEIVED)
+		sba_dma_tx_actions(req);
+	else
+		sba_free_chained_requests(req);
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	/* Re-check all completed request waiting for 'ack' */
+	list_for_each_entry_safe(req, req1, &sba->reqs_completed_list, node) {
+		spin_unlock_irqrestore(&sba->reqs_lock, flags);
+		sba_dma_tx_actions(req);
+		spin_lock_irqsave(&sba->reqs_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+
+done:
+	/* Try to submit pending request */
+	sba_issue_pending(&sba->dma_chan);
+}
+
+/* ====== Platform driver routines ===== */
+
+static int sba_prealloc_channel_resources(struct sba_device *sba)
+{
+	int i, j, p, ret = 0;
+	struct sba_request *req = NULL;
+
+	sba->resp_base = dma_alloc_coherent(sba->dma_dev.dev,
+					    sba->max_resp_pool_size,
+					    &sba->resp_dma_base, GFP_KERNEL);
+	if (!sba->resp_base)
+		return -ENOMEM;
+
+	sba->cmds_base = dma_alloc_coherent(sba->dma_dev.dev,
+					    sba->max_cmds_pool_size,
+					    &sba->cmds_dma_base, GFP_KERNEL);
+	if (!sba->cmds_base) {
+		ret = -ENOMEM;
+		goto fail_free_resp_pool;
+	}
+
+	spin_lock_init(&sba->reqs_lock);
+	sba->reqs_fence = false;
+	INIT_LIST_HEAD(&sba->reqs_alloc_list);
+	INIT_LIST_HEAD(&sba->reqs_pending_list);
+	INIT_LIST_HEAD(&sba->reqs_active_list);
+	INIT_LIST_HEAD(&sba->reqs_received_list);
+	INIT_LIST_HEAD(&sba->reqs_completed_list);
+	INIT_LIST_HEAD(&sba->reqs_aborted_list);
+	INIT_LIST_HEAD(&sba->reqs_free_list);
+
+	sba->reqs = devm_kcalloc(sba->dev, sba->max_req,
+				 sizeof(*req), GFP_KERNEL);
+	if (!sba->reqs) {
+		ret = -ENOMEM;
+		goto fail_free_cmds_pool;
+	}
+
+	for (i = 0, p = 0; i < sba->max_req; i++) {
+		req = &sba->reqs[i];
+		INIT_LIST_HEAD(&req->node);
+		req->sba = sba;
+		req->state = SBA_REQUEST_STATE_FREE;
+		INIT_LIST_HEAD(&req->next);
+		req->next_count = 1;
+		atomic_set(&req->next_pending_count, 0);
+		req->fence = false;
+		req->resp = sba->resp_base + p;
+		req->resp_dma = sba->resp_dma_base + p;
+		p += sba->hw_resp_size;
+		req->cmds = devm_kcalloc(sba->dev, sba->max_cmd_per_req,
+					 sizeof(*req->cmds), GFP_KERNEL);
+		if (!req->cmds) {
+			ret = -ENOMEM;
+			goto fail_free_cmds_pool;
+		}
+		for (j = 0; j < sba->max_cmd_per_req; j++) {
+			req->cmds[j].cmd = 0;
+			req->cmds[j].cmd_dma = sba->cmds_base +
+				(i * sba->max_cmd_per_req + j) * sizeof(u64);
+			req->cmds[j].cmd_dma_addr = sba->cmds_dma_base +
+				(i * sba->max_cmd_per_req + j) * sizeof(u64);
+			req->cmds[j].flags = 0;
+		}
+		memset(&req->msg, 0, sizeof(req->msg));
+		dma_async_tx_descriptor_init(&req->tx, &sba->dma_chan);
+		req->tx.tx_submit = sba_tx_submit;
+		req->tx.phys = req->resp_dma;
+		list_add_tail(&req->node, &sba->reqs_free_list);
+	}
+
+	sba->reqs_free_count = sba->max_req;
+
+	return 0;
+
+fail_free_cmds_pool:
+	dma_free_coherent(sba->dma_dev.dev,
+			  sba->max_cmds_pool_size,
+			  sba->cmds_base, sba->cmds_dma_base);
+fail_free_resp_pool:
+	dma_free_coherent(sba->dma_dev.dev,
+			  sba->max_resp_pool_size,
+			  sba->resp_base, sba->resp_dma_base);
+	return ret;
+}
+
+static void sba_freeup_channel_resources(struct sba_device *sba)
+{
+	dmaengine_terminate_all(&sba->dma_chan);
+	dma_free_coherent(sba->dma_dev.dev, sba->max_cmds_pool_size,
+			  sba->cmds_base, sba->cmds_dma_base);
+	dma_free_coherent(sba->dma_dev.dev, sba->max_resp_pool_size,
+			  sba->resp_base, sba->resp_dma_base);
+	sba->resp_base = NULL;
+	sba->resp_dma_base = 0;
+}
+
+static int sba_async_register(struct sba_device *sba)
+{
+	int ret;
+	struct dma_device *dma_dev = &sba->dma_dev;
+
+	/* Initialize DMA channel cookie */
+	sba->dma_chan.device = dma_dev;
+	dma_cookie_init(&sba->dma_chan);
+
+	/* Initialize DMA device capability mask */
+	dma_cap_zero(dma_dev->cap_mask);
+	dma_cap_set(DMA_INTERRUPT, dma_dev->cap_mask);
+	dma_cap_set(DMA_MEMCPY, dma_dev->cap_mask);
+	dma_cap_set(DMA_XOR, dma_dev->cap_mask);
+	dma_cap_set(DMA_PQ, dma_dev->cap_mask);
+
+	/*
+	 * Set mailbox channel device as the base device of
+	 * our dma_device because the actual memory accesses
+	 * will be done by mailbox controller
+	 */
+	dma_dev->dev = sba->mbox_dev;
+
+	/* Set base prep routines */
+	dma_dev->device_free_chan_resources = sba_free_chan_resources;
+	dma_dev->device_terminate_all = sba_device_terminate_all;
+	dma_dev->device_issue_pending = sba_issue_pending;
+	dma_dev->device_tx_status = sba_tx_status;
+
+	/* Set interrupt routine */
+	if (dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask))
+		dma_dev->device_prep_dma_interrupt = sba_prep_dma_interrupt;
+
+	/* Set memcpy routine */
+	if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask))
+		dma_dev->device_prep_dma_memcpy = sba_prep_dma_memcpy;
+
+	/* Set xor routine and capability */
+	if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) {
+		dma_dev->device_prep_dma_xor = sba_prep_dma_xor;
+		dma_dev->max_xor = sba->max_xor_srcs;
+	}
+
+	/* Set pq routine and capability */
+	if (dma_has_cap(DMA_PQ, dma_dev->cap_mask)) {
+		dma_dev->device_prep_dma_pq = sba_prep_dma_pq;
+		dma_set_maxpq(dma_dev, sba->max_pq_srcs, 0);
+	}
+
+	/* Initialize DMA device channel list */
+	INIT_LIST_HEAD(&dma_dev->channels);
+	list_add_tail(&sba->dma_chan.device_node, &dma_dev->channels);
+
+	/* Register with Linux async DMA framework*/
+	ret = dma_async_device_register(dma_dev);
+	if (ret) {
+		dev_err(sba->dev, "async device register error %d", ret);
+		return ret;
+	}
+
+	dev_info(sba->dev, "%s capabilities: %s%s%s%s\n",
+	dma_chan_name(&sba->dma_chan),
+	dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask) ? "interrupt " : "",
+	dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask) ? "memcpy " : "",
+	dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "",
+	dma_has_cap(DMA_PQ, dma_dev->cap_mask) ? "pq " : "");
+
+	return 0;
+}
+
+static int sba_probe(struct platform_device *pdev)
+{
+	int i, ret = 0, mchans_count;
+	struct sba_device *sba;
+	struct platform_device *mbox_pdev;
+	struct of_phandle_args args;
+
+	/* Allocate main SBA struct */
+	sba = devm_kzalloc(&pdev->dev, sizeof(*sba), GFP_KERNEL);
+	if (!sba)
+		return -ENOMEM;
+
+	sba->dev = &pdev->dev;
+	platform_set_drvdata(pdev, sba);
+
+	/* Determine SBA version from DT compatible string */
+	if (of_device_is_compatible(sba->dev->of_node, "brcm,iproc-sba"))
+		sba->ver = SBA_VER_1;
+	else if (of_device_is_compatible(sba->dev->of_node,
+					 "brcm,iproc-sba-v2"))
+		sba->ver = SBA_VER_2;
+	else
+		return -ENODEV;
+
+	/* Derived Configuration parameters */
+	switch (sba->ver) {
+	case SBA_VER_1:
+		sba->max_req = 1024;
+		sba->hw_buf_size = 4096;
+		sba->hw_resp_size = 8;
+		sba->max_pq_coefs = 6;
+		sba->max_pq_srcs = 6;
+		break;
+	case SBA_VER_2:
+		sba->max_req = 1024;
+		sba->hw_buf_size = 4096;
+		sba->hw_resp_size = 8;
+		sba->max_pq_coefs = 30;
+		/*
+		 * We can support max_pq_srcs == max_pq_coefs because
+		 * we are limited by number of SBA commands that we can
+		 * fit in one message for underlying ring manager HW.
+		 */
+		sba->max_pq_srcs = 12;
+		break;
+	default:
+		return -EINVAL;
+	}
+	sba->max_cmd_per_req = sba->max_pq_srcs + 3;
+	sba->max_xor_srcs = sba->max_cmd_per_req - 1;
+	sba->max_resp_pool_size = sba->max_req * sba->hw_resp_size;
+	sba->max_cmds_pool_size = sba->max_req *
+				  sba->max_cmd_per_req * sizeof(u64);
+
+	/* Setup mailbox client */
+	sba->client.dev			= &pdev->dev;
+	sba->client.rx_callback		= sba_receive_message;
+	sba->client.tx_block		= false;
+	sba->client.knows_txdone	= false;
+	sba->client.tx_tout		= 0;
+
+	/* Number of channels equals number of mailbox channels */
+	ret = of_count_phandle_with_args(pdev->dev.of_node,
+					 "mboxes", "#mbox-cells");
+	if (ret <= 0)
+		return -ENODEV;
+	mchans_count = ret;
+	sba->mchans_count = 0;
+	atomic_set(&sba->mchans_current, 0);
+
+	/* Allocate mailbox channel array */
+	sba->mchans = devm_kcalloc(&pdev->dev, sba->mchans_count,
+				   sizeof(*sba->mchans), GFP_KERNEL);
+	if (!sba->mchans)
+		return -ENOMEM;
+
+	/* Request mailbox channels */
+	for (i = 0; i < mchans_count; i++) {
+		sba->mchans[i] = mbox_request_channel(&sba->client, i);
+		if (IS_ERR(sba->mchans[i])) {
+			ret = PTR_ERR(sba->mchans[i]);
+			goto fail_free_mchans;
+		}
+		sba->mchans_count++;
+	}
+
+	/* Find-out underlying mailbox device */
+	ret = of_parse_phandle_with_args(pdev->dev.of_node,
+					 "mboxes", "#mbox-cells", 0, &args);
+	if (ret)
+		goto fail_free_mchans;
+	mbox_pdev = of_find_device_by_node(args.np);
+	of_node_put(args.np);
+	if (!mbox_pdev) {
+		ret = -ENODEV;
+		goto fail_free_mchans;
+	}
+	sba->mbox_dev = &mbox_pdev->dev;
+
+	/* All mailbox channels should be of same ring manager device */
+	for (i = 1; i < mchans_count; i++) {
+		ret = of_parse_phandle_with_args(pdev->dev.of_node,
+					 "mboxes", "#mbox-cells", i, &args);
+		if (ret)
+			goto fail_free_mchans;
+		mbox_pdev = of_find_device_by_node(args.np);
+		of_node_put(args.np);
+		if (sba->mbox_dev != &mbox_pdev->dev) {
+			ret = -EINVAL;
+			goto fail_free_mchans;
+		}
+	}
+
+	/* Register DMA device with linux async framework */
+	ret = sba_async_register(sba);
+	if (ret)
+		goto fail_free_mchans;
+
+	/* Prealloc channel resource */
+	ret = sba_prealloc_channel_resources(sba);
+	if (ret)
+		goto fail_async_dev_unreg;
+
+	/* Print device info */
+	dev_info(sba->dev, "%s using SBAv%d and %d mailbox channels",
+		 dma_chan_name(&sba->dma_chan), sba->ver+1,
+		 sba->mchans_count);
+
+	return 0;
+
+fail_async_dev_unreg:
+	dma_async_device_unregister(&sba->dma_dev);
+fail_free_mchans:
+	for (i = 0; i < sba->mchans_count; i++)
+		mbox_free_channel(sba->mchans[i]);
+	return ret;
+}
+
+static int sba_remove(struct platform_device *pdev)
+{
+	int i;
+	struct sba_device *sba = platform_get_drvdata(pdev);
+
+	sba_freeup_channel_resources(sba);
+
+	dma_async_device_unregister(&sba->dma_dev);
+
+	for (i = 0; i < sba->mchans_count; i++)
+		mbox_free_channel(sba->mchans[i]);
+
+	return 0;
+}
+
+static const struct of_device_id sba_of_match[] = {
+	{ .compatible = "brcm,iproc-sba", },
+	{ .compatible = "brcm,iproc-sba-v2", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, sba_of_match);
+
+static struct platform_driver sba_driver = {
+	.probe = sba_probe,
+	.remove = sba_remove,
+	.driver = {
+		.name = "bcm-sba-raid",
+		.of_match_table = sba_of_match,
+	},
+};
+module_platform_driver(sba_driver);
+
+MODULE_DESCRIPTION("Broadcom SBA RAID driver");
+MODULE_AUTHOR("Anup Patel <anup.patel@broadcom.com>");
+MODULE_LICENSE("GPL v2");
-- 
2.7.4

^ permalink raw reply related

* [PATCH v5 4/4] dt-bindings: Add DT bindings document for Broadcom SBA RAID driver
From: Anup Patel @ 2017-02-16  6:48 UTC (permalink / raw)
  To: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar
  Cc: Dan Williams, Ray Jui, Scott Branden, Jon Mason, Rob Rice,
	bcm-kernel-feedback-list, dmaengine, devicetree, linux-arm-kernel,
	linux-kernel, linux-crypto, linux-raid, Anup Patel
In-Reply-To: <1487227695-965-1-git-send-email-anup.patel@broadcom.com>

This patch adds the DT bindings document for newly added Broadcom
SBA RAID driver.

Signed-off-by: Anup Patel <anup.patel@broadcom.com>
Reviewed-by: Ray Jui <ray.jui@broadcom.com>
Reviewed-by: Scott Branden <scott.branden@broadcom.com>
---
 .../devicetree/bindings/dma/brcm,iproc-sba.txt     | 29 ++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/dma/brcm,iproc-sba.txt

diff --git a/Documentation/devicetree/bindings/dma/brcm,iproc-sba.txt b/Documentation/devicetree/bindings/dma/brcm,iproc-sba.txt
new file mode 100644
index 0000000..092913a
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/brcm,iproc-sba.txt
@@ -0,0 +1,29 @@
+* Broadcom SBA RAID engine
+
+Required properties:
+- compatible: Should be one of the following
+	      "brcm,iproc-sba"
+	      "brcm,iproc-sba-v2"
+  The "brcm,iproc-sba" has support for only 6 PQ coefficients
+  The "brcm,iproc-sba-v2" has support for only 30 PQ coefficients
+- mboxes: List of phandle and mailbox channel specifiers
+
+Example:
+
+raid_mbox: mbox@67400000 {
+	...
+	#mbox-cells = <3>;
+	...
+};
+
+raid0 {
+	compatible = "brcm,iproc-sba-v2";
+	mboxes = <&raid_mbox 0 0x1 0xffff>,
+		 <&raid_mbox 1 0x1 0xffff>,
+		 <&raid_mbox 2 0x1 0xffff>,
+		 <&raid_mbox 3 0x1 0xffff>,
+		 <&raid_mbox 4 0x1 0xffff>,
+		 <&raid_mbox 5 0x1 0xffff>,
+		 <&raid_mbox 6 0x1 0xffff>,
+		 <&raid_mbox 7 0x1 0xffff>;
+};
-- 
2.7.4


^ permalink raw reply related

* Re: [PATCH V3 1/2] RAID1: a new I/O barrier implementation to remove resync window
From: NeilBrown @ 2017-02-16  7:04 UTC (permalink / raw)
  To: linux-raid; +Cc: Coly Li, Shaohua Li, Johannes Thumshirn, Guoqing Jiang
In-Reply-To: <1487176523-109075-1-git-send-email-colyli@suse.de>

[-- Attachment #1: Type: text/plain, Size: 10650 bytes --]

On Thu, Feb 16 2017, colyli@suse.de wrote:

> 'Commit 79ef3a8aa1cb ("raid1: Rewrite the implementation of iobarrier.")'
> introduces a sliding resync window for raid1 I/O barrier, this idea limits
> I/O barriers to happen only inside a slidingresync window, for regular
> I/Os out of this resync window they don't need to wait for barrier any
> more. On large raid1 device, it helps a lot to improve parallel writing
> I/O throughput when there are background resync I/Os performing at
> same time.
>
> The idea of sliding resync widow is awesome, but code complexity is a
> challenge. Sliding resync window requires several veriables to work

variables

> collectively, this is complexed and very hard to make it work correctly.
> Just grep "Fixes: 79ef3a8aa1" in kernel git log, there are 8 more patches
> to fix the original resync window patch. This is not the end, any further
> related modification may easily introduce more regreassion.
>
> Therefore I decide to implement a much simpler raid1 I/O barrier, by
> removing resync window code, I believe life will be much easier.
>
> The brief idea of the simpler barrier is,
>  - Do not maintain a logbal unique resync window

global

>  - Use multiple hash buckets to reduce I/O barrier conflictions, regular

conflicts

>    I/O only has to wait for a resync I/O when both them have same barrier
>    bucket index, vice versa.
>  - I/O barrier can be recuded to an acceptable number if there are enought

reduced
enough

>    barrier buckets
>
> Here I explain how the barrier buckets are designed,
>  - BARRIER_UNIT_SECTOR_SIZE
>    The whole LBA address space of a raid1 device is divided into multiple
>    barrier units, by the size of BARRIER_UNIT_SECTOR_SIZE.
>    Bio request won't go across border of barrier unit size, that means

requests

>    maximum bio size is BARRIER_UNIT_SECTOR_SIZE<<9 (64MB) in bytes.
>    For random I/O 64MB is large enough for both read and write requests,
>    for sequential I/O considering underlying block layer may merge them
>    into larger requests, 64MB is still good enough.
>    Neil also points out that for resync operation, "we want the resync to
>    move from region to region fairly quickly so that the slowness caused
>    by having to synchronize with the resync is averaged out over a fairly
>    small time frame". For full speed resync, 64MB should take less then 1
>    second. When resync is competing with other I/O, it could take up a few
>    minutes. Therefore 64MB size is fairly good range for resync.
>
>  - BARRIER_BUCKETS_NR
>    There are BARRIER_BUCKETS_NR buckets in total, which is defined by,
>         #define BARRIER_BUCKETS_NR_BITS   (PAGE_SHIFT - 2)
>         #define BARRIER_BUCKETS_NR        (1<<BARRIER_BUCKETS_NR_BITS)
>    this patch makes the bellowed members of struct r1conf from integer
>    to array of integers,
>         -       int                     nr_pending;
>         -       int                     nr_waiting;
>         -       int                     nr_queued;
>         -       int                     barrier;
>         +       int                     *nr_pending;
>         +       int                     *nr_waiting;
>         +       int                     *nr_queued;
>         +       int                     *barrier;
>    number of the array elements is defined as BARRIER_BUCKETS_NR. For 4KB
>    kernel space page size, (PAGE_SHIFT - 2) indecates there are 1024 I/O
>    barrier buckets, and each array of integers occupies single memory page.
>    1024 means for a request which is smaller than the I/O barrier unit size
>    has ~0.1% chance to wait for resync to pause, which is quite a small
>    enough fraction. Also requesting single memory page is more friendly to
>    kernel page allocator than larger memory size.
>
>  - I/O barrier bucket is indexed by bio start sector
>    If multiple I/O requests hit different I/O barrier units, they only need
>    to compete I/O barrier with other I/Os which hit the same I/O barrier
>    bucket index with each other. The index of a barrier bucket which a
>    bio should look for is calculated by sector_to_idx() which is defined
>    in raid1.h as an inline function,
>         static inline int sector_to_idx(sector_t sector)
>         {
>                 return hash_long(sector >> BARRIER_UNIT_SECTOR_BITS,
>                                 BARRIER_BUCKETS_NR_BITS);
>         }
>    Here sector_nr is the start sector number of a bio.

"hash_long() is used so that sequential writes in are region of the
array which is not being resynced will not consistently align with
the buckets that are being sequentially resynced, as described below"

>
>  - Single bio won't go across boundary of a I/O barrier unit
>    If a request goes across boundary of barrier unit, it will be split. A
>    bio may be split in raid1_make_request() or raid1_sync_request(), if
>    sectors returned by align_to_barrier_unit_end() is small than original

smaller

>    bio size.
>
> Comparing to single sliding resync window,
>  - Currently resync I/O grows linearly, therefore regular and resync I/O
>    will have confliction within a single barrier units. So the I/O

... will conflict within ...

>    behavior is similar to single sliding resync window.
>  - But a barrier unit bucket is shared by all barrier units with identical
>    barrier uinit index, the probability of confliction might be higher
>    than single sliding resync window, in condition that writing I/Os
>    always hit barrier units which have identical barrier bucket indexs with
>    the resync I/Os. This is a very rare condition in real I/O work loads,
>    I cannot imagine how it could happen in practice.
>  - Therefore we can achieve a good enough low confliction rate with much

... low conflict rate ...

>    simpler barrier algorithm and implementation.
>
> There are two changes should be noticed,
>  - In raid1d(), I change the code to decrease conf->nr_pending[idx] into
>    single loop, it looks like this,
>         spin_lock_irqsave(&conf->device_lock, flags);
>         conf->nr_queued[idx]--;
>         spin_unlock_irqrestore(&conf->device_lock, flags);
>    This change generates more spin lock operations, but in next patch of
>    this patch set, it will be replaced by a single line code,
>         atomic_dec(&conf->nr_queueud[idx]);
>    So we don't need to worry about spin lock cost here.
>  - Mainline raid1 code split original raid1_make_request() into
>    raid1_read_request() and raid1_write_request(). If the original bio
>    goes across an I/O barrier unit size, this bio will be split before
>    calling raid1_read_request() or raid1_write_request(),  this change
>    the code logic more simple and clear.
>  - In this patch wait_barrier() is moved from raid1_make_request() to
>    raid1_write_request(). In raid_read_request(), original wait_barrier()
>    is replaced by raid1_read_request().
>    The differnece is wait_read_barrier() only waits if array is frozen,
>    using different barrier function in different code path makes the code
>    more clean and easy to read.

Thank you for putting the effort into writing a comprehensve change
description.  I really appreciate it.

>  
> @@ -1447,36 +1501,26 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>  
>  static void raid1_make_request(struct mddev *mddev, struct bio *bio)
>  {
> -	struct r1conf *conf = mddev->private;
> -	struct r1bio *r1_bio;
> +	void (*make_request_fn)(struct mddev *mddev, struct bio *bio);
> +	struct bio *split;
> +	sector_t sectors;
>  
> -	/*
> -	 * make_request() can abort the operation when read-ahead is being
> -	 * used and no empty request is available.
> -	 *
> -	 */
> -	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
> +	make_request_fn = (bio_data_dir(bio) == READ) ?
> +			  raid1_read_request : raid1_write_request;
>  
> -	r1_bio->master_bio = bio;
> -	r1_bio->sectors = bio_sectors(bio);
> -	r1_bio->state = 0;
> -	r1_bio->mddev = mddev;
> -	r1_bio->sector = bio->bi_iter.bi_sector;
> -
> -	/*
> -	 * We might need to issue multiple reads to different devices if there
> -	 * are bad blocks around, so we keep track of the number of reads in
> -	 * bio->bi_phys_segments.  If this is 0, there is only one r1_bio and
> -	 * no locking will be needed when requests complete.  If it is
> -	 * non-zero, then it is the number of not-completed requests.
> -	 */
> -	bio->bi_phys_segments = 0;
> -	bio_clear_flag(bio, BIO_SEG_VALID);
> +	/* if bio exceeds barrier unit boundary, split it */
> +	do {
> +		sectors = align_to_barrier_unit_end(
> +				bio->bi_iter.bi_sector, bio_sectors(bio));
> +		if (sectors < bio_sectors(bio)) {
> +			split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
> +			bio_chain(split, bio);
> +		} else {
> +			split = bio;
> +		}
>  
> -	if (bio_data_dir(bio) == READ)
> -		raid1_read_request(mddev, bio, r1_bio);
> -	else
> -		raid1_write_request(mddev, bio, r1_bio);
> +		make_request_fn(mddev, split);
> +	} while (split != bio);
>  }

I know you are going to change this as Shaohua wantsthe spitting
to happen in a separate function, which I agree with, but there is
something else wrong here.
Calling bio_split/bio_chain repeatedly in a loop is dangerous.
It is OK for simple devices, but when one request can wait for another
request to the same device it can deadlock.
This can happen with raid1.  If a resync request calls raise_barrier()
between one request and the next, then the next has to wait for the
resync request, which has to wait for the first request.
As the first request will be stuck in the queue in
generic_make_request(), you get a deadlock.
It is much safer to:

    if (need to split) {
        split = bio_split(bio, ...)
        bio_chain(...)
        make_request_fn(split);
        generic_make_request(bio);
   } else
        make_request_fn(mddev, bio);

This way we first process the initial section of the bio (in 'split')
which will queue some requests to the underlying devices.  These
requests will be queued in generic_make_request.
Then we queue the remainder of the bio, which will be added to the end
of the generic_make_request queue.
Then we return.
generic_make_request() will pop the lower-level device requests off the
queue and handle them first.  Then it will process the remainder
of the original bio once the first section has been fully processed.


Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* Re: [PATCH V3 2/2] RAID1: avoid unnecessary spin locks in I/O barrier code
From: NeilBrown @ 2017-02-16  7:04 UTC (permalink / raw)
  To: linux-raid
  Cc: Coly Li, Shaohua Li, Hannes Reinecke, Johannes Thumshirn,
	Guoqing Jiang
In-Reply-To: <1487176523-109075-2-git-send-email-colyli@suse.de>

[-- Attachment #1: Type: text/plain, Size: 1042 bytes --]

On Thu, Feb 16 2017, colyli@suse.de wrote:

> @@ -2393,6 +2455,11 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
>  		idx = sector_to_idx(r1_bio->sector);
>  		conf->nr_queued[idx]++;
>  		spin_unlock_irq(&conf->device_lock);
> +		/*
> +		 * In case freeze_array() is waiting for condition
> +		 * get_unqueued_pending() == extra to be true.
> +		 */
> +		wake_up(&conf->wait_barrier);
>  		md_wakeup_thread(conf->mddev->thread);
>  	} else {
>  		if (test_bit(R1BIO_WriteError, &r1_bio->state))
> @@ -2529,9 +2596,7 @@ static void raid1d(struct md_thread *thread)
>  						  retry_list);
>  			list_del(&r1_bio->retry_list);
>  			idx = sector_to_idx(r1_bio->sector);
> -			spin_lock_irqsave(&conf->device_lock, flags);
>  			conf->nr_queued[idx]--;
> -			spin_unlock_irqrestore(&conf->device_lock, flags);

Why do you think it is safe to decrement nr_queued without holding the
lock?
Surely this could race with handle_write_finished, and an update could
be lost.

Otherwise, looks good.

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* [PATCH 00/17] md: cleanup on direct access to bvec table
From: Ming Lei @ 2017-02-16 11:45 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei

In MD's resync I/O path, there are lots of direct access to bio's
bvec table. This patchset kills most of them, and the conversion
is quite straightforward.

Once direct access to bvec table in MD is cleaned up, we may make
multipage bvec moving on.

Thanks,
Ming

Ming Lei (17):
  block: introduce bio_segments_all()
  block: introduce bio_remove_last_page()
  md: raid1/raid10: use bio_remove_last_page()
  md: introduce helpers for dealing with fetch/store preallocated pages
    in bio
  md: raid1/raid10: use the introduced helpers
  md: raid1/raid10: borrow .bi_error as pre-allocated page index
  md: raid1/raid10: don't use .bi_vcnt to check if all pages are added
  md: raid1: simplify r1buf_pool_free()
  md: raid1/raid10: use bio helper in *_pool_free
  md: raid1: remove direct access to bvec table in fix_sync_read_error
  md: raid1: use bio helper in process_checks()
  md: raid1: avoid direct access to bvec table in process_checks()
  md: raid1: use bio_segments_all()
  md: raid10: avoid direct access to bvec table in sync_request_write()
  md: raid10: avoid direct access to bvec table in
    fix_recovery_read_error
  md: raid10: avoid direct access to bvec table in reshape_request
  md: raid10: avoid direct access to bvec table in
    handle_reshape_read_error

 block/bio.c         | 23 ++++++++++++++
 drivers/md/md.h     | 21 +++++++++++++
 drivers/md/raid1.c  | 87 +++++++++++++++++++++++++++++++++++++----------------
 drivers/md/raid10.c | 76 ++++++++++++++++++++++++++++++++++------------
 include/linux/bio.h |  8 +++++
 5 files changed, 169 insertions(+), 46 deletions(-)

-- 
2.7.4

^ permalink raw reply

* [PATCH 01/17] block: introduce bio_segments_all()
From: Ming Lei @ 2017-02-16 11:45 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1487245547-24384-1-git-send-email-tom.leiming@gmail.com>

So that we can replace the direct access to .bi_vcnt.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 include/linux/bio.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8e521194f6fc..3364b3ed90e7 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -293,6 +293,13 @@ static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
 		bv->bv_len = iter.bi_bvec_done;
 }
 
+static inline unsigned bio_segments_all(struct bio *bio)
+{
+	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+
+	return bio->bi_vcnt;
+}
+
 enum bip_flags {
 	BIP_BLOCK_INTEGRITY	= 1 << 0, /* block layer owns integrity data */
 	BIP_MAPPED_INTEGRITY	= 1 << 1, /* ref tag has been remapped */
-- 
2.7.4

^ permalink raw reply related

* [PATCH 02/17] block: introduce bio_remove_last_page()
From: Ming Lei @ 2017-02-16 11:45 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1487245547-24384-1-git-send-email-tom.leiming@gmail.com>

MD need this helper to remove the last added page, so introduce
it.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 block/bio.c         | 23 +++++++++++++++++++++++
 include/linux/bio.h |  1 +
 2 files changed, 24 insertions(+)

diff --git a/block/bio.c b/block/bio.c
index 5eec5e08417f..0ce7ffcd7939 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -837,6 +837,29 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
 EXPORT_SYMBOL(bio_add_pc_page);
 
 /**
+ *	bio_remove_last_page	-	remove the last added page
+ *	@bio: destination bio
+ *
+ *	Attempt to remove the last added page from the bio_vec maplist.
+ */
+void bio_remove_last_page(struct bio *bio)
+{
+	/*
+	 * cloned bio must not modify vec list
+	 */
+	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
+		return;
+
+	if (bio->bi_vcnt > 0) {
+		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+		bio->bi_iter.bi_size -= bv->bv_len;
+		bio->bi_vcnt--;
+	}
+}
+EXPORT_SYMBOL(bio_remove_last_page);
+
+/**
  *	bio_add_page	-	attempt to add page to bio
  *	@bio: destination bio
  *	@page: page to add
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 3364b3ed90e7..32aeb493d1fe 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -443,6 +443,7 @@ extern void bio_init(struct bio *bio, struct bio_vec *table,
 extern void bio_reset(struct bio *);
 void bio_chain(struct bio *, struct bio *);
 
+extern void bio_remove_last_page(struct bio *bio);
 extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 			   unsigned int, unsigned int);
-- 
2.7.4

^ permalink raw reply related

* [PATCH 03/17] md: raid1/raid10: use bio_remove_last_page()
From: Ming Lei @ 2017-02-16 11:45 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1487245547-24384-1-git-send-email-tom.leiming@gmail.com>

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c  | 3 +--
 drivers/md/raid10.c | 6 ++----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 85f309836fd7..6e4e0b868ff2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2824,8 +2824,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 						if (bio->bi_end_io==NULL)
 							continue;
 						/* remove last page from this bio */
-						bio->bi_vcnt--;
-						bio->bi_iter.bi_size -= len;
+						bio_remove_last_page(bio);
 						bio_clear_flag(bio, BIO_SEG_VALID);
 					}
 					goto bio_full;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 063c43d83b72..aa37d4c7900a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3447,8 +3447,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 			     bio2 && bio2 != bio;
 			     bio2 = bio2->bi_next) {
 				/* remove last page from this bio */
-				bio2->bi_vcnt--;
-				bio2->bi_iter.bi_size -= len;
+				bio_remove_last_page(bio2);
 				bio_clear_flag(bio2, BIO_SEG_VALID);
 			}
 			goto bio_full;
@@ -4538,8 +4537,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 			     bio2 && bio2 != bio;
 			     bio2 = bio2->bi_next) {
 				/* Remove last page from this bio */
-				bio2->bi_vcnt--;
-				bio2->bi_iter.bi_size -= len;
+				bio_remove_last_page(bio2);
 				bio_clear_flag(bio2, BIO_SEG_VALID);
 			}
 			goto bio_full;
-- 
2.7.4

^ permalink raw reply related

* [PATCH 04/17] md: introduce helpers for dealing with fetch/store preallocated pages in bio
From: Ming Lei @ 2017-02-16 11:45 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1487245547-24384-1-git-send-email-tom.leiming@gmail.com>

Both raid1 and raid10 uses bio's bvec table to store pre-allocated
pages, then fetch and add it to bio.

This patch introduces two helpers for dealing with the special case,
like what bio_iov_iter_get_pages() does.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/md.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/drivers/md/md.h b/drivers/md/md.h
index a86ad62079de..21897cb514af 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -708,4 +708,25 @@ static inline void mddev_clear_unsupported_flags(struct mddev *mddev,
 {
 	mddev->flags &= ~unsupported_flags;
 }
+
+/*
+ * Both raid1 and raid10 use bio's bvec table to store the preallocated
+ * pages, so we introduces the following helpers for this kind of usage.
+ *
+ * Actually the usage is a bit similar with bio_iov_iter_get_pages().
+ *
+ * Please make sure .bi_io_vec[idx] points to one unused vector by the
+ * bio.
+ */
+static inline struct page *mdev_get_page_from_bio(struct bio *bio, unsigned idx)
+{
+	return bio->bi_io_vec[idx].bv_page;
+}
+
+static inline void mdev_put_page_to_bio(struct bio *bio, unsigned idx,
+					struct page *page)
+{
+	bio->bi_io_vec[idx].bv_page = page;
+}
+
 #endif /* _MD_MD_H */
-- 
2.7.4

^ permalink raw reply related

* [PATCH 05/17] md: raid1/raid10: use the introduced helpers
From: Ming Lei @ 2017-02-16 11:45 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1487245547-24384-1-git-send-email-tom.leiming@gmail.com>

This patch uses the introduced helpers to fetch pre-allocated
page from bio bvec table, and store it back.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c  | 4 ++--
 drivers/md/raid10.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 6e4e0b868ff2..c4791fbd69ac 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2814,10 +2814,10 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 		for (i = 0 ; i < conf->raid_disks * 2; i++) {
 			bio = r1_bio->bios[i];
 			if (bio->bi_end_io) {
-				page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
+				page = mdev_get_page_from_bio(bio, bio->bi_vcnt);
 				if (bio_add_page(bio, page, len, 0) == 0) {
 					/* stop here */
-					bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
+					mdev_put_page_to_bio(bio, bio->bi_vcnt, page);
 					while (i > 0) {
 						i--;
 						bio = r1_bio->bios[i];
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index aa37d4c7900a..b7dfbca869a3 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3437,12 +3437,12 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 			break;
 		for (bio= biolist ; bio ; bio=bio->bi_next) {
 			struct bio *bio2;
-			page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
+			page = mdev_get_page_from_bio(bio, bio->bi_vcnt);
 			if (bio_add_page(bio, page, len, 0))
 				continue;
 
 			/* stop here */
-			bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
+			mdev_put_page_to_bio(bio, bio->bi_vcnt, page);
 			for (bio2 = biolist;
 			     bio2 && bio2 != bio;
 			     bio2 = bio2->bi_next) {
-- 
2.7.4

^ permalink raw reply related

* [PATCH 06/17] md: raid1/raid10: borrow .bi_error as pre-allocated page index
From: Ming Lei @ 2017-02-16 11:45 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1487245547-24384-1-git-send-email-tom.leiming@gmail.com>

Before bio is submitted, it is safe to borrow .bi_error. This
patch uses .bi_error as index of pre-allocated page in bio, so
that we can avoid to mess .bi_vcnt. Especially the old way
will not work any more when multipage bvec is introduced.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c  | 12 ++++++++++--
 drivers/md/raid10.c | 14 ++++++++++----
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c4791fbd69ac..8904a9149671 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2811,13 +2811,14 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 				len = sync_blocks<<9;
 		}
 
+		/* borrow .bi_error as pre-allocated page index */
 		for (i = 0 ; i < conf->raid_disks * 2; i++) {
 			bio = r1_bio->bios[i];
 			if (bio->bi_end_io) {
-				page = mdev_get_page_from_bio(bio, bio->bi_vcnt);
+				page = mdev_get_page_from_bio(bio, bio->bi_error++);
 				if (bio_add_page(bio, page, len, 0) == 0) {
 					/* stop here */
-					mdev_put_page_to_bio(bio, bio->bi_vcnt, page);
+					mdev_put_page_to_bio(bio, --bio->bi_error, page);
 					while (i > 0) {
 						i--;
 						bio = r1_bio->bios[i];
@@ -2836,6 +2837,13 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 		sync_blocks -= (len>>9);
 	} while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
  bio_full:
+	/* return .bi_error back to bio */
+	for (i = 0 ; i < conf->raid_disks * 2; i++) {
+		bio = r1_bio->bios[i];
+		if (bio->bi_end_io)
+			bio->bi_error = 0;
+	}
+
 	r1_bio->sectors = nr_sectors;
 
 	if (mddev_is_clustered(mddev) &&
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index b7dfbca869a3..9cfc22cd1330 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3348,7 +3348,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
 			bio = r10_bio->devs[i].bio;
 			bio_reset(bio);
-			bio->bi_error = -EIO;
 			rcu_read_lock();
 			rdev = rcu_dereference(conf->mirrors[d].rdev);
 			if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
@@ -3392,7 +3391,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 			/* Need to set up for writing to the replacement */
 			bio = r10_bio->devs[i].repl_bio;
 			bio_reset(bio);
-			bio->bi_error = -EIO;
 
 			sector = r10_bio->devs[i].addr;
 			bio->bi_next = biolist;
@@ -3435,14 +3433,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 			len = (max_sector - sector_nr) << 9;
 		if (len == 0)
 			break;
+		/* borrow .bi_error as pre-allocated page index */
 		for (bio= biolist ; bio ; bio=bio->bi_next) {
 			struct bio *bio2;
-			page = mdev_get_page_from_bio(bio, bio->bi_vcnt);
+			page = mdev_get_page_from_bio(bio, bio->bi_error++);
 			if (bio_add_page(bio, page, len, 0))
 				continue;
 
 			/* stop here */
-			mdev_put_page_to_bio(bio, bio->bi_vcnt, page);
+			mdev_put_page_to_bio(bio, --bio->bi_error, page);
 			for (bio2 = biolist;
 			     bio2 && bio2 != bio;
 			     bio2 = bio2->bi_next) {
@@ -3456,6 +3455,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		sector_nr += len>>9;
 	} while (biolist->bi_vcnt < RESYNC_PAGES);
  bio_full:
+	/* return .bi_error back to bio, and set resync's as -EIO */
+	for (bio= biolist ; bio ; bio=bio->bi_next)
+		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+			bio->bi_error = -EIO;
+		else
+			bio->bi_error = 0;
+
 	r10_bio->sectors = nr_sectors;
 
 	while (biolist) {
-- 
2.7.4


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox