Linux RAID subsystem development

Linux RAID subsystem development
 help / color / mirror / Atom feed

* Re: [md PATCH 01/14] md/raid5: use md_write_start to count stripes, not bios
From: Shaohua Li @ 2017-02-16 17:29 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid, hch
In-Reply-To: <148721994120.7521.3518782809103694227.stgit@noble>

On Thu, Feb 16, 2017 at 03:39:01PM +1100, Neil Brown wrote:
> We use md_write_start() to increase the count of pending writes, and
> md_write_end() to decrement the count.  We currently count bios
> submitted to md/raid5.  Change it count stripe_heads that a WRITE bio
> has been attached to.
> 
> So now, raid5_make_request() calls md_write_start() and then
> md_write_end() to keep the count elevated during the setup of the
> request.
> 
> add_stripe_bio() calls md_write_start() for each stripe_head, and the
> completion routines always call md_write_end(), instead of only
> calling it when raid5_dec_bi_active_stripes() returns 0.
> make_discard_request also calls md_write_start/end().
> 
> The parallel between md_write_{start,end} and use of bi_phys_segments
> can be seen in that:
>  Whenever we set bi_phys_segments to 1, we now call md_write_start.
>  Whenever we increment it on non-read requests with
>    raid5_inc_bi_active_stripes(), we now call md_write_start().
>  Whenever we decrement bi_phys_segments on non-read requsts with
>     raid5_dec_bi_active_stripes(), we now call md_write_end().
> 
> This reduces our dependence on keeping a per-bio count of active
> stripes in bi_phys_segments.
> 
> Signed-off-by: NeilBrown <neilb@suse.com>
> ---
>  drivers/md/raid5-cache.c |    2 +-
>  drivers/md/raid5.c       |   27 +++++++++++++--------------
>  2 files changed, 14 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 302dea3296ba..4b211f914d17 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -265,8 +265,8 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
>  	while (wbi && wbi->bi_iter.bi_sector <
>  	       dev->sector + STRIPE_SECTORS) {
>  		wbi2 = r5_next_bio(wbi, dev->sector);
> +		md_write_end(conf->mddev);
>  		if (!raid5_dec_bi_active_stripes(wbi)) {
> -			md_write_end(conf->mddev);
>  			bio_list_add(return_bi, wbi);
>  		}
>  		wbi = wbi2;
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 3c7e106c12a2..760b726943c9 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -3075,6 +3075,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
>  		bi->bi_next = *bip;
>  	*bip = bi;
>  	raid5_inc_bi_active_stripes(bi);
> +	md_write_start(conf->mddev, bi);
>  
>  	if (forwrite) {
>  		/* check if page is covered */
> @@ -3198,10 +3199,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
>  			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
>  
>  			bi->bi_error = -EIO;
> -			if (!raid5_dec_bi_active_stripes(bi)) {
> -				md_write_end(conf->mddev);
> +			md_write_end(conf->mddev);
> +			if (!raid5_dec_bi_active_stripes(bi))
>  				bio_list_add(return_bi, bi);
> -			}
>  			bi = nextbi;
>  		}
>  		if (bitmap_end)
> @@ -3222,10 +3222,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
>  			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
>  
>  			bi->bi_error = -EIO;
> -			if (!raid5_dec_bi_active_stripes(bi)) {
> -				md_write_end(conf->mddev);
> +			md_write_end(conf->mddev);
> +			if (!raid5_dec_bi_active_stripes(bi))
>  				bio_list_add(return_bi, bi);
> -			}
>  			bi = bi2;
>  		}
>  
> @@ -3582,10 +3581,9 @@ static void handle_stripe_clean_event(struct r5conf *conf,
>  				while (wbi && wbi->bi_iter.bi_sector <
>  					dev->sector + STRIPE_SECTORS) {
>  					wbi2 = r5_next_bio(wbi, dev->sector);
> -					if (!raid5_dec_bi_active_stripes(wbi)) {
> -						md_write_end(conf->mddev);
> +					md_write_end(conf->mddev);
> +					if (!raid5_dec_bi_active_stripes(wbi))
>  						bio_list_add(return_bi, wbi);
> -					}
>  					wbi = wbi2;
>  				}
>  				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
> @@ -5268,6 +5266,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
>  
>  	bi->bi_next = NULL;
>  	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
> +	md_write_start(mddev, bi);

md_write_start calls wait_event, so it can't be called with lock hold. Maybe
add a special __md_write_start which only increases the count, we already wait
in make_request.

>  	stripe_sectors = conf->chunk_sectors *
>  		(conf->raid_disks - conf->max_degraded);
> @@ -5314,6 +5313,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
>  			sh->dev[d].towrite = bi;
>  			set_bit(R5_OVERWRITE, &sh->dev[d].flags);
>  			raid5_inc_bi_active_stripes(bi);
> +			md_write_start(mddev, bi);

Ditto.

Thanks,
Shaohua

^ permalink raw reply

* Re: [PATCH V3 1/2] RAID1: a new I/O barrier implementation to remove resync window
From: Coly Li @ 2017-02-16 17:05 UTC (permalink / raw)
  To: Shaohua Li
  Cc: linux-raid, Shaohua Li, Neil Brown, Johannes Thumshirn,
	Guoqing Jiang
In-Reply-To: <20170216022222.73xmrd7lujkpns2x@kernel.org>

On 2017/2/16 上午10:22, Shaohua Li wrote:
> On Thu, Feb 16, 2017 at 12:35:22AM +0800, colyli@suse.de wrote:
>> 'Commit 79ef3a8aa1cb ("raid1: Rewrite the implementation of iobarrier.")'
>> introduces a sliding resync window for raid1 I/O barrier, this idea limits
>> I/O barriers to happen only inside a slidingresync window, for regular
>> I/Os out of this resync window they don't need to wait for barrier any
>> more. On large raid1 device, it helps a lot to improve parallel writing
>> I/O throughput when there are background resync I/Os performing at
>> same time.
>>
>> The idea of sliding resync widow is awesome, but code complexity is a
>> challenge. Sliding resync window requires several veriables to work
>> collectively, this is complexed and very hard to make it work correctly.
>> Just grep "Fixes: 79ef3a8aa1" in kernel git log, there are 8 more patches
>> to fix the original resync window patch. This is not the end, any further
>> related modification may easily introduce more regreassion.
>>
>> Therefore I decide to implement a much simpler raid1 I/O barrier, by
>> removing resync window code, I believe life will be much easier.
>>
>> The brief idea of the simpler barrier is,
>>  - Do not maintain a logbal unique resync window
>>  - Use multiple hash buckets to reduce I/O barrier conflictions, regular
>>    I/O only has to wait for a resync I/O when both them have same barrier
>>    bucket index, vice versa.
>>  - I/O barrier can be recuded to an acceptable number if there are enought
>>    barrier buckets
>>
>> Here I explain how the barrier buckets are designed,
>>  - BARRIER_UNIT_SECTOR_SIZE
>>    The whole LBA address space of a raid1 device is divided into multiple
>>    barrier units, by the size of BARRIER_UNIT_SECTOR_SIZE.
>>    Bio request won't go across border of barrier unit size, that means
>>    maximum bio size is BARRIER_UNIT_SECTOR_SIZE<<9 (64MB) in bytes.
>>    For random I/O 64MB is large enough for both read and write requests,
>>    for sequential I/O considering underlying block layer may merge them
>>    into larger requests, 64MB is still good enough.
>>    Neil also points out that for resync operation, "we want the resync to
>>    move from region to region fairly quickly so that the slowness caused
>>    by having to synchronize with the resync is averaged out over a fairly
>>    small time frame". For full speed resync, 64MB should take less then 1
>>    second. When resync is competing with other I/O, it could take up a few
>>    minutes. Therefore 64MB size is fairly good range for resync.
>>
>>  - BARRIER_BUCKETS_NR
>>    There are BARRIER_BUCKETS_NR buckets in total, which is defined by,
>>         #define BARRIER_BUCKETS_NR_BITS   (PAGE_SHIFT - 2)
>>         #define BARRIER_BUCKETS_NR        (1<<BARRIER_BUCKETS_NR_BITS)
>>    this patch makes the bellowed members of struct r1conf from integer
>>    to array of integers,
>>         -       int                     nr_pending;
>>         -       int                     nr_waiting;
>>         -       int                     nr_queued;
>>         -       int                     barrier;
>>         +       int                     *nr_pending;
>>         +       int                     *nr_waiting;
>>         +       int                     *nr_queued;
>>         +       int                     *barrier;
>>    number of the array elements is defined as BARRIER_BUCKETS_NR. For 4KB
>>    kernel space page size, (PAGE_SHIFT - 2) indecates there are 1024 I/O
>>    barrier buckets, and each array of integers occupies single memory page.
>>    1024 means for a request which is smaller than the I/O barrier unit size
>>    has ~0.1% chance to wait for resync to pause, which is quite a small
>>    enough fraction. Also requesting single memory page is more friendly to
>>    kernel page allocator than larger memory size.
>>
>>  - I/O barrier bucket is indexed by bio start sector
>>    If multiple I/O requests hit different I/O barrier units, they only need
>>    to compete I/O barrier with other I/Os which hit the same I/O barrier
>>    bucket index with each other. The index of a barrier bucket which a
>>    bio should look for is calculated by sector_to_idx() which is defined
>>    in raid1.h as an inline function,
>>         static inline int sector_to_idx(sector_t sector)
>>         {
>>                 return hash_long(sector >> BARRIER_UNIT_SECTOR_BITS,
>>                                 BARRIER_BUCKETS_NR_BITS);
>>         }
>>    Here sector_nr is the start sector number of a bio.
>>
>>  - Single bio won't go across boundary of a I/O barrier unit
>>    If a request goes across boundary of barrier unit, it will be split. A
>>    bio may be split in raid1_make_request() or raid1_sync_request(), if
>>    sectors returned by align_to_barrier_unit_end() is small than original
>>    bio size.
>>
>> Comparing to single sliding resync window,
>>  - Currently resync I/O grows linearly, therefore regular and resync I/O
>>    will have confliction within a single barrier units. So the I/O
>>    behavior is similar to single sliding resync window.
>>  - But a barrier unit bucket is shared by all barrier units with identical
>>    barrier uinit index, the probability of confliction might be higher
>>    than single sliding resync window, in condition that writing I/Os
>>    always hit barrier units which have identical barrier bucket indexs with
>>    the resync I/Os. This is a very rare condition in real I/O work loads,
>>    I cannot imagine how it could happen in practice.
>>  - Therefore we can achieve a good enough low confliction rate with much
>>    simpler barrier algorithm and implementation.
>>
>> There are two changes should be noticed,
>>  - In raid1d(), I change the code to decrease conf->nr_pending[idx] into
>>    single loop, it looks like this,
>>         spin_lock_irqsave(&conf->device_lock, flags);
>>         conf->nr_queued[idx]--;
>>         spin_unlock_irqrestore(&conf->device_lock, flags);
>>    This change generates more spin lock operations, but in next patch of
>>    this patch set, it will be replaced by a single line code,
>>         atomic_dec(&conf->nr_queueud[idx]);
>>    So we don't need to worry about spin lock cost here.
>>  - Mainline raid1 code split original raid1_make_request() into
>>    raid1_read_request() and raid1_write_request(). If the original bio
>>    goes across an I/O barrier unit size, this bio will be split before
>>    calling raid1_read_request() or raid1_write_request(),  this change
>>    the code logic more simple and clear.
>>  - In this patch wait_barrier() is moved from raid1_make_request() to
>>    raid1_write_request(). In raid_read_request(), original wait_barrier()
>>    is replaced by raid1_read_request().
>>    The differnece is wait_read_barrier() only waits if array is frozen,
>>    using different barrier function in different code path makes the code
>>    more clean and easy to read.
>> Changelog
>> V3:
>> - Rebase the patch against latest upstream kernel code.
>> - Many fixes by review comments from Neil,
>>   - Back to use pointers to replace arraries in struct r1conf
>>   - Remove total_barriers from struct r1conf
>>   - Add more patch comments to explain how/why the values of
>>     BARRIER_UNIT_SECTOR_SIZE and BARRIER_BUCKETS_NR are decided.
>>   - Use get_unqueued_pending() to replace get_all_pendings() and
>>     get_all_queued()
>>   - Increase bucket number from 512 to 1024
>> - Change code comments format by review from Shaohua.
>> V2:
>> - Use bio_split() to split the orignal bio if it goes across barrier unit
>>   bounday, to make the code more simple, by suggestion from Shaohua and
>>   Neil.
>> - Use hash_long() to replace original linear hash, to avoid a possible
>>   confilict between resync I/O and sequential write I/O, by suggestion from
>>   Shaohua.
>> - Add conf->total_barriers to record barrier depth, which is used to
>>   control number of parallel sync I/O barriers, by suggestion from Shaohua.
>> - In V1 patch the bellowed barrier buckets related members in r1conf are
>>   allocated in memory page. To make the code more simple, V2 patch moves
>>   the memory space into struct r1conf, like this,
>>         -       int                     nr_pending;
>>         -       int                     nr_waiting;
>>         -       int                     nr_queued;
>>         -       int                     barrier;
>>         +       int                     nr_pending[BARRIER_BUCKETS_NR];
>>         +       int                     nr_waiting[BARRIER_BUCKETS_NR];
>>         +       int                     nr_queued[BARRIER_BUCKETS_NR];
>>         +       int                     barrier[BARRIER_BUCKETS_NR];
>>   This change is by the suggestion from Shaohua.
>> - Remove some inrelavent code comments, by suggestion from Guoqing.
>> - Add a missing wait_barrier() before jumping to retry_write, in
>>   raid1_make_write_request().
>> V1:
>> - Original RFC patch for comments
> 
> Looks good, two minor issues.
> 
>>  
>> -static void raid1_read_request(struct mddev *mddev, struct bio *bio,
>> -				 struct r1bio *r1_bio)
>> +static void raid1_read_request(struct mddev *mddev, struct bio *bio)
>>  {
>>  	struct r1conf *conf = mddev->private;
>>  	struct raid1_info *mirror;
>> +	struct r1bio *r1_bio;
>>  	struct bio *read_bio;
>>  	struct bitmap *bitmap = mddev->bitmap;
>>  	const int op = bio_op(bio);
>> @@ -1083,7 +1101,34 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
>>  	int max_sectors;
>>  	int rdisk;
>>  
>> -	wait_barrier(conf, bio);
>> +	/*
>> +	 * Still need barrier for READ in case that whole
>> +	 * array is frozen.
>> +	 */
>> +	wait_read_barrier(conf, bio->bi_iter.bi_sector);
>> +	bitmap = mddev->bitmap;
>> +
>> +	/*
>> +	 * make_request() can abort the operation when read-ahead is being
>> +	 * used and no empty request is available.
>> +	 *
>> +	 */
>> +	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
>> +	r1_bio->master_bio = bio;
>> +	r1_bio->sectors = bio_sectors(bio);
>> +	r1_bio->state = 0;
>> +	r1_bio->mddev = mddev;
>> +	r1_bio->sector = bio->bi_iter.bi_sector;
> 
> This part looks unnecessary complicated. If you change raid1_make_request to
> something like __raid1_make_reques, add a new raid1_make_request and do bio
> split there, then call __raid1_make_request for each splitted bio, then you
> don't need to duplicate the r1_bio allocation parts for read/write.
> 

Aha, good point! I will do that.

>> diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
>> index c52ef42..d3faf30 100644
>> --- a/drivers/md/raid1.h
>> +++ b/drivers/md/raid1.h
>> @@ -1,6 +1,14 @@
>>  #ifndef _RAID1_H
>>  #define _RAID1_H
>>  
>> +/* each barrier unit size is 64MB fow now
>> + * note: it must be larger than RESYNC_DEPTH
>> + */
>> +#define BARRIER_UNIT_SECTOR_BITS	17
>> +#define BARRIER_UNIT_SECTOR_SIZE	(1<<17)
>> +#define BARRIER_BUCKETS_NR_BITS		(PAGE_SHIFT - 2)
> 
> maybe write this as (PAGE_SHIFT - ilog2(sizeof(int)))? To be honest, I don't
> think it really matters if the array is PAGE_SIZE length, maybe just specify a
> const here.

Nice catch! It makes sense, I will do that, and add some comments to
explain why it is sizeof(int).

Thanks.

Coly

^ permalink raw reply

* Re: [md PATCH 09/14] md/raid10: stop using bi_phys_segments
From: Jack Wang @ 2017-02-16 14:26 UTC (permalink / raw)
  To: NeilBrown; +Cc: Shaohua Li, linux-raid, Christoph Hellwig
In-Reply-To: <148721994257.7521.2502119977276847128.stgit@noble>

2017-02-16 5:39 GMT+01:00 NeilBrown <neilb@suse.com>:
> raid10 currently repurposes bi_phys_segments on each
> incoming bio to count how many r10bio was used to encode the
> request.
>
> We need to know when the number of attached r10bio reaches
> zero to:
> 1/ call bio_endio() when all IO on the bio is finished
> 2/ decrement ->nr_pending so that resync IO can proceed.
>
> Now that the bio has its own __bi_remaining counter, that
> can be used instead. We can call bio_inc_remaining to
> increment the counter and call bio_endio() every time an
> r10bio completes, rather than only when bi_phys_segments
> reaches zero.
>
> This addresses point 1, but not point 2.  bio_endio()
> doesn't (and cannot) report when the last r10bio has
> finished, so a different approach is needed.
>
> So: instead of counting bios in ->nr_pending, count r10bios.
> i.e. every time we attach a bio, increment nr_pending.
> Every time an r10bio completes, decrement nr_pending.
>
> Normally we only increment nr_pending after first checking
> that ->barrier is zero, or some other non-trivial tests and
> possible waiting.  When attaching multiple r10bios to a bio,
> we only need the tests and the waiting once.  After the
> first increment, subsequent increments can happen
> unconditionally as they are really all part of the one
> request.
>
> So introduce inc_pending() which can be used when we know
> that nr_pending is already elevated.
>
> Signed-off-by: NeilBrown <neilb@suse.com>
> ---
>  drivers/md/raid10.c |   76 +++++++++++++++++----------------------------------
>  1 file changed, 25 insertions(+), 51 deletions(-)
>
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 9258cbe233bb..6b4d8643c574 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -301,27 +301,18 @@ static void reschedule_retry(struct r10bio *r10_bio)
>  static void raid_end_bio_io(struct r10bio *r10_bio)
>  {
>         struct bio *bio = r10_bio->master_bio;
> -       int done;
>         struct r10conf *conf = r10_bio->mddev->private;
>
> -       if (bio->bi_phys_segments) {
> -               unsigned long flags;
> -               spin_lock_irqsave(&conf->device_lock, flags);
> -               bio->bi_phys_segments--;
> -               done = (bio->bi_phys_segments == 0);
> -               spin_unlock_irqrestore(&conf->device_lock, flags);
> -       } else
> -               done = 1;
>         if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
>                 bio->bi_error = -EIO;
> -       if (done) {
> -               bio_endio(bio);
> -               /*
> -                * Wake up any possible resync thread that waits for the device
> -                * to go idle.
> -                */
> -               allow_barrier(conf);
> -       }
> +
> +       /*
> +        * Wake up any possible resync thread that waits for the device
> +        * to go idle.
> +        */
> +       allow_barrier(conf);
> +       bio_endio(bio);
> +

Hi Neil,

Why do you switch the order of above 2 lines, is there a reason
behind, I notice in raid1 you kept the order?

Regards,
Jack

>         free_r10bio(r10_bio);
>  }
>
> @@ -984,6 +975,15 @@ static void wait_barrier(struct r10conf *conf)
>         spin_unlock_irq(&conf->resync_lock);
>  }
>
> +static void inc_pending(struct r10conf *conf)
> +{
> +       /* The current request requires multiple r10_bio, so
> +        * we need to increment the pending count.
> +        */
> +       WARN_ON(!atomic_read(&conf->nr_pending));
> +       atomic_inc(&conf->nr_pending);
> +}
> +
>  static void allow_barrier(struct r10conf *conf)
>  {
>         if ((atomic_dec_and_test(&conf->nr_pending)) ||
> @@ -1161,12 +1161,8 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
>                 sectors_handled = (r10_bio->sector + max_sectors
>                                    - bio->bi_iter.bi_sector);
>                 r10_bio->sectors = max_sectors;
> -               spin_lock_irq(&conf->device_lock);
> -               if (bio->bi_phys_segments == 0)
> -                       bio->bi_phys_segments = 2;
> -               else
> -                       bio->bi_phys_segments++;
> -               spin_unlock_irq(&conf->device_lock);
> +               inc_pending(conf);
> +               bio_inc_remaining(bio);
>                 /*
>                  * Cannot call generic_make_request directly as that will be
>                  * queued in __generic_make_request and subsequent
> @@ -1261,9 +1257,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>          * on which we have seen a write error, we want to avoid
>          * writing to those blocks.  This potentially requires several
>          * writes to write around the bad blocks.  Each set of writes
> -        * gets its own r10_bio with a set of bios attached.  The number
> -        * of r10_bios is recored in bio->bi_phys_segments just as with
> -        * the read case.
> +        * gets its own r10_bio with a set of bios attached.
>          */
>
>         r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
> @@ -1481,15 +1475,9 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>          */
>
>         if (sectors_handled < bio_sectors(bio)) {
> -               /* We need another r10_bio and it needs to be counted
> -                * in bio->bi_phys_segments.
> -                */
> -               spin_lock_irq(&conf->device_lock);
> -               if (bio->bi_phys_segments == 0)
> -                       bio->bi_phys_segments = 2;
> -               else
> -                       bio->bi_phys_segments++;
> -               spin_unlock_irq(&conf->device_lock);
> +               /* We need another r10_bio and it needs to be counted */
> +               inc_pending(conf);
> +               bio_inc_remaining(bio);
>                 one_write_done(r10_bio);
>                 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
>
> @@ -1518,16 +1506,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
>         r10_bio->sector = bio->bi_iter.bi_sector;
>         r10_bio->state = 0;
>
> -       /*
> -        * We might need to issue multiple reads to different devices if there
> -        * are bad blocks around, so we keep track of the number of reads in
> -        * bio->bi_phys_segments.  If this is 0, there is only one r10_bio and
> -        * no locking will be needed when the request completes.  If it is
> -        * non-zero, then it is the number of not-completed requests.
> -        */
> -       bio->bi_phys_segments = 0;
> -       bio_clear_flag(bio, BIO_SEG_VALID);
> -
>         if (bio_data_dir(bio) == READ)
>                 raid10_read_request(mddev, bio, r10_bio);
>         else
> @@ -2662,12 +2640,8 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
>                         r10_bio->sector + max_sectors
>                         - mbio->bi_iter.bi_sector;
>                 r10_bio->sectors = max_sectors;
> -               spin_lock_irq(&conf->device_lock);
> -               if (mbio->bi_phys_segments == 0)
> -                       mbio->bi_phys_segments = 2;
> -               else
> -                       mbio->bi_phys_segments++;
> -               spin_unlock_irq(&conf->device_lock);
> +               bio_inc_remaining(mbio);
> +               inc_pending(conf);
>                 generic_make_request(bio);
>
>                 r10_bio = mempool_alloc(conf->r10bio_pool,
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 02/17] block: introduce bio_remove_last_page()
From: Johannes Thumshirn @ 2017-02-16 14:10 UTC (permalink / raw)
  To: Ming Lei
  Cc: Shaohua Li, Jens Axboe, Linux Kernel Mailing List,
	open list:SOFTWARE RAID (Multiple Disks) SUPPORT, linux-block,
	Christoph Hellwig, NeilBrown
In-Reply-To: <CACVXFVPLXfWKcQGcmX8H6Um+4P0XFpKC0DWC4-JXy94st=QL7Q@mail.gmail.com>

On 02/16/2017 02:59 PM, Ming Lei wrote:
> On Thu, Feb 16, 2017 at 9:40 PM, Johannes Thumshirn <jthumshirn@suse.de> wrote:
>> On 02/16/2017 02:30 PM, Ming Lei wrote:
>>> On Thu, Feb 16, 2017 at 8:08 PM, Johannes Thumshirn <jthumshirn@suse.de> wrote:
>>>> On 02/16/2017 12:45 PM, Ming Lei wrote:
>>>>> MD need this helper to remove the last added page, so introduce
>>>>> it.
>>>>>
>>>>> Signed-off-by: Ming Lei <tom.leiming@gmail.com>
>>>>> ---
>>>>>  block/bio.c         | 23 +++++++++++++++++++++++
>>>>>  include/linux/bio.h |  1 +
>>>>>  2 files changed, 24 insertions(+)
>>>>>
>>>>> diff --git a/block/bio.c b/block/bio.c
>>>>> index 5eec5e08417f..0ce7ffcd7939 100644
>>>>> --- a/block/bio.c
>>>>> +++ b/block/bio.c
>>>>> @@ -837,6 +837,29 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
>>>>>  EXPORT_SYMBOL(bio_add_pc_page);
>>>>>
>>>>>  /**
>>>>> + *   bio_remove_last_page    -       remove the last added page
>>>>> + *   @bio: destination bio
>>>>> + *
>>>>> + *   Attempt to remove the last added page from the bio_vec maplist.
>>>>> + */
>>>>> +void bio_remove_last_page(struct bio *bio)
>>>>> +{
>>>>> +     /*
>>>>> +      * cloned bio must not modify vec list
>>>>> +      */
>>>>> +     if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
>>>>> +             return;
>>>>> +
>>>>> +     if (bio->bi_vcnt > 0) {
>>>>
>>>> In patch 1 you introduce bio_segments_all() with the log message 'So
>>>> that we can replace the direct access to .bi_vcnt.' Here you introduce a
>>>> new direct access to it (plus the duplicated WARN_ON_ONCE()).
>>>>
>>>> Maybe use the helper directly here (I admit I haven't gone through the
>>>> whole series yet, so I can't see if the change is made later).
>>>
>>> Firstly MD does need one helper to remove the last added page, as you
>>> can see there are three such uses in patch3.
>>>
>>> Secondly both the two helpers will be changed once multipage bvec
>>> is supported, that means we have to change MD too after multipage bvec
>>> if just using bio_segments_all() to replace .bi_vcnt for removing
>>> the last added page.
>>
>> I'm not sure if we're talking past each other here, I assumed you'd do
>> something like:
>>
>> void bio_remove_last_page(struct bio *bio)
>> {
>>         int vcnt = bio_segments_all(bio);
>>
>>         if (bio_flagged(bio, BIO_CLONED))
>>                 return;
>>
>>         if (vcnt > 0) {
>>                 struct bio_vec *bv = &bio->bi_io_vec[vcnt - 1];
>>
>>                 bio->bi_iter.bi_size -= bv->bv_len;
>>                 bio->bi_vcnt--;
>>         }
>> }
> 
> What we are doing is to remove the external direct access to bvec
> table in drivers or filesystems because they may misuse that, for
> example, drivers often use .bi_vcnt to get the page count in the bio,
> but the actual meaning is just bvec's count.
> 
> And the implementation in block layer still need to play the table directly,
> especially for sake of efficiency, cause we understand the details and won't
> misuse that.

Ah OK, so bio_segments_all() is intended for drivers.

Thanks for the clarification,
	Johannes


-- 
Johannes Thumshirn                                          Storage
jthumshirn@suse.de                                +49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

^ permalink raw reply

* RAID10 and 'writemostly' support
From: Reindl Harald @ 2017-02-16 14:08 UTC (permalink / raw)
  To: linux-raid

Hi

i am new and was redirected to this list from the bugtracker
please have a look at https://bugzilla.kernel.org/show_bug.cgi?id=194551

currently "writemostly" seems to be only supported on "real" RAID1 while 
i was in hope that by the conecpt of RAID10 having more or less 
RAID0+RAID1 it would also work on RAID10 (and on the virtual machine wre 
i tested mdadm with the flag before buying the disks it did not complain)

RAID10 with "writemostly" makes a lot of sense for large storages to get 
them fast *and* reliable without make it extremly expensive

* you don't want RAID5/RAID6 rebuild over many TB
* very large SSD for RAID1 are much more expensive than smaller ones

so with 4x2 TB disks you get 4 TB useable storage and with "writemostly" 
which would be in the best case "writeonly" you have a lightening fast 
RAID0 on SSD with good redundancy

most workloads are read-intense with less writes (rsync with --checksum 
enabled, booting, starting large applications...)

another benefit: different technologies - it's very unlikely that both 
disks of a stripe fail at the same time or due rebuild when one half is 
a SSD and the other a HDD

^ permalink raw reply

* Re: [PATCH 02/17] block: introduce bio_remove_last_page()
From: Ming Lei @ 2017-02-16 13:59 UTC (permalink / raw)
  To: Johannes Thumshirn
  Cc: Shaohua Li, Jens Axboe, Linux Kernel Mailing List,
	open list:SOFTWARE RAID (Multiple Disks) SUPPORT, linux-block,
	Christoph Hellwig, NeilBrown
In-Reply-To: <d6d67f9b-e011-cc57-e484-e21b66ab9d6b@suse.de>

On Thu, Feb 16, 2017 at 9:40 PM, Johannes Thumshirn <jthumshirn@suse.de> wrote:
> On 02/16/2017 02:30 PM, Ming Lei wrote:
>> On Thu, Feb 16, 2017 at 8:08 PM, Johannes Thumshirn <jthumshirn@suse.de> wrote:
>>> On 02/16/2017 12:45 PM, Ming Lei wrote:
>>>> MD need this helper to remove the last added page, so introduce
>>>> it.
>>>>
>>>> Signed-off-by: Ming Lei <tom.leiming@gmail.com>
>>>> ---
>>>>  block/bio.c         | 23 +++++++++++++++++++++++
>>>>  include/linux/bio.h |  1 +
>>>>  2 files changed, 24 insertions(+)
>>>>
>>>> diff --git a/block/bio.c b/block/bio.c
>>>> index 5eec5e08417f..0ce7ffcd7939 100644
>>>> --- a/block/bio.c
>>>> +++ b/block/bio.c
>>>> @@ -837,6 +837,29 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
>>>>  EXPORT_SYMBOL(bio_add_pc_page);
>>>>
>>>>  /**
>>>> + *   bio_remove_last_page    -       remove the last added page
>>>> + *   @bio: destination bio
>>>> + *
>>>> + *   Attempt to remove the last added page from the bio_vec maplist.
>>>> + */
>>>> +void bio_remove_last_page(struct bio *bio)
>>>> +{
>>>> +     /*
>>>> +      * cloned bio must not modify vec list
>>>> +      */
>>>> +     if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
>>>> +             return;
>>>> +
>>>> +     if (bio->bi_vcnt > 0) {
>>>
>>> In patch 1 you introduce bio_segments_all() with the log message 'So
>>> that we can replace the direct access to .bi_vcnt.' Here you introduce a
>>> new direct access to it (plus the duplicated WARN_ON_ONCE()).
>>>
>>> Maybe use the helper directly here (I admit I haven't gone through the
>>> whole series yet, so I can't see if the change is made later).
>>
>> Firstly MD does need one helper to remove the last added page, as you
>> can see there are three such uses in patch3.
>>
>> Secondly both the two helpers will be changed once multipage bvec
>> is supported, that means we have to change MD too after multipage bvec
>> if just using bio_segments_all() to replace .bi_vcnt for removing
>> the last added page.
>
> I'm not sure if we're talking past each other here, I assumed you'd do
> something like:
>
> void bio_remove_last_page(struct bio *bio)
> {
>         int vcnt = bio_segments_all(bio);
>
>         if (bio_flagged(bio, BIO_CLONED))
>                 return;
>
>         if (vcnt > 0) {
>                 struct bio_vec *bv = &bio->bi_io_vec[vcnt - 1];
>
>                 bio->bi_iter.bi_size -= bv->bv_len;
>                 bio->bi_vcnt--;
>         }
> }

What we are doing is to remove the external direct access to bvec
table in drivers or filesystems because they may misuse that, for
example, drivers often use .bi_vcnt to get the page count in the bio,
but the actual meaning is just bvec's count.

And the implementation in block layer still need to play the table directly,
especially for sake of efficiency, cause we understand the details and won't
misuse that.


Thanks,
Ming Lei

^ permalink raw reply

* Re: [PATCH 02/17] block: introduce bio_remove_last_page()
From: Johannes Thumshirn @ 2017-02-16 13:40 UTC (permalink / raw)
  To: Ming Lei
  Cc: Shaohua Li, Jens Axboe, Linux Kernel Mailing List,
	open list:SOFTWARE RAID (Multiple Disks) SUPPORT, linux-block,
	Christoph Hellwig, NeilBrown
In-Reply-To: <CACVXFVPPifd4ktKnCb6v+44dwUOyN6cTNiQp5zT82r9XzP_z1A@mail.gmail.com>

On 02/16/2017 02:30 PM, Ming Lei wrote:
> On Thu, Feb 16, 2017 at 8:08 PM, Johannes Thumshirn <jthumshirn@suse.de> wrote:
>> On 02/16/2017 12:45 PM, Ming Lei wrote:
>>> MD need this helper to remove the last added page, so introduce
>>> it.
>>>
>>> Signed-off-by: Ming Lei <tom.leiming@gmail.com>
>>> ---
>>>  block/bio.c         | 23 +++++++++++++++++++++++
>>>  include/linux/bio.h |  1 +
>>>  2 files changed, 24 insertions(+)
>>>
>>> diff --git a/block/bio.c b/block/bio.c
>>> index 5eec5e08417f..0ce7ffcd7939 100644
>>> --- a/block/bio.c
>>> +++ b/block/bio.c
>>> @@ -837,6 +837,29 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
>>>  EXPORT_SYMBOL(bio_add_pc_page);
>>>
>>>  /**
>>> + *   bio_remove_last_page    -       remove the last added page
>>> + *   @bio: destination bio
>>> + *
>>> + *   Attempt to remove the last added page from the bio_vec maplist.
>>> + */
>>> +void bio_remove_last_page(struct bio *bio)
>>> +{
>>> +     /*
>>> +      * cloned bio must not modify vec list
>>> +      */
>>> +     if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
>>> +             return;
>>> +
>>> +     if (bio->bi_vcnt > 0) {
>>
>> In patch 1 you introduce bio_segments_all() with the log message 'So
>> that we can replace the direct access to .bi_vcnt.' Here you introduce a
>> new direct access to it (plus the duplicated WARN_ON_ONCE()).
>>
>> Maybe use the helper directly here (I admit I haven't gone through the
>> whole series yet, so I can't see if the change is made later).
> 
> Firstly MD does need one helper to remove the last added page, as you
> can see there are three such uses in patch3.
> 
> Secondly both the two helpers will be changed once multipage bvec
> is supported, that means we have to change MD too after multipage bvec
> if just using bio_segments_all() to replace .bi_vcnt for removing
> the last added page.

I'm not sure if we're talking past each other here, I assumed you'd do
something like:

void bio_remove_last_page(struct bio *bio)
{
	int vcnt = bio_segments_all(bio);

	if (bio_flagged(bio, BIO_CLONED))
		return;

	if (vcnt > 0) {
		struct bio_vec *bv = &bio->bi_io_vec[vcnt - 1];

		bio->bi_iter.bi_size -= bv->bv_len;
		bio->bi_vcnt--;
	}
}

-- 
Johannes Thumshirn                                          Storage
jthumshirn@suse.de                                +49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

^ permalink raw reply

* Re: [PATCH 13/17] md: raid1: use bio_segments_all()
From: Ming Lei @ 2017-02-16 13:38 UTC (permalink / raw)
  To: Johannes Thumshirn
  Cc: Shaohua Li, Jens Axboe, Linux Kernel Mailing List,
	open list:SOFTWARE RAID (Multiple Disks) SUPPORT, linux-block,
	Christoph Hellwig, NeilBrown
In-Reply-To: <fc4d24ec-55a1-7280-f734-c9b8c4a0ecae@suse.de>

On Thu, Feb 16, 2017 at 9:34 PM, Johannes Thumshirn <jthumshirn@suse.de> wrote:
> On 02/16/2017 02:32 PM, Ming Lei wrote:
>> On Thu, Feb 16, 2017 at 8:35 PM, Johannes Thumshirn <jthumshirn@suse.de> wrote:
>>> On 02/16/2017 12:45 PM, Ming Lei wrote:
>>>> @@ -998,7 +998,8 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
>>>>  {
>>>>       int i;
>>>>       struct bio_vec *bvec;
>>>> -     struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
>>>> +     unsigned vcnt = bio_segments_all(bio);
>>>> +     struct bio_vec *bvecs = kzalloc(vcnt * sizeof(struct bio_vec),
>>>>                                       GFP_NOIO);
>>>
>>> Maybe use kcalloc() instead of kzalloc() with a multiplication.
>>
>> That doesn't belong to this patch, which just wants to remove direct
>> access to .bi_vcnt.
>
> But you're touching it anyways, aren't you?

Don't you know the policy of 'do one thing, do it better' in one patch?

If you want to switch to kcalloc(), just post a patch, that is fine, but
please don't push me to do that in this patch.

Thanks,
Ming Lei

^ permalink raw reply

* Re: [PATCH 13/17] md: raid1: use bio_segments_all()
From: Johannes Thumshirn @ 2017-02-16 13:34 UTC (permalink / raw)
  To: Ming Lei
  Cc: Shaohua Li, Jens Axboe, Linux Kernel Mailing List,
	open list:SOFTWARE RAID (Multiple Disks) SUPPORT, linux-block,
	Christoph Hellwig, NeilBrown
In-Reply-To: <CACVXFVOf=4yjNRfemcYEYCc7eeQeL9Esp_WM9ELgX77S1LT+1g@mail.gmail.com>

On 02/16/2017 02:32 PM, Ming Lei wrote:
> On Thu, Feb 16, 2017 at 8:35 PM, Johannes Thumshirn <jthumshirn@suse.de> wrote:
>> On 02/16/2017 12:45 PM, Ming Lei wrote:
>>> @@ -998,7 +998,8 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
>>>  {
>>>       int i;
>>>       struct bio_vec *bvec;
>>> -     struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
>>> +     unsigned vcnt = bio_segments_all(bio);
>>> +     struct bio_vec *bvecs = kzalloc(vcnt * sizeof(struct bio_vec),
>>>                                       GFP_NOIO);
>>
>> Maybe use kcalloc() instead of kzalloc() with a multiplication.
> 
> That doesn't belong to this patch, which just wants to remove direct
> access to .bi_vcnt.

But you're touching it anyways, aren't you?


-- 
Johannes Thumshirn                                          Storage
jthumshirn@suse.de                                +49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

^ permalink raw reply

* Re: [PATCH 13/17] md: raid1: use bio_segments_all()
From: Ming Lei @ 2017-02-16 13:32 UTC (permalink / raw)
  To: Johannes Thumshirn
  Cc: Shaohua Li, Jens Axboe, Linux Kernel Mailing List,
	open list:SOFTWARE RAID (Multiple Disks) SUPPORT, linux-block,
	Christoph Hellwig, NeilBrown
In-Reply-To: <4a700de4-8540-d5d8-8c58-3465c0a7a55a@suse.de>

On Thu, Feb 16, 2017 at 8:35 PM, Johannes Thumshirn <jthumshirn@suse.de> wrote:
> On 02/16/2017 12:45 PM, Ming Lei wrote:
>> @@ -998,7 +998,8 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
>>  {
>>       int i;
>>       struct bio_vec *bvec;
>> -     struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
>> +     unsigned vcnt = bio_segments_all(bio);
>> +     struct bio_vec *bvecs = kzalloc(vcnt * sizeof(struct bio_vec),
>>                                       GFP_NOIO);
>
> Maybe use kcalloc() instead of kzalloc() with a multiplication.

That doesn't belong to this patch, which just wants to remove direct
access to .bi_vcnt.


Thanks,
Ming Lei

^ permalink raw reply

* Re: [PATCH 02/17] block: introduce bio_remove_last_page()
From: Ming Lei @ 2017-02-16 13:30 UTC (permalink / raw)
  To: Johannes Thumshirn
  Cc: Shaohua Li, Jens Axboe, Linux Kernel Mailing List,
	open list:SOFTWARE RAID (Multiple Disks) SUPPORT, linux-block,
	Christoph Hellwig, NeilBrown
In-Reply-To: <b443b3a1-ecc9-8238-1b3d-90870a25c793@suse.de>

On Thu, Feb 16, 2017 at 8:08 PM, Johannes Thumshirn <jthumshirn@suse.de> wrote:
> On 02/16/2017 12:45 PM, Ming Lei wrote:
>> MD need this helper to remove the last added page, so introduce
>> it.
>>
>> Signed-off-by: Ming Lei <tom.leiming@gmail.com>
>> ---
>>  block/bio.c         | 23 +++++++++++++++++++++++
>>  include/linux/bio.h |  1 +
>>  2 files changed, 24 insertions(+)
>>
>> diff --git a/block/bio.c b/block/bio.c
>> index 5eec5e08417f..0ce7ffcd7939 100644
>> --- a/block/bio.c
>> +++ b/block/bio.c
>> @@ -837,6 +837,29 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
>>  EXPORT_SYMBOL(bio_add_pc_page);
>>
>>  /**
>> + *   bio_remove_last_page    -       remove the last added page
>> + *   @bio: destination bio
>> + *
>> + *   Attempt to remove the last added page from the bio_vec maplist.
>> + */
>> +void bio_remove_last_page(struct bio *bio)
>> +{
>> +     /*
>> +      * cloned bio must not modify vec list
>> +      */
>> +     if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
>> +             return;
>> +
>> +     if (bio->bi_vcnt > 0) {
>
> In patch 1 you introduce bio_segments_all() with the log message 'So
> that we can replace the direct access to .bi_vcnt.' Here you introduce a
> new direct access to it (plus the duplicated WARN_ON_ONCE()).
>
> Maybe use the helper directly here (I admit I haven't gone through the
> whole series yet, so I can't see if the change is made later).

Firstly MD does need one helper to remove the last added page, as you
can see there are three such uses in patch3.

Secondly both the two helpers will be changed once multipage bvec
is supported, that means we have to change MD too after multipage bvec
if just using bio_segments_all() to replace .bi_vcnt for removing
the last added page.



Thanks,
Ming Lei

^ permalink raw reply

* Re: [PATCH 13/17] md: raid1: use bio_segments_all()
From: Johannes Thumshirn @ 2017-02-16 12:35 UTC (permalink / raw)
  To: Ming Lei, Shaohua Li, Jens Axboe, linux-kernel, linux-raid,
	linux-block, Christoph Hellwig, NeilBrown
In-Reply-To: <1487245547-24384-14-git-send-email-tom.leiming@gmail.com>

On 02/16/2017 12:45 PM, Ming Lei wrote:
> @@ -998,7 +998,8 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
>  {
>  	int i;
>  	struct bio_vec *bvec;
> -	struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
> +	unsigned vcnt = bio_segments_all(bio);
> +	struct bio_vec *bvecs = kzalloc(vcnt * sizeof(struct bio_vec),
>  					GFP_NOIO);

Maybe use kcalloc() instead of kzalloc() with a multiplication.

Byte,
	Johannes
-- 
Johannes Thumshirn                                          Storage
jthumshirn@suse.de                                +49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

^ permalink raw reply

* Re: [PATCH 02/17] block: introduce bio_remove_last_page()
From: Johannes Thumshirn @ 2017-02-16 12:08 UTC (permalink / raw)
  To: Ming Lei, Shaohua Li, Jens Axboe, linux-kernel, linux-raid,
	linux-block, Christoph Hellwig, NeilBrown
In-Reply-To: <1487245547-24384-3-git-send-email-tom.leiming@gmail.com>

On 02/16/2017 12:45 PM, Ming Lei wrote:
> MD need this helper to remove the last added page, so introduce
> it.
> 
> Signed-off-by: Ming Lei <tom.leiming@gmail.com>
> ---
>  block/bio.c         | 23 +++++++++++++++++++++++
>  include/linux/bio.h |  1 +
>  2 files changed, 24 insertions(+)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 5eec5e08417f..0ce7ffcd7939 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -837,6 +837,29 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
>  EXPORT_SYMBOL(bio_add_pc_page);
>  
>  /**
> + *	bio_remove_last_page	-	remove the last added page
> + *	@bio: destination bio
> + *
> + *	Attempt to remove the last added page from the bio_vec maplist.
> + */
> +void bio_remove_last_page(struct bio *bio)
> +{
> +	/*
> +	 * cloned bio must not modify vec list
> +	 */
> +	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
> +		return;
> +
> +	if (bio->bi_vcnt > 0) {

In patch 1 you introduce bio_segments_all() with the log message 'So
that we can replace the direct access to .bi_vcnt.' Here you introduce a
new direct access to it (plus the duplicated WARN_ON_ONCE()).

Maybe use the helper directly here (I admit I haven't gone through the
whole series yet, so I can't see if the change is made later).

Byte,
	Johannes

-- 
Johannes Thumshirn                                          Storage
jthumshirn@suse.de                                +49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

^ permalink raw reply

* [PATCH 17/17] md: raid10: avoid direct access to bvec table in handle_reshape_read_error
From: Ming Lei @ 2017-02-16 11:45 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1487245547-24384-1-git-send-email-tom.leiming@gmail.com>

The cost is 128bytes(8*16) stack space in kernel thread context, and
just use the bio helper to retrieve pages from bio.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid10.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5016ce8163c3..d9d7c79a3bd4 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4689,7 +4689,15 @@ static int handle_reshape_read_error(struct mddev *mddev,
 	struct r10bio *r10b = &on_stack.r10_bio;
 	int slot = 0;
 	int idx = 0;
-	struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
+	struct bio_vec *bvl;
+	struct page *pages[RESYNC_PAGES];
+
+	/*
+	 * This bio is allocated in reshape_request(), and size
+	 * is still RESYNC_PAGES
+	 */
+	bio_for_each_segment_all(bvl, r10_bio->master_bio, idx)
+		pages[idx] = bvl->bv_page;
 
 	r10b->sector = r10_bio->sector;
 	__raid10_find_phys(&conf->prev, r10b);
@@ -4718,7 +4726,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
 			success = sync_page_io(rdev,
 					       addr,
 					       s << 9,
-					       bvec[idx].bv_page,
+					       pages[idx],
 					       REQ_OP_READ, 0, false);
 			rdev_dec_pending(rdev, mddev);
 			rcu_read_lock();
-- 
2.7.4

^ permalink raw reply related

* [PATCH 16/17] md: raid10: avoid direct access to bvec table in reshape_request
From: Ming Lei @ 2017-02-16 11:45 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1487245547-24384-1-git-send-email-tom.leiming@gmail.com>

The cost is 128bytes(8*16) stack space in kernel thread context, and
just use the bio helper to retrieve pages from bio.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid10.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c7d2f73565d9..5016ce8163c3 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4383,6 +4383,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 	struct bio *blist;
 	struct bio *bio, *read_bio;
 	int sectors_done = 0;
+	struct bio_vec *bvl;
+	struct page *pages[RESYNC_PAGES];
 
 	if (sector_nr == 0) {
 		/* If restarting in the middle, skip the initial sectors */
@@ -4546,9 +4548,12 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 
 	/* Now add as many pages as possible to all of these bios. */
 
+	bio_for_each_segment_all(bvl, r10_bio->devs[0].bio, s)
+		pages[s] = bvl->bv_page;
+
 	nr_sectors = 0;
 	for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
-		struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
+		struct page *page = pages[s / (PAGE_SIZE >> 9)];
 		int len = (max_sectors - s) << 9;
 		if (len > PAGE_SIZE)
 			len = PAGE_SIZE;
-- 
2.7.4

^ permalink raw reply related

* [PATCH 15/17] md: raid10: avoid direct access to bvec table in fix_recovery_read_error
From: Ming Lei @ 2017-02-16 11:45 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1487245547-24384-1-git-send-email-tom.leiming@gmail.com>

The cost is 128bytes(8*16) stack space in kernel thread context, and just
use the bio helper to retrieve pages from bio.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid10.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 69fe2a3cef89..c7d2f73565d9 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2184,6 +2184,11 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
 	int idx = 0;
 	int dr = r10_bio->devs[0].devnum;
 	int dw = r10_bio->devs[1].devnum;
+	struct bio_vec *bvl;
+	struct page *pages[RESYNC_PAGES];
+
+	bio_for_each_segment_all(bvl, bio, idx)
+		pages[idx] = bvl->bv_page;
 
 	while (sectors) {
 		int s = sectors;
@@ -2199,7 +2204,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
 		ok = sync_page_io(rdev,
 				  addr,
 				  s << 9,
-				  bio->bi_io_vec[idx].bv_page,
+				  pages[idx],
 				  REQ_OP_READ, 0, false);
 		if (ok) {
 			rdev = conf->mirrors[dw].rdev;
@@ -2207,7 +2212,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
 			ok = sync_page_io(rdev,
 					  addr,
 					  s << 9,
-					  bio->bi_io_vec[idx].bv_page,
+					  pages[idx],
 					  REQ_OP_WRITE, 0, false);
 			if (!ok) {
 				set_bit(WriteErrorSeen, &rdev->flags);
-- 
2.7.4

^ permalink raw reply related

* [PATCH 14/17] md: raid10: avoid direct access to bvec table in sync_request_write()
From: Ming Lei @ 2017-02-16 11:45 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1487245547-24384-1-git-send-email-tom.leiming@gmail.com>

The cost is 256bytes(8*16*2) stack space, and just use the bio
helper to retrieve pages from bio.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid10.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5c698f3d3083..69fe2a3cef89 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2036,6 +2036,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 	int i, first;
 	struct bio *tbio, *fbio;
 	int vcnt;
+	struct bio_vec *bvl;
+	struct page *fbio_pages[RESYNC_PAGES], *tbio_pages[RESYNC_PAGES];
 
 	atomic_set(&r10_bio->remaining, 1);
 
@@ -2052,6 +2054,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 	fbio->bi_iter.bi_size = r10_bio->sectors << 9;
 	fbio->bi_iter.bi_idx = 0;
 
+	/* the bio has been filled up in raid10_sync_request */
+	bio_for_each_segment_all(bvl, fbio, i)
+		fbio_pages[i] = bvl->bv_page;
+
 	vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
 	/* now find blocks with errors */
 	for (i=0 ; i < conf->copies ; i++) {
@@ -2072,12 +2078,17 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 			 * All vec entries are PAGE_SIZE;
 			 */
 			int sectors = r10_bio->sectors;
+
+			/* the bio has been filled up in raid10_sync_request */
+			bio_for_each_segment_all(bvl, tbio, j)
+				tbio_pages[j] = bvl->bv_page;
+
 			for (j = 0; j < vcnt; j++) {
 				int len = PAGE_SIZE;
 				if (sectors < (len / 512))
 					len = sectors * 512;
-				if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
-					   page_address(tbio->bi_io_vec[j].bv_page),
+				if (memcmp(page_address(fbio_pages[j]),
+					   page_address(tbio_pages[j]),
 					   len))
 					break;
 				sectors -= len/512;
-- 
2.7.4

^ permalink raw reply related

* [PATCH 13/17] md: raid1: use bio_segments_all()
From: Ming Lei @ 2017-02-16 11:45 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1487245547-24384-1-git-send-email-tom.leiming@gmail.com>

Use this helper, instead of direct access to .bi_vcnt.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 54ec32be3277..02cfece74981 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -998,7 +998,8 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
 {
 	int i;
 	struct bio_vec *bvec;
-	struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
+	unsigned vcnt = bio_segments_all(bio);
+	struct bio_vec *bvecs = kzalloc(vcnt * sizeof(struct bio_vec),
 					GFP_NOIO);
 	if (unlikely(!bvecs))
 		return;
@@ -1014,12 +1015,12 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
 		kunmap(bvec->bv_page);
 	}
 	r1_bio->behind_bvecs = bvecs;
-	r1_bio->behind_page_count = bio->bi_vcnt;
+	r1_bio->behind_page_count = vcnt;
 	set_bit(R1BIO_BehindIO, &r1_bio->state);
 	return;
 
 do_sync_io:
-	for (i = 0; i < bio->bi_vcnt; i++)
+	for (i = 0; i < vcnt; i++)
 		if (bvecs[i].bv_page)
 			put_page(bvecs[i].bv_page);
 	kfree(bvecs);
-- 
2.7.4

^ permalink raw reply related

* [PATCH 12/17] md: raid1: avoid direct access to bvec table in process_checks()
From: Ming Lei @ 2017-02-16 11:45 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1487245547-24384-1-git-send-email-tom.leiming@gmail.com>

Given process_checks is only called in resync path, it should be
ok to allocate three stack variable(total 320byteds) to store
pages from bios.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4400fbe7ce8c..54ec32be3277 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2010,6 +2010,9 @@ static void process_checks(struct r1bio *r1_bio)
 	int primary;
 	int i;
 	int vcnt;
+	struct bio_vec *bi;
+	int page_len[RESYNC_PAGES];
+	struct page *pbio_pages[RESYNC_PAGES], *sbio_pages[RESYNC_PAGES];
 
 	/* Fix variable parts of all bios */
 	vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9);
@@ -2017,7 +2020,6 @@ static void process_checks(struct r1bio *r1_bio)
 		int j;
 		int size;
 		int error;
-		struct bio_vec *bi;
 		struct bio *b = r1_bio->bios[i];
 		if (b->bi_end_io != end_sync_read)
 			continue;
@@ -2051,6 +2053,11 @@ static void process_checks(struct r1bio *r1_bio)
 			break;
 		}
 	r1_bio->read_disk = primary;
+
+	/* .bi_vcnt has been set for all read bios */
+	bio_for_each_segment_all(bi, r1_bio->bios[primary], i)
+		pbio_pages[i] = bi->bv_page;
+
 	for (i = 0; i < conf->raid_disks * 2; i++) {
 		int j;
 		struct bio *pbio = r1_bio->bios[primary];
@@ -2062,14 +2069,19 @@ static void process_checks(struct r1bio *r1_bio)
 		/* Now we can 'fixup' the error value */
 		sbio->bi_error = 0;
 
+		bio_for_each_segment_all(bi, sbio, j) {
+			sbio_pages[j] = bi->bv_page;
+			page_len[j] = bi->bv_len;
+		}
+
 		if (!error) {
 			for (j = vcnt; j-- ; ) {
 				struct page *p, *s;
-				p = pbio->bi_io_vec[j].bv_page;
-				s = sbio->bi_io_vec[j].bv_page;
+				p = pbio_pages[j];
+				s = sbio_pages[j];
 				if (memcmp(page_address(p),
 					   page_address(s),
-					   sbio->bi_io_vec[j].bv_len))
+					   page_len[j]))
 					break;
 			}
 		} else
-- 
2.7.4

^ permalink raw reply related

* [PATCH 11/17] md: raid1: use bio helper in process_checks()
From: Ming Lei @ 2017-02-16 11:45 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1487245547-24384-1-git-send-email-tom.leiming@gmail.com>

Avoid to direct access to bvec table.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 02ee8542295d..4400fbe7ce8c 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2017,6 +2017,7 @@ static void process_checks(struct r1bio *r1_bio)
 		int j;
 		int size;
 		int error;
+		struct bio_vec *bi;
 		struct bio *b = r1_bio->bios[i];
 		if (b->bi_end_io != end_sync_read)
 			continue;
@@ -2033,9 +2034,7 @@ static void process_checks(struct r1bio *r1_bio)
 		b->bi_private = r1_bio;
 
 		size = b->bi_iter.bi_size;
-		for (j = 0; j < vcnt ; j++) {
-			struct bio_vec *bi;
-			bi = &b->bi_io_vec[j];
+		bio_for_each_segment_all(bi, b, j) {
 			bi->bv_offset = 0;
 			if (size > PAGE_SIZE)
 				bi->bv_len = PAGE_SIZE;
-- 
2.7.4

^ permalink raw reply related

* [PATCH 10/17] md: raid1: remove direct access to bvec table in fix_sync_read_error
From: Ming Lei @ 2017-02-16 11:45 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1487245547-24384-1-git-send-email-tom.leiming@gmail.com>

This patch uses a stack variable to hold the pages in bio, so
that we can remove direct access to bvec table in fix_sync_read_error().

This 16*8 stack variable is just fine for kernel thread context.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 1dd6b2760fba..02ee8542295d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1874,6 +1874,16 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 	int sectors = r1_bio->sectors;
 	int idx = 0;
 	struct md_rdev *rdev;
+	struct bio_vec *bvl;
+	int i;
+	struct page *pages[RESYNC_PAGES];
+
+	/*
+	 * bio for read_disk is filled up, so we can use
+	 * bio_for_each_segment_all() to retrieve all pages.
+	 */
+	bio_for_each_segment_all(bvl, bio, i)
+		pages[i] = bvl->bv_page;
 
 	rdev = conf->mirrors[r1_bio->read_disk].rdev;
 	if (test_bit(FailFast, &rdev->flags)) {
@@ -1903,7 +1913,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 				 */
 				rdev = conf->mirrors[d].rdev;
 				if (sync_page_io(rdev, sect, s<<9,
-						 bio->bi_io_vec[idx].bv_page,
+						 pages[idx],
 						 REQ_OP_READ, 0, false)) {
 					success = 1;
 					break;
@@ -1958,7 +1968,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 				continue;
 			rdev = conf->mirrors[d].rdev;
 			if (r1_sync_page_io(rdev, sect, s,
-					    bio->bi_io_vec[idx].bv_page,
+					    pages[idx],
 					    WRITE) == 0) {
 				r1_bio->bios[d]->bi_end_io = NULL;
 				rdev_dec_pending(rdev, mddev);
@@ -1973,7 +1983,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 				continue;
 			rdev = conf->mirrors[d].rdev;
 			if (r1_sync_page_io(rdev, sect, s,
-					    bio->bi_io_vec[idx].bv_page,
+					    pages[idx],
 					    READ) != 0)
 				atomic_add(s, &rdev->corrected_errors);
 		}
-- 
2.7.4

^ permalink raw reply related

* [PATCH 09/17] md: raid1/raid10: use bio helper in *_pool_free
From: Ming Lei @ 2017-02-16 11:45 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1487245547-24384-1-git-send-email-tom.leiming@gmail.com>

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c  | 13 ++++++++++---
 drivers/md/raid10.c | 11 +++++++----
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ebea8615344a..1dd6b2760fba 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -172,9 +172,16 @@ static void r1buf_pool_free(void *__r1_bio, void *data)
 	int i,j;
 	struct r1bio *r1bio = __r1_bio;
 
-	for (i = 0; i < RESYNC_PAGES; i++)
-		for (j = pi->raid_disks; j-- ;)
-			safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
+	for (i = 0; i < pi->raid_disks; i++) {
+		struct bio_vec *bvl;
+		struct bio *bio = r1bio->bios[i];
+
+		/* make sure all pages can be freed */
+		bio->bi_vcnt = RESYNC_PAGES;
+
+		bio_for_each_segment_all(bvl, bio, j)
+			safe_put_page(bvl->bv_page);
+	}
 	for (i=0 ; i < pi->raid_disks; i++)
 		bio_put(r1bio->bios[i]);
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8262f3e1dd93..5c698f3d3083 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -230,10 +230,13 @@ static void r10buf_pool_free(void *__r10_bio, void *data)
 	for (j=0; j < conf->copies; j++) {
 		struct bio *bio = r10bio->devs[j].bio;
 		if (bio) {
-			for (i = 0; i < RESYNC_PAGES; i++) {
-				safe_put_page(bio->bi_io_vec[i].bv_page);
-				bio->bi_io_vec[i].bv_page = NULL;
-			}
+			struct bio_vec *bvl;
+
+			/* make sure all pages can be freed */
+			bio->bi_vcnt = RESYNC_PAGES;
+
+			bio_for_each_segment_all(bvl, bio, i)
+				safe_put_page(bvl->bv_page);
 			bio_put(bio);
 		}
 		bio = r10bio->devs[j].repl_bio;
-- 
2.7.4

^ permalink raw reply related

* [PATCH 08/17] md: raid1: simplify r1buf_pool_free()
From: Ming Lei @ 2017-02-16 11:45 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1487245547-24384-1-git-send-email-tom.leiming@gmail.com>

This patch gets each page's reference of each bio for resync,
then r1buf_pool_free() gets simplified.

The same policy has been taken in raid10's buf pool allocation/free
too.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 23a3a678a9ed..ebea8615344a 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -143,9 +143,12 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 	/* If not user-requests, copy the page pointers to all bios */
 	if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
 		for (i=0; i<RESYNC_PAGES ; i++)
-			for (j=1; j<pi->raid_disks; j++)
-				r1_bio->bios[j]->bi_io_vec[i].bv_page =
+			for (j=1; j<pi->raid_disks; j++) {
+				struct page *page =
 					r1_bio->bios[0]->bi_io_vec[i].bv_page;
+				get_page(page);
+				r1_bio->bios[j]->bi_io_vec[i].bv_page = page;
+			}
 	}
 
 	r1_bio->master_bio = NULL;
@@ -170,12 +173,8 @@ static void r1buf_pool_free(void *__r1_bio, void *data)
 	struct r1bio *r1bio = __r1_bio;
 
 	for (i = 0; i < RESYNC_PAGES; i++)
-		for (j = pi->raid_disks; j-- ;) {
-			if (j == 0 ||
-			    r1bio->bios[j]->bi_io_vec[i].bv_page !=
-			    r1bio->bios[0]->bi_io_vec[i].bv_page)
-				safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
-		}
+		for (j = pi->raid_disks; j-- ;)
+			safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
 	for (i=0 ; i < pi->raid_disks; i++)
 		bio_put(r1bio->bios[i]);
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH 07/17] md: raid1/raid10: don't use .bi_vcnt to check if all pages are added
From: Ming Lei @ 2017-02-16 11:45 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1487245547-24384-1-git-send-email-tom.leiming@gmail.com>

Instead we use the index of the pre-allocated pages buffer, it should
be more explicit because the index(.bi_error) just means how many pages
are added to the bio.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c  | 2 +-
 drivers/md/raid10.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 8904a9149671..23a3a678a9ed 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2835,7 +2835,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 		nr_sectors += len>>9;
 		sector_nr += len>>9;
 		sync_blocks -= (len>>9);
-	} while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
+	} while (r1_bio->bios[disk]->bi_error < RESYNC_PAGES);
  bio_full:
 	/* return .bi_error back to bio */
 	for (i = 0 ; i < conf->raid_disks * 2; i++) {
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 9cfc22cd1330..8262f3e1dd93 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3453,7 +3453,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		}
 		nr_sectors += len>>9;
 		sector_nr += len>>9;
-	} while (biolist->bi_vcnt < RESYNC_PAGES);
+	} while (biolist->bi_error < RESYNC_PAGES);
  bio_full:
 	/* return .bi_error back to bio, and set resync's as -EIO */
 	for (bio= biolist ; bio ; bio=bio->bi_next)
-- 
2.7.4

^ permalink raw reply related

* [PATCH 06/17] md: raid1/raid10: borrow .bi_error as pre-allocated page index
From: Ming Lei @ 2017-02-16 11:45 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1487245547-24384-1-git-send-email-tom.leiming@gmail.com>

Before bio is submitted, it is safe to borrow .bi_error. This
patch uses .bi_error as index of pre-allocated page in bio, so
that we can avoid to mess .bi_vcnt. Especially the old way
will not work any more when multipage bvec is introduced.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c  | 12 ++++++++++--
 drivers/md/raid10.c | 14 ++++++++++----
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c4791fbd69ac..8904a9149671 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2811,13 +2811,14 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 				len = sync_blocks<<9;
 		}
 
+		/* borrow .bi_error as pre-allocated page index */
 		for (i = 0 ; i < conf->raid_disks * 2; i++) {
 			bio = r1_bio->bios[i];
 			if (bio->bi_end_io) {
-				page = mdev_get_page_from_bio(bio, bio->bi_vcnt);
+				page = mdev_get_page_from_bio(bio, bio->bi_error++);
 				if (bio_add_page(bio, page, len, 0) == 0) {
 					/* stop here */
-					mdev_put_page_to_bio(bio, bio->bi_vcnt, page);
+					mdev_put_page_to_bio(bio, --bio->bi_error, page);
 					while (i > 0) {
 						i--;
 						bio = r1_bio->bios[i];
@@ -2836,6 +2837,13 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 		sync_blocks -= (len>>9);
 	} while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
  bio_full:
+	/* return .bi_error back to bio */
+	for (i = 0 ; i < conf->raid_disks * 2; i++) {
+		bio = r1_bio->bios[i];
+		if (bio->bi_end_io)
+			bio->bi_error = 0;
+	}
+
 	r1_bio->sectors = nr_sectors;
 
 	if (mddev_is_clustered(mddev) &&
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index b7dfbca869a3..9cfc22cd1330 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3348,7 +3348,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
 			bio = r10_bio->devs[i].bio;
 			bio_reset(bio);
-			bio->bi_error = -EIO;
 			rcu_read_lock();
 			rdev = rcu_dereference(conf->mirrors[d].rdev);
 			if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
@@ -3392,7 +3391,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 			/* Need to set up for writing to the replacement */
 			bio = r10_bio->devs[i].repl_bio;
 			bio_reset(bio);
-			bio->bi_error = -EIO;
 
 			sector = r10_bio->devs[i].addr;
 			bio->bi_next = biolist;
@@ -3435,14 +3433,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 			len = (max_sector - sector_nr) << 9;
 		if (len == 0)
 			break;
+		/* borrow .bi_error as pre-allocated page index */
 		for (bio= biolist ; bio ; bio=bio->bi_next) {
 			struct bio *bio2;
-			page = mdev_get_page_from_bio(bio, bio->bi_vcnt);
+			page = mdev_get_page_from_bio(bio, bio->bi_error++);
 			if (bio_add_page(bio, page, len, 0))
 				continue;
 
 			/* stop here */
-			mdev_put_page_to_bio(bio, bio->bi_vcnt, page);
+			mdev_put_page_to_bio(bio, --bio->bi_error, page);
 			for (bio2 = biolist;
 			     bio2 && bio2 != bio;
 			     bio2 = bio2->bi_next) {
@@ -3456,6 +3455,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		sector_nr += len>>9;
 	} while (biolist->bi_vcnt < RESYNC_PAGES);
  bio_full:
+	/* return .bi_error back to bio, and set resync's as -EIO */
+	for (bio= biolist ; bio ; bio=bio->bi_next)
+		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+			bio->bi_error = -EIO;
+		else
+			bio->bi_error = 0;
+
 	r10_bio->sectors = nr_sectors;
 
 	while (biolist) {
-- 
2.7.4


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox