Re: [PATCH 1/2] block: Add support for atomic writes

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Shaohua Li <shli@kernel.org>
To: Chris Mason <chris.mason@fusionio.com>
Cc: Linux FS Devel <linux-fsdevel@vger.kernel.org>,
	Jens Axboe <axboe@kernel.dk>
Subject: Re: [PATCH 1/2] block: Add support for atomic writes
Date: Sat, 2 Nov 2013 05:47:28 +0800	[thread overview]
Message-ID: <20131101214728.GA21406@kernel.org> (raw)
In-Reply-To: <20131101212854.10239.19830@localhost.localdomain>

On Fri, Nov 01, 2013 at 05:28:54PM -0400, Chris Mason wrote:
> This allows filesystems and O_DIRECT to send down a list of bios
> flagged for atomic completion.  If the hardware supports atomic
> IO, it is given the whole list in a single make_request_fn
> call.
> 
> In order to limit corner cases, there are a few restrictions in the
> current code:
> 
> * Every bio in the list must be for the same queue
> 
> * Every bio must be a simple write.  No trims or reads may be mixed in
> 
> A new blk_queue_set_atomic_write() sets the number of atomic segments a
> given driver can accept.

If the queue is request based, I thought we have problem with linked bios. For
example, init request will be wrong with linked bios.

> Any number greater than one is allowed, but the driver is expected to
> do final checks on the bio list to make sure a given list fits inside
> its atomic capabilities.
> 
> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
> ---
>  block/blk-core.c       | 217 +++++++++++++++++++++++++++++++------------------
>  block/blk-settings.c   |  17 ++++
>  include/linux/blkdev.h |  14 ++++
>  3 files changed, 170 insertions(+), 78 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 39d1261..6a5c292 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1664,95 +1664,131 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
>  	return 0;
>  }
>  
> +static void end_linked_bio(struct bio *bio, int err)
> +{
> +	struct bio *next;
> +	do {
> +		next = bio->bi_next;
> +		bio->bi_next = NULL;
> +		bio_endio(bio, err);
> +		bio = next;
> +	} while (bio);
> +}
> +
>  static noinline_for_stack bool
> -generic_make_request_checks(struct bio *bio)
> +generic_make_request_checks(struct bio *first_bio)
>  {
> -	struct request_queue *q;
> -	int nr_sectors = bio_sectors(bio);
> +	struct request_queue *q = NULL;
> +	int nr_sectors;
>  	int err = -EIO;
>  	char b[BDEVNAME_SIZE];
>  	struct hd_struct *part;
> +	struct bio *bio;
> +	int linked_bio = first_bio->bi_next ? 1 : 0;
>  
>  	might_sleep();
>  
> -	if (bio_check_eod(bio, nr_sectors))
> -		goto end_io;
> +	bio = first_bio;
> +	for_each_bio(bio) {
> +		nr_sectors = bio_sectors(bio);
> +		if (bio_check_eod(bio, nr_sectors))
> +			goto end_io;
>  
> -	q = bdev_get_queue(bio->bi_bdev);
> -	if (unlikely(!q)) {
> -		printk(KERN_ERR
> -		       "generic_make_request: Trying to access "
> -			"nonexistent block-device %s (%Lu)\n",
> -			bdevname(bio->bi_bdev, b),
> -			(long long) bio->bi_iter.bi_sector);
> -		goto end_io;
> -	}
> +		if (!q) {
> +			q = bdev_get_queue(bio->bi_bdev);
> +			if (unlikely(!q)) {
> +				printk(KERN_ERR
> +				       "generic_make_request: Trying to access "
> +					"nonexistent block-device %s (%Lu)\n",
> +					bdevname(bio->bi_bdev, b),
> +					(long long) bio->bi_iter.bi_sector);
> +				goto end_io;
> +			}
> +		} else if (q != bdev_get_queue(bio->bi_bdev)) {
> +			printk(KERN_ERR "generic_make_request: linked bio queue mismatch\n");
> +			goto end_io;
> +		}
>  
> -	if (likely(bio_is_rw(bio) &&
> -		   nr_sectors > queue_max_hw_sectors(q))) {
> -		printk(KERN_ERR "bio too big device %s (%u > %u)\n",
> -		       bdevname(bio->bi_bdev, b),
> -		       bio_sectors(bio),
> -		       queue_max_hw_sectors(q));
> -		goto end_io;
> -	}
> +		if (likely(bio_is_rw(bio) &&
> +			   nr_sectors > queue_max_hw_sectors(q))) {
> +			printk(KERN_ERR "bio too big device %s (%u > %u)\n",
> +			       bdevname(bio->bi_bdev, b),
> +			       bio_sectors(bio),
> +			       queue_max_hw_sectors(q));
> +			goto end_io;
> +		}
>  
> -	part = bio->bi_bdev->bd_part;
> -	if (should_fail_request(part, bio->bi_iter.bi_size) ||
> -	    should_fail_request(&part_to_disk(part)->part0,
> -				bio->bi_iter.bi_size))
> -		goto end_io;
> +		part = bio->bi_bdev->bd_part;
> +		if (should_fail_request(part, bio->bi_iter.bi_size) ||
> +		    should_fail_request(&part_to_disk(part)->part0,
> +					bio->bi_iter.bi_size))
> +			goto end_io;
>  
> -	/*
> -	 * If this device has partitions, remap block n
> -	 * of partition p to block n+start(p) of the disk.
> -	 */
> -	blk_partition_remap(bio);
> +		/*
> +		 * If this device has partitions, remap block n
> +		 * of partition p to block n+start(p) of the disk.
> +		 */
> +		blk_partition_remap(bio);
>  
> -	if (bio_check_eod(bio, nr_sectors))
> -		goto end_io;
> +		if (bio_check_eod(bio, nr_sectors))
> +			goto end_io;
>  
> -	/*
> -	 * Filter flush bio's early so that make_request based
> -	 * drivers without flush support don't have to worry
> -	 * about them.
> -	 */
> -	if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
> -		bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
> -		if (!nr_sectors) {
> -			err = 0;
> +		/*
> +		 * Filter flush bio's early so that make_request based
> +		 * drivers without flush support don't have to worry
> +		 * about them.
> +		 */
> +		if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
> +			bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
> +			if (!nr_sectors) {
> +				/*
> +				 * we don't know how to mix empty flush bios
> +				 * with a list of non-flush bios on devices
> +				 * that don't support flushing
> +				 */
> +				if (linked_bio)
> +					err = -EINVAL;
> +				else
> +					err = 0;
> +				goto end_io;
> +			}
> +		}
> +
> +		if ((bio->bi_rw & REQ_DISCARD) &&
> +		    (!blk_queue_discard(q) ||
> +		     ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) {
> +			err = -EOPNOTSUPP;
>  			goto end_io;
>  		}
> -	}
>  
> -	if ((bio->bi_rw & REQ_DISCARD) &&
> -	    (!blk_queue_discard(q) ||
> -	     ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) {
> -		err = -EOPNOTSUPP;
> -		goto end_io;
> -	}
> +		if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) {
> +			err = -EOPNOTSUPP;
> +			goto end_io;
> +		}
>  
> -	if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) {
> -		err = -EOPNOTSUPP;
> -		goto end_io;
> -	}
> +		if ((bio->bi_rw & REQ_ATOMIC) &&
> +		    !q->limits.atomic_write_segments) {
> +			err = -EOPNOTSUPP;
> +			goto end_io;
> +		}

Looks we don't check if the queue is a stacked queue for MD/DM. And I didn't
see we check if atomic bios number exceeds atomic_write_segments.
  
> -	/*
> -	 * Various block parts want %current->io_context and lazy ioc
> -	 * allocation ends up trading a lot of pain for a small amount of
> -	 * memory.  Just allocate it upfront.  This may fail and block
> -	 * layer knows how to live with it.
> -	 */
> -	create_io_context(GFP_ATOMIC, q->node);
> +		/*
> +		 * Various block parts want %current->io_context and lazy ioc
> +		 * allocation ends up trading a lot of pain for a small amount of
> +		 * memory.  Just allocate it upfront.  This may fail and block
> +		 * layer knows how to live with it.
> +		 */
> +		create_io_context(GFP_ATOMIC, q->node);
>  
> -	if (blk_throtl_bio(q, bio))
> -		return false;	/* throttled, will be resubmitted later */
> +		if (blk_throtl_bio(q, bio))
> +			return false;	/* throttled, will be resubmitted later */
>  
> -	trace_block_bio_queue(q, bio);
> +		trace_block_bio_queue(q, bio);
> +	}
>  	return true;
>  
>  end_io:
> -	bio_endio(bio, err);
> +	end_linked_bio(first_bio, err);
>  	return false;
>  }
>  
> @@ -1788,6 +1824,17 @@ void generic_make_request(struct bio *bio)
>  		return;
>  
>  	/*
> +	 * generic_make_request checks for atomic write support, we'll have
> +	 * failed already if the queue doesn't support it
> +	 */
> +	if (bio->bi_rw & REQ_ATOMIC) {
> +		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
> +
> +		q->make_request_fn(q, bio);

directly sending requests out has problem for MD/DM. But We should fail early
for MD/DM check, so this isn't a problem if we detected this is a stacked queue.

Thanks,
Shaohua

next prev parent reply	other threads:[~2013-11-01 22:35 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-11-01 21:27 [PATCH 0/2] Support for atomic IOs Chris Mason
2013-11-01 21:28 ` [PATCH 1/2] block: Add support for atomic writes Chris Mason
2013-11-01 21:47   ` Shaohua Li [this message]
2013-11-05 17:43   ` Jeff Moyer
2013-11-07 13:52     ` Chris Mason
2013-11-07 15:43       ` Jeff Moyer
2013-11-07 15:55         ` Chris Mason
2013-11-07 16:14           ` Jeff Moyer
2013-11-07 16:52             ` Chris Mason
2013-11-13 23:59             ` Dave Chinner
2013-11-12 15:11       ` Matthew Wilcox
2013-11-13 20:44         ` Chris Mason
2013-11-13 20:53           ` Howard Chu
2013-11-13 21:35           ` Matthew Wilcox
2013-11-01 21:29 ` [PATCH 2/3] fs: Add O_ATOMIC support to direct IO Chris Mason
  -- strict thread matches above, loose matches on Subject: below --
2013-11-20  8:23 [PATCH 1/2] block: Add support for atomic writes Kishore Sampathkumar
2013-11-26  6:24 Kishore Sampathkumar

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20131101214728.GA21406@kernel.org \
    --to=shli@kernel.org \
    --cc=axboe@kernel.dk \
    --cc=chris.mason@fusionio.com \
    --cc=linux-fsdevel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.