All of lore.kernel.org
 help / color / mirror / Atom feed
From: Chris Mason <chris.mason@fusionio.com>
To: Linux FS Devel <linux-fsdevel@vger.kernel.org>,
	Jens Axboe <axboe@kernel.dk>
Subject: [PATCH 1/2] block: Add support for atomic writes
Date: Fri, 1 Nov 2013 17:28:54 -0400	[thread overview]
Message-ID: <20131101212854.10239.19830@localhost.localdomain> (raw)
In-Reply-To: <20131101212704.10239.73920@localhost.localdomain>

This allows filesystems and O_DIRECT to send down a list of bios
flagged for atomic completion.  If the hardware supports atomic
IO, it is given the whole list in a single make_request_fn
call.

In order to limit corner cases, there are a few restrictions in the
current code:

* Every bio in the list must be for the same queue

* Every bio must be a simple write.  No trims or reads may be mixed in

A new blk_queue_set_atomic_write() sets the number of atomic segments a
given driver can accept.

Any number greater than one is allowed, but the driver is expected to
do final checks on the bio list to make sure a given list fits inside
its atomic capabilities.

Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 block/blk-core.c       | 217 +++++++++++++++++++++++++++++++------------------
 block/blk-settings.c   |  17 ++++
 include/linux/blkdev.h |  14 ++++
 3 files changed, 170 insertions(+), 78 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 39d1261..6a5c292 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1664,95 +1664,131 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
 	return 0;
 }
 
+static void end_linked_bio(struct bio *bio, int err)
+{
+	struct bio *next;
+	do {
+		next = bio->bi_next;
+		bio->bi_next = NULL;
+		bio_endio(bio, err);
+		bio = next;
+	} while (bio);
+}
+
 static noinline_for_stack bool
-generic_make_request_checks(struct bio *bio)
+generic_make_request_checks(struct bio *first_bio)
 {
-	struct request_queue *q;
-	int nr_sectors = bio_sectors(bio);
+	struct request_queue *q = NULL;
+	int nr_sectors;
 	int err = -EIO;
 	char b[BDEVNAME_SIZE];
 	struct hd_struct *part;
+	struct bio *bio;
+	int linked_bio = first_bio->bi_next ? 1 : 0;
 
 	might_sleep();
 
-	if (bio_check_eod(bio, nr_sectors))
-		goto end_io;
+	bio = first_bio;
+	for_each_bio(bio) {
+		nr_sectors = bio_sectors(bio);
+		if (bio_check_eod(bio, nr_sectors))
+			goto end_io;
 
-	q = bdev_get_queue(bio->bi_bdev);
-	if (unlikely(!q)) {
-		printk(KERN_ERR
-		       "generic_make_request: Trying to access "
-			"nonexistent block-device %s (%Lu)\n",
-			bdevname(bio->bi_bdev, b),
-			(long long) bio->bi_iter.bi_sector);
-		goto end_io;
-	}
+		if (!q) {
+			q = bdev_get_queue(bio->bi_bdev);
+			if (unlikely(!q)) {
+				printk(KERN_ERR
+				       "generic_make_request: Trying to access "
+					"nonexistent block-device %s (%Lu)\n",
+					bdevname(bio->bi_bdev, b),
+					(long long) bio->bi_iter.bi_sector);
+				goto end_io;
+			}
+		} else if (q != bdev_get_queue(bio->bi_bdev)) {
+			printk(KERN_ERR "generic_make_request: linked bio queue mismatch\n");
+			goto end_io;
+		}
 
-	if (likely(bio_is_rw(bio) &&
-		   nr_sectors > queue_max_hw_sectors(q))) {
-		printk(KERN_ERR "bio too big device %s (%u > %u)\n",
-		       bdevname(bio->bi_bdev, b),
-		       bio_sectors(bio),
-		       queue_max_hw_sectors(q));
-		goto end_io;
-	}
+		if (likely(bio_is_rw(bio) &&
+			   nr_sectors > queue_max_hw_sectors(q))) {
+			printk(KERN_ERR "bio too big device %s (%u > %u)\n",
+			       bdevname(bio->bi_bdev, b),
+			       bio_sectors(bio),
+			       queue_max_hw_sectors(q));
+			goto end_io;
+		}
 
-	part = bio->bi_bdev->bd_part;
-	if (should_fail_request(part, bio->bi_iter.bi_size) ||
-	    should_fail_request(&part_to_disk(part)->part0,
-				bio->bi_iter.bi_size))
-		goto end_io;
+		part = bio->bi_bdev->bd_part;
+		if (should_fail_request(part, bio->bi_iter.bi_size) ||
+		    should_fail_request(&part_to_disk(part)->part0,
+					bio->bi_iter.bi_size))
+			goto end_io;
 
-	/*
-	 * If this device has partitions, remap block n
-	 * of partition p to block n+start(p) of the disk.
-	 */
-	blk_partition_remap(bio);
+		/*
+		 * If this device has partitions, remap block n
+		 * of partition p to block n+start(p) of the disk.
+		 */
+		blk_partition_remap(bio);
 
-	if (bio_check_eod(bio, nr_sectors))
-		goto end_io;
+		if (bio_check_eod(bio, nr_sectors))
+			goto end_io;
 
-	/*
-	 * Filter flush bio's early so that make_request based
-	 * drivers without flush support don't have to worry
-	 * about them.
-	 */
-	if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
-		bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
-		if (!nr_sectors) {
-			err = 0;
+		/*
+		 * Filter flush bio's early so that make_request based
+		 * drivers without flush support don't have to worry
+		 * about them.
+		 */
+		if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
+			bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
+			if (!nr_sectors) {
+				/*
+				 * we don't know how to mix empty flush bios
+				 * with a list of non-flush bios on devices
+				 * that don't support flushing
+				 */
+				if (linked_bio)
+					err = -EINVAL;
+				else
+					err = 0;
+				goto end_io;
+			}
+		}
+
+		if ((bio->bi_rw & REQ_DISCARD) &&
+		    (!blk_queue_discard(q) ||
+		     ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) {
+			err = -EOPNOTSUPP;
 			goto end_io;
 		}
-	}
 
-	if ((bio->bi_rw & REQ_DISCARD) &&
-	    (!blk_queue_discard(q) ||
-	     ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) {
-		err = -EOPNOTSUPP;
-		goto end_io;
-	}
+		if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) {
+			err = -EOPNOTSUPP;
+			goto end_io;
+		}
 
-	if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) {
-		err = -EOPNOTSUPP;
-		goto end_io;
-	}
+		if ((bio->bi_rw & REQ_ATOMIC) &&
+		    !q->limits.atomic_write_segments) {
+			err = -EOPNOTSUPP;
+			goto end_io;
+		}
 
-	/*
-	 * Various block parts want %current->io_context and lazy ioc
-	 * allocation ends up trading a lot of pain for a small amount of
-	 * memory.  Just allocate it upfront.  This may fail and block
-	 * layer knows how to live with it.
-	 */
-	create_io_context(GFP_ATOMIC, q->node);
+		/*
+		 * Various block parts want %current->io_context and lazy ioc
+		 * allocation ends up trading a lot of pain for a small amount of
+		 * memory.  Just allocate it upfront.  This may fail and block
+		 * layer knows how to live with it.
+		 */
+		create_io_context(GFP_ATOMIC, q->node);
 
-	if (blk_throtl_bio(q, bio))
-		return false;	/* throttled, will be resubmitted later */
+		if (blk_throtl_bio(q, bio))
+			return false;	/* throttled, will be resubmitted later */
 
-	trace_block_bio_queue(q, bio);
+		trace_block_bio_queue(q, bio);
+	}
 	return true;
 
 end_io:
-	bio_endio(bio, err);
+	end_linked_bio(first_bio, err);
 	return false;
 }
 
@@ -1788,6 +1824,17 @@ void generic_make_request(struct bio *bio)
 		return;
 
 	/*
+	 * generic_make_request checks for atomic write support, we'll have
+	 * failed already if the queue doesn't support it
+	 */
+	if (bio->bi_rw & REQ_ATOMIC) {
+		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+
+		q->make_request_fn(q, bio);
+		return;
+	}
+
+	/*
 	 * We only want one ->make_request_fn to be active at a time, else
 	 * stack usage with stacked devices could be a problem.  So use
 	 * current->bio_list to keep a list of requests submited by a
@@ -1815,6 +1862,10 @@ void generic_make_request(struct bio *bio)
 	 * from the top.  In this case we really did just take the bio
 	 * of the top of the list (no pretending) and so remove it from
 	 * bio_list, and call into ->make_request() again.
+	 *
+	 * REQ_ATOMIC bios may have been chained on bi_next, but we
+	 * should have caught them all above.  This BUG_ON(bi_next)
+	 * will catch any lists of bios that were not flagged as atomic
 	 */
 	BUG_ON(bio->bi_next);
 	bio_list_init(&bio_list_on_stack);
@@ -1849,28 +1900,38 @@ void submit_bio(int rw, struct bio *bio)
 	 * go through the normal accounting stuff before submission.
 	 */
 	if (bio_has_data(bio)) {
-		unsigned int count;
-
-		if (unlikely(rw & REQ_WRITE_SAME))
-			count = bdev_logical_block_size(bio->bi_bdev) >> 9;
-		else
-			count = bio_sectors(bio);
+		unsigned int count = 0;
+		unsigned int size = 0;
+		struct bio *walk;
+
+		walk = bio;
+		for_each_bio(walk) {
+			if (unlikely(rw & REQ_WRITE_SAME))
+				count += bdev_logical_block_size(walk->bi_bdev) >> 9;
+			else
+				count += bio_sectors(walk);
+			size += walk->bi_iter.bi_size;
+		}
 
 		if (rw & WRITE) {
 			count_vm_events(PGPGOUT, count);
 		} else {
-			task_io_account_read(bio->bi_iter.bi_size);
+			task_io_account_read(size);
 			count_vm_events(PGPGIN, count);
 		}
 
 		if (unlikely(block_dump)) {
 			char b[BDEVNAME_SIZE];
-			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
-			current->comm, task_pid_nr(current),
-				(rw & WRITE) ? "WRITE" : "READ",
-				(unsigned long long)bio->bi_iter.bi_sector,
-				bdevname(bio->bi_bdev, b),
-				count);
+
+			walk = bio;
+			for_each_bio(walk) {
+				printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
+				current->comm, task_pid_nr(current),
+					(rw & WRITE) ? "WRITE" : "READ",
+					(unsigned long long)walk->bi_iter.bi_sector,
+					bdevname(walk->bi_bdev, b),
+					bio_sectors(walk));
+			}
 		}
 	}
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 5330933..17a6d23 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -119,6 +119,7 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->discard_alignment = 0;
 	lim->discard_misaligned = 0;
 	lim->discard_zeroes_data = 0;
+	lim->atomic_write_segments = 0;
 	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
 	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
 	lim->alignment_offset = 0;
@@ -804,6 +805,22 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
 EXPORT_SYMBOL(blk_queue_update_dma_alignment);
 
 /**
+ * blk_queue_set_atomic_write - number of segments supported for atomic writes
+ * @q:     the request queue for the device
+ * @segments: number of segments supported
+ *
+ * description:
+ *    If the device supports atomic (or transactional) writes, then it can pass
+ *    the maximum number of segments it supports in here. Atomic writes are
+ *    either completed as a whole, or none of it gets written.
+ **/
+void blk_queue_set_atomic_write(struct request_queue *q, unsigned int segments)
+{
+	q->limits.atomic_write_segments = segments;
+}
+EXPORT_SYMBOL(blk_queue_set_atomic_write);
+
+/**
  * blk_queue_flush - configure queue's cache flush capability
  * @q:		the request queue for the device
  * @flush:	0, REQ_FLUSH or REQ_FLUSH | REQ_FUA
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ca0119d..40238bf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -283,6 +283,8 @@ struct queue_limits {
 	unsigned int		discard_granularity;
 	unsigned int		discard_alignment;
 
+	unsigned int		atomic_write_segments;
+
 	unsigned short		logical_block_size;
 	unsigned short		max_segments;
 	unsigned short		max_integrity_segments;
@@ -968,6 +970,8 @@ extern void blk_queue_logical_block_size(struct request_queue *, unsigned short)
 extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
 extern void blk_queue_alignment_offset(struct request_queue *q,
 				       unsigned int alignment);
+extern void blk_queue_set_atomic_write(struct request_queue *q,
+				       unsigned int segments);
 extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
 extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
 extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
@@ -1190,6 +1194,16 @@ static inline unsigned short queue_logical_block_size(struct request_queue *q)
 	return retval;
 }
 
+static inline unsigned short bdev_atomic_write_segments(struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (q)
+		return q->limits.atomic_write_segments;
+
+	return 0;
+}
+
 static inline unsigned short bdev_logical_block_size(struct block_device *bdev)
 {
 	return queue_logical_block_size(bdev_get_queue(bdev));
-- 
1.8.2


  reply	other threads:[~2013-11-01 21:29 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-11-01 21:27 [PATCH 0/2] Support for atomic IOs Chris Mason
2013-11-01 21:28 ` Chris Mason [this message]
2013-11-01 21:47   ` [PATCH 1/2] block: Add support for atomic writes Shaohua Li
2013-11-05 17:43   ` Jeff Moyer
2013-11-07 13:52     ` Chris Mason
2013-11-07 15:43       ` Jeff Moyer
2013-11-07 15:55         ` Chris Mason
2013-11-07 16:14           ` Jeff Moyer
2013-11-07 16:52             ` Chris Mason
2013-11-13 23:59             ` Dave Chinner
2013-11-12 15:11       ` Matthew Wilcox
2013-11-13 20:44         ` Chris Mason
2013-11-13 20:53           ` Howard Chu
2013-11-13 21:35           ` Matthew Wilcox
2013-11-01 21:29 ` [PATCH 2/3] fs: Add O_ATOMIC support to direct IO Chris Mason
  -- strict thread matches above, loose matches on Subject: below --
2013-11-20  8:23 [PATCH 1/2] block: Add support for atomic writes Kishore Sampathkumar
2013-11-26  6:24 Kishore Sampathkumar

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20131101212854.10239.19830@localhost.localdomain \
    --to=chris.mason@fusionio.com \
    --cc=axboe@kernel.dk \
    --cc=linux-fsdevel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.