Linux block layer
 help / color / mirror / Atom feed
* [PATCH 3/8] nowait aio: return if direct write will trigger writeback
From: Goldwyn Rodrigues @ 2017-03-15 21:51 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: jack, hch, linux-block, linux-btrfs, linux-ext4, linux-xfs, sagi,
	avi, axboe, linux-api, willy, Goldwyn Rodrigues
In-Reply-To: <20170315215107.5628-1-rgoldwyn@suse.de>

From: Goldwyn Rodrigues <rgoldwyn@suse.com>

Find out if the write will trigger a wait due to writeback. If yes,
return -EAGAIN.

This introduces a new function filemap_range_has_page() which
returns true if the file's mapping has a page within the range
mentioned.

Return -EINVAL for buffered AIO: there are multiple causes of
delay such as page locks, dirty throttling logic, page loading
from disk etc. which cannot be taken care of.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 include/linux/fs.h |  2 ++
 mm/filemap.c       | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index e8d9346..4a30e8f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2514,6 +2514,8 @@ extern int filemap_fdatawait(struct address_space *);
 extern void filemap_fdatawait_keep_errors(struct address_space *);
 extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
 				   loff_t lend);
+extern int filemap_range_has_page(struct address_space *, loff_t lstart,
+				   loff_t lend);
 extern int filemap_write_and_wait(struct address_space *mapping);
 extern int filemap_write_and_wait_range(struct address_space *mapping,
 				        loff_t lstart, loff_t lend);
diff --git a/mm/filemap.c b/mm/filemap.c
index e08f3b9..c020e23 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -376,6 +376,39 @@ int filemap_flush(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_flush);
 
+/**
+ * filemap_range_has_page - check if a page exists in range.
+ * @mapping:		address space structure to wait for
+ * @start_byte:		offset in bytes where the range starts
+ * @end_byte:		offset in bytes where the range ends (inclusive)
+ *
+ * Find at least one page in the range supplied, usually used to check if
+ * direct writing in this range will trigger a writeback.
+ */
+int filemap_range_has_page(struct address_space *mapping,
+				loff_t start_byte, loff_t end_byte)
+{
+	pgoff_t index = start_byte >> PAGE_SHIFT;
+	pgoff_t end = end_byte >> PAGE_SHIFT;
+	struct pagevec pvec;
+	int ret;
+
+	if (end_byte < start_byte)
+		return 0;
+
+	if (mapping->nrpages == 0)
+		return 0;
+
+	pagevec_init(&pvec, 0);
+	ret = pagevec_lookup(&pvec, mapping, index, 1);
+	if (!ret)
+		return 0;
+	ret = (pvec.pages[0]->index <= end);
+	pagevec_release(&pvec);
+	return ret;
+}
+EXPORT_SYMBOL(filemap_range_has_page);
+
 static int __filemap_fdatawait_range(struct address_space *mapping,
 				     loff_t start_byte, loff_t end_byte)
 {
@@ -2640,6 +2673,9 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
 
 	pos = iocb->ki_pos;
 
+	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+		return -EINVAL;
+
 	if (limit != RLIM_INFINITY) {
 		if (iocb->ki_pos >= limit) {
 			send_sig(SIGXFSZ, current, 0);
@@ -2709,9 +2745,17 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
 	write_len = iov_iter_count(from);
 	end = (pos + write_len - 1) >> PAGE_SHIFT;
 
-	written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
-	if (written)
-		goto out;
+	if (iocb->ki_flags & IOCB_NOWAIT) {
+		/* If there are pages to writeback, return */
+		if (filemap_range_has_page(inode->i_mapping, pos,
+					   pos + iov_iter_count(from)))
+			return -EAGAIN;
+	} else {
+		written = filemap_write_and_wait_range(mapping, pos,
+							pos + write_len - 1);
+		if (written)
+			goto out;
+	}
 
 	/*
 	 * After a write we want buffered reads to be sure to go to disk to get
-- 
2.10.2

^ permalink raw reply related

* [PATCH 4/8] nowait-aio: Introduce IOMAP_NOWAIT
From: Goldwyn Rodrigues @ 2017-03-15 21:51 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: jack, hch, linux-block, linux-btrfs, linux-ext4, linux-xfs, sagi,
	avi, axboe, linux-api, willy, Goldwyn Rodrigues
In-Reply-To: <20170315215107.5628-1-rgoldwyn@suse.de>

From: Goldwyn Rodrigues <rgoldwyn@suse.com>

IOCB_NOWAIT translates to IOMAP_NOWAIT for iomaps.
This is used by XFS in the XFS patch.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 fs/iomap.c            | 2 ++
 include/linux/iomap.h | 1 +
 2 files changed, 3 insertions(+)

diff --git a/fs/iomap.c b/fs/iomap.c
index 141c3cd..d1c8175 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -885,6 +885,8 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	} else {
 		dio->flags |= IOMAP_DIO_WRITE;
 		flags |= IOMAP_WRITE;
+		if (iocb->ki_flags & IOCB_NOWAIT)
+			flags |= IOMAP_NOWAIT;
 	}
 
 	if (mapping->nrpages) {
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 7291810..53f6af8 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -51,6 +51,7 @@ struct iomap {
 #define IOMAP_REPORT		(1 << 2) /* report extent status, e.g. FIEMAP */
 #define IOMAP_FAULT		(1 << 3) /* mapping for page fault */
 #define IOMAP_DIRECT		(1 << 4) /* direct I/O */
+#define IOMAP_NOWAIT		(1 << 5) /* Don't wait for writeback */
 
 struct iomap_ops {
 	/*
-- 
2.10.2

^ permalink raw reply related

* [PATCH 5/8] nowait aio: return on congested block device
From: Goldwyn Rodrigues @ 2017-03-15 21:51 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: jack, hch, linux-block, linux-btrfs, linux-ext4, linux-xfs, sagi,
	avi, axboe, linux-api, willy, Goldwyn Rodrigues
In-Reply-To: <20170315215107.5628-1-rgoldwyn@suse.de>

From: Goldwyn Rodrigues <rgoldwyn@suse.com>

A new flag BIO_NOWAIT is introduced to identify bio's
orignating from iocb with IOCB_NOWAIT. This flag indicates
to return immediately if a request cannot be made instead
of retrying.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 block/blk-core.c          | 12 ++++++++++--
 block/blk-mq-sched.c      |  3 +++
 block/blk-mq.c            |  4 ++++
 fs/direct-io.c            | 11 +++++++++--
 include/linux/bio.h       |  6 ++++++
 include/linux/blk_types.h |  1 +
 6 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 0eeb99e..2e5cba2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1232,6 +1232,11 @@ static struct request *get_request(struct request_queue *q, unsigned int op,
 	if (!IS_ERR(rq))
 		return rq;
 
+	if (bio && bio_flagged(bio, BIO_NOWAIT)) {
+		blk_put_rl(rl);
+		return ERR_PTR(-EAGAIN);
+	}
+
 	if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) {
 		blk_put_rl(rl);
 		return rq;
@@ -2014,7 +2019,7 @@ blk_qc_t generic_make_request(struct bio *bio)
 	do {
 		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 
-		if (likely(blk_queue_enter(q, false) == 0)) {
+		if (likely(blk_queue_enter(q, bio_flagged(bio, BIO_NOWAIT)) == 0)) {
 			struct bio_list hold;
 			struct bio_list lower, same;
 
@@ -2040,7 +2045,10 @@ blk_qc_t generic_make_request(struct bio *bio)
 			bio_list_merge(&bio_list_on_stack, &same);
 			bio_list_merge(&bio_list_on_stack, &hold);
 		} else {
-			bio_io_error(bio);
+			if (unlikely(bio_flagged(bio, BIO_NOWAIT)))
+				bio_wouldblock_error(bio);
+			else
+				bio_io_error(bio);
 		}
 		bio = bio_list_pop(current->bio_list);
 	} while (bio);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 09af8ff..40e78b5 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -119,6 +119,9 @@ struct request *blk_mq_sched_get_request(struct request_queue *q,
 	if (likely(!data->hctx))
 		data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
 
+	if (likely(bio) && bio_flagged(bio, BIO_NOWAIT))
+		data->flags |= BLK_MQ_REQ_NOWAIT;
+
 	if (e) {
 		data->flags |= BLK_MQ_REQ_INTERNAL;
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 159187a..942ce8c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1518,6 +1518,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
 	if (unlikely(!rq)) {
 		__wbt_done(q->rq_wb, wb_acct);
+		if (bio && bio_flagged(bio, BIO_NOWAIT))
+			bio_wouldblock_error(bio);
 		return BLK_QC_T_NONE;
 	}
 
@@ -1642,6 +1644,8 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
 	if (unlikely(!rq)) {
 		__wbt_done(q->rq_wb, wb_acct);
+		if (bio && bio_flagged(bio, BIO_NOWAIT))
+			bio_wouldblock_error(bio);
 		return BLK_QC_T_NONE;
 	}
 
diff --git a/fs/direct-io.c b/fs/direct-io.c
index a04ebea..f6835d3 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -386,6 +386,9 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
 	else
 		bio->bi_end_io = dio_bio_end_io;
 
+	if (dio->iocb->ki_flags & IOCB_NOWAIT)
+		bio_set_flag(bio, BIO_NOWAIT);
+
 	sdio->bio = bio;
 	sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
 }
@@ -480,8 +483,12 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
 	unsigned i;
 	int err;
 
-	if (bio->bi_error)
-		dio->io_error = -EIO;
+	if (bio->bi_error) {
+		if (bio_flagged(bio, BIO_NOWAIT))
+			dio->io_error = -EAGAIN;
+		else
+			dio->io_error = -EIO;
+	}
 
 	if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) {
 		err = bio->bi_error;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8e52119..1a92707 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -425,6 +425,12 @@ static inline void bio_io_error(struct bio *bio)
 	bio_endio(bio);
 }
 
+static inline void bio_wouldblock_error(struct bio *bio)
+{
+	bio->bi_error = -EAGAIN;
+	bio_endio(bio);
+}
+
 struct request_queue;
 extern int bio_phys_segments(struct request_queue *, struct bio *);
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d703acb..514c08e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -102,6 +102,7 @@ struct bio {
 #define BIO_REFFED	8	/* bio has elevated ->bi_cnt */
 #define BIO_THROTTLED	9	/* This bio has already been subjected to
 				 * throttling rules. Don't do it again. */
+#define BIO_NOWAIT	10	/* don't block over blk device congestion */
 
 /*
  * Flags starting here get preserved by bio_reset() - this includes
-- 
2.10.2

^ permalink raw reply related

* [PATCH 6/8] nowait aio: ext4
From: Goldwyn Rodrigues @ 2017-03-15 21:51 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: jack, hch, linux-block, linux-btrfs, linux-ext4, linux-xfs, sagi,
	avi, axboe, linux-api, willy, Goldwyn Rodrigues
In-Reply-To: <20170315215107.5628-1-rgoldwyn@suse.de>

From: Goldwyn Rodrigues <rgoldwyn@suse.com>

Return EAGAIN if any of the following checks fail for direct I/O:
 + i_rwsem is lockable
 + Writing beyond end of file (will trigger allocation)
 + Blocks are not allocated at the write location

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 fs/ext4/file.c | 48 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8210c1f..e223b9f 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -127,27 +127,22 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
 	return 0;
 }
 
-/* Is IO overwriting allocated and initialized blocks? */
-static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
+/* Are IO blocks allocated */
+static bool ext4_blocks_mapped(struct inode *inode, loff_t pos, loff_t len,
+				struct ext4_map_blocks *map)
 {
-	struct ext4_map_blocks map;
 	unsigned int blkbits = inode->i_blkbits;
 	int err, blklen;
 
 	if (pos + len > i_size_read(inode))
 		return false;
 
-	map.m_lblk = pos >> blkbits;
-	map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits);
-	blklen = map.m_len;
+	map->m_lblk = pos >> blkbits;
+	map->m_len = EXT4_MAX_BLOCKS(len, pos, blkbits);
+	blklen = map->m_len;
 
-	err = ext4_map_blocks(NULL, inode, &map, 0);
-	/*
-	 * 'err==len' means that all of the blocks have been preallocated,
-	 * regardless of whether they have been initialized or not. To exclude
-	 * unwritten extents, we need to check m_flags.
-	 */
-	return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
+	err = ext4_map_blocks(NULL, inode, map, 0);
+	return err == blklen;
 }
 
 static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
@@ -204,6 +199,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
 	int o_direct = iocb->ki_flags & IOCB_DIRECT;
+	int nowait = iocb->ki_flags & IOCB_NOWAIT;
 	int unaligned_aio = 0;
 	int overwrite = 0;
 	ssize_t ret;
@@ -216,7 +212,13 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		return ext4_dax_write_iter(iocb, from);
 #endif
 
-	inode_lock(inode);
+	if (o_direct && nowait) {
+		if (!inode_trylock(inode))
+			return -EAGAIN;
+	} else {
+		inode_lock(inode);
+	}
+
 	ret = ext4_write_checks(iocb, from);
 	if (ret <= 0)
 		goto out;
@@ -235,9 +237,21 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 	iocb->private = &overwrite;
 	/* Check whether we do a DIO overwrite or not */
-	if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio &&
-	    ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from)))
-		overwrite = 1;
+	if (o_direct && !unaligned_aio) {
+		struct ext4_map_blocks map;
+		if (ext4_blocks_mapped(inode, iocb->ki_pos,
+				      iov_iter_count(from), &map)) {
+	 		/* To exclude unwritten extents, we need to check
+			 * m_flags.
+			 */
+			if (ext4_should_dioread_nolock(inode) &&
+			    (map.m_flags & EXT4_MAP_MAPPED))
+				overwrite = 1;
+		} else if (iocb->ki_flags & IOCB_NOWAIT) {
+			ret = -EAGAIN;
+			goto out;
+		}
+	}
 
 	ret = __generic_file_write_iter(iocb, from);
 	inode_unlock(inode);
-- 
2.10.2

^ permalink raw reply related

* [PATCH 7/8] nowait aio: xfs
From: Goldwyn Rodrigues @ 2017-03-15 21:51 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: jack, hch, linux-block, linux-btrfs, linux-ext4, linux-xfs, sagi,
	avi, axboe, linux-api, willy, Goldwyn Rodrigues
In-Reply-To: <20170315215107.5628-1-rgoldwyn@suse.de>

From: Goldwyn Rodrigues <rgoldwyn@suse.com>

If IOCB_NOWAIT is set, bail if the i_rwsem is not lockable
immediately.

IF IOMAP_NOWAIT is set, return EAGAIN in xfs_file_iomap_begin
if it needs allocation either due to file extension, writing to a hole,
or COW or waiting for other DIOs to finish.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 fs/xfs/xfs_file.c  | 15 +++++++++++----
 fs/xfs/xfs_iomap.c | 13 +++++++++++++
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 35703a8..08a5eef 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -541,8 +541,11 @@ xfs_file_dio_aio_write(
 		iolock = XFS_IOLOCK_SHARED;
 	}
 
-	xfs_ilock(ip, iolock);
-
+	if (!xfs_ilock_nowait(ip, iolock)) {
+		if (iocb->ki_flags & IOCB_NOWAIT)
+			return -EAGAIN;
+		xfs_ilock(ip, iolock);
+	}
 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 	if (ret)
 		goto out;
@@ -553,9 +556,13 @@ xfs_file_dio_aio_write(
 	 * otherwise demote the lock if we had to take the exclusive lock
 	 * for other reasons in xfs_file_aio_write_checks.
 	 */
-	if (unaligned_io)
+	if (unaligned_io) {
+		/* If we are going to wait for other DIO to finish, bail */
+		if ((iocb->ki_flags & IOCB_NOWAIT) &&
+		     atomic_read(&inode->i_dio_count))
+			return -EAGAIN;
 		inode_dio_wait(inode);
-	else if (iolock == XFS_IOLOCK_EXCL) {
+	} else if (iolock == XFS_IOLOCK_EXCL) {
 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
 		iolock = XFS_IOLOCK_SHARED;
 	}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 288ee5b..6843725 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1015,6 +1015,11 @@ xfs_file_iomap_begin(
 
 	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
 		if (flags & IOMAP_DIRECT) {
+			/* A reflinked inode will result in CoW alloc */
+			if (flags & IOMAP_NOWAIT) {
+				error = -EAGAIN;
+				goto out_unlock;
+			}
 			/* may drop and re-acquire the ilock */
 			error = xfs_reflink_allocate_cow(ip, &imap, &shared,
 					&lockmode);
@@ -1032,6 +1037,14 @@ xfs_file_iomap_begin(
 
 	if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) {
 		/*
+		 * If nowait is set bail since we are going to make
+		 * allocations.
+		 */
+		if (flags & IOMAP_NOWAIT) {
+			error = -EAGAIN;
+			goto out_unlock;
+		}
+		/*
 		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
 		 * pages to keep the chunks of work done where somewhat symmetric
 		 * with the work writeback does. This is a completely arbitrary
-- 
2.10.2

^ permalink raw reply related

* [PATCH 8/8] nowait aio: btrfs
From: Goldwyn Rodrigues @ 2017-03-15 21:51 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: jack, hch, linux-block, linux-btrfs, linux-ext4, linux-xfs, sagi,
	avi, axboe, linux-api, willy, Goldwyn Rodrigues
In-Reply-To: <20170315215107.5628-1-rgoldwyn@suse.de>

From: Goldwyn Rodrigues <rgoldwyn@suse.com>

Return EAGAIN if any of the following checks fail
 + i_rwsem is not lockable
 + NODATACOW or PREALLOC is not set
 + Cannot nocow at the desired location
 + Writing beyond end of file which is not allocated

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 fs/btrfs/file.c  | 25 ++++++++++++++++++++-----
 fs/btrfs/inode.c |  3 +++
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 520cb72..a870e5d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1823,12 +1823,29 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	ssize_t num_written = 0;
 	bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
 	ssize_t err;
-	loff_t pos;
-	size_t count;
+	loff_t pos = iocb->ki_pos;
+	size_t count = iov_iter_count(from);
 	loff_t oldsize;
 	int clean_page = 0;
 
-	inode_lock(inode);
+	if ((iocb->ki_flags & IOCB_NOWAIT) &&
+			(iocb->ki_flags & IOCB_DIRECT)) {
+		/* Don't sleep on inode rwsem */
+		if (!inode_trylock(inode))
+			return -EAGAIN;
+		/*
+		 * We will allocate space in case nodatacow is not set,
+		 * so bail
+		 */
+		if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+					      BTRFS_INODE_PREALLOC)) ||
+		    check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) {
+			inode_unlock(inode);
+			return -EAGAIN;
+		}
+	} else
+		inode_lock(inode);
+
 	err = generic_write_checks(iocb, from);
 	if (err <= 0) {
 		inode_unlock(inode);
@@ -1862,8 +1879,6 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	 */
 	update_time_for_write(inode);
 
-	pos = iocb->ki_pos;
-	count = iov_iter_count(from);
 	start_pos = round_down(pos, fs_info->sectorsize);
 	oldsize = i_size_read(inode);
 	if (start_pos > oldsize) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c40060c..788bb93 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8613,6 +8613,9 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 			dio_data.overwrite = 1;
 			inode_unlock(inode);
 			relock = true;
+		} else if (iocb->ki_flags & IOCB_NOWAIT) {
+			ret = -EAGAIN;
+			goto out;
 		}
 		ret = btrfs_delalloc_reserve_space(inode, offset, count);
 		if (ret)
-- 
2.10.2

^ permalink raw reply related

* Re: [PATCH 1/2] blk-mq: don't complete un-started request in timeout handler
From: Ming Lei @ 2017-03-15 23:41 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: linux-kernel@vger.kernel.org, hch@infradead.org,
	linux-block@vger.kernel.org, yizhan@redhat.com, axboe@fb.com,
	stable@vger.kernel.org
In-Reply-To: <1489613676.2660.8.camel@sandisk.com>

On Wed, Mar 15, 2017 at 09:34:50PM +0000, Bart Van Assche wrote:
> On Wed, 2017-03-15 at 20:18 +0800, Ming Lei wrote:
> > On Wed, Mar 15, 2017 at 12:07:37AM +0000, Bart Van Assche wrote:
> > > Both the old and the new check look racy to me. The REQ_ATOM_STARTED bit can
> > > be changed concurrently by blk_mq_start_request(), __blk_mq_finish_request()
> > 
> > blk_mq_start_request() and __blk_mq_finish_request() won't be run concurrently.
> > 
> > From view of __blk_mq_finish_request():
> > 
> > 	- if it is run from merge queue io path(blk_mq_merge_queue_io()),
> > 	blk_mq_start_request() can't be run at all, and the COMPLETE flag
> > 	is kept as previous value(zero)
> > 
> > 	- if it is run from normal complete path, COMPLETE flag is cleared
> > 	before the req/tag is released to tag set.
> > 
> > so there isn't race in blk_mq_start_request() vs. __blk_mq_finish_request()
> > wrt. timeout.
> > 
> > > or __blk_mq_requeue_request(). Another issue with this function is that the
> > 
> > __blk_mq_requeue_request() can be run from two pathes:
> > 
> > 	- dispatch failure, in which case the req/tag isn't released to tag set
> > 	
> > 	- IO completion path, in which COMPLETE flag is cleared before requeue.
> > 	
> > so I can't see races with timeout in case of start rq vs. requeue rq. 
> > 
> > > request passed to this function can be reinitialized concurrently.
> 
> Hello Ming,
> 
> You misinterpret what I wrote. I was referring to manipulation of
> REQ_ATOM_STARTED from different contexts and not to what you explained.

This patch addresses one race between timeout and pre-queuing I/O in block layer
(before .queue_rq), please focus on this patch. And that is definitely correct.

Also I am happy to discuss this kind of races, but maybe we can do that in
another thread if you can describe the issue clearly.


-- 
Ming

^ permalink raw reply

* Re: [PATCH 1/2] blk-mq: don't complete un-started request in timeout handler
From: Ming Lei @ 2017-03-16  0:07 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: linux-kernel@vger.kernel.org, hch@infradead.org,
	linux-block@vger.kernel.org, yizhan@redhat.com, axboe@fb.com,
	stable@vger.kernel.org
In-Reply-To: <1489613678.2660.9.camel@sandisk.com>

On Wed, Mar 15, 2017 at 09:35:03PM +0000, Bart Van Assche wrote:
> On Thu, 2017-03-16 at 00:22 +0800, Ming Lei wrote:
> > On Wed, Mar 15, 2017 at 03:36:31PM +0000, Bart Van Assche wrote:
> > > Please have another look at __blk_mq_requeue_request(). In that function
> > > the following code occurs: if (test_and_clear_bit(REQ_ATOM_STARTED,
> > > &rq->atomic_flags)) { ... }
> > > 
> > > I think the�REQ_ATOM_STARTED check in blk_mq_check_expired() races with the
> > > test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags) call in
> > > __blk_mq_requeue_request().
> > 
> > OK, this race should only exist in case that the requeue happens after dispatch
> > busy, because COMPLETE flag isn't set. And if the requeue is from io completion,
> > no such race because COMPLETE flag is set.
> > 
> > One solution I thought of is to call blk_mark_rq_complete() before requeuing
> > when dispatch busy happened, but that looks a bit silly. Another way is to
> > set STARTED flag just after .queue_rq returns BLK_MQ_RQ_QUEUE_OK, which looks
> > reasonable too. Any comments on the 2nd solution?
> 
> Sorry but I don't think that would be sufficient. There are several other
> scenarios that have not been mentioned above, e.g. that a tag gets freed and
> reused after the blk_mq_check_expired() call started and before that function
> has had the chance to examine any members of struct request. What is needed in
> blk_mq_check_expired() is the following as a single atomic operation:

We have dealt with this by checking COMPLETE & rq->deadline together in
blk_mq_check_expired() already:

- if new rq->deadline(set in reuse path) has been observed in the later
checking rq of blk_mq_check_expired(), it won't be timeouted because of the timing.

- if new rq->deadline(set in reuse path) hasn't been observed in the
later checking rq of blk_mq_check_expired(), that means COMPLETE flag isn't set
yet in reuse path because we have a barrier to enhance the order in
blk_mq_start_request(), so it won't be timeouted too.

So let me know what is the real race between free/reusing vs. timeout.

> * Check whether REQ_ATOM_STARTED has been set.
> * Check whether REQ_ATOM_COMPLETE has not yet been set.
> * If both conditions have been met, set REQ_ATOM_COMPLETE.
> 
> I don't think there is another solution than using a single state variable to
> represent the REQ_ATOM_STARTED and REQ_ATOM_COMPLETE states instead of two
> independent bits. How about the patch below?

I would review it if you can confirm me that it is a real issue, :-)

Thanks,
Ming

^ permalink raw reply

* Re: [PATCH 3/8] nowait aio: return if direct write will trigger writeback
From: Matthew Wilcox @ 2017-03-16 13:08 UTC (permalink / raw)
  To: Goldwyn Rodrigues
  Cc: linux-fsdevel, jack, hch, linux-block, linux-btrfs, linux-ext4,
	linux-xfs, sagi, avi, axboe, linux-api, Goldwyn Rodrigues
In-Reply-To: <20170315215107.5628-4-rgoldwyn@suse.de>

On Wed, Mar 15, 2017 at 04:51:02PM -0500, Goldwyn Rodrigues wrote:
> This introduces a new function filemap_range_has_page() which
> returns true if the file's mapping has a page within the range
> mentioned.

I thought you were going to replace this patch with one that starts
writeback for these pages but does not wait for them?

^ permalink raw reply

* Re: [PATCH 3/8] nowait aio: return if direct write will trigger writeback
From: Matthew Wilcox @ 2017-03-16 13:20 UTC (permalink / raw)
  To: Goldwyn Rodrigues
  Cc: linux-fsdevel, jack, hch, linux-block, linux-btrfs, linux-ext4,
	linux-xfs, sagi, avi, axboe, linux-api, Goldwyn Rodrigues
In-Reply-To: <20170315215107.5628-4-rgoldwyn@suse.de>

On Wed, Mar 15, 2017 at 04:51:02PM -0500, Goldwyn Rodrigues wrote:
> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> 
> Find out if the write will trigger a wait due to writeback. If yes,
> return -EAGAIN.
> 
> This introduces a new function filemap_range_has_page() which
> returns true if the file's mapping has a page within the range
> mentioned.
> 
> Return -EINVAL for buffered AIO: there are multiple causes of
> delay such as page locks, dirty throttling logic, page loading
> from disk etc. which cannot be taken care of.

Also, this patch only touches the write path; we have a similar call to
write_and_wait_range() in generic_file_read_iter().

Actually, why do we even have that?  Why can't we satisfy an O_DIRECT
read from the cache?

^ permalink raw reply

* Re: [PATCH 3/8] nowait aio: return if direct write will trigger writeback
From: Goldwyn Rodrigues @ 2017-03-16 13:46 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: linux-fsdevel, jack, hch, linux-block, linux-btrfs, linux-ext4,
	linux-xfs, sagi, avi, axboe, linux-api, Goldwyn Rodrigues
In-Reply-To: <20170316130824.GF4033@bombadil.infradead.org>



On 03/16/2017 08:08 AM, Matthew Wilcox wrote:
> On Wed, Mar 15, 2017 at 04:51:02PM -0500, Goldwyn Rodrigues wrote:
>> This introduces a new function filemap_range_has_page() which
>> returns true if the file's mapping has a page within the range
>> mentioned.
> 
> I thought you were going to replace this patch with one that starts
> writeback for these pages but does not wait for them?
> 

As mentioned by Jan, Flags to filemap_write_and_wait_range are
unnecessarily complicated. The AIO-DIO API users who eye for performance
usually are careful with page writes/evictions. As a fallback, they can
(and should) go the wait route (without  IOCB_RW_FLAG_NOWAIT).

Finally, my take on this is that we don't want to perform tasks for a
following system call, which may or may not immediately follow the
current one. May not, because an application (DB) will offload the task
from the CPU thread to the I/O thread in case of -EAGAIN. A system call
should be complete in itself (and do the minimum, what is asked).


-- 
Goldwyn

^ permalink raw reply

* Re: [PATCH 3/8] nowait aio: return if direct write will trigger writeback
From: Goldwyn Rodrigues @ 2017-03-16 13:46 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: linux-fsdevel, jack, hch, linux-block, linux-btrfs, linux-ext4,
	linux-xfs, sagi, avi, axboe, linux-api, Goldwyn Rodrigues
In-Reply-To: <20170316132052.GG4033@bombadil.infradead.org>



On 03/16/2017 08:20 AM, Matthew Wilcox wrote:
> On Wed, Mar 15, 2017 at 04:51:02PM -0500, Goldwyn Rodrigues wrote:
>> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
>>
>> Find out if the write will trigger a wait due to writeback. If yes,
>> return -EAGAIN.
>>
>> This introduces a new function filemap_range_has_page() which
>> returns true if the file's mapping has a page within the range
>> mentioned.
>>
>> Return -EINVAL for buffered AIO: there are multiple causes of
>> delay such as page locks, dirty throttling logic, page loading
>> from disk etc. which cannot be taken care of.
> 
> Also, this patch only touches the write path; we have a similar call to
> write_and_wait_range() in generic_file_read_iter().
> 

This patch series is concerned with direct-write AIO paths only.

-- 
Goldwyn

^ permalink raw reply

* Re: [PATCH 5/8] nowait aio: return on congested block device
From: Jens Axboe @ 2017-03-16 14:33 UTC (permalink / raw)
  To: Goldwyn Rodrigues, linux-fsdevel
  Cc: jack, hch, linux-block, linux-btrfs, linux-ext4, linux-xfs, sagi,
	avi, linux-api, willy, Goldwyn Rodrigues
In-Reply-To: <20170315215107.5628-6-rgoldwyn@suse.de>

On 03/15/2017 03:51 PM, Goldwyn Rodrigues wrote:
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 0eeb99e..2e5cba2 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -2014,7 +2019,7 @@ blk_qc_t generic_make_request(struct bio *bio)
>  	do {
>  		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
>  
> -		if (likely(blk_queue_enter(q, false) == 0)) {
> +		if (likely(blk_queue_enter(q, bio_flagged(bio, BIO_NOWAIT)) == 0)) {
>  			struct bio_list hold;
>  			struct bio_list lower, same;
>  
> @@ -2040,7 +2045,10 @@ blk_qc_t generic_make_request(struct bio *bio)
>  			bio_list_merge(&bio_list_on_stack, &same);
>  			bio_list_merge(&bio_list_on_stack, &hold);
>  		} else {
> -			bio_io_error(bio);
> +			if (unlikely(bio_flagged(bio, BIO_NOWAIT)))
> +				bio_wouldblock_error(bio);
> +			else
> +				bio_io_error(bio);

This doesn't look right. What if the queue is dying, and BIO_NOWAIT just
happened to be set?

And you're missing wbt_wait() as well as a blocking point. Ditto in
blk-mq.

> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 159187a..942ce8c 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -1518,6 +1518,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
>  	rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
>  	if (unlikely(!rq)) {
>  		__wbt_done(q->rq_wb, wb_acct);
> +		if (bio && bio_flagged(bio, BIO_NOWAIT))
> +			bio_wouldblock_error(bio);
>  		return BLK_QC_T_NONE;
>  	}
>  

This seems a little fragile now, since not both paths free the bio.

-- 
Jens Axboe

^ permalink raw reply

* [PATCH v3 01/14] md: raid1/raid10: don't handle failure of bio_add_page()
From: Ming Lei @ 2017-03-16 16:12 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <20170316161235.27110-1-tom.leiming@gmail.com>

All bio_add_page() is for adding one page into resync bio,
which is big enough to hold RESYNC_PAGES pages, and
the current bio_add_page() doesn't check queue limit any more,
so it won't fail at all.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c  | 21 ++++++---------------
 drivers/md/raid10.c | 41 ++++++++++-------------------------------
 2 files changed, 16 insertions(+), 46 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index fbc2d7851b49..4a0b2ad5025e 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2917,21 +2917,12 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 			bio = r1_bio->bios[i];
 			if (bio->bi_end_io) {
 				page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
-				if (bio_add_page(bio, page, len, 0) == 0) {
-					/* stop here */
-					bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
-					while (i > 0) {
-						i--;
-						bio = r1_bio->bios[i];
-						if (bio->bi_end_io==NULL)
-							continue;
-						/* remove last page from this bio */
-						bio->bi_vcnt--;
-						bio->bi_iter.bi_size -= len;
-						bio_clear_flag(bio, BIO_SEG_VALID);
-					}
-					goto bio_full;
-				}
+
+				/*
+				 * won't fail because the vec table is big
+				 * enough to hold all these pages
+				 */
+				bio_add_page(bio, page, len, 0);
 			}
 		}
 		nr_sectors += len>>9;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 0536658c9d40..b4a56a488668 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3437,27 +3437,16 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		if (len == 0)
 			break;
 		for (bio= biolist ; bio ; bio=bio->bi_next) {
-			struct bio *bio2;
 			page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
-			if (bio_add_page(bio, page, len, 0))
-				continue;
-
-			/* stop here */
-			bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
-			for (bio2 = biolist;
-			     bio2 && bio2 != bio;
-			     bio2 = bio2->bi_next) {
-				/* remove last page from this bio */
-				bio2->bi_vcnt--;
-				bio2->bi_iter.bi_size -= len;
-				bio_clear_flag(bio2, BIO_SEG_VALID);
-			}
-			goto bio_full;
+			/*
+			 * won't fail because the vec table is big enough
+			 * to hold all these pages
+			 */
+			bio_add_page(bio, page, len, 0);
 		}
 		nr_sectors += len>>9;
 		sector_nr += len>>9;
 	} while (biolist->bi_vcnt < RESYNC_PAGES);
- bio_full:
 	r10_bio->sectors = nr_sectors;
 
 	while (biolist) {
@@ -4530,25 +4519,15 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 		if (len > PAGE_SIZE)
 			len = PAGE_SIZE;
 		for (bio = blist; bio ; bio = bio->bi_next) {
-			struct bio *bio2;
-			if (bio_add_page(bio, page, len, 0))
-				continue;
-
-			/* Didn't fit, must stop */
-			for (bio2 = blist;
-			     bio2 && bio2 != bio;
-			     bio2 = bio2->bi_next) {
-				/* Remove last page from this bio */
-				bio2->bi_vcnt--;
-				bio2->bi_iter.bi_size -= len;
-				bio_clear_flag(bio2, BIO_SEG_VALID);
-			}
-			goto bio_full;
+			/*
+			 * won't fail because the vec table is big enough
+			 * to hold all these pages
+			 */
+			bio_add_page(bio, page, len, 0);
 		}
 		sector_nr += len >> 9;
 		nr_sectors += len >> 9;
 	}
-bio_full:
 	rcu_read_unlock();
 	r10_bio->sectors = nr_sectors;
 
-- 
2.9.3

^ permalink raw reply related

* [PATCH v3 00/14]  md: cleanup on direct access to bvec table
From: Ming Lei @ 2017-03-16 16:12 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei

In MD's resync I/O path, there are lots of direct access to bio's
bvec table. This patchset kills almost all, and the conversion
is quite straightforward. One root cause of direct access to bvec
table is that resync I/O uses the bio's bvec to manage pages.
In V1, as suggested by Shaohua, a new approach is used to manage
these pages for resync I/O, turns out code becomes more clean
and readable.

Once direct access to bvec table in MD is cleaned up, we may make
multipage bvec moving on.

V3:
	- improve pages allocation & bio clone for write behind(10/14)
	- retrieve pages from preallocated page array, and avoid to user
	bio helpers(14/14)
	- change on helpers about managing resync io pages(03/14)

V2:
	- remove the patch for introducing/applying bio_remove_last_page()

V1:
	- allocate page array to manage resync pages

Thanks,
Ming

Ming Lei (14):
  md: raid1/raid10: don't handle failure of bio_add_page()
  md: move two macros into md.h
  md: prepare for managing resync I/O pages in clean way
  md: raid1: simplify r1buf_pool_free()
  md: raid1: don't use bio's vec table to manage resync pages
  md: raid1: retrieve page from pre-allocated resync page array
  md: raid1: use bio helper in process_checks()
  block: introduce bio_copy_data_partial
  md: raid1: move 'offset' out of loop
  md: raid1: improve write behind
  md: raid10: refactor code of read reshape's .bi_end_io
  md: raid10: don't use bio's vec table to manage resync pages
  md: raid10: retrieve page from preallocated resync page array
  md: raid10: avoid direct access to bvec table in
    handle_reshape_read_error

 block/bio.c         |  60 +++++++++---
 drivers/md/md.h     |  55 +++++++++++
 drivers/md/raid1.c  | 267 ++++++++++++++++++++++++++++------------------------
 drivers/md/raid1.h  |  10 +-
 drivers/md/raid10.c | 224 +++++++++++++++++++++++--------------------
 include/linux/bio.h |   2 +
 6 files changed, 376 insertions(+), 242 deletions(-)

-- 
2.9.3

^ permalink raw reply

* [PATCH v3 02/14] md: move two macros into md.h
From: Ming Lei @ 2017-03-16 16:12 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <20170316161235.27110-1-tom.leiming@gmail.com>

Both raid1 and raid10 share common resync
block size and page count, so move them into md.h.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/md.h     | 5 +++++
 drivers/md/raid1.c  | 2 --
 drivers/md/raid10.c | 3 ---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/md/md.h b/drivers/md/md.h
index b8859cbf84b6..1d63239a1be4 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -715,4 +715,9 @@ static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio)
 	    !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
 		mddev->queue->limits.max_write_same_sectors = 0;
 }
+
+/* Maximum size of each resync request */
+#define RESYNC_BLOCK_SIZE (64*1024)
+#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
+
 #endif /* _MD_MD_H */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4a0b2ad5025e..908e2caeb704 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -94,10 +94,8 @@ static void r1bio_pool_free(void *r1_bio, void *data)
 	kfree(r1_bio);
 }
 
-#define RESYNC_BLOCK_SIZE (64*1024)
 #define RESYNC_DEPTH 32
 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
-#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
 #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
 #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
 #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index b4a56a488668..2b40420299e3 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -125,9 +125,6 @@ static void r10bio_pool_free(void *r10_bio, void *data)
 	kfree(r10_bio);
 }
 
-/* Maximum size of each resync request */
-#define RESYNC_BLOCK_SIZE (64*1024)
-#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
 /* amount of memory to reserve for resync requests */
 #define RESYNC_WINDOW (1024*1024)
 /* maximum number of concurrent requests, memory permitting */
-- 
2.9.3

^ permalink raw reply related

* [PATCH v3 05/14] md: raid1: don't use bio's vec table to manage resync pages
From: Ming Lei @ 2017-03-16 16:12 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <20170316161235.27110-1-tom.leiming@gmail.com>

Now we allocate one page array for managing resync pages, instead
of using bio's vec table to do that, and the old way is very hacky
and won't work any more if multipage bvec is enabled.

The introduced cost is that we need to allocate (128 + 16) * raid_disks
bytes per r1_bio, and it is fine because the inflight r1_bio for
resync shouldn't be much, as pointed by Shaohua.

Also the bio_reset() in raid1_sync_request() is removed because
all bios are freshly new now and not necessary to reset any more.

This patch can be thought as a cleanup too

Suggested-by: Shaohua Li <shli@kernel.org>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 94 +++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 64 insertions(+), 30 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e30d89690109..0e64beb60e4d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -80,6 +80,24 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
 #define raid1_log(md, fmt, args...)				\
 	do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
 
+/*
+ * 'strct resync_pages' stores actual pages used for doing the resync
+ *  IO, and it is per-bio, so make .bi_private points to it.
+ */
+static inline struct resync_pages *get_resync_pages(struct bio *bio)
+{
+	return bio->bi_private;
+}
+
+/*
+ * for resync bio, r1bio pointer can be retrieved from the per-bio
+ * 'struct resync_pages'.
+ */
+static inline struct r1bio *get_resync_r1bio(struct bio *bio)
+{
+	return get_resync_pages(bio)->raid_bio;
+}
+
 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct pool_info *pi = data;
@@ -107,12 +125,18 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 	struct r1bio *r1_bio;
 	struct bio *bio;
 	int need_pages;
-	int i, j;
+	int j;
+	struct resync_pages *rps;
 
 	r1_bio = r1bio_pool_alloc(gfp_flags, pi);
 	if (!r1_bio)
 		return NULL;
 
+	rps = kmalloc(sizeof(struct resync_pages) * pi->raid_disks,
+		      gfp_flags);
+	if (!rps)
+		goto out_free_r1bio;
+
 	/*
 	 * Allocate bios : 1 for reading, n-1 for writing
 	 */
@@ -132,22 +156,22 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 		need_pages = pi->raid_disks;
 	else
 		need_pages = 1;
-	for (j = 0; j < need_pages; j++) {
+	for (j = 0; j < pi->raid_disks; j++) {
+		struct resync_pages *rp = &rps[j];
+
 		bio = r1_bio->bios[j];
-		bio->bi_vcnt = RESYNC_PAGES;
-
-		if (bio_alloc_pages(bio, gfp_flags))
-			goto out_free_pages;
-	}
-	/* If not user-requests, copy the page pointers to all bios */
-	if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
-		for (i=0; i<RESYNC_PAGES ; i++)
-			for (j=1; j<pi->raid_disks; j++) {
-				struct page *page =
-					r1_bio->bios[0]->bi_io_vec[i].bv_page;
-				get_page(page);
-				r1_bio->bios[j]->bi_io_vec[i].bv_page = page;
-			}
+
+		if (j < need_pages) {
+			if (resync_alloc_pages(rp, gfp_flags))
+				goto out_free_pages;
+		} else {
+			memcpy(rp, &rps[0], sizeof(*rp));
+			resync_get_all_pages(rp);
+		}
+
+		rp->idx = 0;
+		rp->raid_bio = r1_bio;
+		bio->bi_private = rp;
 	}
 
 	r1_bio->master_bio = NULL;
@@ -156,11 +180,14 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 
 out_free_pages:
 	while (--j >= 0)
-		bio_free_pages(r1_bio->bios[j]);
+		resync_free_pages(&rps[j]);
 
 out_free_bio:
 	while (++j < pi->raid_disks)
 		bio_put(r1_bio->bios[j]);
+	kfree(rps);
+
+out_free_r1bio:
 	r1bio_pool_free(r1_bio, data);
 	return NULL;
 }
@@ -168,14 +195,18 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 static void r1buf_pool_free(void *__r1_bio, void *data)
 {
 	struct pool_info *pi = data;
-	int i,j;
+	int i;
 	struct r1bio *r1bio = __r1_bio;
+	struct resync_pages *rp = NULL;
 
-	for (i = 0; i < RESYNC_PAGES; i++)
-		for (j = pi->raid_disks; j-- ;)
-			safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
-	for (i=0 ; i < pi->raid_disks; i++)
+	for (i = pi->raid_disks; i--; ) {
+		rp = get_resync_pages(r1bio->bios[i]);
+		resync_free_pages(rp);
 		bio_put(r1bio->bios[i]);
+	}
+
+	/* resync pages array stored in the 1st bio's .bi_private */
+	kfree(rp);
 
 	r1bio_pool_free(r1bio, data);
 }
@@ -1863,7 +1894,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 
 static void end_sync_read(struct bio *bio)
 {
-	struct r1bio *r1_bio = bio->bi_private;
+	struct r1bio *r1_bio = get_resync_r1bio(bio);
 
 	update_head_pos(r1_bio->read_disk, r1_bio);
 
@@ -1882,7 +1913,7 @@ static void end_sync_read(struct bio *bio)
 static void end_sync_write(struct bio *bio)
 {
 	int uptodate = !bio->bi_error;
-	struct r1bio *r1_bio = bio->bi_private;
+	struct r1bio *r1_bio = get_resync_r1bio(bio);
 	struct mddev *mddev = r1_bio->mddev;
 	struct r1conf *conf = mddev->private;
 	sector_t first_bad;
@@ -2099,6 +2130,7 @@ static void process_checks(struct r1bio *r1_bio)
 		int size;
 		int error;
 		struct bio *b = r1_bio->bios[i];
+		struct resync_pages *rp = get_resync_pages(b);
 		if (b->bi_end_io != end_sync_read)
 			continue;
 		/* fixup the bio for reuse, but preserve errno */
@@ -2111,7 +2143,8 @@ static void process_checks(struct r1bio *r1_bio)
 			conf->mirrors[i].rdev->data_offset;
 		b->bi_bdev = conf->mirrors[i].rdev->bdev;
 		b->bi_end_io = end_sync_read;
-		b->bi_private = r1_bio;
+		rp->raid_bio = r1_bio;
+		b->bi_private = rp;
 
 		size = b->bi_iter.bi_size;
 		for (j = 0; j < vcnt ; j++) {
@@ -2769,7 +2802,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 	for (i = 0; i < conf->raid_disks * 2; i++) {
 		struct md_rdev *rdev;
 		bio = r1_bio->bios[i];
-		bio_reset(bio);
 
 		rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev == NULL ||
@@ -2825,7 +2857,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 			atomic_inc(&rdev->nr_pending);
 			bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
 			bio->bi_bdev = rdev->bdev;
-			bio->bi_private = r1_bio;
 			if (test_bit(FailFast, &rdev->flags))
 				bio->bi_opf |= MD_FAILFAST;
 		}
@@ -2911,9 +2942,12 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 		}
 
 		for (i = 0 ; i < conf->raid_disks * 2; i++) {
+			struct resync_pages *rp;
+
 			bio = r1_bio->bios[i];
+			rp = get_resync_pages(bio);
 			if (bio->bi_end_io) {
-				page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
+				page = resync_fetch_page(rp, rp->idx++);
 
 				/*
 				 * won't fail because the vec table is big
@@ -2925,8 +2959,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 		nr_sectors += len>>9;
 		sector_nr += len>>9;
 		sync_blocks -= (len>>9);
-	} while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
- bio_full:
+	} while (get_resync_pages(r1_bio->bios[disk]->bi_private)->idx < RESYNC_PAGES);
+
 	r1_bio->sectors = nr_sectors;
 
 	if (mddev_is_clustered(mddev) &&
-- 
2.9.3

^ permalink raw reply related

* [PATCH v3 06/14] md: raid1: retrieve page from pre-allocated resync page array
From: Ming Lei @ 2017-03-16 16:12 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <20170316161235.27110-1-tom.leiming@gmail.com>

Now one page array is allocated for each resync bio, and we can
retrieve page from this table directly.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 0e64beb60e4d..b1345aa4efd8 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1992,6 +1992,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 	struct mddev *mddev = r1_bio->mddev;
 	struct r1conf *conf = mddev->private;
 	struct bio *bio = r1_bio->bios[r1_bio->read_disk];
+	struct page **pages = get_resync_pages(bio)->pages;
 	sector_t sect = r1_bio->sector;
 	int sectors = r1_bio->sectors;
 	int idx = 0;
@@ -2025,7 +2026,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 				 */
 				rdev = conf->mirrors[d].rdev;
 				if (sync_page_io(rdev, sect, s<<9,
-						 bio->bi_io_vec[idx].bv_page,
+						 pages[idx],
 						 REQ_OP_READ, 0, false)) {
 					success = 1;
 					break;
@@ -2080,7 +2081,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 				continue;
 			rdev = conf->mirrors[d].rdev;
 			if (r1_sync_page_io(rdev, sect, s,
-					    bio->bi_io_vec[idx].bv_page,
+					    pages[idx],
 					    WRITE) == 0) {
 				r1_bio->bios[d]->bi_end_io = NULL;
 				rdev_dec_pending(rdev, mddev);
@@ -2095,7 +2096,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 				continue;
 			rdev = conf->mirrors[d].rdev;
 			if (r1_sync_page_io(rdev, sect, s,
-					    bio->bi_io_vec[idx].bv_page,
+					    pages[idx],
 					    READ) != 0)
 				atomic_add(s, &rdev->corrected_errors);
 		}
@@ -2171,6 +2172,8 @@ static void process_checks(struct r1bio *r1_bio)
 		struct bio *pbio = r1_bio->bios[primary];
 		struct bio *sbio = r1_bio->bios[i];
 		int error = sbio->bi_error;
+		struct page **ppages = get_resync_pages(pbio)->pages;
+		struct page **spages = get_resync_pages(sbio)->pages;
 
 		if (sbio->bi_end_io != end_sync_read)
 			continue;
@@ -2179,11 +2182,8 @@ static void process_checks(struct r1bio *r1_bio)
 
 		if (!error) {
 			for (j = vcnt; j-- ; ) {
-				struct page *p, *s;
-				p = pbio->bi_io_vec[j].bv_page;
-				s = sbio->bi_io_vec[j].bv_page;
-				if (memcmp(page_address(p),
-					   page_address(s),
+				if (memcmp(page_address(ppages[j]),
+					   page_address(spages[j]),
 					   sbio->bi_io_vec[j].bv_len))
 					break;
 			}
-- 
2.9.3

^ permalink raw reply related

* [PATCH v3 04/14] md: raid1: simplify r1buf_pool_free()
From: Ming Lei @ 2017-03-16 16:12 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <20170316161235.27110-1-tom.leiming@gmail.com>

This patch gets each page's reference of each bio for resync,
then r1buf_pool_free() gets simplified a lot.

The same policy has been taken in raid10's buf pool allocation/free
too.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 908e2caeb704..e30d89690109 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -142,9 +142,12 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 	/* If not user-requests, copy the page pointers to all bios */
 	if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
 		for (i=0; i<RESYNC_PAGES ; i++)
-			for (j=1; j<pi->raid_disks; j++)
-				r1_bio->bios[j]->bi_io_vec[i].bv_page =
+			for (j=1; j<pi->raid_disks; j++) {
+				struct page *page =
 					r1_bio->bios[0]->bi_io_vec[i].bv_page;
+				get_page(page);
+				r1_bio->bios[j]->bi_io_vec[i].bv_page = page;
+			}
 	}
 
 	r1_bio->master_bio = NULL;
@@ -169,12 +172,8 @@ static void r1buf_pool_free(void *__r1_bio, void *data)
 	struct r1bio *r1bio = __r1_bio;
 
 	for (i = 0; i < RESYNC_PAGES; i++)
-		for (j = pi->raid_disks; j-- ;) {
-			if (j == 0 ||
-			    r1bio->bios[j]->bi_io_vec[i].bv_page !=
-			    r1bio->bios[0]->bi_io_vec[i].bv_page)
-				safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
-		}
+		for (j = pi->raid_disks; j-- ;)
+			safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
 	for (i=0 ; i < pi->raid_disks; i++)
 		bio_put(r1bio->bios[i]);
 
-- 
2.9.3

^ permalink raw reply related

* [PATCH v3 03/14] md: prepare for managing resync I/O pages in clean way
From: Ming Lei @ 2017-03-16 16:12 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <20170316161235.27110-1-tom.leiming@gmail.com>

Now resync I/O use bio's bec table to manage pages,
this way is very hacky, and may not work any more
once multipage bvec is introduced.

So introduce helpers and new data structure for
managing resync I/O pages more cleanly.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/md.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1d63239a1be4..20c48032493b 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -720,4 +720,54 @@ static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio)
 #define RESYNC_BLOCK_SIZE (64*1024)
 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
 
+/* for managing resync I/O pages */
+struct resync_pages {
+	unsigned	idx;	/* for get/put page from the pool */
+	void		*raid_bio;
+	struct page	*pages[RESYNC_PAGES];
+};
+
+static inline int resync_alloc_pages(struct resync_pages *rp,
+				     gfp_t gfp_flags)
+{
+	int i;
+
+	for (i = 0; i < RESYNC_PAGES; i++) {
+		rp->pages[i] = alloc_page(gfp_flags);
+		if (!rp->pages[i])
+			goto out_free;
+	}
+
+	return 0;
+
+ out_free:
+	while (--i >= 0)
+		put_page(rp->pages[i]);
+	return -ENOMEM;
+}
+
+static inline void resync_free_pages(struct resync_pages *rp)
+{
+	int i;
+
+	for (i = 0; i < RESYNC_PAGES; i++)
+		put_page(rp->pages[i]);
+}
+
+static inline void resync_get_all_pages(struct resync_pages *rp)
+{
+	int i;
+
+	for (i = 0; i < RESYNC_PAGES; i++)
+		get_page(rp->pages[i]);
+}
+
+static inline struct page *resync_fetch_page(struct resync_pages *rp,
+					     unsigned idx)
+{
+	if (WARN_ON_ONCE(idx >= RESYNC_PAGES))
+		return NULL;
+	return rp->pages[idx];
+}
+
 #endif /* _MD_MD_H */
-- 
2.9.3

^ permalink raw reply related

* [PATCH v3 07/14] md: raid1: use bio helper in process_checks()
From: Ming Lei @ 2017-03-16 16:12 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <20170316161235.27110-1-tom.leiming@gmail.com>

Avoid to direct access to bvec table.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b1345aa4efd8..4034a2963da8 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2130,6 +2130,7 @@ static void process_checks(struct r1bio *r1_bio)
 		int j;
 		int size;
 		int error;
+		struct bio_vec *bi;
 		struct bio *b = r1_bio->bios[i];
 		struct resync_pages *rp = get_resync_pages(b);
 		if (b->bi_end_io != end_sync_read)
@@ -2148,9 +2149,7 @@ static void process_checks(struct r1bio *r1_bio)
 		b->bi_private = rp;
 
 		size = b->bi_iter.bi_size;
-		for (j = 0; j < vcnt ; j++) {
-			struct bio_vec *bi;
-			bi = &b->bi_io_vec[j];
+		bio_for_each_segment_all(bi, b, j) {
 			bi->bv_offset = 0;
 			if (size > PAGE_SIZE)
 				bi->bv_len = PAGE_SIZE;
@@ -2174,17 +2173,22 @@ static void process_checks(struct r1bio *r1_bio)
 		int error = sbio->bi_error;
 		struct page **ppages = get_resync_pages(pbio)->pages;
 		struct page **spages = get_resync_pages(sbio)->pages;
+		struct bio_vec *bi;
+		int page_len[RESYNC_PAGES];
 
 		if (sbio->bi_end_io != end_sync_read)
 			continue;
 		/* Now we can 'fixup' the error value */
 		sbio->bi_error = 0;
 
+		bio_for_each_segment_all(bi, sbio, j)
+			page_len[j] = bi->bv_len;
+
 		if (!error) {
 			for (j = vcnt; j-- ; ) {
 				if (memcmp(page_address(ppages[j]),
 					   page_address(spages[j]),
-					   sbio->bi_io_vec[j].bv_len))
+					   page_len[j]))
 					break;
 			}
 		} else
-- 
2.9.3

^ permalink raw reply related

* [PATCH v3 09/14] md: raid1: move 'offset' out of loop
From: Ming Lei @ 2017-03-16 16:12 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <20170316161235.27110-1-tom.leiming@gmail.com>

The 'offset' local variable can't be changed inside the loop, so
move it out.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4034a2963da8..2f3622c695ce 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1317,6 +1317,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 	int first_clone;
 	int sectors_handled;
 	int max_sectors;
+	sector_t offset;
 
 	/*
 	 * Register the new request and wait if the reconstruction
@@ -1481,13 +1482,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 	atomic_set(&r1_bio->behind_remaining, 0);
 
 	first_clone = 1;
+
+	offset = r1_bio->sector - bio->bi_iter.bi_sector;
 	for (i = 0; i < disks; i++) {
 		struct bio *mbio = NULL;
-		sector_t offset;
 		if (!r1_bio->bios[i])
 			continue;
 
-		offset = r1_bio->sector - bio->bi_iter.bi_sector;
 
 		if (first_clone) {
 			/* do behind I/O ?
-- 
2.9.3

^ permalink raw reply related

* [PATCH v3 08/14] block: introduce bio_copy_data_partial
From: Ming Lei @ 2017-03-16 16:12 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <20170316161235.27110-1-tom.leiming@gmail.com>

Turns out we can use bio_copy_data in raid1's write behind,
and we can make alloc_behind_pages() more clean/efficient,
but we need to partial version of bio_copy_data().

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 block/bio.c         | 60 +++++++++++++++++++++++++++++++++++++++++------------
 include/linux/bio.h |  2 ++
 2 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index e75878f8b14a..1ccff0dace89 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1025,19 +1025,8 @@ int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(bio_alloc_pages);
 
-/**
- * bio_copy_data - copy contents of data buffers from one chain of bios to
- * another
- * @src: source bio list
- * @dst: destination bio list
- *
- * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
- * @src and @dst as linked lists of bios.
- *
- * Stops when it reaches the end of either @src or @dst - that is, copies
- * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
- */
-void bio_copy_data(struct bio *dst, struct bio *src)
+static void __bio_copy_data(struct bio *dst, struct bio *src,
+			    int offset, int size)
 {
 	struct bvec_iter src_iter, dst_iter;
 	struct bio_vec src_bv, dst_bv;
@@ -1047,6 +1036,12 @@ void bio_copy_data(struct bio *dst, struct bio *src)
 	src_iter = src->bi_iter;
 	dst_iter = dst->bi_iter;
 
+	/* for supporting partial copy */
+	if (offset || size != src->bi_iter.bi_size) {
+		bio_advance_iter(src, &src_iter, offset);
+		src_iter.bi_size = size;
+	}
+
 	while (1) {
 		if (!src_iter.bi_size) {
 			src = src->bi_next;
@@ -1083,8 +1078,47 @@ void bio_copy_data(struct bio *dst, struct bio *src)
 		bio_advance_iter(dst, &dst_iter, bytes);
 	}
 }
+
+/**
+ * bio_copy_data - copy contents of data buffers from one chain of bios to
+ * another
+ * @src: source bio list
+ * @dst: destination bio list
+ *
+ * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
+ * @src and @dst as linked lists of bios.
+ *
+ * Stops when it reaches the end of either @src or @dst - that is, copies
+ * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
+ */
+void bio_copy_data(struct bio *dst, struct bio *src)
+{
+	__bio_copy_data(dst, src, 0, src->bi_iter.bi_size);
+}
 EXPORT_SYMBOL(bio_copy_data);
 
+/**
+ * bio_copy_data_partial - copy partial contents of data buffers from one
+ * chain of bios to another
+ * @dst: destination bio list
+ * @src: source bio list
+ * @offset: starting copy from the offset
+ * @size: how many bytes to copy
+ *
+ * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
+ * @src and @dst as linked lists of bios.
+ *
+ * Stops when it reaches the end of either @src or @dst - that is, copies
+ * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
+ */
+void bio_copy_data_partial(struct bio *dst, struct bio *src,
+			   int offset, int size)
+{
+	__bio_copy_data(dst, src, offset, size);
+
+}
+EXPORT_SYMBOL(bio_copy_data_partial);
+
 struct bio_map_data {
 	int is_our_pages;
 	struct iov_iter iter;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8e521194f6fc..42b62a0288b0 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -468,6 +468,8 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
 #endif
 
 extern void bio_copy_data(struct bio *dst, struct bio *src);
+extern void bio_copy_data_partial(struct bio *dst, struct bio *src,
+				  int offset, int size);
 extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
 extern void bio_free_pages(struct bio *bio);
 
-- 
2.9.3

^ permalink raw reply related

* [PATCH v3 10/14] md: raid1: improve write behind
From: Ming Lei @ 2017-03-16 16:12 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <20170316161235.27110-1-tom.leiming@gmail.com>

This patch improve handling of write behind in the following ways:

- introduce behind master bio to hold all write behind pages
- fast clone bios from behind master bio
- avoid to change bvec table directly
- use bio_copy_data() and make code more clean

Suggested-by: Shaohua Li <shli@fb.com>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 118 ++++++++++++++++++++++++-----------------------------
 drivers/md/raid1.h |  10 +++--
 2 files changed, 61 insertions(+), 67 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 2f3622c695ce..3c13286190c1 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -405,12 +405,9 @@ static void close_write(struct r1bio *r1_bio)
 {
 	/* it really is the end of this request */
 	if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
-		/* free extra copy of the data pages */
-		int i = r1_bio->behind_page_count;
-		while (i--)
-			safe_put_page(r1_bio->behind_bvecs[i].bv_page);
-		kfree(r1_bio->behind_bvecs);
-		r1_bio->behind_bvecs = NULL;
+		bio_free_pages(r1_bio->behind_master_bio);
+		bio_put(r1_bio->behind_master_bio);
+		r1_bio->behind_master_bio = NULL;
 	}
 	/* clear the bitmap if all writes complete successfully */
 	bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
@@ -512,6 +509,10 @@ static void raid1_end_write_request(struct bio *bio)
 	}
 
 	if (behind) {
+		/* we release behind master bio when all write are done */
+		if (r1_bio->behind_master_bio == bio)
+			to_put = NULL;
+
 		if (test_bit(WriteMostly, &rdev->flags))
 			atomic_dec(&r1_bio->behind_remaining);
 
@@ -1096,39 +1097,46 @@ static void unfreeze_array(struct r1conf *conf)
 	wake_up(&conf->wait_barrier);
 }
 
-/* duplicate the data pages for behind I/O
- */
-static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
+static struct bio *alloc_behind_master_bio(struct r1bio *r1_bio,
+					   struct bio *bio,
+					   int offset, int size)
 {
-	int i;
-	struct bio_vec *bvec;
-	struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
-					GFP_NOIO);
-	if (unlikely(!bvecs))
-		return;
+	unsigned vcnt = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	int i = 0;
+	struct bio *behind_bio = NULL;
+
+	behind_bio = bio_alloc_mddev(GFP_NOIO, vcnt, r1_bio->mddev);
+	if (!behind_bio)
+		goto fail;
+
+	while (i < vcnt && size) {
+		struct page *page;
+		int len = min_t(int, PAGE_SIZE, size);
+
+		page = alloc_page(GFP_NOIO);
+		if (unlikely(!page))
+			goto free_pages;
+
+		bio_add_page(behind_bio, page, len, 0);
+
+		size -= len;
+		i++;
+	}
 
-	bio_for_each_segment_all(bvec, bio, i) {
-		bvecs[i] = *bvec;
-		bvecs[i].bv_page = alloc_page(GFP_NOIO);
-		if (unlikely(!bvecs[i].bv_page))
-			goto do_sync_io;
-		memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset,
-		       kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
-		kunmap(bvecs[i].bv_page);
-		kunmap(bvec->bv_page);
-	}
-	r1_bio->behind_bvecs = bvecs;
-	r1_bio->behind_page_count = bio->bi_vcnt;
+	bio_copy_data_partial(behind_bio, bio, offset,
+			      behind_bio->bi_iter.bi_size);
+
+	r1_bio->behind_master_bio = behind_bio;;
 	set_bit(R1BIO_BehindIO, &r1_bio->state);
-	return;
 
-do_sync_io:
-	for (i = 0; i < bio->bi_vcnt; i++)
-		if (bvecs[i].bv_page)
-			put_page(bvecs[i].bv_page);
-	kfree(bvecs);
+	return behind_bio;
+
+ free_pages:
 	pr_debug("%dB behind alloc failed, doing sync I/O\n",
 		 bio->bi_iter.bi_size);
+	bio_free_pages(behind_bio);
+ fail:
+	return behind_bio;
 }
 
 struct raid1_plug_cb {
@@ -1499,11 +1507,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 			    (atomic_read(&bitmap->behind_writes)
 			     < mddev->bitmap_info.max_write_behind) &&
 			    !waitqueue_active(&bitmap->behind_wait)) {
-				mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
-								mddev->bio_set,
-								offset << 9,
-								max_sectors << 9);
-				alloc_behind_pages(mbio, r1_bio);
+				mbio = alloc_behind_master_bio(r1_bio, bio,
+							       offset << 9,
+							       max_sectors << 9);
 			}
 
 			bitmap_startwrite(bitmap, r1_bio->sector,
@@ -1514,26 +1520,17 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 		}
 
 		if (!mbio) {
-			if (r1_bio->behind_bvecs)
-				mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
-								mddev->bio_set,
-								offset << 9,
-								max_sectors << 9);
+			if (r1_bio->behind_master_bio)
+				mbio = bio_clone_fast(r1_bio->behind_master_bio,
+						      GFP_NOIO,
+						      mddev->bio_set);
 			else {
 				mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
 				bio_trim(mbio, offset, max_sectors);
 			}
 		}
 
-		if (r1_bio->behind_bvecs) {
-			struct bio_vec *bvec;
-			int j;
-
-			/*
-			 * We trimmed the bio, so _all is legit
-			 */
-			bio_for_each_segment_all(bvec, mbio, j)
-				bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
+		if (r1_bio->behind_master_bio) {
 			if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
 				atomic_inc(&r1_bio->behind_remaining);
 		}
@@ -2405,18 +2402,11 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
 		/* Write at 'sector' for 'sectors'*/
 
 		if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
-			unsigned vcnt = r1_bio->behind_page_count;
-			struct bio_vec *vec = r1_bio->behind_bvecs;
-
-			while (!vec->bv_page) {
-				vec++;
-				vcnt--;
-			}
-
-			wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
-			memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
-
-			wbio->bi_vcnt = vcnt;
+			wbio = bio_clone_fast(r1_bio->behind_master_bio,
+					      GFP_NOIO,
+					      mddev->bio_set);
+			/* We really need a _all clone */
+			wbio->bi_iter = (struct bvec_iter){ 0 };
 		} else {
 			wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
 					      mddev->bio_set);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index dd22a37d0d83..4271cd7ac2de 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -153,9 +153,13 @@ struct r1bio {
 	int			read_disk;
 
 	struct list_head	retry_list;
-	/* Next two are only valid when R1BIO_BehindIO is set */
-	struct bio_vec		*behind_bvecs;
-	int			behind_page_count;
+
+	/*
+	 * When R1BIO_BehindIO is set, we store pages for write behind
+	 * in behind_master_bio.
+	 */
+	struct bio		*behind_master_bio;
+
 	/*
 	 * if the IO is in WRITE direction, then multiple bios are used.
 	 * We choose the number when they are allocated.
-- 
2.9.3

^ permalink raw reply related

* [PATCH v3 12/14] md: raid10: don't use bio's vec table to manage resync pages
From: Ming Lei @ 2017-03-16 16:12 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <20170316161235.27110-1-tom.leiming@gmail.com>

Now we allocate one page array for managing resync pages, instead
of using bio's vec table to do that, and the old way is very hacky
and won't work any more if multipage bvec is enabled.

The introduced cost is that we need to allocate (128 + 16) * copies
bytes per r10_bio, and it is fine because the inflight r10_bio for
resync shouldn't be much, as pointed by Shaohua.

Also bio_reset() in raid10_sync_request() and reshape_request()
are removed because all bios are freshly new now in these functions
and not necessary to reset any more.

This patch can be thought as cleanup too.

Suggested-by: Shaohua Li <shli@kernel.org>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid10.c | 134 ++++++++++++++++++++++++++++++++--------------------
 1 file changed, 82 insertions(+), 52 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 98e53f39b66e..d2cb68971486 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -110,6 +110,24 @@ static void end_reshape(struct r10conf *conf);
 #define raid10_log(md, fmt, args...)				\
 	do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
 
+/*
+ * 'strct resync_pages' stores actual pages used for doing the resync
+ *  IO, and it is per-bio, so make .bi_private points to it.
+ */
+static inline struct resync_pages *get_resync_pages(struct bio *bio)
+{
+	return bio->bi_private;
+}
+
+/*
+ * for resync bio, r10bio pointer can be retrieved from the per-bio
+ * 'struct resync_pages'.
+ */
+static inline struct r10bio *get_resync_r10bio(struct bio *bio)
+{
+	return get_resync_pages(bio)->raid_bio;
+}
+
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct r10conf *conf = data;
@@ -140,11 +158,11 @@ static void r10bio_pool_free(void *r10_bio, void *data)
 static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct r10conf *conf = data;
-	struct page *page;
 	struct r10bio *r10_bio;
 	struct bio *bio;
-	int i, j;
-	int nalloc;
+	int j;
+	int nalloc, nalloc_rp;
+	struct resync_pages *rps;
 
 	r10_bio = r10bio_pool_alloc(gfp_flags, conf);
 	if (!r10_bio)
@@ -156,6 +174,15 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 	else
 		nalloc = 2; /* recovery */
 
+	/* allocate once for all bios */
+	if (!conf->have_replacement)
+		nalloc_rp = nalloc;
+	else
+		nalloc_rp = nalloc * 2;
+	rps = kmalloc(sizeof(struct resync_pages) * nalloc_rp, gfp_flags);
+	if (!rps)
+		goto out_free_r10bio;
+
 	/*
 	 * Allocate bios.
 	 */
@@ -175,36 +202,40 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 	 * Allocate RESYNC_PAGES data pages and attach them
 	 * where needed.
 	 */
-	for (j = 0 ; j < nalloc; j++) {
+	for (j = 0; j < nalloc; j++) {
 		struct bio *rbio = r10_bio->devs[j].repl_bio;
+		struct resync_pages *rp, *rp_repl;
+
+		rp = &rps[j];
+		if (rbio)
+			rp_repl = &rps[nalloc + j];
+
 		bio = r10_bio->devs[j].bio;
-		for (i = 0; i < RESYNC_PAGES; i++) {
-			if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
-					       &conf->mddev->recovery)) {
-				/* we can share bv_page's during recovery
-				 * and reshape */
-				struct bio *rbio = r10_bio->devs[0].bio;
-				page = rbio->bi_io_vec[i].bv_page;
-				get_page(page);
-			} else
-				page = alloc_page(gfp_flags);
-			if (unlikely(!page))
+
+		if (!j || test_bit(MD_RECOVERY_SYNC,
+				   &conf->mddev->recovery)) {
+			if (resync_alloc_pages(rp, gfp_flags))
 				goto out_free_pages;
+		} else {
+			memcpy(rp, &rps[0], sizeof(*rp));
+			resync_get_all_pages(rp);
+		}
 
-			bio->bi_io_vec[i].bv_page = page;
-			if (rbio)
-				rbio->bi_io_vec[i].bv_page = page;
+		rp->idx = 0;
+		rp->raid_bio = r10_bio;
+		bio->bi_private = rp;
+		if (rbio) {
+			memcpy(rp_repl, rp, sizeof(*rp));
+			rbio->bi_private = rp_repl;
 		}
 	}
 
 	return r10_bio;
 
 out_free_pages:
-	for ( ; i > 0 ; i--)
-		safe_put_page(bio->bi_io_vec[i-1].bv_page);
-	while (j--)
-		for (i = 0; i < RESYNC_PAGES ; i++)
-			safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
+	while (--j >= 0)
+		resync_free_pages(&rps[j * 2]);
+
 	j = 0;
 out_free_bio:
 	for ( ; j < nalloc; j++) {
@@ -213,30 +244,34 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 		if (r10_bio->devs[j].repl_bio)
 			bio_put(r10_bio->devs[j].repl_bio);
 	}
+	kfree(rps);
+out_free_r10bio:
 	r10bio_pool_free(r10_bio, conf);
 	return NULL;
 }
 
 static void r10buf_pool_free(void *__r10_bio, void *data)
 {
-	int i;
 	struct r10conf *conf = data;
 	struct r10bio *r10bio = __r10_bio;
 	int j;
+	struct resync_pages *rp = NULL;
 
-	for (j=0; j < conf->copies; j++) {
+	for (j = conf->copies; j--; ) {
 		struct bio *bio = r10bio->devs[j].bio;
-		if (bio) {
-			for (i = 0; i < RESYNC_PAGES; i++) {
-				safe_put_page(bio->bi_io_vec[i].bv_page);
-				bio->bi_io_vec[i].bv_page = NULL;
-			}
-			bio_put(bio);
-		}
+
+		rp = get_resync_pages(bio);
+		resync_free_pages(rp);
+		bio_put(bio);
+
 		bio = r10bio->devs[j].repl_bio;
 		if (bio)
 			bio_put(bio);
 	}
+
+	/* resync pages array stored in the 1st bio's .bi_private */
+	kfree(rp);
+
 	r10bio_pool_free(r10bio, conf);
 }
 
@@ -1937,7 +1972,7 @@ static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
 
 static void end_sync_read(struct bio *bio)
 {
-	struct r10bio *r10_bio = bio->bi_private;
+	struct r10bio *r10_bio = get_resync_r10bio(bio);
 	struct r10conf *conf = r10_bio->mddev->private;
 	int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
 
@@ -1946,6 +1981,7 @@ static void end_sync_read(struct bio *bio)
 
 static void end_reshape_read(struct bio *bio)
 {
+	/* reshape read bio isn't allocated from r10buf_pool */
 	struct r10bio *r10_bio = bio->bi_private;
 
 	__end_sync_read(r10_bio, bio, r10_bio->read_slot);
@@ -1980,7 +2016,7 @@ static void end_sync_request(struct r10bio *r10_bio)
 
 static void end_sync_write(struct bio *bio)
 {
-	struct r10bio *r10_bio = bio->bi_private;
+	struct r10bio *r10_bio = get_resync_r10bio(bio);
 	struct mddev *mddev = r10_bio->mddev;
 	struct r10conf *conf = mddev->private;
 	int d;
@@ -2060,6 +2096,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 	for (i=0 ; i < conf->copies ; i++) {
 		int  j, d;
 		struct md_rdev *rdev;
+		struct resync_pages *rp;
 
 		tbio = r10_bio->devs[i].bio;
 
@@ -2101,11 +2138,13 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 		 * First we need to fixup bv_offset, bv_len and
 		 * bi_vecs, as the read request might have corrupted these
 		 */
+		rp = get_resync_pages(tbio);
 		bio_reset(tbio);
 
 		tbio->bi_vcnt = vcnt;
 		tbio->bi_iter.bi_size = fbio->bi_iter.bi_size;
-		tbio->bi_private = r10_bio;
+		rp->raid_bio = r10_bio;
+		tbio->bi_private = rp;
 		tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
 		tbio->bi_end_io = end_sync_write;
 		bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
@@ -3173,10 +3212,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 					}
 				}
 				bio = r10_bio->devs[0].bio;
-				bio_reset(bio);
 				bio->bi_next = biolist;
 				biolist = bio;
-				bio->bi_private = r10_bio;
 				bio->bi_end_io = end_sync_read;
 				bio_set_op_attrs(bio, REQ_OP_READ, 0);
 				if (test_bit(FailFast, &rdev->flags))
@@ -3200,10 +3237,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
 				if (!test_bit(In_sync, &mrdev->flags)) {
 					bio = r10_bio->devs[1].bio;
-					bio_reset(bio);
 					bio->bi_next = biolist;
 					biolist = bio;
-					bio->bi_private = r10_bio;
 					bio->bi_end_io = end_sync_write;
 					bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 					bio->bi_iter.bi_sector = to_addr
@@ -3228,10 +3263,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 				if (mreplace == NULL || bio == NULL ||
 				    test_bit(Faulty, &mreplace->flags))
 					break;
-				bio_reset(bio);
 				bio->bi_next = biolist;
 				biolist = bio;
-				bio->bi_private = r10_bio;
 				bio->bi_end_io = end_sync_write;
 				bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 				bio->bi_iter.bi_sector = to_addr +
@@ -3353,7 +3386,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 				r10_bio->devs[i].repl_bio->bi_end_io = NULL;
 
 			bio = r10_bio->devs[i].bio;
-			bio_reset(bio);
 			bio->bi_error = -EIO;
 			rcu_read_lock();
 			rdev = rcu_dereference(conf->mirrors[d].rdev);
@@ -3378,7 +3410,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 			atomic_inc(&r10_bio->remaining);
 			bio->bi_next = biolist;
 			biolist = bio;
-			bio->bi_private = r10_bio;
 			bio->bi_end_io = end_sync_read;
 			bio_set_op_attrs(bio, REQ_OP_READ, 0);
 			if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
@@ -3397,13 +3428,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
 			/* Need to set up for writing to the replacement */
 			bio = r10_bio->devs[i].repl_bio;
-			bio_reset(bio);
 			bio->bi_error = -EIO;
 
 			sector = r10_bio->devs[i].addr;
 			bio->bi_next = biolist;
 			biolist = bio;
-			bio->bi_private = r10_bio;
 			bio->bi_end_io = end_sync_write;
 			bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 			if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
@@ -3442,7 +3471,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		if (len == 0)
 			break;
 		for (bio= biolist ; bio ; bio=bio->bi_next) {
-			page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
+			struct resync_pages *rp = get_resync_pages(bio);
+			page = resync_fetch_page(rp, rp->idx++);
 			/*
 			 * won't fail because the vec table is big enough
 			 * to hold all these pages
@@ -3451,7 +3481,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		}
 		nr_sectors += len>>9;
 		sector_nr += len>>9;
-	} while (biolist->bi_vcnt < RESYNC_PAGES);
+	} while (get_resync_pages(biolist)->idx < RESYNC_PAGES);
 	r10_bio->sectors = nr_sectors;
 
 	while (biolist) {
@@ -3459,7 +3489,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		biolist = biolist->bi_next;
 
 		bio->bi_next = NULL;
-		r10_bio = bio->bi_private;
+		r10_bio = get_resync_r10bio(bio);
 		r10_bio->sectors = nr_sectors;
 
 		if (bio->bi_end_io == end_sync_read) {
@@ -4354,6 +4384,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 	struct bio *blist;
 	struct bio *bio, *read_bio;
 	int sectors_done = 0;
+	struct page **pages;
 
 	if (sector_nr == 0) {
 		/* If restarting in the middle, skip the initial sectors */
@@ -4504,11 +4535,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 		if (!rdev2 || test_bit(Faulty, &rdev2->flags))
 			continue;
 
-		bio_reset(b);
 		b->bi_bdev = rdev2->bdev;
 		b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
 			rdev2->new_data_offset;
-		b->bi_private = r10_bio;
 		b->bi_end_io = end_reshape_write;
 		bio_set_op_attrs(b, REQ_OP_WRITE, 0);
 		b->bi_next = blist;
@@ -4518,8 +4547,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 	/* Now add as many pages as possible to all of these bios. */
 
 	nr_sectors = 0;
+	pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
 	for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
-		struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
+		struct page *page = pages[s / (PAGE_SIZE >> 9)];
 		int len = (max_sectors - s) << 9;
 		if (len > PAGE_SIZE)
 			len = PAGE_SIZE;
@@ -4703,7 +4733,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
 
 static void end_reshape_write(struct bio *bio)
 {
-	struct r10bio *r10_bio = bio->bi_private;
+	struct r10bio *r10_bio = get_resync_r10bio(bio);
 	struct mddev *mddev = r10_bio->mddev;
 	struct r10conf *conf = mddev->private;
 	int d;
-- 
2.9.3

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox