From: "Darrick J. Wong" <darrick.wong@oracle.com>
To: axboe@kernel.dk, zab@redhat.com, martin.petersen@oracle.com,
darrick.wong@oracle.com, JBottomley@parallels.com,
jmoyer@redhat.com, bcrl@kvack.org, viro@zeniv.linux.org.uk
Cc: linux-fsdevel@vger.kernel.org, linux-aio@kvack.org,
linux-scsi@vger.kernel.org, linux-mm@kvack.org
Subject: [PATCH 3/6] aio/dio: enable PI passthrough
Date: Mon, 24 Mar 2014 09:22:52 -0700 [thread overview]
Message-ID: <20140324162251.10848.56452.stgit@birch.djwong.org> (raw)
In-Reply-To: <20140324162231.10848.4863.stgit@birch.djwong.org>
Provide an IO extension handler that attaches PI data from the io
extension structure to a kiocb, then teach directio how to attach the
pages representing the PI buffer directly to a bio.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
Documentation/block/data-integrity.txt | 11 ++++
fs/aio.c | 62 +++++++++++++++++++++
fs/bio-integrity.c | 94 +++++++++++++++++++++++++++++++-
fs/direct-io.c | 70 +++++++++++++++++++-----
include/linux/aio.h | 10 +++
include/linux/bio.h | 15 +++++
include/uapi/linux/aio_abi.h | 6 ++
mm/filemap.c | 6 ++
8 files changed, 259 insertions(+), 15 deletions(-)
diff --git a/Documentation/block/data-integrity.txt b/Documentation/block/data-integrity.txt
index 2d735b0a..1d1f070 100644
--- a/Documentation/block/data-integrity.txt
+++ b/Documentation/block/data-integrity.txt
@@ -282,6 +282,17 @@ will require extra work due to the application tag.
It is up to the receiver to process them and verify data
integrity upon completion.
+ int bio_integrity_prep_buffer(struct bio *bio, int rw,
+ struct bio_integrity_prep_iter *pi);
+
+ This function should be called before submit_bio; its purpose is to
+ attach an arbitrary array of struct page * containing integrity data
+ to an existing bio. Primarily this is intended for AIO/DIO to be
+ able to attach a userspace buffer to a bio.
+
+ The bio_integrity_prep_iter should contain the page offset and buffer
+ length of the PI buffer, the number of pages, and the actual array of
+ pages, as returned by get_user_pages.
5.4 REGISTERING A BLOCK DEVICE AS CAPABLE OF EXCHANGING INTEGRITY
METADATA
diff --git a/fs/aio.c b/fs/aio.c
index 0c40bdc..3f932c3 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1379,7 +1379,69 @@ struct io_extension_type {
int (*destroy_fn)(struct kiocb *);
};
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+static int destroy_pi_ext(struct kiocb *req)
+{
+ unsigned int i;
+
+ if (req->ki_ioext->ke_pi_iter.pi_userpages == NULL)
+ return 0;
+
+ for (i = 0; i < req->ki_ioext->ke_pi_iter.pi_nrpages; i++)
+ page_cache_release(req->ki_ioext->ke_pi_iter.pi_userpages[i]);
+ kfree(req->ki_ioext->ke_pi_iter.pi_userpages);
+ req->ki_ioext->ke_pi_iter.pi_userpages = NULL;
+
+ return 0;
+}
+
+static int setup_pi_ext(struct kiocb *req, int is_write)
+{
+ struct file *file = req->ki_filp;
+ struct io_extension *ext = &req->ki_ioext->ke_kern;
+ void *p;
+ unsigned long start, end;
+ int retval;
+
+ if (!(file->f_flags & O_DIRECT)) {
+ pr_debug("EINVAL: can't use PI without O_DIRECT.\n");
+ return -EINVAL;
+ }
+
+ BUG_ON(req->ki_ioext->ke_pi_iter.pi_userpages);
+
+ end = (((unsigned long)ext->ie_pi_buf) + ext->ie_pi_buflen +
+ PAGE_SIZE - 1) >> PAGE_SHIFT;
+ start = ((unsigned long)ext->ie_pi_buf) >> PAGE_SHIFT;
+ req->ki_ioext->ke_pi_iter.pi_offset = offset_in_page(ext->ie_pi_buf);
+ req->ki_ioext->ke_pi_iter.pi_len = ext->ie_pi_buflen;
+ req->ki_ioext->ke_pi_iter.pi_nrpages = end - start;
+ p = kzalloc(req->ki_ioext->ke_pi_iter.pi_nrpages *
+ sizeof(struct page *),
+ GFP_NOIO);
+ if (p == NULL) {
+ pr_err("%s: no room for page array?\n", __func__);
+ return -ENOMEM;
+ }
+ req->ki_ioext->ke_pi_iter.pi_userpages = p;
+
+ retval = get_user_pages_fast((unsigned long)ext->ie_pi_buf,
+ req->ki_ioext->ke_pi_iter.pi_nrpages,
+ is_write,
+ req->ki_ioext->ke_pi_iter.pi_userpages);
+ if (retval != req->ki_ioext->ke_pi_iter.pi_nrpages) {
+ pr_err("%s: couldn't map pages?\n", __func__);
+ req->ki_ioext->ke_pi_iter.pi_nrpages = retval;
+ return -ENOMEM;
+ }
+ req->ki_flags |= KIOCB_DIO_ONLY;
+
+ return 0;
+}
+#endif
+
static struct io_extension_type extensions[] = {
+ {IO_EXT_PI, IO_EXT_SIZE(ie_pi_ret), setup_pi_ext, destroy_pi_ext},
{IO_EXT_INVALID, 0, NULL, NULL},
};
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 413312f..3df9aeb 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -138,7 +138,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
struct bio_vec *iv;
if (bip->bip_vcnt >= bip_integrity_vecs(bip)) {
- printk(KERN_ERR "%s: bip_vec full\n", __func__);
+ pr_err("%s: bip_vec full\n", __func__);
return 0;
}
@@ -250,7 +250,7 @@ static int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len,
DIV_ROUND_UP(len, bi->tag_size));
if (nr_sectors * bi->tuple_size > bip->bip_iter.bi_size) {
- printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", __func__,
+ pr_err("%s: tag too big for bio: %u > %u\n", __func__,
nr_sectors * bi->tuple_size, bip->bip_iter.bi_size);
return -1;
}
@@ -375,6 +375,96 @@ static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
}
/**
+ * bio_integrity_prep_buffer - Prepare bio for integrity I/O
+ * @bio: bio to prepare
+ * @rw: data direction for the bio
+ * @pi: pi data to attach to bio
+ *
+ * Description: Allocates a buffer for integrity metadata, maps the
+ * pages and attaches them to a bio. The bio must have target device
+ * and start sector set prior to calling. The pages specified in the
+ * @pi argument should contain integrity metadata in the WRITE case,
+ * and should be ready to receive metadata in the READ case.
+ */
+int bio_integrity_prep_buffer(struct bio *bio, int rw,
+ struct bio_integrity_prep_iter *pi)
+{
+ struct bio_integrity_payload *bip;
+ struct blk_integrity *bi;
+ unsigned long start, end;
+ unsigned int len, nr_pages;
+ unsigned int bytes, i;
+ unsigned int sectors;
+ int ret;
+
+ bi = bdev_get_integrity(bio->bi_bdev);
+ BUG_ON(bi == NULL);
+ BUG_ON(bio_integrity(bio));
+
+ sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio));
+
+ /* Allocate kernel buffer for protection data */
+ len = sectors * blk_integrity_tuple_size(bi);
+ end = (pi->pi_offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ start = pi->pi_offset >> PAGE_SHIFT;
+ nr_pages = end - start;
+
+ if (pi->pi_len < len) {
+ pr_err("%s: not enough space left in buffer!\n", __func__);
+ return -ENOMEM;
+ }
+
+ /* Allocate bio integrity payload and integrity vectors */
+ bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
+ if (unlikely(bip == NULL)) {
+ pr_err("could not allocate data integrity bioset\n");
+ return -EIO;
+ }
+
+ bip->bip_owns_buf = 0;
+ bip->bip_buf = NULL;
+ bip->bip_iter.bi_size = len;
+ bip->bip_iter.bi_sector = bio->bi_iter.bi_sector;
+
+ /* Map it */
+ for (i = 0 ; i < nr_pages ; i++) {
+ bytes = PAGE_SIZE - pi->pi_offset;
+
+ if (bytes > pi->pi_len)
+ bytes = pi->pi_len;
+ if (bytes > len)
+ bytes = len;
+ if (pi->pi_len <= 0 || len == 0)
+ break;
+
+ ret = bio_integrity_add_page(bio, *pi->pi_userpages,
+ bytes, pi->pi_offset);
+
+ if (ret == 0)
+ return -EIO;
+
+ if (ret < bytes)
+ break;
+
+ len -= bytes;
+ pi->pi_len -= bytes;
+ if (pi->pi_offset + bytes == PAGE_SIZE)
+ pi->pi_userpages++;
+ pi->pi_offset = (pi->pi_offset + bytes) % PAGE_SIZE;
+ }
+
+ /* Install custom I/O completion handler if read verify is enabled */
+ if ((rw & WRITE) == READ) {
+ bip->bip_end_io = bio->bi_end_io;
+ bio->bi_end_io = bio_integrity_endio;
+ ret = 0;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(bio_integrity_prep_buffer);
+
+/**
* bio_integrity_prep - Prepare bio for integrity I/O
* @bio: bio to prepare
*
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 160a548..3f591f8 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -111,6 +111,10 @@ struct dio_submit {
*/
unsigned head; /* next page to process */
unsigned tail; /* last valid page + 1 */
+
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+ struct bio_integrity_prep_iter pi_iter;
+#endif
};
/* dio_state communicated between submission path and end_io */
@@ -221,6 +225,7 @@ static inline struct page *dio_get_page(struct dio *dio,
return dio->pages[sdio->head++];
}
+
/**
* dio_complete() - called when all DIO BIO I/O has been completed
* @offset: the byte offset in the file of the completed operation
@@ -385,6 +390,22 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
}
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+static int dio_prep_pi_buffers(struct dio *dio, struct dio_submit *sdio)
+{
+ struct bio *bio = sdio->bio;
+ if (sdio->pi_iter.pi_userpages == NULL || !bio_integrity_enabled(bio))
+ return 0;
+
+ return bio_integrity_prep_buffer(bio, dio->rw, &sdio->pi_iter);
+}
+#else
+static int dio_prep_pi_buffers(struct dio *dio, struct dio_submit *sdio)
+{
+ return 0;
+}
+#endif
+
/*
* In the AIO read case we speculatively dirty the pages before starting IO.
* During IO completion, any of these pages which happen to have been written
@@ -392,13 +413,18 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
*
* bios hold a dio reference between submit_bio and ->end_io.
*/
-static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
+static inline int dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
{
struct bio *bio = sdio->bio;
unsigned long flags;
+ int ret = 0;
bio->bi_private = dio;
+ ret = dio_prep_pi_buffers(dio, sdio);
+ if (ret)
+ return ret;
+
spin_lock_irqsave(&dio->bio_lock, flags);
dio->refcount++;
spin_unlock_irqrestore(&dio->bio_lock, flags);
@@ -415,6 +441,8 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
sdio->bio = NULL;
sdio->boundary = 0;
sdio->logical_offset_in_bio = 0;
+
+ return ret;
}
/*
@@ -736,8 +764,11 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
* have.
*/
if (sdio->final_block_in_bio != sdio->cur_page_block ||
- cur_offset != bio_next_offset)
- dio_bio_submit(dio, sdio);
+ cur_offset != bio_next_offset) {
+ ret = dio_bio_submit(dio, sdio);
+ if (ret)
+ goto out;
+ }
}
if (sdio->bio == NULL) {
@@ -747,7 +778,9 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
}
if (dio_bio_add_page(sdio) != 0) {
- dio_bio_submit(dio, sdio);
+ ret = dio_bio_submit(dio, sdio);
+ if (ret)
+ goto out;
ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
if (ret == 0) {
ret = dio_bio_add_page(sdio);
@@ -823,8 +856,12 @@ out:
* avoid metadata seeks.
*/
if (sdio->boundary) {
+ int ret2;
+
ret = dio_send_cur_page(dio, sdio, map_bh);
- dio_bio_submit(dio, sdio);
+ ret2 = dio_bio_submit(dio, sdio);
+ if (ret == 0)
+ ret = ret2;
page_cache_release(sdio->cur_page);
sdio->cur_page = NULL;
}
@@ -1120,7 +1157,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
unsigned blocksize_mask = (1 << blkbits) - 1;
ssize_t retval = -EINVAL;
loff_t end = offset;
- struct dio *dio;
+ struct dio *dio = NULL;
struct dio_submit sdio = { 0, };
unsigned long user_addr;
size_t bytes;
@@ -1187,8 +1224,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
end - 1);
if (retval) {
mutex_unlock(&inode->i_mutex);
- kmem_cache_free(dio_cache, dio);
- goto out;
+ goto out_dio;
}
}
}
@@ -1217,8 +1253,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
* We grab i_mutex only for reads so we don't have
* to release it here
*/
- kmem_cache_free(dio_cache, dio);
- goto out;
+ goto out_dio;
}
}
@@ -1228,6 +1263,9 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
atomic_inc(&inode->i_dio_count);
retval = 0;
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+ sdio.pi_iter = iocb->ki_ioext->ke_pi_iter;
+#endif
sdio.blkbits = blkbits;
sdio.blkfactor = i_blkbits - blkbits;
sdio.block_in_file = offset >> blkbits;
@@ -1315,8 +1353,12 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
page_cache_release(sdio.cur_page);
sdio.cur_page = NULL;
}
- if (sdio.bio)
- dio_bio_submit(dio, &sdio);
+ if (sdio.bio) {
+ int ret2;
+ ret2 = dio_bio_submit(dio, &sdio);
+ if (retval == 0)
+ retval = ret2;
+ }
blk_finish_plug(&plug);
@@ -1353,7 +1395,9 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
retval = dio_complete(dio, offset, retval, false);
} else
BUG_ON(retval != -EIOCBQUEUED);
-
+ return retval;
+out_dio:
+ kmem_cache_free(dio_cache, dio);
out:
return retval;
}
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 60f4364..3f142b8 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -6,6 +6,7 @@
#include <linux/aio_abi.h>
#include <linux/uio.h>
#include <linux/rcupdate.h>
+#include <linux/bio.h>
#include <linux/atomic.h>
@@ -14,6 +15,8 @@ struct kiocb;
#define KIOCB_KEY 0
+#define KIOCB_DIO_ONLY (1) /* don't try buffered if directio fails */
+
/*
* We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
* cancelled or completed (this makes a certain amount of sense because
@@ -29,10 +32,15 @@ struct kiocb;
typedef int (kiocb_cancel_fn)(struct kiocb *);
+/* per-kiocb extension data */
struct kio_extension {
struct io_extension __user *ke_user;
struct io_extension ke_kern;
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+ struct bio_integrity_prep_iter ke_pi_iter; /* PI buffers */
+#endif
};
+
struct kiocb {
struct file *ki_filp;
struct kioctx *ki_ctx; /* NULL for sync ops */
@@ -59,6 +67,8 @@ struct kiocb {
/* Kernel copy of extension descriptors */
struct kio_extension *ki_ioext;
+
+ unsigned int ki_flags;
};
static inline bool is_sync_kiocb(struct kiocb *kiocb)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5a4d39b..4729ab1 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -635,6 +635,13 @@ struct biovec_slab {
struct kmem_cache *slab;
};
+struct bio_integrity_prep_iter {
+ struct page **pi_userpages; /* Pages containing PI data */
+ size_t pi_nrpages; /* Number of PI data pages */
+ size_t pi_offset; /* Offset into the page */
+ size_t pi_len; /* Length of the buffer */
+};
+
/*
* a small number of entries is fine, not going to be performance critical.
* basically we just need to survive
@@ -663,6 +670,8 @@ extern int bio_integrity_enabled(struct bio *bio);
extern int bio_integrity_set_tag(struct bio *, void *, unsigned int);
extern int bio_integrity_get_tag(struct bio *, void *, unsigned int);
extern int bio_integrity_prep(struct bio *);
+extern int bio_integrity_prep_buffer(struct bio *, int rw,
+ struct bio_integrity_prep_iter *);
extern void bio_integrity_endio(struct bio *, int);
extern void bio_integrity_advance(struct bio *, unsigned int);
extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int);
@@ -693,6 +702,12 @@ static inline void bioset_integrity_free (struct bio_set *bs)
return;
}
+static inline int bio_integrity_prep_buffer(struct bio *bio, int rw,
+ struct bio_integrity_prep_iter *pi)
+{
+ return 0;
+}
+
static inline int bio_integrity_prep(struct bio *bio)
{
return 0;
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index 07ffd1f..d7b8c68 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -74,11 +74,17 @@ struct io_event {
/* IO extension types */
#define IO_EXT_INVALID (0)
+#define IO_EXT_PI (1) /* protection info (checksums, etc) */
/* IO extension descriptor */
struct io_extension {
__u64 ie_size;
__u64 ie_has;
+
+ /* PI stuff */
+ __u64 ie_pi_buf;
+ __u32 ie_pi_buflen;
+ __u32 ie_pi_ret;
};
/*
diff --git a/mm/filemap.c b/mm/filemap.c
index 7a13f6a..d35ddb3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2477,6 +2477,12 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
ppos, count, ocount);
if (written < 0 || written == count)
goto out;
+
+ if (iocb->ki_flags & KIOCB_DIO_ONLY) {
+ err = -EINVAL;
+ goto out;
+ }
+
/*
* direct-io write to a hole: fall through to buffered I/O
* for completing the rest of the request.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2014-03-24 16:22 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-03-24 16:22 [RFC PATCH DONOTMERGE v2 0/6] userspace PI passthrough via AIO/DIO Darrick J. Wong
2014-03-24 16:22 ` [PATCH 1/6] fs/bio-integrity: remove duplicate code Darrick J. Wong
2014-04-02 19:17 ` Zach Brown
2014-04-02 20:35 ` Darrick J. Wong
2014-03-24 16:22 ` [PATCH 2/6] io: define an interface for IO extensions Darrick J. Wong
2014-04-02 19:22 ` Jeff Moyer
2014-04-02 22:08 ` Darrick J. Wong
2014-04-02 19:49 ` Zach Brown
2014-04-02 22:28 ` Darrick J. Wong
2014-04-02 22:53 ` Zach Brown
2014-04-02 23:06 ` Darrick J. Wong
2014-03-24 16:22 ` Darrick J. Wong [this message]
2014-04-02 20:01 ` [PATCH 3/6] aio/dio: enable PI passthrough Zach Brown
2014-04-02 20:44 ` Darrick J. Wong
2014-04-02 22:33 ` Zach Brown
2014-04-02 22:55 ` Darrick J. Wong
2014-03-24 16:22 ` [PATCH 4/6] PI IO extension: allow user to ask kernel to fill in parts of the protection info Darrick J. Wong
2014-03-24 16:23 ` [PATCH 5/6] PI IO extension: advertise possible userspace flags Darrick J. Wong
2014-03-24 16:23 ` [PATCH 6/6] blk-integrity: refactor various routines Darrick J. Wong
2014-04-02 19:14 ` [RFC PATCH DONOTMERGE v2 0/6] userspace PI passthrough via AIO/DIO Zach Brown
2014-04-02 20:05 ` Zach Brown
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20140324162251.10848.56452.stgit@birch.djwong.org \
--to=darrick.wong@oracle.com \
--cc=JBottomley@parallels.com \
--cc=axboe@kernel.dk \
--cc=bcrl@kvack.org \
--cc=jmoyer@redhat.com \
--cc=linux-aio@kvack.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-scsi@vger.kernel.org \
--cc=martin.petersen@oracle.com \
--cc=viro@zeniv.linux.org.uk \
--cc=zab@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).