All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jens Axboe <jens.axboe@oracle.com>
To: "Leisner, Martin" <Martin.Leisner@xerox.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>,
	marty <martyleisner@yahoo.com>,
	linux-kernel@vger.kernel.org
Subject: Re: disk IO directly from PCI memory to block device sectors
Date: Mon, 29 Sep 2008 15:02:22 +0200	[thread overview]
Message-ID: <20080929130222.GX2677@kernel.dk> (raw)
In-Reply-To: <556445368AFA1C438794ABDA8901891C0999203A@USA0300MS03.na.xerox.net>

On Fri, Sep 26 2008, Leisner, Martin wrote:
> 
> 
> >   -----Original Message-----
> >   From: Jens Axboe [mailto:jens.axboe@oracle.com]
> >   Sent: Friday, September 26, 2008 5:12 AM
> >   To: Alan Cox
> >   Cc: marty; linux-kernel@vger.kernel.org; Leisner, Martin
> >   Subject: Re: disk IO directly from PCI memory to block device
> sectors
> >   
> >   On Fri, Sep 26 2008, Alan Cox wrote:
> >   > > What I'm looking is for a more generic/driver independent way of
> >   sticking
> >   > > contents of PCI ram onto a disk.
> >   >
> >   > Ermm seriously why not have a userspace task with the PCI RAM
> mmapped
> >   > and just use write() like normal sane people do ?
> >   
> >   To avoid the fault and copy, I would assume.
> >   
> >   --
> >   Jens Axboe
> 
> Also:
>    a) to deal with interrupts from the hardware
>    b) using legacy code/design/architecture
>    
> The splice approaches look very interesting...thanks...

Just for kicks, I did the read part of the fast bdev interface as well.
As with the write, it's totally untested (apart from compiled). Just in
case anyone is curious... I plan to do a bit of testing on this this
week.

IMHO, this interface totally rocks. It's really async like splice was
intended, and it's fast too. I may have to look into some generic IO
mechanism to unify them all, O_DIRECT/page cache/splice. Famous last
words, I'm sure.


diff --git a/fs/block_dev.c b/fs/block_dev.c
index aff5421..f8df781 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -24,6 +24,7 @@
 #include <linux/uio.h>
 #include <linux/namei.h>
 #include <linux/log2.h>
+#include <linux/splice.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -1155,6 +1156,264 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	return blkdev_ioctl(file->f_mapping->host, file, cmd, arg);
 }
 
+static void block_splice_write_end_io(struct bio *bio, int err)
+{
+	bio_put(bio);
+}
+
+static int pipe_to_disk(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+			struct splice_desc *sd)
+{
+	struct block_device *bdev = I_BDEV(sd->u.file->f_mapping->host);
+	struct bio *bio;
+	int ret, bs;
+
+	bs = queue_hardsect_size(bdev_get_queue(bdev));
+	if (sd->pos & (bs - 1))
+		return -EINVAL;
+
+	ret = buf->ops->confirm(pipe, buf);
+	if (unlikely(ret))
+		return ret;
+
+	bio = bio_alloc(GFP_KERNEL, 1);
+	bio->bi_sector = sd->pos / bs;
+	bio->bi_bdev = bdev;
+	bio->bi_end_io = block_splice_write_end_io;
+
+	bio_add_page(bio, buf->page, buf->len, buf->offset);
+
+	submit_bio(WRITE, bio);
+	return buf->len;
+}
+
+/*
+ * Splice to file opened with O_DIRECT. Bypass caching completely and
+ * just go direct-to-bio
+ */
+static ssize_t __block_splice_write(struct pipe_inode_info *pipe,
+				    struct file *out, loff_t *ppos, size_t len,
+				    unsigned int flags)
+{
+	struct splice_desc sd = {
+		.total_len = len,
+		.flags = flags,
+		.pos = *ppos,
+		.u.file = out,
+	};
+	struct inode *inode = out->f_mapping->host;
+	ssize_t ret;
+
+	if (unlikely(*ppos & 511))
+		return -EINVAL;
+
+	inode_double_lock(inode, pipe->inode);
+	ret = __splice_from_pipe(pipe, &sd, pipe_to_disk);
+	inode_double_unlock(inode, pipe->inode);
+
+	if (ret > 0)
+		*ppos += ret;
+
+	return ret;
+}
+
+static ssize_t block_splice_write(struct pipe_inode_info *pipe,
+				  struct file *out, loff_t *ppos, size_t len,
+				  unsigned int flags)
+{
+	if (out->f_flags & O_DIRECT)
+		return __block_splice_write(pipe, out, ppos, len, flags);
+
+	return generic_file_splice_write(pipe, out, ppos, len, flags);
+}
+
+static void block_pipe_buf_release(struct pipe_inode_info *pipe,
+				   struct pipe_buffer *buf)
+{
+	struct bio *bio;
+
+	bio = (struct bio *) buf->private;
+	if (bio)
+		bio_put(bio);
+
+	__free_pages(buf->page, 0);
+}
+
+/*
+ * Wait for IO to be done on the bio that this buf belongs to
+ */
+static int block_pipe_buf_confirm(struct pipe_inode_info *pipe,
+				  struct pipe_buffer *buf)
+{
+	struct bio *bio = (struct bio *) buf->private;
+	struct completion *comp = bio->bi_private;
+
+	wait_for_completion(comp);
+	return 0;
+}
+
+static const struct pipe_buf_operations block_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.confirm = block_pipe_buf_confirm,
+	.release = block_pipe_buf_release,
+	.steal = generic_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
+static void block_release_page(struct splice_pipe_desc *spd, unsigned int i)
+{
+	struct bio *bio;
+
+	bio = (struct bio *) spd->partial[i].private;
+	if (bio)
+		bio_put(bio);
+
+	__free_pages(spd->pages[i], 0);
+}
+
+/*
+ * READ end io handling completes the bio, so that we can wakeup
+ * anyone waiting in ->confirm().
+ */
+static void block_splice_read_end_io(struct bio *bio, int err)
+{
+	struct completion *comp = bio->bi_private;
+
+	complete(comp);
+	bio_put(bio);
+}
+
+static void block_splice_bio_destructor(struct bio *bio)
+{
+	kfree(bio->bi_private);
+	bio_free(bio, fs_bio_set);
+}
+
+/*
+ * Bypass the page cache and allocate pages for IO directly
+ */
+static ssize_t __block_splice_read(struct pipe_inode_info *pipe,
+				    struct file *in, loff_t *ppos, size_t len,
+				    unsigned int flags)
+{
+	struct page *pages[PIPE_BUFFERS];
+	struct partial_page partial[PIPE_BUFFERS];
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.partial = partial,
+		.flags = flags,
+		.ops = &block_pipe_buf_ops,
+		.spd_release = block_release_page,
+	};
+	struct inode *inode = in->f_mapping->host;
+	struct block_device *bdev = I_BDEV(inode);
+	struct bio *bio;
+	sector_t sector;
+	loff_t isize, left;
+	int bs, err;
+
+	/*
+	 * First to alignment and length sanity checks
+	 */
+	bs = queue_hardsect_size(bdev_get_queue(bdev));
+	if (*ppos & (bs - 1))
+		return -EINVAL;
+
+	isize = i_size_read(inode);
+	if (unlikely(*ppos >= isize))
+		return 0;
+
+	left = isize - *ppos;
+	if (unlikely(left < len))
+		len = left;
+
+	err = 0;
+	spd.nr_pages = 0;
+	sector = *ppos / bs;
+	bio = NULL;
+	while (len) {
+		struct completion *comp;
+		unsigned int this_len;
+		struct page *page;
+
+		this_len = len;
+		if (this_len > PAGE_SIZE)
+			this_len = PAGE_SIZE;
+
+		page = alloc_page(GFP_KERNEL);
+		if (!page) {
+			err = -ENOMEM;
+			break;
+		}
+
+		if (!bio) {
+alloc_new_bio:
+			comp = kmalloc(sizeof(*comp), GFP_KERNEL);
+			if (!comp) {
+				err = -ENOMEM;
+				break;
+			}
+
+			init_completion(comp);
+
+			bio = bio_alloc(GFP_KERNEL, (len + PAGE_SIZE - 1) / PAGE_SIZE);
+			bio->bi_sector = sector;
+			bio->bi_bdev = bdev;
+			bio->bi_private = comp;
+			bio->bi_end_io = block_splice_read_end_io;
+
+			/*
+			 * Not too nice...
+			 */
+			bio->bi_destructor = block_splice_bio_destructor;
+		}
+
+		/*
+		 * if we fail adding page, then submit this bio and get
+		 * a new one
+		 */
+		if (bio_add_page(bio, page, this_len, 0) != this_len) {
+			submit_bio(READ, bio);
+			bio = NULL;
+			goto alloc_new_bio;
+		}
+
+		/*
+		 * The pipe buffer needs to hang on to the bio, so that we
+		 * can reuse it in the ->confirm() part of the pipe ops
+		 */
+		bio_get(bio);
+
+		sector += (this_len / bs);
+		len -= this_len;
+		partial[spd.nr_pages].offset = 0;
+		partial[spd.nr_pages].len = this_len;
+		partial[spd.nr_pages].private = (unsigned long) bio;
+		pages[spd.nr_pages] = page;
+		spd.nr_pages++;
+	}
+
+	if (bio)
+		submit_bio(READ, bio);
+
+	if (spd.nr_pages)
+		return splice_to_pipe(pipe, &spd);
+
+	return err;
+}
+
+static ssize_t block_splice_read(struct file *in, loff_t *ppos,
+				 struct pipe_inode_info *pipe, size_t len,
+				 unsigned int flags)
+{
+	if (in->f_flags & O_DIRECT)
+		return __block_splice_read(pipe, in, ppos, len, flags);
+
+	return generic_file_splice_read(in, ppos, pipe, len, flags);
+}
+
 static const struct address_space_operations def_blk_aops = {
 	.readpage	= blkdev_readpage,
 	.writepage	= blkdev_writepage,
@@ -1179,8 +1438,8 @@ const struct file_operations def_blk_fops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= compat_blkdev_ioctl,
 #endif
-	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_read	= block_splice_read,
+	.splice_write	= block_splice_write,
 };
 
 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)

-- 
Jens Axboe


  reply	other threads:[~2008-09-29 13:02 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-09-26  7:29 disk IO directly from PCI memory to block device sectors marty
2008-09-26  8:03 ` Jens Axboe
2008-09-26  8:46 ` Alan Cox
2008-09-26  9:11   ` Jens Axboe
2008-09-26 10:06     ` Alan Cox
2008-09-26 10:19       ` Jens Axboe
2008-09-26 11:34         ` Jens Axboe
2008-09-26 15:51     ` Leisner, Martin
2008-09-29 13:02       ` Jens Axboe [this message]
2008-10-01 19:05         ` Jens Axboe
2008-10-02 16:15           ` Leon Woestenberg
2008-10-02 16:32             ` Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20080929130222.GX2677@kernel.dk \
    --to=jens.axboe@oracle.com \
    --cc=Martin.Leisner@xerox.com \
    --cc=alan@lxorguk.ukuu.org.uk \
    --cc=linux-kernel@vger.kernel.org \
    --cc=martyleisner@yahoo.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.