public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Jens Axboe <jens.axboe@oracle.com>
To: Ingo Molnar <mingo@elte.hu>
Cc: Linus Torvalds <torvalds@linux-foundation.org>,
	Jeff Garzik <jeff@garzik.org>, Zach Brown <zach.brown@oracle.com>,
	linux-kernel@vger.kernel.org,
	Arjan van de Ven <arjan@infradead.org>,
	Christoph Hellwig <hch@infradead.org>,
	Andrew Morton <akpm@zip.com.au>,
	Alan Cox <alan@lxorguk.ukuu.org.uk>,
	Ulrich Drepper <drepper@redhat.com>,
	Evgeniy Polyakov <johnpol@2ka.mipt.ru>,
	"David S. Miller" <davem@davemloft.net>,
	Suparna Bhattacharya <suparna@in.ibm.com>,
	Davide Libenzi <davidel@xmailserver.org>,
	Thomas Gleixner <tglx@linutronix.de>
Subject: Re: Syslets, Threadlets, generic AIO support, v6
Date: Wed, 30 May 2007 19:57:59 +0200	[thread overview]
Message-ID: <20070530175759.GP15559@kernel.dk> (raw)
In-Reply-To: <20070530160913.GA14261@elte.hu>

On Wed, May 30 2007, Ingo Molnar wrote:
>  - splice. (a bit too early to tell but it's looking good so far. Would
>    be nice if someone did a brute-force memcpy() based vmsplice to user
>    memory, just to make usage fully symmetric.)

Heh, I actually agree, at least then the interface is complete! We can
always replace it with something more clever, should someone feel so
inclined. Here's a rough patch to do that, it's totally untested (but it
compiles). sparse will warn about the __user removal, though. I'm sure
viro would shoot me dead on the spot, should he see this...

diff --git a/fs/splice.c b/fs/splice.c
index 12f2828..5023c01 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -657,9 +657,9 @@ out_ret:
  * key here is the 'actor' worker passed in that actually moves the data
  * to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
  */
-ssize_t __splice_from_pipe(struct pipe_inode_info *pipe,
-			   struct file *out, loff_t *ppos, size_t len,
-			   unsigned int flags, splice_actor *actor)
+ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, void *actor_priv,
+			   loff_t *ppos, size_t len, unsigned int flags,
+			   splice_actor *actor)
 {
 	int ret, do_wakeup, err;
 	struct splice_desc sd;
@@ -669,7 +669,7 @@ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe,
 
 	sd.total_len = len;
 	sd.flags = flags;
-	sd.file = out;
+	sd.file = actor_priv;
 	sd.pos = *ppos;
 
 	for (;;) {
@@ -1240,28 +1240,104 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 	return error;
 }
 
+static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+			struct splice_desc *sd)
+{
+	int ret;
+
+	ret = buf->ops->pin(pipe, buf);
+	if (!ret) {
+		void __user *dst = sd->userptr;
+		/*
+		 * use non-atomic map, can be optimized to map atomically if we
+		 * prefault the user memory.
+		 */
+		char *src = buf->ops->map(pipe, buf, 0);
+
+		if (copy_to_user(dst, src, sd->len))
+			ret = -EFAULT;
+
+		buf->ops->unmap(pipe, buf, src);
+
+		if (!ret)
+			return sd->len;
+	}
+
+	return ret;
+}
+
+/*
+ * For lack of a better implementation, implement vmsplice() to userspace
+ * as a simple copy of the pipes pages to the user iov.
+ */
+static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
+			     unsigned long nr_segs, unsigned int flags)
+{
+	struct pipe_inode_info *pipe;
+	ssize_t size;
+	int error;
+	long ret;
+
+	pipe = pipe_info(file->f_path.dentry->d_inode);
+	if (!pipe)
+		return -EBADF;
+	if (!nr_segs)
+		return 0;
+
+	if (pipe->inode)
+		mutex_lock(&pipe->inode->i_mutex);
+
+	ret = 0;
+	while (nr_segs) {
+		void __user *base;
+		size_t len;
+
+		/*
+		 * Get user address base and length for this iovec.
+		 */
+		error = get_user(base, &iov->iov_base);
+		if (unlikely(error))
+			break;
+		error = get_user(len, &iov->iov_len);
+		if (unlikely(error))
+			break;
+
+		/*
+		 * Sanity check this iovec. 0 read succeeds.
+		 */
+		if (unlikely(!len))
+			break;
+		error = -EFAULT;
+		if (unlikely(!base))
+			break;
+
+		size = __splice_from_pipe(pipe, (void *) base, NULL, len,
+						flags, pipe_to_user);
+		if (size < 0) {
+			if (!ret)
+				ret = size;
+
+			break;
+		}
+
+		nr_segs--;
+		iov++;
+		ret += size;
+	}
+
+	if (pipe->inode)
+		mutex_unlock(&pipe->inode->i_mutex);
+
+	return ret;
+}
+
 /*
  * vmsplice splices a user address range into a pipe. It can be thought of
  * as splice-from-memory, where the regular splice is splice-from-file (or
  * to file). In both cases the output is a pipe, naturally.
- *
- * Note that vmsplice only supports splicing _from_ user memory to a pipe,
- * not the other way around. Splicing from user memory is a simple operation
- * that can be supported without any funky alignment restrictions or nasty
- * vm tricks. We simply map in the user memory and fill them into a pipe.
- * The reverse isn't quite as easy, though. There are two possible solutions
- * for that:
- *
- *	- memcpy() the data internally, at which point we might as well just
- *	  do a regular read() on the buffer anyway.
- *	- Lots of nasty vm tricks, that are neither fast nor flexible (it
- *	  has restriction limitations on both ends of the pipe).
- *
- * Alas, it isn't here.
- *
  */
-static long do_vmsplice(struct file *file, const struct iovec __user *iov,
-			unsigned long nr_segs, unsigned int flags)
+static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
+			     unsigned long nr_segs, unsigned int flags)
 {
 	struct pipe_inode_info *pipe;
 	struct page *pages[PIPE_BUFFERS];
@@ -1289,6 +1365,22 @@ static long do_vmsplice(struct file *file, const struct iovec __user *iov,
 	return splice_to_pipe(pipe, &spd);
 }
 
+/*
+ * Note that vmsplice only really supports true splicing _from_ user memory
+ * to a pipe, not the other way around. Splicing from user memory is a simple
+ * operation that can be supported without any funky alignment restrictions
+ * or nasty vm tricks. We simply map in the user memory and fill them into
+ * a pipe. The reverse isn't quite as easy, though. There are two possible
+ * solutions for that:
+ *
+ *	- memcpy() the data internally, at which point we might as well just
+ *	  do a regular read() on the buffer anyway.
+ *	- Lots of nasty vm tricks, that are neither fast nor flexible (it
+ *	  has restriction limitations on both ends of the pipe).
+ *
+ * Currently we punt and implement it as a normal copy, see pipe_to_user().
+ *
+ */
 asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
 			     unsigned long nr_segs, unsigned int flags)
 {
@@ -1300,7 +1392,9 @@ asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
 	file = fget_light(fd, &fput);
 	if (file) {
 		if (file->f_mode & FMODE_WRITE)
-			error = do_vmsplice(file, iov, nr_segs, flags);
+			error = vmsplice_to_pipe(file, iov, nr_segs, flags);
+		else if (file->f_mode & FMODE_READ)
+			error = vmsplice_to_user(file, iov, nr_segs, flags);
 
 		fput_light(file, fput);
 	}

-- 
Jens Axboe


  reply	other threads:[~2007-05-30 17:59 UTC|newest]

Thread overview: 71+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-05-29 21:27 Syslets, Threadlets, generic AIO support, v6 Zach Brown
2007-05-29 21:49 ` Linus Torvalds
2007-05-29 22:49   ` Zach Brown
2007-05-29 22:16 ` Jeff Garzik
2007-05-29 23:09   ` Zach Brown
2007-05-29 23:20     ` Ulrich Drepper
2007-05-30  1:11       ` Dave Jones
2007-05-30 17:08         ` Zach Brown
2007-05-30  7:26     ` Ingo Molnar
2007-05-30  7:20   ` Ingo Molnar
2007-05-30  7:31     ` Ulrich Drepper
2007-05-30  8:42       ` Ingo Molnar
2007-05-30  8:51         ` Evgeniy Polyakov
2007-05-30  9:05           ` Ingo Molnar
2007-05-30 15:16         ` Linus Torvalds
2007-05-30 15:39         ` Ulrich Drepper
2007-05-30 19:40         ` Davide Libenzi
2007-05-30 19:55           ` Ulrich Drepper
2007-05-30 20:00           ` Linus Torvalds
2007-05-30 20:21             ` Davide Libenzi
2007-05-30 20:31             ` Eric Dumazet
2007-05-30 20:44               ` Linus Torvalds
2007-05-30 21:53                 ` Eric Dumazet
2007-05-30 21:31               ` Davide Libenzi
2007-05-30 21:16             ` Ulrich Drepper
2007-05-30 21:27               ` Linus Torvalds
2007-05-30 21:47                 ` Ulrich Drepper
2007-05-30 22:06                   ` Davide Libenzi
2007-05-30 21:48                 ` Davide Libenzi
2007-05-30 22:01                   ` Linus Torvalds
2007-05-31  6:13                     ` Ingo Molnar
2007-05-31  7:35                       ` Eric Dumazet
2007-05-31  9:26                         ` Ingo Molnar
2007-05-31  9:02                       ` Ingo Molnar
2007-05-31 10:41                         ` Eric Dumazet
2007-05-31 10:50                           ` Ingo Molnar
2007-05-31  9:32                       ` Ingo Molnar
2007-05-31  9:34                         ` Jens Axboe
2007-05-30 22:09                   ` Eric Dumazet
2007-05-30 21:51                 ` David M. Lloyd
2007-05-30 22:24                 ` William Lee Irwin III
2007-05-30 21:38               ` Jeremy Fitzhardinge
2007-05-30 21:39               ` Davide Libenzi
2007-05-30 21:36             ` Jeremy Fitzhardinge
2007-05-30 21:44               ` Linus Torvalds
2007-05-30 21:48                 ` Linus Torvalds
2007-05-30 21:54                   ` Jeremy Fitzhardinge
2007-05-30 22:27             ` Matt Mackall
2007-05-30 22:38               ` William Lee Irwin III
2007-05-30  8:32     ` Evgeniy Polyakov
2007-05-30  8:54       ` Ingo Molnar
2007-05-30  9:30         ` Evgeniy Polyakov
2007-05-30  9:28     ` Jeff Garzik
2007-05-30 13:02       ` Ingo Molnar
2007-05-30 13:20         ` Ingo Molnar
2007-05-30 15:31       ` Linus Torvalds
2007-05-30 16:09         ` Ingo Molnar
2007-05-30 17:57           ` Jens Axboe [this message]
2007-05-30 19:05           ` Mark Lord
2007-05-30 19:10             ` Jens Axboe
2007-05-30 19:15             ` Linus Torvalds
2007-05-30 19:32               ` Jens Axboe
2007-05-30 20:07               ` Eric Dumazet
2007-05-30 20:31                 ` Linus Torvalds
2007-05-30 20:46                   ` Eric Dumazet
2007-05-30 19:52           ` Davide Libenzi
2007-05-30  7:40 ` Jens Axboe
2007-05-30 16:55   ` Zach Brown
2007-05-30 17:33     ` Jens Axboe
  -- strict thread matches above, loose matches on Subject: below --
2007-05-31  8:15 Albert Cahalan
2007-05-31  9:50 ` Ingo Molnar

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070530175759.GP15559@kernel.dk \
    --to=jens.axboe@oracle.com \
    --cc=akpm@zip.com.au \
    --cc=alan@lxorguk.ukuu.org.uk \
    --cc=arjan@infradead.org \
    --cc=davem@davemloft.net \
    --cc=davidel@xmailserver.org \
    --cc=drepper@redhat.com \
    --cc=hch@infradead.org \
    --cc=jeff@garzik.org \
    --cc=johnpol@2ka.mipt.ru \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=suparna@in.ibm.com \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    --cc=zach.brown@oracle.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox