public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] sys_vmsplice
@ 2006-04-21  8:02 Jens Axboe
  2006-04-21  9:17 ` Andrew Morton
  2006-04-21 20:45 ` [PATCH] sys_vmsplice Linus Torvalds
  0 siblings, 2 replies; 11+ messages in thread
From: Jens Axboe @ 2006-04-21  8:02 UTC (permalink / raw)
  To: linux-kernel; +Cc: torvalds, davem

[-- Attachment #1: Type: text/plain, Size: 13323 bytes --]

Hi,

Here's the first implementation of sys_vmsplice(). I'm attaching a
little test app as well for playing with it, it's also committed to the
splice tools repo at:

        git://brick.kernel.dk/data/git/splice.git

Patch is against current Linus -git, it's also included in the splice
branch of the block git repo.

-----

sys_splice() moves data to/from pipes with a file input/output. sys_vmsplice()
moves data to a pipe, with the input being a user address range instead.

Signed-off-by: Jens Axboe <axboe@suse.de>

---

 arch/ia64/kernel/entry.S     |    1 
 arch/powerpc/kernel/systbl.S |    1 
 fs/fcntl.c                   |    8 ++
 fs/splice.c                  |  181 +++++++++++++++++++++++++++++++++++-------
 include/asm-generic/fcntl.h  |    5 +
 include/asm-i386/unistd.h    |    3 -
 include/asm-ia64/unistd.h    |    1 
 include/asm-powerpc/unistd.h |    3 -
 include/asm-x86_64/unistd.h  |    4 +
 include/linux/syscalls.h     |    3 +
 10 files changed, 176 insertions(+), 34 deletions(-)

888642ed2f315862a4cc815c1a9029a328adbd33
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index e307988..bcb80ca 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1610,5 +1610,6 @@ sys_call_table:
 	data8 sys_get_robust_list
 	data8 sys_sync_file_range		// 1300
 	data8 sys_tee
+	data8 sys_vmsplice
 
 	.org sys_call_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls
diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S
index a14c964..9730315 100644
--- a/arch/powerpc/kernel/systbl.S
+++ b/arch/powerpc/kernel/systbl.S
@@ -324,3 +324,4 @@ COMPAT_SYS(ppoll)
 SYSCALL(unshare)
 SYSCALL(splice)
 SYSCALL(tee)
+SYSCALL(vmsplice)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index d35cbc6..56ac96e 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -18,6 +18,7 @@ #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/signal.h>
 #include <linux/rcupdate.h>
+#include <linux/pipe_fs_i.h>
 
 #include <asm/poll.h>
 #include <asm/siginfo.h>
@@ -345,6 +346,13 @@ static long do_fcntl(int fd, unsigned in
 	case F_NOTIFY:
 		err = fcntl_dirnotify(fd, filp, arg);
 		break;
+	case F_SETPSZ:
+		err = -EINVAL;
+		break;
+	case F_GETPSZ:
+		/* this assumes user space can reliably tell PAGE_CACHE_SIZE */
+		err = PIPE_BUFFERS;
+		break;
 	default:
 		break;
 	}
diff --git a/fs/splice.c b/fs/splice.c
index 0559e75..57d55f2 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -39,6 +39,18 @@ struct splice_desc {
 };
 
 /*
+ * Passed to move_to_pipe
+ */
+struct splice_pipe_desc {
+	struct page **pages;		/* page map */
+	int nr_pages;			/* number of pages in map */
+	unsigned long len;		/* maximum number of bytes to maps */
+	unsigned int offset;		/* offset into first page */
+	unsigned int flags;		/* splice flags */
+	struct pipe_buf_operations *ops;/* ops associated with output pipe */
+};
+
+/*
  * Attempt to steal a page from a pipe buffer. This should perhaps go into
  * a vm helper function, it's already simplified quite a bit by the
  * addition of remove_mapping(). If success is returned, the caller may
@@ -128,6 +140,19 @@ static void page_cache_pipe_buf_unmap(st
 	kunmap(buf->page);
 }
 
+static void *user_page_pipe_buf_map(struct file *file,
+				    struct pipe_inode_info *pipe,
+				    struct pipe_buffer *buf)
+{
+	return kmap(buf->page);
+}
+
+static void user_page_pipe_buf_unmap(struct pipe_inode_info *pipe,
+				     struct pipe_buffer *buf)
+{
+	kunmap(buf->page);
+}
+
 static void page_cache_pipe_buf_get(struct pipe_inode_info *info,
 				    struct pipe_buffer *buf)
 {
@@ -143,13 +168,27 @@ static struct pipe_buf_operations page_c
 	.get = page_cache_pipe_buf_get,
 };
 
+static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
+				    struct pipe_buffer *buf)
+{
+	return 1;
+}
+
+static struct pipe_buf_operations user_page_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = user_page_pipe_buf_map,
+	.unmap = user_page_pipe_buf_unmap,
+	.release = page_cache_pipe_buf_release,
+	.steal = user_page_pipe_buf_steal,
+	.get = page_cache_pipe_buf_get,
+};
+
 /*
  * Pipe output worker. This sets up our pipe format with the page cache
  * pipe buffer operations. Otherwise very similar to the regular pipe_writev().
  */
-static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
-			    int nr_pages, unsigned long len,
-			    unsigned int offset, unsigned int flags)
+static ssize_t move_to_pipe(struct pipe_inode_info *pipe,
+			    struct splice_pipe_desc *spd)
 {
 	int ret, do_wakeup, i;
 
@@ -171,27 +210,27 @@ static ssize_t move_to_pipe(struct pipe_
 		if (pipe->nrbufs < PIPE_BUFFERS) {
 			int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
 			struct pipe_buffer *buf = pipe->bufs + newbuf;
-			struct page *page = pages[i++];
+			struct page *page = spd->pages[i++];
 			unsigned long this_len;
 
-			this_len = PAGE_CACHE_SIZE - offset;
-			if (this_len > len)
-				this_len = len;
+			this_len = PAGE_CACHE_SIZE - spd->offset;
+			if (this_len > spd->len)
+				this_len = spd->len;
 
 			buf->page = page;
-			buf->offset = offset;
+			buf->offset = spd->offset;
 			buf->len = this_len;
-			buf->ops = &page_cache_pipe_buf_ops;
+			buf->ops = spd->ops;
 			pipe->nrbufs++;
 			if (pipe->inode)
 				do_wakeup = 1;
 
 			ret += this_len;
-			len -= this_len;
-			offset = 0;
-			if (!--nr_pages)
+			spd->len -= this_len;
+			spd->offset = 0;
+			if (!--spd->nr_pages)
 				break;
-			if (!len)
+			if (!spd->len)
 				break;
 			if (pipe->nrbufs < PIPE_BUFFERS)
 				continue;
@@ -199,7 +238,7 @@ static ssize_t move_to_pipe(struct pipe_
 			break;
 		}
 
-		if (flags & SPLICE_F_NONBLOCK) {
+		if (spd->flags & SPLICE_F_NONBLOCK) {
 			if (!ret)
 				ret = -EAGAIN;
 			break;
@@ -234,8 +273,8 @@ static ssize_t move_to_pipe(struct pipe_
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
 
-	while (i < nr_pages)
-		page_cache_release(pages[i++]);
+	while (i < spd->nr_pages)
+		page_cache_release(spd->pages[i++]);
 
 	return ret;
 }
@@ -246,17 +285,21 @@ __generic_file_splice_read(struct file *
 			   unsigned int flags)
 {
 	struct address_space *mapping = in->f_mapping;
-	unsigned int loff, offset, nr_pages;
+	unsigned int loff, nr_pages;
 	struct page *pages[PIPE_BUFFERS];
 	struct page *page;
 	pgoff_t index, end_index;
 	loff_t isize;
-	size_t bytes;
-	int i, error;
+	int error;
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.flags = flags,
+		.ops = &page_cache_pipe_buf_ops,
+	};
 
 	index = *ppos >> PAGE_CACHE_SHIFT;
-	loff = offset = *ppos & ~PAGE_CACHE_MASK;
-	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	loff = spd.offset = *ppos & ~PAGE_CACHE_MASK;
+	nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
 	if (nr_pages > PIPE_BUFFERS)
 		nr_pages = PIPE_BUFFERS;
@@ -266,15 +309,14 @@ __generic_file_splice_read(struct file *
 	 * read-ahead if this is a non-zero offset (we are likely doing small
 	 * chunk splice and the page is already there) for a single page.
 	 */
-	if (!offset || nr_pages > 1)
-		do_page_cache_readahead(mapping, in, index, nr_pages);
+	if (!spd.offset || spd.nr_pages > 1)
+		do_page_cache_readahead(mapping, in, index, spd.nr_pages);
 
 	/*
 	 * Now fill in the holes:
 	 */
 	error = 0;
-	bytes = 0;
-	for (i = 0; i < nr_pages; i++, index++) {
+	for (spd.nr_pages = 0; spd.nr_pages < nr_pages; spd.nr_pages++, index++) {
 		unsigned int this_len;
 
 		if (!len)
@@ -367,26 +409,26 @@ readpage:
 			 */
 			if (end_index == index) {
 				loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK);
-				if (bytes + loff > isize) {
+				if (spd.len + loff > isize) {
 					page_cache_release(page);
 					break;
 				}
 				/*
 				 * force quit after adding this page
 				 */
-				nr_pages = i;
+				nr_pages = spd.nr_pages;
 				this_len = min(this_len, loff);
 			}
 		}
 fill_it:
-		pages[i] = page;
-		bytes += this_len;
+		pages[spd.nr_pages] = page;
+		spd.len += this_len;
 		len -= this_len;
 		loff = 0;
 	}
 
-	if (i)
-		return move_to_pipe(pipe, pages, i, bytes, offset, flags);
+	if (spd.nr_pages)
+		return move_to_pipe(pipe, &spd);
 
 	return error;
 }
@@ -1010,6 +1052,83 @@ static long do_splice(struct file *in, l
 	return -EINVAL;
 }
 
+/*
+ * vmsplice splices a user address range into a pipe. It can be thought of
+ * as splice-from-memory, where the regular splice is splice-from-file (or
+ * to file). In both cases the output is a pipe, naturally.
+ *
+ * Note that vmsplice only supports splicing _from_ user memory to a pipe,
+ * not the other way around. Splicing from user memory is a simple operation
+ * that can be supported without any funky alignment restrictions or nasty
+ * vm tricks. We simply map in the user memory and fill them into a pipe.
+ * The reverse isn't quite as easy, though. There are two possible solutions
+ * for that:
+ *
+ *	- memcpy() the data internally, at which point we might as well just
+ *	  do a regular read() on the buffer anyway.
+ *	- Lots of nasty vm tricks, that are neither fast nor flexible (it
+ *	  has restriction limitations on both ends of the pipe).
+ *
+ * Alas, it isn't here.
+ *
+ */
+static long do_vmsplice(struct file *file, void __user *buffer, size_t len,
+			unsigned int flags)
+{
+	unsigned long uaddr = (unsigned long) buffer;
+	struct pipe_inode_info *pipe;
+	struct page *pages[PIPE_BUFFERS];
+	unsigned int nr_pages;
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.len = len,
+		.flags = flags,
+		.ops = &user_page_pipe_buf_ops,
+	};
+
+	pipe = file->f_dentry->d_inode->i_pipe;
+	if (unlikely(!pipe))
+		return -EBADF;
+
+	spd.offset = uaddr & ~PAGE_CACHE_MASK;
+	nr_pages = (len + spd.offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	if (nr_pages > PIPE_BUFFERS)
+		nr_pages = PIPE_BUFFERS;
+
+	down_read(&current->mm->mmap_sem);
+	spd.nr_pages = get_user_pages(current, current->mm, uaddr, nr_pages, 0,
+				      0, pages, NULL);
+	up_read(&current->mm->mmap_sem);
+
+	if (spd.nr_pages > 0)
+		return move_to_pipe(pipe, &spd);
+
+	return spd.nr_pages;
+}
+
+asmlinkage long sys_vmsplice(int fd, void __user *buffer, size_t len,
+			     unsigned int flags)
+{
+	long error;
+	struct file *file;
+	int fput;
+
+	if (unlikely(!len))
+		return 0;
+
+	error = -EBADF;
+	file = fget_light(fd, &fput);
+	if (file) {
+		if (file->f_mode & FMODE_WRITE)
+			error = do_vmsplice(file, buffer, len, flags);
+
+		fput_light(file, fput);
+	}
+
+	return error;
+}
+
 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
 			   int fd_out, loff_t __user *off_out,
 			   size_t len, unsigned int flags)
diff --git a/include/asm-generic/fcntl.h b/include/asm-generic/fcntl.h
index b663520..1da0fba 100644
--- a/include/asm-generic/fcntl.h
+++ b/include/asm-generic/fcntl.h
@@ -146,4 +146,9 @@ struct flock64 {
 #endif
 #endif /* !CONFIG_64BIT */
 
+#ifndef F_SETPSZ
+#define F_SETPSZ	15	/* for pipes. */
+#define F_GETPSZ	16	/* for pipes. */
+#endif
+
 #endif /* _ASM_GENERIC_FCNTL_H */
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index d81d6cf..eb4b152 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -321,8 +321,9 @@ #define __NR_get_robust_list	312
 #define __NR_splice		313
 #define __NR_sync_file_range	314
 #define __NR_tee		315
+#define __NR_vmsplice		316
 
-#define NR_syscalls 316
+#define NR_syscalls 317
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index a40ebec..9aa3487 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -290,6 +290,7 @@ #define __NR_set_robust_list		1298
 #define __NR_get_robust_list		1299
 #define __NR_sync_file_range		1300
 #define __NR_tee			1301
+#define __NR_vmsplice			1302
 
 #ifdef __KERNEL__
 
diff --git a/include/asm-powerpc/unistd.h b/include/asm-powerpc/unistd.h
index c612f1a..34325e2 100644
--- a/include/asm-powerpc/unistd.h
+++ b/include/asm-powerpc/unistd.h
@@ -303,8 +303,9 @@ #define __NR_ppoll		281
 #define __NR_unshare		282
 #define __NR_splice		283
 #define __NR_tee		284
+#define __NR_vmsplice		285
 
-#define __NR_syscalls		285
+#define __NR_syscalls		286
 
 #ifdef __KERNEL__
 #define __NR__exit __NR_exit
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 98c36ea..feb77cb 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -615,8 +615,10 @@ #define __NR_tee		276
 __SYSCALL(__NR_tee, sys_tee)
 #define __NR_sync_file_range	277
 __SYSCALL(__NR_sync_file_range, sys_sync_file_range)
+#define __NR_vmsplice		278
+__SYSCALL(__NR_vmsplice, sys_vmsplice)
 
-#define __NR_syscall_max __NR_sync_file_range
+#define __NR_syscall_max __NR_vmsplice
 
 #ifndef __NO_STUBS
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d3ebc0e..fc9392c 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -574,6 +574,9 @@ asmlinkage long sys_splice(int fd_in, lo
 			   int fd_out, loff_t __user *off_out,
 			   size_t len, unsigned int flags);
 
+asmlinkage long sys_vmsplice(int fd, void __user *buffer, size_t len,
+			     unsigned int flags);
+
 asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags);
 
 asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
-- 
1.3.0.g2473


-- 
Jens Axboe


[-- Attachment #2: vmsplice.c --]
[-- Type: text/x-c++src, Size: 2286 bytes --]

/*
 * Use vmsplice to fill some user memory into a pipe. vmsplice writes
 * to stdout, so that must be a pipe.
 */
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <limits.h>
#include <sys/poll.h>
#include <sys/stat.h>
#include <sys/types.h>

#include "splice.h"

#define ALIGN_BUF

#ifdef ALIGN_BUF
#define ALIGN_MASK	(65535)	/* 64k-1, should just be PAGE_SIZE - 1 */
#define ALIGN(buf)	(void *) (((unsigned long) (buf) + ALIGN_MASK) & ~ALIGN_MASK)
#else
#define ALIGN_MASK	(0)
#define ALIGN(buf)	(buf)
#endif

int do_vmsplice(int fd, void *buffer, int len)
{
	struct pollfd pfd = { .fd = fd, .events = POLLOUT, };
	int written;

	while (len) {
		/*
		 * in a real app you'd be more clever with poll of course,
		 * here we are basically just blocking on output room and
		 * not using the free time for anything interesting.
		 */
		if (poll(&pfd, 1, -1) < 0)
			return error("poll");

		written = vmsplice(fd, buffer, min(SPLICE_SIZE, len), 0);

		if (written <= 0)
			return error("vmsplice");

		len -= written;
	}

	return 0;
}

int main(int argc, char *argv[])
{
	unsigned char *buffer;
	struct stat sb;
	long page_size;
	int i, ret;

	if (fstat(STDOUT_FILENO, &sb) < 0)
		return error("stat");
	if (!S_ISFIFO(sb.st_mode)) {
		fprintf(stderr, "stdout must be a pipe\n");
		return 1;
	}

	ret = fcntl(STDOUT_FILENO, F_GETPSZ);
	if (ret < 0)
		return error("F_GETPSZ");

	page_size = sysconf(_SC_PAGESIZE);
	if (page_size < 0)
		return error("_SC_PAGESIZE");

	fprintf(stderr, "Pipe size: %d pages / %ld bytes\n", ret, ret * page_size);

	buffer = ALIGN(malloc(2 * SPLICE_SIZE + ALIGN_MASK));
	for (i = 0; i < 2 * SPLICE_SIZE; i++)
		buffer[i] = (i & 0xff);

	do {
		/*
		 * vmsplice the first half of the buffer into the pipe
		 */
		if (do_vmsplice(STDOUT_FILENO, buffer, SPLICE_SIZE))
			break;

		/*
		 * first half is now in pipe, but we don't quite know when
		 * we can reuse it.
		 */

		/*
		 * vmsplice second half
		 */
		if (do_vmsplice(STDOUT_FILENO, buffer + SPLICE_SIZE, SPLICE_SIZE))
			break;

		/*
		 * We still don't know when we can reuse the second half of
		 * the buffer, but we do now know that all parts of the first
		 * half have been consumed from the pipe - so we can reuse that.
		 */
	} while (0);

	return 0;
}

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH] sys_vmsplice
  2006-04-21  8:02 [PATCH] sys_vmsplice Jens Axboe
@ 2006-04-21  9:17 ` Andrew Morton
  2006-04-21  9:21   ` Jens Axboe
  2006-04-21 20:45 ` [PATCH] sys_vmsplice Linus Torvalds
  1 sibling, 1 reply; 11+ messages in thread
From: Andrew Morton @ 2006-04-21  9:17 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, torvalds, davem

Jens Axboe <axboe@suse.de> wrote:
>
> ...
>
> sys_splice() moves data to/from pipes with a file input/output. sys_vmsplice()
> moves data to a pipe, with the input being a user address range instead.
> 
> ...
>
> +static void *user_page_pipe_buf_map(struct file *file,
> +				    struct pipe_inode_info *pipe,
> +				    struct pipe_buffer *buf)
> +{
> +	return kmap(buf->page);
> +}
> +
> +static void user_page_pipe_buf_unmap(struct pipe_inode_info *pipe,
> +				     struct pipe_buffer *buf)
> +{
> +	kunmap(buf->page);
> +}

All these new kmaps and kunmaps hurt.  It's supposed to be
kinda-deprecated and scales poorly.  Oh well.

> -	while (i < nr_pages)
> -		page_cache_release(pages[i++]);
> +	while (i < spd->nr_pages)
> +		page_cache_release(spd->pages[i++]);

grumble.  I'm going to march over there and take away your `i' key.  Or
make it emit "page_nr".

> +static long do_vmsplice(struct file *file, void __user *buffer, size_t len,
> +			unsigned int flags)
> +{
> +	unsigned long uaddr = (unsigned long) buffer;
> +	struct pipe_inode_info *pipe;
> +	struct page *pages[PIPE_BUFFERS];
> +	unsigned int nr_pages;
> +	struct splice_pipe_desc spd = {
> +		.pages = pages,
> +		.len = len,
> +		.flags = flags,
> +		.ops = &user_page_pipe_buf_ops,
> +	};
> +
> +	pipe = file->f_dentry->d_inode->i_pipe;
> +	if (unlikely(!pipe))
> +		return -EBADF;
> +
> +	spd.offset = uaddr & ~PAGE_CACHE_MASK;
> +	nr_pages = (len + spd.offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;

Can I feed this 16TB of memory?   If so, we just overflowed `nr_pages'.

> --- a/include/asm-powerpc/unistd.h
> +++ b/include/asm-powerpc/unistd.h
> @@ -303,8 +303,9 @@ #define __NR_ppoll		281
>  #define __NR_unshare		282
>  #define __NR_splice		283
>  #define __NR_tee		284
> +#define __NR_vmsplice		285
>  
> -#define __NR_syscalls		285
> +#define __NR_syscalls		286
>  

Every time we do this the spufs build breaks:

long spu_sys_callback(struct spu_syscall_block *s)
{
	long (*syscall)(u64 a1, u64 a2, u64 a3, u64 a4, u64 a5, u64 a6);

	BUILD_BUG_ON(ARRAY_SIZE(spu_syscall_table) != __NR_syscalls);



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] sys_vmsplice
  2006-04-21  9:17 ` Andrew Morton
@ 2006-04-21  9:21   ` Jens Axboe
  2006-04-21  9:25     ` Andrew Morton
  0 siblings, 1 reply; 11+ messages in thread
From: Jens Axboe @ 2006-04-21  9:21 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, torvalds, davem

On Fri, Apr 21 2006, Andrew Morton wrote:
> Jens Axboe <axboe@suse.de> wrote:
> >
> > ...
> >
> > sys_splice() moves data to/from pipes with a file input/output. sys_vmsplice()
> > moves data to a pipe, with the input being a user address range instead.
> > 
> > ...
> >
> > +static void *user_page_pipe_buf_map(struct file *file,
> > +				    struct pipe_inode_info *pipe,
> > +				    struct pipe_buffer *buf)
> > +{
> > +	return kmap(buf->page);
> > +}
> > +
> > +static void user_page_pipe_buf_unmap(struct pipe_inode_info *pipe,
> > +				     struct pipe_buffer *buf)
> > +{
> > +	kunmap(buf->page);
> > +}
> 
> All these new kmaps and kunmaps hurt.  It's supposed to be
> kinda-deprecated and scales poorly.  Oh well.

I know, it's not a new complaint and I fully agree. It's not too bad to
fix, it just needs 1/2 new pipe_buffer_ops hooks to split the mapping
from the 'checking' part. For now we blissfully ignore highmem boxes and
rejoice on 64-bit machines :-)

> > -	while (i < nr_pages)
> > -		page_cache_release(pages[i++]);
> > +	while (i < spd->nr_pages)
> > +		page_cache_release(spd->pages[i++]);
> 
> grumble.  I'm going to march over there and take away your `i' key.  Or
> make it emit "page_nr".

;-)

> > +static long do_vmsplice(struct file *file, void __user *buffer, size_t len,
> > +			unsigned int flags)
> > +{
> > +	unsigned long uaddr = (unsigned long) buffer;
> > +	struct pipe_inode_info *pipe;
> > +	struct page *pages[PIPE_BUFFERS];
> > +	unsigned int nr_pages;
> > +	struct splice_pipe_desc spd = {
> > +		.pages = pages,
> > +		.len = len,
> > +		.flags = flags,
> > +		.ops = &user_page_pipe_buf_ops,
> > +	};
> > +
> > +	pipe = file->f_dentry->d_inode->i_pipe;
> > +	if (unlikely(!pipe))
> > +		return -EBADF;
> > +
> > +	spd.offset = uaddr & ~PAGE_CACHE_MASK;
> > +	nr_pages = (len + spd.offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
> 
> Can I feed this 16TB of memory?   If so, we just overflowed `nr_pages'.

I'll make it a long type instead.

> > --- a/include/asm-powerpc/unistd.h
> > +++ b/include/asm-powerpc/unistd.h
> > @@ -303,8 +303,9 @@ #define __NR_ppoll		281
> >  #define __NR_unshare		282
> >  #define __NR_splice		283
> >  #define __NR_tee		284
> > +#define __NR_vmsplice		285
> >  
> > -#define __NR_syscalls		285
> > +#define __NR_syscalls		286
> >  
> 
> Every time we do this the spufs build breaks:
> 
> long spu_sys_callback(struct spu_syscall_block *s)
> {
> 	long (*syscall)(u64 a1, u64 a2, u64 a3, u64 a4, u64 a5, u64 a6);
> 
> 	BUILD_BUG_ON(ARRAY_SIZE(spu_syscall_table) != __NR_syscalls);

I'll leave the ppc syscall update out of it.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] sys_vmsplice
  2006-04-21  9:21   ` Jens Axboe
@ 2006-04-21  9:25     ` Andrew Morton
  2006-04-21  9:52       ` Jens Axboe
  2006-04-21 10:41       ` Arnd Bergmann
  0 siblings, 2 replies; 11+ messages in thread
From: Andrew Morton @ 2006-04-21  9:25 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, torvalds, davem

Jens Axboe <axboe@suse.de> wrote:
>
>  > long spu_sys_callback(struct spu_syscall_block *s)
>  > {
>  > 	long (*syscall)(u64 a1, u64 a2, u64 a3, u64 a4, u64 a5, u64 a6);
>  > 
>  > 	BUILD_BUG_ON(ARRAY_SIZE(spu_syscall_table) != __NR_syscalls);
> 
>  I'll leave the ppc syscall update out of it.

It might be better to just stick the new entry into the spufs table, make
sure that the powerpc guys see it go in.  That way, ppc64 people (Linus,
maybe you?) can test it.

I guess mapping it onto sys_ni_syscall would be safest.

(It's been broken since sys_tee went in, btw).

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] sys_vmsplice
  2006-04-21  9:25     ` Andrew Morton
@ 2006-04-21  9:52       ` Jens Axboe
  2006-04-21 10:41       ` Arnd Bergmann
  1 sibling, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2006-04-21  9:52 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, torvalds, davem

On Fri, Apr 21 2006, Andrew Morton wrote:
> Jens Axboe <axboe@suse.de> wrote:
> >
> >  > long spu_sys_callback(struct spu_syscall_block *s)
> >  > {
> >  > 	long (*syscall)(u64 a1, u64 a2, u64 a3, u64 a4, u64 a5, u64 a6);
> >  > 
> >  > 	BUILD_BUG_ON(ARRAY_SIZE(spu_syscall_table) != __NR_syscalls);
> > 
> >  I'll leave the ppc syscall update out of it.
> 
> It might be better to just stick the new entry into the spufs table, make
> sure that the powerpc guys see it go in.  That way, ppc64 people (Linus,
> maybe you?) can test it.

No ppc64 here unfortunately, and my ppc box (powerbook) has since moved
on to more relaxing pastures (my wife inherited it). But certainly, I'll
update it then.

> I guess mapping it onto sys_ni_syscall would be safest.
> 
> (It's been broken since sys_tee went in, btw).

Noted.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] sys_vmsplice
  2006-04-21  9:25     ` Andrew Morton
  2006-04-21  9:52       ` Jens Axboe
@ 2006-04-21 10:41       ` Arnd Bergmann
  2006-04-21 10:45         ` [PATCH] fix spu_callbacks BUILD_BUG_ON Arnd Bergmann
  1 sibling, 1 reply; 11+ messages in thread
From: Arnd Bergmann @ 2006-04-21 10:41 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jens Axboe, linux-kernel, torvalds, davem, Paul Mackerras,
	linuxppc-dev

On Friday 21 April 2006 11:25, Andrew Morton wrote:
> It might be better to just stick the new entry into the spufs table, make
> sure that the powerpc guys see it go in.  That way, ppc64 people (Linus,
> maybe you?) can test it.
> 
> I guess mapping it onto sys_ni_syscall would be safest.
> 
> (It's been broken since sys_tee went in, btw).

The BUILD_BUG_ON in there was just overkill. How about if we just add a
small comment to the systbl on powerpc to remind people about the fact
that there is another file to edit? Patch follows.

	Arnd <><

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH] fix spu_callbacks BUILD_BUG_ON
  2006-04-21 10:41       ` Arnd Bergmann
@ 2006-04-21 10:45         ` Arnd Bergmann
  2006-04-21 13:14           ` Jan Engelhardt
  0 siblings, 1 reply; 11+ messages in thread
From: Arnd Bergmann @ 2006-04-21 10:45 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jens Axboe, linux-kernel, torvalds, davem, Paul Mackerras,
	linuxppc-dev

Every time a new syscall gets added, a BUILD_BUG_ON in
arch/powerpc/platforms/cell/spu_callbacks.c gets triggered.
Since the addition of a new syscall is rather harmless,
the error should just be removed.

While we're here, add sys_tee to the list and add a comment
to systbl.S to remind people that there is another list
on powerpc.

Signed-of-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>

--- linus-2.6.orig/arch/powerpc/platforms/cell/spu_callbacks.c
+++ linus-2.6/arch/powerpc/platforms/cell/spu_callbacks.c
@@ -317,17 +317,16 @@ void *spu_syscall_table[] = {
 	[__NR_ppoll]			sys_ni_syscall, /* sys_ppoll */
 	[__NR_unshare]			sys_unshare,
 	[__NR_splice]			sys_splice,
+	[__NR_tee]			sys_tee,
 };
 
 long spu_sys_callback(struct spu_syscall_block *s)
 {
 	long (*syscall)(u64 a1, u64 a2, u64 a3, u64 a4, u64 a5, u64 a6);
 
-	BUILD_BUG_ON(ARRAY_SIZE(spu_syscall_table) != __NR_syscalls);
-
 	syscall = spu_syscall_table[s->nr_ret];
 
-	if (s->nr_ret >= __NR_syscalls) {
+	if (s->nr_ret >= ARRAY_SIZE(spu_syscall_table)) {
 		pr_debug("%s: invalid syscall #%ld", __FUNCTION__, s->nr_ret);
 		return -ENOSYS;
 	}
Index: linus-2.6/arch/powerpc/kernel/systbl.S
===================================================================
--- linus-2.6.orig/arch/powerpc/kernel/systbl.S
+++ linus-2.6/arch/powerpc/kernel/systbl.S
@@ -324,3 +324,8 @@ COMPAT_SYS(ppoll)
 SYSCALL(unshare)
 SYSCALL(splice)
 SYSCALL(tee)
+
+/*
+ * please add new calls to arch/powerpc/platforms/cell/spu_callbacks.c
+ * as well when appropriate.
+ */

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] fix spu_callbacks BUILD_BUG_ON
  2006-04-21 10:45         ` [PATCH] fix spu_callbacks BUILD_BUG_ON Arnd Bergmann
@ 2006-04-21 13:14           ` Jan Engelhardt
  2006-04-21 13:52             ` Arnd Bergmann
  0 siblings, 1 reply; 11+ messages in thread
From: Jan Engelhardt @ 2006-04-21 13:14 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Andrew Morton, Jens Axboe, linux-kernel, torvalds, davem,
	Paul Mackerras, linuxppc-dev


>--- linus-2.6.orig/arch/powerpc/platforms/cell/spu_callbacks.c
>+++ linus-2.6/arch/powerpc/platforms/cell/spu_callbacks.c
>@@ -317,17 +317,16 @@ void *spu_syscall_table[] = {
> 	[__NR_ppoll]			sys_ni_syscall, /* sys_ppoll */
> 	[__NR_unshare]			sys_unshare,
> 	[__NR_splice]			sys_splice,
>+	[__NR_tee]			sys_tee,

+      [__NR_syscalls] = NULL,

> };
> 
> long spu_sys_callback(struct spu_syscall_block *s)
> {
> 	long (*syscall)(u64 a1, u64 a2, u64 a3, u64 a4, u64 a5, u64 a6);
> 
>-	BUILD_BUG_ON(ARRAY_SIZE(spu_syscall_table) != __NR_syscalls);
>-
> 	syscall = spu_syscall_table[s->nr_ret];
> 
>-	if (s->nr_ret >= __NR_syscalls) {
>+	if (s->nr_ret >= ARRAY_SIZE(spu_syscall_table)) {

+       if(syscall == NULL) {



That way, syscalls could be added in the master table while spu does not 
break. Comments?


Jan Engelhardt
-- 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] fix spu_callbacks BUILD_BUG_ON
  2006-04-21 13:14           ` Jan Engelhardt
@ 2006-04-21 13:52             ` Arnd Bergmann
  0 siblings, 0 replies; 11+ messages in thread
From: Arnd Bergmann @ 2006-04-21 13:52 UTC (permalink / raw)
  To: Jan Engelhardt
  Cc: Andrew Morton, Jens Axboe, linux-kernel, torvalds, davem,
	Paul Mackerras, linuxppc-dev

On Friday 21 April 2006 15:14, Jan Engelhardt wrote:
> +      [__NR_syscalls] = NULL,
> 
> > };
> > 

> >-      if (s->nr_ret >= __NR_syscalls) {
> >+      if (s->nr_ret >= ARRAY_SIZE(spu_syscall_table)) {
> 
> +       if(syscall == NULL) {
> 
> 
> 
> That way, syscalls could be added in the master table while spu does not 
> break. Comments?

Hmm, my idea was not having to check for NULL pointers when we know
that they are valid function calls. But you are right that your approach
is more robust. It also means that we might just leave out all
the assignments to sys_ni_syscall in order to make the source a little
shorter.

	Arnd <><

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] sys_vmsplice
  2006-04-21  8:02 [PATCH] sys_vmsplice Jens Axboe
  2006-04-21  9:17 ` Andrew Morton
@ 2006-04-21 20:45 ` Linus Torvalds
  2006-04-21 21:01   ` Jens Axboe
  1 sibling, 1 reply; 11+ messages in thread
From: Linus Torvalds @ 2006-04-21 20:45 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, davem



On Fri, 21 Apr 2006, Jens Axboe wrote:
> 
> Here's the first implementation of sys_vmsplice(). I'm attaching a
> little test app as well for playing with it, it's also committed to the
> splice tools repo at:

Btw, I think we'll have to think a bit more about the buffer size issues:

> @@ -345,6 +346,13 @@ static long do_fcntl(int fd, unsigned in
>  	case F_NOTIFY:
>  		err = fcntl_dirnotify(fd, filp, arg);
>  		break;
> +	case F_SETPSZ:
> +		err = -EINVAL;
> +		break;
> +	case F_GETPSZ:
> +		/* this assumes user space can reliably tell PAGE_CACHE_SIZE */
> +		err = PIPE_BUFFERS;
> +		break;

The above obviously isn't incorrect, but it really isn't enough to handle 
the case I was talking about - for somebody to be able to re-use a 
vmsplice'd page, it's not enough that his buffer is bigger than the pipe 
buffer, it needs to be bigger than the end-to-end buffer. Or rather, 
bigger than the "end-to-copy" buffer.

Ie there may be more buffering after the local pipe buffer - other pipes, 
network buffers, etc etc.

It may even be that my idea to try to limit the buffer size just isn't 
workable, and that we need to do it in a page-per-page basis (ie instead 
of depending on the buffer sizes, actually inspect the page and wait for 
it to be flushed all the way).

So while thinking about that, it might be best to forget about the pipe 
buffer size issue until the final solution is clearer. The pipe buffer may 
well be a _part_ of it, though.

		Linus

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] sys_vmsplice
  2006-04-21 20:45 ` [PATCH] sys_vmsplice Linus Torvalds
@ 2006-04-21 21:01   ` Jens Axboe
  0 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2006-04-21 21:01 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, davem

On Fri, Apr 21 2006, Linus Torvalds wrote:
> 
> 
> On Fri, 21 Apr 2006, Jens Axboe wrote:
> > 
> > Here's the first implementation of sys_vmsplice(). I'm attaching a
> > little test app as well for playing with it, it's also committed to the
> > splice tools repo at:
> 
> Btw, I think we'll have to think a bit more about the buffer size issues:
> 
> > @@ -345,6 +346,13 @@ static long do_fcntl(int fd, unsigned in
> >  	case F_NOTIFY:
> >  		err = fcntl_dirnotify(fd, filp, arg);
> >  		break;
> > +	case F_SETPSZ:
> > +		err = -EINVAL;
> > +		break;
> > +	case F_GETPSZ:
> > +		/* this assumes user space can reliably tell PAGE_CACHE_SIZE */
> > +		err = PIPE_BUFFERS;
> > +		break;
> 
> The above obviously isn't incorrect, but it really isn't enough to handle 
> the case I was talking about - for somebody to be able to re-use a 
> vmsplice'd page, it's not enough that his buffer is bigger than the pipe 
> buffer, it needs to be bigger than the end-to-end buffer. Or rather, 
> bigger than the "end-to-copy" buffer.
> 
> Ie there may be more buffering after the local pipe buffer - other pipes, 
> network buffers, etc etc.
> 
> It may even be that my idea to try to limit the buffer size just isn't 
> workable, and that we need to do it in a page-per-page basis (ie instead 
> of depending on the buffer sizes, actually inspect the page and wait for 
> it to be flushed all the way).
> 
> So while thinking about that, it might be best to forget about the pipe 
> buffer size issue until the final solution is clearer. The pipe buffer may 
> well be a _part_ of it, though.

Obviously the posted patch doesn't handle the reuse case yet. If you
play with the splice tools and do:

# vmsplice | ktee-net bla port | cat > file

with the vmsplice modified to memset the first half of the buffer after
the second half has been vmspliced, you'll see bad data going out over
the network.

One way that works fine but is hackish is just to monitor the page
count. So don't rely solely on reusing the page once ->release has been
called, but also check buf->page on reuse if the ->release function
hasn't cleared ->page (the page cache ops would, the user_page would
not) so move_to_pipe() can break out appropriately just like if
pipe->nr_buffers was too large. Works, is easily doable, but isn't a
very nice design - it basically works around the problem that
->release() is called too early.

The better way is imho to forget ->sendpage() and do a proper hook that
actually accepts the pipe buffer. Then you don't have to work around
premature release. And hey, it'd be more efficient too. With tee and
friends, you'd want to keep them referenced as well.

This all provided that the method of choice for signalling safe reuse is
still either blocking on vmsplice or using poll() intelligently to do
that.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2006-04-21 21:01 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-04-21  8:02 [PATCH] sys_vmsplice Jens Axboe
2006-04-21  9:17 ` Andrew Morton
2006-04-21  9:21   ` Jens Axboe
2006-04-21  9:25     ` Andrew Morton
2006-04-21  9:52       ` Jens Axboe
2006-04-21 10:41       ` Arnd Bergmann
2006-04-21 10:45         ` [PATCH] fix spu_callbacks BUILD_BUG_ON Arnd Bergmann
2006-04-21 13:14           ` Jan Engelhardt
2006-04-21 13:52             ` Arnd Bergmann
2006-04-21 20:45 ` [PATCH] sys_vmsplice Linus Torvalds
2006-04-21 21:01   ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox