linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] splice support #2
@ 2006-03-30 10:06 Jens Axboe
  2006-03-30 10:16 ` Andrew Morton
  2006-03-30 12:00 ` Ingo Molnar
  0 siblings, 2 replies; 48+ messages in thread
From: Jens Axboe @ 2006-03-30 10:06 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, torvalds

Hi,

This patch should resolve all issues mentioned so far. I'd still like to
implement the page moving, but that should just be a separate patch.
After this round of patching, I thought I'd toss a fresh version out
there for all to look at.

Signed-off-by: Jens Axboe <axboe@suse.de>

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 326595f..ce3ef4f 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -312,3 +312,4 @@ ENTRY(sys_call_table)
 	.long sys_unshare		/* 310 */
 	.long sys_set_robust_list
 	.long sys_get_robust_list
+	.long sys_splice
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index 0e3eda9..14480d9 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1605,5 +1605,6 @@ sys_call_table:
 	data8 sys_ni_syscall			// reserved for pselect
 	data8 sys_ni_syscall			// 1295 reserved for ppoll
 	data8 sys_unshare
+	date8 sys_splice
 
 	.org sys_call_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls
diff --git a/fs/Makefile b/fs/Makefile
index 080b386..f3a4f70 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -10,7 +10,7 @@ obj-y :=	open.o read_write.o file_table.
 		ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
-		ioprio.o pnode.o drop_caches.o
+		ioprio.o pnode.o drop_caches.o splice.o
 
 obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 509ccec..23e2c7c 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -53,6 +53,8 @@ const struct file_operations ext2_file_o
 	.readv		= generic_file_readv,
 	.writev		= generic_file_writev,
 	.sendfile	= generic_file_sendfile,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
 };
 
 #ifdef CONFIG_EXT2_FS_XIP
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 783a796..1efefb6 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -119,6 +119,8 @@ const struct file_operations ext3_file_o
 	.release	= ext3_release_file,
 	.fsync		= ext3_sync_file,
 	.sendfile	= generic_file_sendfile,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
 };
 
 struct inode_operations ext3_file_inode_operations = {
diff --git a/fs/pipe.c b/fs/pipe.c
index e2f4f1d..aef9ad7 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -15,6 +15,7 @@ #include <linux/mount.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/uio.h>
 #include <linux/highmem.h>
+#include <linux/pagemap.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -94,11 +95,20 @@ static void anon_pipe_buf_release(struct
 {
 	struct page *page = buf->page;
 
-	if (info->tmp_page) {
-		__free_page(page);
+	/*
+	 * If nobody else uses this page, and we don't already have a
+	 * temporary page, let's keep track of it as a one-deep
+	 * allocation cache
+	 */
+	if (page_count(page) == 1 && !info->tmp_page) {
+		info->tmp_page = page;
 		return;
 	}
-	info->tmp_page = page;
+
+	/*
+	 * Otherwise just release our reference to it
+	 */
+	page_cache_release(page);
 }
 
 static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *info, struct pipe_buffer *buf)
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 010094d..cf6e1cf 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1576,6 +1576,8 @@ const struct file_operations reiserfs_fi
 	.sendfile = generic_file_sendfile,
 	.aio_read = generic_file_aio_read,
 	.aio_write = reiserfs_aio_write,
+	.splice_read = generic_file_splice_read,
+	.splice_write = generic_file_splice_write,
 };
 
 struct inode_operations reiserfs_file_inode_operations = {
diff --git a/fs/splice.c b/fs/splice.c
new file mode 100644
index 0000000..b63db3f
--- /dev/null
+++ b/fs/splice.c
@@ -0,0 +1,602 @@
+/*
+ * "splice": joining two ropes together by interweaving their strands.
+ *
+ * This is the "extended pipe" functionality, where a pipe is used as
+ * an arbitrary in-memory buffer. Think of a pipe as a small kernel
+ * buffer that you can use to transfer data from one end to the other.
+ *
+ * The traditional unix read/write is extended with a "splice()" operation
+ * that transfers data buffers to or from a pipe buffer.
+ *
+ * Named by Larry McVoy, original implementation from Linus, extended by
+ * Jens to support splicing to files and fixing the initial implementation
+ * bugs.
+ *
+ * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
+ * Copyright (C) 2005 Linus Torvalds <torvalds@osdl.org>
+ *
+ */
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mm_inline.h>
+
+/*
+ * Passed to the actors
+ */
+struct splice_desc {
+	unsigned int len, total_len;	/* current and remaining length */
+	unsigned int flags;		/* splice flags */
+	struct file *file;		/* file to read/write */
+	loff_t pos;			/* file position */
+};
+
+static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
+					struct pipe_buffer *buf)
+{
+	page_cache_release(buf->page);
+	buf->page = NULL;
+}
+
+static void *page_cache_pipe_buf_map(struct file *file,
+				     struct pipe_inode_info *info,
+				     struct pipe_buffer *buf)
+{
+	struct page *page = buf->page;
+
+	lock_page(page);
+
+	if (!PageUptodate(page)) {
+		unlock_page(page);
+		return NULL;
+	}
+
+	return kmap(buf->page);
+}
+
+static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
+				      struct pipe_buffer *buf)
+{
+	unlock_page(buf->page);
+	kunmap(buf->page);
+}
+
+static struct pipe_buf_operations page_cache_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = page_cache_pipe_buf_map,
+	.unmap = page_cache_pipe_buf_unmap,
+	.release = page_cache_pipe_buf_release,
+};
+
+static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
+			    int nr_pages, unsigned long offset,
+			    unsigned long len)
+{
+	struct pipe_inode_info *info;
+	int ret, do_wakeup, i;
+
+	ret = 0;
+	do_wakeup = 0;
+	i = 0;
+
+	mutex_lock(PIPE_MUTEX(*inode));
+
+	info = inode->i_pipe;
+	for (;;) {
+		int bufs;
+
+		if (!PIPE_READERS(*inode)) {
+			send_sig(SIGPIPE, current, 0);
+			if (!ret)
+				ret = -EPIPE;
+			break;
+		}
+
+		bufs = info->nrbufs;
+		if (bufs < PIPE_BUFFERS) {
+			int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS - 1);
+			struct pipe_buffer *buf = info->bufs + newbuf;
+			struct page *page = pages[i++];
+			unsigned long this_len;
+
+			this_len = PAGE_CACHE_SIZE - offset;
+			if (this_len > len)
+				this_len = len;
+
+			buf->page = page;
+			buf->offset = offset;
+			buf->len = this_len;
+			buf->ops = &page_cache_pipe_buf_ops;
+			info->nrbufs = ++bufs;
+			do_wakeup = 1;
+
+			ret += this_len;
+			len -= this_len;
+			offset = 0;
+			if (!--nr_pages)
+				break;
+			if (!len)
+				break;
+			if (bufs < PIPE_BUFFERS)
+				continue;
+
+			break;
+		}
+
+		if (signal_pending(current)) {
+			if (!ret)
+				ret = -ERESTARTSYS;
+			break;
+		}
+
+		if (do_wakeup) {
+			wake_up_interruptible_sync(PIPE_WAIT(*inode));
+			kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO,
+				    POLL_IN);
+			do_wakeup = 0;
+		}
+
+		PIPE_WAITING_WRITERS(*inode)++;
+		pipe_wait(inode);
+		PIPE_WAITING_WRITERS(*inode)--;
+	}
+
+	mutex_unlock(PIPE_MUTEX(*inode));
+
+	if (do_wakeup) {
+		wake_up_interruptible(PIPE_WAIT(*inode));
+		kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
+	}
+
+	while (i < nr_pages)
+		page_cache_release(pages[i++]);
+
+	return ret;
+}
+
+static int __generic_file_splice_read(struct file *in, struct inode *pipe,
+				      size_t len)
+{
+	struct address_space *mapping = in->f_mapping;
+	unsigned int offset, nr_pages;
+	struct page *pages[PIPE_BUFFERS];
+	struct page *page;
+	pgoff_t index, pidx;
+	int i;
+
+	index = in->f_pos >> PAGE_CACHE_SHIFT;
+	offset = in->f_pos & ~PAGE_CACHE_MASK;
+	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	if (nr_pages > PIPE_BUFFERS)
+		nr_pages = PIPE_BUFFERS;
+
+	/*
+	 * initiate read-ahead on this page range
+	 */
+	do_page_cache_readahead(mapping, in, index, nr_pages);
+
+	/*
+	 * Get as many pages from the page cache as possible..
+	 * Start IO on the page cache entries we create (we
+	 * can assume that any pre-existing ones we find have
+	 * already had IO started on them).
+	 */
+	i = find_get_pages(mapping, index, nr_pages, pages);
+
+	/*
+	 * common case - we found all pages and they are contiguous,
+	 * kick them off
+	 */
+	if (i && (pages[i - 1]->index == index + i - 1))
+		goto splice_them;
+
+	memset(&pages[i], 0, (nr_pages - i) * sizeof(struct page *));
+
+	/*
+	 * find_get_pages() may not return consecutive pages, so loop
+	 * over the array moving pages and filling the rest, if need be.
+	 */
+	for (i = 0, pidx = index; i < nr_pages; pidx++, i++) {
+		if (!pages[i]) {
+			int error;
+fill_page:
+			/*
+			 * no page there, look one up / create it
+			 */
+			page = find_or_create_page(mapping, pidx,
+						   mapping_gfp_mask(mapping));
+			if (!page)
+				break;
+
+			if (PageUptodate(page))
+				unlock_page(page);
+			else {
+				error = mapping->a_ops->readpage(in, page);
+
+				if (unlikely(error)) {
+					page_cache_release(page);
+					break;
+				}
+			}
+			pages[i] = page;
+		} else if (pages[i]->index != pidx) {
+			page = pages[i];
+			/*
+			 * page isn't in the right spot, move it and jump
+			 * back to filling this one. we know that ->index
+			 * is larger than pidx
+			 */
+			pages[i + page->index - pidx] = page;
+			pages[i] = NULL;
+			goto fill_page;
+		}
+	}
+
+	if (!i)
+		return 0;
+
+	/*
+	 * Now we splice them into the pipe..
+	 */
+splice_them:
+	return move_to_pipe(pipe, pages, i, offset, len);
+}
+
+ssize_t generic_file_splice_read(struct file *in, struct inode *pipe,
+				 size_t len, unsigned int flags)
+{
+	ssize_t spliced;
+	int ret;
+
+	ret = 0;
+	spliced = 0;
+	while (len) {
+		ret = __generic_file_splice_read(in, pipe, len);
+
+		if (ret <= 0)
+			break;
+
+		in->f_pos += ret;
+		len -= ret;
+		spliced += ret;
+	}
+
+	if (spliced)
+		return spliced;
+
+	return ret;
+}
+
+/*
+ * Send 'len' bytes to socket from 'file' at position 'pos' using sendpage().
+ */
+static int pipe_to_sendpage(struct pipe_inode_info *info,
+			    struct pipe_buffer *buf, struct splice_desc *sd)
+{
+	struct file *file = sd->file;
+	loff_t pos = sd->pos;
+	unsigned int offset;
+	ssize_t ret;
+
+	/*
+	 * sub-optimal, but we are limited by the pipe ->map. we don't
+	 * need a kmap'ed buffer here, we just want to make sure we
+	 * have the page pinned if the pipe page originates from the
+	 * page cache
+	 */
+	if (!buf->ops->map(file, info, buf))
+		return -EIO;
+
+	offset = pos & ~PAGE_CACHE_MASK;
+
+	ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,
+					sd->len < sd->total_len);
+
+	buf->ops->unmap(info, buf);
+	if (ret == sd->len)
+		return 0;
+
+	return -EIO;
+}
+
+/*
+ * This is a little more tricky than the file -> pipe splicing. There are
+ * basically three cases:
+ *
+ *	- Destination page already exists in the address space and there
+ *	  are users of it. For that case we have no other option that
+ *	  copying the data. Tough luck.
+ *	- Destination page already exists in the address space, but there
+ *	  are no users of it. Make sure it's uptodate, then drop it. Fall
+ *	  through to last case.
+ *	- Destination page does not exist, we can add the pipe page to
+ *	  the page cache and avoid the copy.
+ *
+ * For now we just do the slower thing and always copy pages over, it's
+ * easier than migrating pages from the pipe to the target file. For the
+ * case of doing file | file splicing, the migrate approach had some LRU
+ * nastiness...
+ */
+static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
+			struct splice_desc *sd)
+{
+	struct file *file = sd->file;
+	struct address_space *mapping = file->f_mapping;
+	unsigned int offset;
+	struct page *page;
+	char *src, *dst;
+	pgoff_t index;
+	int ret;
+
+	/*
+	 * after this, page will be locked and unmapped
+	 */
+	src = buf->ops->map(file, info, buf);
+	if (!src)
+		return -EIO;
+
+	index = sd->pos >> PAGE_CACHE_SHIFT;
+	offset = sd->pos & ~PAGE_CACHE_MASK;
+
+find_page:
+	ret = -ENOMEM;
+	page = find_or_create_page(mapping, index, mapping_gfp_mask(mapping));
+	if (!page)
+		goto out;
+
+	/*
+	 * If the page is uptodate, it is also locked. If it isn't
+	 * uptodate, we can mark it uptodate if we are filling the
+	 * full page. Otherwise we need to read it in first...
+	 */
+	if (!PageUptodate(page)) {
+		if (sd->len < PAGE_CACHE_SIZE) {
+			ret = mapping->a_ops->readpage(file, page);
+			if (unlikely(ret))
+				goto out;
+
+			lock_page(page);
+
+			if (!PageUptodate(page)) {
+				/*
+				 * page got invalidated, repeat
+				 */
+				if (page->mapping == NULL) {
+					unlock_page(page);
+					page_cache_release(page);
+					goto find_page;
+				}
+				ret = -EIO;
+				goto out;
+			}
+		} else {
+			WARN_ON(!PageLocked(page));
+			SetPageUptodate(page);
+		}
+	}
+
+	ret = mapping->a_ops->prepare_write(file, page, 0, sd->len);
+	if (ret)
+		goto out;
+
+	dst = kmap_atomic(page, KM_USER0);
+	memcpy(dst + offset, src + buf->offset, sd->len);
+	flush_dcache_page(page);
+	kunmap_atomic(dst, KM_USER0);
+
+	ret = mapping->a_ops->commit_write(file, page, 0, sd->len);
+	if (ret < 0)
+		goto out;
+
+	set_page_dirty(page);
+	ret = write_one_page(page, 0);
+out:
+	if (ret < 0)
+		unlock_page(page);
+	page_cache_release(page);
+	buf->ops->unmap(info, buf);
+	return ret;
+}
+
+typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
+			   struct splice_desc *);
+
+static ssize_t move_from_pipe(struct inode *inode, struct file *out,
+			      size_t len, unsigned int flags,
+			      splice_actor *actor)
+{
+	struct pipe_inode_info *info;
+	int ret, do_wakeup, err;
+	struct splice_desc sd;
+
+	ret = 0;
+	do_wakeup = 0;
+
+	sd.total_len = len;
+	sd.flags = flags;
+	sd.file = out;
+	sd.pos = out->f_pos;
+
+	mutex_lock(PIPE_MUTEX(*inode));
+
+	info = inode->i_pipe;
+	for (;;) {
+		int bufs = info->nrbufs;
+
+		if (bufs) {
+			int curbuf = info->curbuf;
+			struct pipe_buffer *buf = info->bufs + curbuf;
+			struct pipe_buf_operations *ops = buf->ops;
+
+			sd.len = buf->len;
+			if (sd.len > sd.total_len)
+				sd.len = sd.total_len;
+
+			err = actor(info, buf, &sd);
+			if (err) {
+				if (!ret)
+					ret = err;
+
+				break;
+			}
+
+			ret += sd.len;
+			buf->offset += sd.len;
+			buf->len -= sd.len;
+			if (!buf->len) {
+				buf->ops = NULL;
+				ops->release(info, buf);
+				curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1);
+				info->curbuf = curbuf;
+				info->nrbufs = --bufs;
+				do_wakeup = 1;
+			}
+
+			sd.pos += sd.len;
+			sd.total_len -= sd.len;
+			if (!sd.total_len)
+				break;
+		}
+
+		if (bufs)
+			continue;
+		if (!PIPE_WRITERS(*inode))
+			break;
+		if (!PIPE_WAITING_WRITERS(*inode)) {
+			if (ret)
+				break;
+		}
+
+		if (signal_pending(current)) {
+			if (!ret)
+				ret = -ERESTARTSYS;
+			break;
+		}
+
+		if (do_wakeup) {
+			wake_up_interruptible_sync(PIPE_WAIT(*inode));
+			kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT);
+			do_wakeup = 0;
+		}
+
+		pipe_wait(inode);
+	}
+
+	mutex_unlock(PIPE_MUTEX(*inode));
+
+	if (do_wakeup) {
+		wake_up_interruptible(PIPE_WAIT(*inode));
+		kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
+	}
+
+	mutex_lock(&out->f_mapping->host->i_mutex);
+	out->f_pos = sd.pos;
+	mutex_unlock(&out->f_mapping->host->i_mutex);
+	return ret;
+
+}
+
+ssize_t generic_file_splice_write(struct inode *inode, struct file *out,
+				  size_t len, unsigned int flags)
+{
+	return move_from_pipe(inode, out, len, flags, pipe_to_file);
+}
+
+ssize_t generic_splice_sendpage(struct inode *inode, struct file *out,
+				size_t len, unsigned int flags)
+{
+	return move_from_pipe(inode, out, len, flags, pipe_to_sendpage);
+}
+
+static long do_splice_from(struct inode *pipe, struct file *out, size_t len,
+			   unsigned int flags)
+{
+	loff_t pos;
+	int ret;
+
+	if (!out->f_op || !out->f_op->splice_write)
+		return -EINVAL;
+
+	if (!(out->f_mode & FMODE_WRITE))
+		return -EBADF;
+
+	pos = out->f_pos;
+	ret = rw_verify_area(WRITE, out, &pos, len);
+	if (unlikely(ret < 0))
+		return ret;
+
+	return out->f_op->splice_write(pipe, out, len, flags);
+}
+
+static long do_splice_to(struct file *in, struct inode *pipe, size_t len,
+			 unsigned int flags)
+{
+	loff_t pos, isize, left;
+	int ret;
+
+	if (!in->f_op || !in->f_op->splice_read)
+		return -EINVAL;
+
+	if (!(in->f_mode & FMODE_READ))
+		return -EBADF;
+
+	pos = in->f_pos;
+	ret = rw_verify_area(READ, in, &pos, len);
+	if (unlikely(ret < 0))
+		return ret;
+
+	isize = i_size_read(in->f_mapping->host);
+	if (unlikely(in->f_pos >= isize))
+		return 0;
+	
+	left = isize - in->f_pos;
+	if (left < len)
+		len = left;
+
+	return in->f_op->splice_read(in, pipe, len, flags);
+}
+
+static long do_splice(struct file *in, struct file *out, size_t len,
+		      unsigned int flags)
+{
+	struct inode *pipe;
+
+	pipe = in->f_dentry->d_inode;
+	if (pipe->i_pipe)
+		return do_splice_from(pipe, out, len, flags);
+
+	pipe = out->f_dentry->d_inode;
+	if (pipe->i_pipe)
+		return do_splice_to(in, pipe, len, flags);
+
+	return -EINVAL;
+}
+
+asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags)
+{
+	long error;
+	struct file *in, *out;
+	int fput_in, fput_out;
+
+	if (unlikely(!len))
+		return 0;
+
+	error = -EBADF;
+	in = fget_light(fdin, &fput_in);
+	if (in) {
+		if (in->f_mode & FMODE_READ) {
+			out = fget_light(fdout, &fput_out);
+			if (out) {
+				if (out->f_mode & FMODE_WRITE)
+					error = do_splice(in, out, len, flags);
+				fput_light(out, fput_out);
+			}
+		}
+
+		fput_light(in, fput_in);
+	}
+
+	return error;
+}
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index 014e356..789e9bd 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -318,8 +318,9 @@ #define __NR_ppoll		309
 #define __NR_unshare		310
 #define __NR_set_robust_list	311
 #define __NR_get_robust_list	312
+#define __NR_sys_splice		313
 
-#define NR_syscalls 313
+#define NR_syscalls 314
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index 019956c..36070c1 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -285,12 +285,13 @@ #define __NR_fchmodat			1292
 #define __NR_faccessat			1293
 /* 1294, 1295 reserved for pselect/ppoll */
 #define __NR_unshare			1296
+#define __NR_splice			1297
 
 #ifdef __KERNEL__
 
 #include <linux/config.h>
 
-#define NR_syscalls			273 /* length of syscall table */
+#define NR_syscalls			274 /* length of syscall table */
 
 #define __ARCH_WANT_SYS_RT_SIGACTION
 
diff --git a/include/asm-powerpc/unistd.h b/include/asm-powerpc/unistd.h
index 1e99074..536ba08 100644
--- a/include/asm-powerpc/unistd.h
+++ b/include/asm-powerpc/unistd.h
@@ -301,8 +301,9 @@ #define __NR_spu_create		279
 #define __NR_pselect6		280
 #define __NR_ppoll		281
 #define __NR_unshare		282
+#define __NR_splice		283
 
-#define __NR_syscalls		283
+#define __NR_syscalls		284
 
 #ifdef __KERNEL__
 #define __NR__exit __NR_exit
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index fcc5163..f21ff2c 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -609,8 +609,10 @@ #define __NR_set_robust_list	273
 __SYSCALL(__NR_set_robust_list, sys_set_robust_list)
 #define __NR_get_robust_list	274
 __SYSCALL(__NR_get_robust_list, sys_get_robust_list)
+#define __NR_splice		275
+__SYSCALL(__NR_splice, sys_splice)
 
-#define __NR_syscall_max __NR_get_robust_list
+#define __NR_syscall_max __NR_splice
 
 #ifndef __NO_STUBS
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 408fe89..20fa5f6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1032,6 +1032,8 @@ struct file_operations {
 	int (*check_flags)(int);
 	int (*dir_notify)(struct file *filp, unsigned long arg);
 	int (*flock) (struct file *, int, struct file_lock *);
+	ssize_t (*splice_write)(struct inode *, struct file *, size_t, unsigned int);
+	ssize_t (*splice_read)(struct file *, struct inode *, size_t, unsigned int);
 };
 
 struct inode_operations {
@@ -1609,6 +1611,8 @@ extern ssize_t generic_file_sendfile(str
 extern void do_generic_mapping_read(struct address_space *mapping,
 				    struct file_ra_state *, struct file *,
 				    loff_t *, read_descriptor_t *, read_actor_t);
+extern ssize_t generic_file_splice_read(struct file *, struct inode *, size_t, unsigned int);
+extern ssize_t generic_file_splice_write(struct inode *, struct file *, size_t, unsigned int);
 extern void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
 extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e487e3b..e78ffc7 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -569,5 +569,7 @@ asmlinkage long compat_sys_newfstatat(un
 asmlinkage long compat_sys_openat(unsigned int dfd, const char __user *filename,
 				   int flags, int mode);
 asmlinkage long sys_unshare(unsigned long unshare_flags);
+asmlinkage long sys_splice(int fdin, int fdout, size_t len,
+				unsigned int flags);
 
 #endif
diff --git a/net/socket.c b/net/socket.c
index fcd77ea..b13042f 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -119,7 +119,10 @@ static ssize_t sock_writev(struct file *
 static ssize_t sock_sendpage(struct file *file, struct page *page,
 			     int offset, size_t size, loff_t *ppos, int more);
 
+extern ssize_t generic_splice_sendpage(struct inode *inode, struct file *out,
+				size_t len, unsigned int flags);
 
+
 /*
  *	Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
  *	in the operation structures but are done directly via the socketcall() multiplexor.
@@ -141,7 +144,8 @@ #endif
 	.fasync =	sock_fasync,
 	.readv =	sock_readv,
 	.writev =	sock_writev,
-	.sendpage =	sock_sendpage
+	.sendpage =	sock_sendpage,
+	.splice_write = generic_splice_sendpage,
 };
 
 /*

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 10:06 Jens Axboe
@ 2006-03-30 10:16 ` Andrew Morton
  2006-03-30 10:24   ` Jens Axboe
  2006-03-30 12:00 ` Ingo Molnar
  1 sibling, 1 reply; 48+ messages in thread
From: Andrew Morton @ 2006-03-30 10:16 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, torvalds

Jens Axboe <axboe@suse.de> wrote:
>
> Hi,
> 
> This patch should resolve all issues mentioned so far. I'd still like to
> implement the page moving, but that should just be a separate patch.
> After this round of patching, I thought I'd toss a fresh version out
> there for all to look at.
> 
> Signed-off-by: Jens Axboe <axboe@suse.de>
> 
> ...
>
> --- a/arch/ia64/kernel/entry.S
> +++ b/arch/ia64/kernel/entry.S
> @@ -1605,5 +1605,6 @@ sys_call_table:
>  	data8 sys_ni_syscall			// reserved for pselect
>  	data8 sys_ni_syscall			// 1295 reserved for ppoll
>  	data8 sys_unshare
> +	date8 sys_splice

typo

> +static void *page_cache_pipe_buf_map(struct file *file,
> +				     struct pipe_inode_info *info,
> +				     struct pipe_buffer *buf)
> +{
> +	struct page *page = buf->page;
> +
> +	lock_page(page);
> +
> +	if (!PageUptodate(page)) {

 || page->mapping == NULL

> +	struct page *pages[PIPE_BUFFERS];
> +	struct page *page;
> +	pgoff_t index, pidx;
> +	int i;
> +
> +	index = in->f_pos >> PAGE_CACHE_SHIFT;
> +	offset = in->f_pos & ~PAGE_CACHE_MASK;
> +	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
> +
> +	if (nr_pages > PIPE_BUFFERS)
> +		nr_pages = PIPE_BUFFERS;
> +
> +	/*
> +	 * initiate read-ahead on this page range
> +	 */
> +	do_page_cache_readahead(mapping, in, index, nr_pages);
> +
> +	/*
> +	 * Get as many pages from the page cache as possible..
> +	 * Start IO on the page cache entries we create (we
> +	 * can assume that any pre-existing ones we find have
> +	 * already had IO started on them).
> +	 */
> +	i = find_get_pages(mapping, index, nr_pages, pages);
> +
> +	/*
> +	 * common case - we found all pages and they are contiguous,
> +	 * kick them off
> +	 */
> +	if (i && (pages[i - 1]->index == index + i - 1))
> +		goto splice_them;
> +
> +	memset(&pages[i], 0, (nr_pages - i) * sizeof(struct page *));
> +
> +	/*
> +	 * find_get_pages() may not return consecutive pages, so loop
> +	 * over the array moving pages and filling the rest, if need be.
> +	 */
> +	for (i = 0, pidx = index; i < nr_pages; pidx++, i++) {
> +		if (!pages[i]) {
> +			int error;
> +fill_page:
> +			/*
> +			 * no page there, look one up / create it
> +			 */
> +			page = find_or_create_page(mapping, pidx,
> +						   mapping_gfp_mask(mapping));
> +			if (!page)
> +				break;
> +
> +			if (PageUptodate(page))
> +				unlock_page(page);
> +			else {
> +				error = mapping->a_ops->readpage(in, page);
> +
> +				if (unlikely(error)) {
> +					page_cache_release(page);
> +					break;
> +				}
> +			}
> +			pages[i] = page;
> +		} else if (pages[i]->index != pidx) {
> +			page = pages[i];
> +			/*
> +			 * page isn't in the right spot, move it and jump
> +			 * back to filling this one. we know that ->index
> +			 * is larger than pidx
> +			 */
> +			pages[i + page->index - pidx] = page;
> +			pages[i] = NULL;
> +			goto fill_page;
> +		}
> +	}
> +
> +	if (!i)
> +		return 0;
> +
> +	/*
> +	 * Now we splice them into the pipe..
> +	 */
> +splice_them:
> +	return move_to_pipe(pipe, pages, i, offset, len);
> +}

OK, and this returns the number of bytes transferred all the way up to
userspace.

And, like read(), if we hit an IO error or ENOMEM partway through we'll
just return a short read which is indistinguishable from EOF.

But if we do get an IO error, we'll detect it in page_cache_pipe_buf_map().
Does the code still follow these read() return value semantics in that
case?

> ...

argh, am all reviewed out, and infiniband isn't compiling.  Will look later.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 10:16 ` Andrew Morton
@ 2006-03-30 10:24   ` Jens Axboe
  2006-03-30 11:16     ` Andrew Morton
  0 siblings, 1 reply; 48+ messages in thread
From: Jens Axboe @ 2006-03-30 10:24 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, torvalds

On Thu, Mar 30 2006, Andrew Morton wrote:
> Jens Axboe <axboe@suse.de> wrote:
> >
> > Hi,
> > 
> > This patch should resolve all issues mentioned so far. I'd still like to
> > implement the page moving, but that should just be a separate patch.
> > After this round of patching, I thought I'd toss a fresh version out
> > there for all to look at.
> > 
> > Signed-off-by: Jens Axboe <axboe@suse.de>
> > 
> > ...
> >
> > --- a/arch/ia64/kernel/entry.S
> > +++ b/arch/ia64/kernel/entry.S
> > @@ -1605,5 +1605,6 @@ sys_call_table:
> >  	data8 sys_ni_syscall			// reserved for pselect
> >  	data8 sys_ni_syscall			// 1295 reserved for ppoll
> >  	data8 sys_unshare
> > +	date8 sys_splice
> 
> typo

Oops, fixed.

> > +static void *page_cache_pipe_buf_map(struct file *file,
> > +				     struct pipe_inode_info *info,
> > +				     struct pipe_buffer *buf)
> > +{
> > +	struct page *page = buf->page;
> > +
> > +	lock_page(page);
> > +
> > +	if (!PageUptodate(page)) {
> 
>  || page->mapping == NULL

Fixed

> > +	struct page *pages[PIPE_BUFFERS];
> > +	struct page *page;
> > +	pgoff_t index, pidx;
> > +	int i;
> > +
> > +	index = in->f_pos >> PAGE_CACHE_SHIFT;
> > +	offset = in->f_pos & ~PAGE_CACHE_MASK;
> > +	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
> > +
> > +	if (nr_pages > PIPE_BUFFERS)
> > +		nr_pages = PIPE_BUFFERS;
> > +
> > +	/*
> > +	 * initiate read-ahead on this page range
> > +	 */
> > +	do_page_cache_readahead(mapping, in, index, nr_pages);
> > +
> > +	/*
> > +	 * Get as many pages from the page cache as possible..
> > +	 * Start IO on the page cache entries we create (we
> > +	 * can assume that any pre-existing ones we find have
> > +	 * already had IO started on them).
> > +	 */
> > +	i = find_get_pages(mapping, index, nr_pages, pages);
> > +
> > +	/*
> > +	 * common case - we found all pages and they are contiguous,
> > +	 * kick them off
> > +	 */
> > +	if (i && (pages[i - 1]->index == index + i - 1))
> > +		goto splice_them;
> > +
> > +	memset(&pages[i], 0, (nr_pages - i) * sizeof(struct page *));
> > +
> > +	/*
> > +	 * find_get_pages() may not return consecutive pages, so loop
> > +	 * over the array moving pages and filling the rest, if need be.
> > +	 */
> > +	for (i = 0, pidx = index; i < nr_pages; pidx++, i++) {
> > +		if (!pages[i]) {
> > +			int error;
> > +fill_page:
> > +			/*
> > +			 * no page there, look one up / create it
> > +			 */
> > +			page = find_or_create_page(mapping, pidx,
> > +						   mapping_gfp_mask(mapping));
> > +			if (!page)
> > +				break;
> > +
> > +			if (PageUptodate(page))
> > +				unlock_page(page);
> > +			else {
> > +				error = mapping->a_ops->readpage(in, page);
> > +
> > +				if (unlikely(error)) {
> > +					page_cache_release(page);
> > +					break;
> > +				}
> > +			}
> > +			pages[i] = page;
> > +		} else if (pages[i]->index != pidx) {
> > +			page = pages[i];
> > +			/*
> > +			 * page isn't in the right spot, move it and jump
> > +			 * back to filling this one. we know that ->index
> > +			 * is larger than pidx
> > +			 */
> > +			pages[i + page->index - pidx] = page;
> > +			pages[i] = NULL;
> > +			goto fill_page;
> > +		}
> > +	}
> > +
> > +	if (!i)
> > +		return 0;
> > +
> > +	/*
> > +	 * Now we splice them into the pipe..
> > +	 */
> > +splice_them:
> > +	return move_to_pipe(pipe, pages, i, offset, len);
> > +}
> 
> OK, and this returns the number of bytes transferred all the way up to
> userspace.
> 
> And, like read(), if we hit an IO error or ENOMEM partway through we'll
> just return a short read which is indistinguishable from EOF.
> 
> But if we do get an IO error, we'll detect it in page_cache_pipe_buf_map().
> Does the code still follow these read() return value semantics in that
> case?

Yes, see:

                        err = actor(info, buf, &sd);
                        if (err) {
                                if (!ret)
                                        ret = err;

                                break;
                        }

so if we get -EIO from the actor, then we return bytes transferred _or_
-EIO in 0.

> argh, am all reviewed out, and infiniband isn't compiling.  Will look later.

I don't blame you, you've been (as always) a huge help! Thanks a lot.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 10:24   ` Jens Axboe
@ 2006-03-30 11:16     ` Andrew Morton
  2006-03-30 11:55       ` Jens Axboe
  0 siblings, 1 reply; 48+ messages in thread
From: Andrew Morton @ 2006-03-30 11:16 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, torvalds

Jens Axboe <axboe@suse.de> wrote:
>
> > > +static void *page_cache_pipe_buf_map(struct file *file,
>  > > +				     struct pipe_inode_info *info,
>  > > +				     struct pipe_buffer *buf)
>  > > +{
>  > > +	struct page *page = buf->page;
>  > > +
>  > > +	lock_page(page);
>  > > +
>  > > +	if (!PageUptodate(page)) {
>  > 
>  >  || page->mapping == NULL
> 
>  Fixed

Actually that wasn't right.  If the page was truncated then we shouldn't
return -EIO to userspace.  We should return zero, as this is the first
page.

Which means either returning an ERR_PTR here for -EIO or re-checking i_size
in the caller.   Maybe the latter?


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 11:16     ` Andrew Morton
@ 2006-03-30 11:55       ` Jens Axboe
  2006-03-30 12:30         ` Jens Axboe
  0 siblings, 1 reply; 48+ messages in thread
From: Jens Axboe @ 2006-03-30 11:55 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, torvalds

On Thu, Mar 30 2006, Andrew Morton wrote:
> Jens Axboe <axboe@suse.de> wrote:
> >
> > > > +static void *page_cache_pipe_buf_map(struct file *file,
> >  > > +				     struct pipe_inode_info *info,
> >  > > +				     struct pipe_buffer *buf)
> >  > > +{
> >  > > +	struct page *page = buf->page;
> >  > > +
> >  > > +	lock_page(page);
> >  > > +
> >  > > +	if (!PageUptodate(page)) {
> >  > 
> >  >  || page->mapping == NULL
> > 
> >  Fixed
> 
> Actually that wasn't right.  If the page was truncated then we shouldn't
> return -EIO to userspace.  We should return zero, as this is the first
> page.

_if_ this is the first page, then agree.

> Which means either returning an ERR_PTR here for -EIO or re-checking i_size
> in the caller.   Maybe the latter?

The latter sounds better, I'll add that.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 10:06 Jens Axboe
  2006-03-30 10:16 ` Andrew Morton
@ 2006-03-30 12:00 ` Ingo Molnar
  2006-03-30 12:05   ` Jens Axboe
  1 sibling, 1 reply; 48+ messages in thread
From: Ingo Molnar @ 2006-03-30 12:00 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, akpm, torvalds


* Jens Axboe <axboe@suse.de> wrote:

> Hi,
> 
> This patch should resolve all issues mentioned so far. I'd still like 
> to implement the page moving, but that should just be a separate 
> patch.

neat stuff. One question: why do we require fdin or fdout to be a pipe?  
Is there any fundamental problem with implementing what Larry's original 
paper described too: straight pagecache -> socket transfers? Without a
pipe intermediary forced inbetween. It only adds unnecessary overhead.

	Ingo

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 12:00 ` Ingo Molnar
@ 2006-03-30 12:05   ` Jens Axboe
  2006-03-30 12:10     ` Ingo Molnar
  2006-03-30 17:02     ` Linus Torvalds
  0 siblings, 2 replies; 48+ messages in thread
From: Jens Axboe @ 2006-03-30 12:05 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel, akpm, torvalds

On Thu, Mar 30 2006, Ingo Molnar wrote:
> 
> * Jens Axboe <axboe@suse.de> wrote:
> 
> > Hi,
> > 
> > This patch should resolve all issues mentioned so far. I'd still like 
> > to implement the page moving, but that should just be a separate 
> > patch.
> 
> neat stuff. One question: why do we require fdin or fdout to be a pipe?  
> Is there any fundamental problem with implementing what Larry's original 
> paper described too: straight pagecache -> socket transfers? Without a
> pipe intermediary forced inbetween. It only adds unnecessary overhead.

No, not a fundamental problem. I think I even hid that in some comment
in there, at least if it's decipharable by someone else than myself...
Basically I think it would be nice in the future to tidy this a little
bit and separate the actual container from the pipe itself - and have
the pipe just fill/use the same container.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 12:05   ` Jens Axboe
@ 2006-03-30 12:10     ` Ingo Molnar
  2006-03-30 12:16       ` Jens Axboe
  2006-03-30 17:02     ` Linus Torvalds
  1 sibling, 1 reply; 48+ messages in thread
From: Ingo Molnar @ 2006-03-30 12:10 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, akpm, torvalds


* Jens Axboe <axboe@suse.de> wrote:

> On Thu, Mar 30 2006, Ingo Molnar wrote:
> > 
> > * Jens Axboe <axboe@suse.de> wrote:
> > 
> > > Hi,
> > > 
> > > This patch should resolve all issues mentioned so far. I'd still like 
> > > to implement the page moving, but that should just be a separate 
> > > patch.
> > 
> > neat stuff. One question: why do we require fdin or fdout to be a pipe?  
> > Is there any fundamental problem with implementing what Larry's original 
> > paper described too: straight pagecache -> socket transfers? Without a
> > pipe intermediary forced inbetween. It only adds unnecessary overhead.
> 
> No, not a fundamental problem. I think I even hid that in some comment 
> in there, at least if it's decipharable by someone else than myself... 
> Basically I think it would be nice in the future to tidy this a little 
> bit and separate the actual container from the pipe itself - and have 
> the pipe just fill/use the same container.

why is there a container needed at all? If i splice pagecache->socket, 
we can use sendpage to send it off immediately. There is no need for any 
container - both the pagecache and sendpage use struct page, and when we 
iterate to create a container we might as well ->sendpage() those pages 
off immediately instead.

I agree with the purpose of making sys_splice() generic and in 
particular usable in scripts/shells where pipes are commonly used, but 
we should also fulfill the original promise (outlined 15 years ago or 
so) and not limit this to pipes. That way i could improve TUX to make 
use of it for example ;)

	Ingo

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 12:10     ` Ingo Molnar
@ 2006-03-30 12:16       ` Jens Axboe
  2006-03-30 12:38         ` Ingo Molnar
  0 siblings, 1 reply; 48+ messages in thread
From: Jens Axboe @ 2006-03-30 12:16 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel, akpm, torvalds

On Thu, Mar 30 2006, Ingo Molnar wrote:
> 
> * Jens Axboe <axboe@suse.de> wrote:
> 
> > On Thu, Mar 30 2006, Ingo Molnar wrote:
> > > 
> > > * Jens Axboe <axboe@suse.de> wrote:
> > > 
> > > > Hi,
> > > > 
> > > > This patch should resolve all issues mentioned so far. I'd still like 
> > > > to implement the page moving, but that should just be a separate 
> > > > patch.
> > > 
> > > neat stuff. One question: why do we require fdin or fdout to be a pipe?  
> > > Is there any fundamental problem with implementing what Larry's original 
> > > paper described too: straight pagecache -> socket transfers? Without a
> > > pipe intermediary forced inbetween. It only adds unnecessary overhead.
> > 
> > No, not a fundamental problem. I think I even hid that in some comment 
> > in there, at least if it's decipharable by someone else than myself... 
> > Basically I think it would be nice in the future to tidy this a little 
> > bit and separate the actual container from the pipe itself - and have 
> > the pipe just fill/use the same container.
> 
> why is there a container needed at all? If i splice pagecache->socket, 
> we can use sendpage to send it off immediately. There is no need for any 
> container - both the pagecache and sendpage use struct page, and when we 
> iterate to create a container we might as well ->sendpage() those pages 
> off immediately instead.
> 
> I agree with the purpose of making sys_splice() generic and in 
> particular usable in scripts/shells where pipes are commonly used, but 
> we should also fulfill the original promise (outlined 15 years ago or 
> so) and not limit this to pipes. That way i could improve TUX to make 
> use of it for example ;)

There's absolutely no reason why we can't add fd -> fd splicing as well,
so no worries. Right now we just require a pipe transport. It's
extendable :-)

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 11:55       ` Jens Axboe
@ 2006-03-30 12:30         ` Jens Axboe
  2006-03-30 12:30           ` Jens Axboe
  2006-03-30 19:19           ` Andrew Morton
  0 siblings, 2 replies; 48+ messages in thread
From: Jens Axboe @ 2006-03-30 12:30 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, torvalds

On Thu, Mar 30 2006, Jens Axboe wrote:
> On Thu, Mar 30 2006, Andrew Morton wrote:
> > Jens Axboe <axboe@suse.de> wrote:
> > >
> > > > > +static void *page_cache_pipe_buf_map(struct file *file,
> > >  > > +				     struct pipe_inode_info *info,
> > >  > > +				     struct pipe_buffer *buf)
> > >  > > +{
> > >  > > +	struct page *page = buf->page;
> > >  > > +
> > >  > > +	lock_page(page);
> > >  > > +
> > >  > > +	if (!PageUptodate(page)) {
> > >  > 
> > >  >  || page->mapping == NULL
> > > 
> > >  Fixed
> > 
> > Actually that wasn't right.  If the page was truncated then we shouldn't
> > return -EIO to userspace.  We should return zero, as this is the first
> > page.
> 
> _if_ this is the first page, then agree.
> 
> > Which means either returning an ERR_PTR here for -EIO or re-checking i_size
> > in the caller.   Maybe the latter?
> 
> The latter sounds better, I'll add that.

Actually that's a little hard, since you don't know what state buf->page
is in - it might be page cache, it might not be. So I opted for the
ERR_PTR approach, just returning -ENODATA on !page->mapping.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 12:30         ` Jens Axboe
@ 2006-03-30 12:30           ` Jens Axboe
  2006-03-30 19:19           ` Andrew Morton
  1 sibling, 0 replies; 48+ messages in thread
From: Jens Axboe @ 2006-03-30 12:30 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, torvalds

On Thu, Mar 30 2006, Jens Axboe wrote:
> On Thu, Mar 30 2006, Jens Axboe wrote:
> > On Thu, Mar 30 2006, Andrew Morton wrote:
> > > Jens Axboe <axboe@suse.de> wrote:
> > > >
> > > > > > +static void *page_cache_pipe_buf_map(struct file *file,
> > > >  > > +				     struct pipe_inode_info *info,
> > > >  > > +				     struct pipe_buffer *buf)
> > > >  > > +{
> > > >  > > +	struct page *page = buf->page;
> > > >  > > +
> > > >  > > +	lock_page(page);
> > > >  > > +
> > > >  > > +	if (!PageUptodate(page)) {
> > > >  > 
> > > >  >  || page->mapping == NULL
> > > > 
> > > >  Fixed
> > > 
> > > Actually that wasn't right.  If the page was truncated then we shouldn't
> > > return -EIO to userspace.  We should return zero, as this is the first
> > > page.
> > 
> > _if_ this is the first page, then agree.
> > 
> > > Which means either returning an ERR_PTR here for -EIO or re-checking i_size
> > > in the caller.   Maybe the latter?
> > 
> > The latter sounds better, I'll add that.
> 
> Actually that's a little hard, since you don't know what state buf->page
> is in - it might be page cache, it might not be. So I opted for the
> ERR_PTR approach, just returning -ENODATA on !page->mapping.

Ala

diff --git a/fs/splice.c b/fs/splice.c
index 7927dff..68b98bb 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -47,9 +47,14 @@ static void *page_cache_pipe_buf_map(str
 
 	lock_page(page);
 
-	if (!PageUptodate(page) || !page->mapping) {
+	if (!PageUptodate(page)) {
+		unlock_page(page);
+		return ERR_PTR(-EIO);
+	}
+
+	if (!page->mapping) {
 		unlock_page(page);
-		return NULL;
+		return ERR_PTR(-ENODATA);
 	}
 
 	return kmap(buf->page);
@@ -279,6 +293,7 @@ static int pipe_to_sendpage(struct pipe_
 	loff_t pos = sd->pos;
 	unsigned int offset;
 	ssize_t ret;
+	void *ptr;
 
 	/*
 	 * sub-optimal, but we are limited by the pipe ->map. we don't
@@ -286,8 +301,9 @@ static int pipe_to_sendpage(struct pipe_
 	 * have the page pinned if the pipe page originates from the
 	 * page cache
 	 */
-	if (!buf->ops->map(file, info, buf))
-		return -EIO;
+	ptr = buf->ops->map(file, info, buf);
+	if (IS_ERR(ptr))
+		return PTR_ERR(ptr);
 
 	offset = pos & ~PAGE_CACHE_MASK;
 
@@ -334,8 +350,8 @@ static int pipe_to_file(struct pipe_inod
 	 * after this, page will be locked and unmapped
 	 */
 	src = buf->ops->map(file, info, buf);
-	if (!src)
-		return -EIO;
+	if (IS_ERR(src))
+		return PTR_ERR(src);
 
 	index = sd->pos >> PAGE_CACHE_SHIFT;
 	offset = sd->pos & ~PAGE_CACHE_MASK;
@@ -436,7 +452,7 @@ static ssize_t move_from_pipe(struct ino
 
 			err = actor(info, buf, &sd);
 			if (err) {
-				if (!ret)
+				if (!ret && err != -ENODATA)
 					ret = err;
 
 				break;

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 12:16       ` Jens Axboe
@ 2006-03-30 12:38         ` Ingo Molnar
  2006-03-30 12:42           ` Jens Axboe
  2006-03-30 14:20           ` Christoph Hellwig
  0 siblings, 2 replies; 48+ messages in thread
From: Ingo Molnar @ 2006-03-30 12:38 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, akpm, torvalds


* Jens Axboe <axboe@suse.de> wrote:

> > I agree with the purpose of making sys_splice() generic and in 
> > particular usable in scripts/shells where pipes are commonly used, but 
> > we should also fulfill the original promise (outlined 15 years ago or 
> > so) and not limit this to pipes. That way i could improve TUX to make 
> > use of it for example ;)
> 
> There's absolutely no reason why we can't add fd -> fd splicing as 
> well, so no worries. Right now we just require a pipe transport. It's 
> extendable :-)

ok :) I think this code should be merged into v2.6.17 - it's very clean 
and unintrusive.

	Ingo

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 12:38         ` Ingo Molnar
@ 2006-03-30 12:42           ` Jens Axboe
  2006-03-30 12:42             ` Ingo Molnar
  2006-03-30 14:20           ` Christoph Hellwig
  1 sibling, 1 reply; 48+ messages in thread
From: Jens Axboe @ 2006-03-30 12:42 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel, akpm, torvalds

On Thu, Mar 30 2006, Ingo Molnar wrote:
> 
> * Jens Axboe <axboe@suse.de> wrote:
> 
> > > I agree with the purpose of making sys_splice() generic and in 
> > > particular usable in scripts/shells where pipes are commonly used, but 
> > > we should also fulfill the original promise (outlined 15 years ago or 
> > > so) and not limit this to pipes. That way i could improve TUX to make 
> > > use of it for example ;)
> > 
> > There's absolutely no reason why we can't add fd -> fd splicing as 
> > well, so no worries. Right now we just require a pipe transport. It's 
> > extendable :-)
> 
> ok :) I think this code should be merged into v2.6.17 - it's very clean 
> and unintrusive.

I hope so, too. I'll post a hopefully finished patch real soon, and then
the move pages addon afterwards (which works now).

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 12:42           ` Jens Axboe
@ 2006-03-30 12:42             ` Ingo Molnar
  2006-03-30 13:02               ` Jens Axboe
  0 siblings, 1 reply; 48+ messages in thread
From: Ingo Molnar @ 2006-03-30 12:42 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, akpm, torvalds


* Jens Axboe <axboe@suse.de> wrote:

> On Thu, Mar 30 2006, Ingo Molnar wrote:
> > 
> > * Jens Axboe <axboe@suse.de> wrote:
> > 
> > > > I agree with the purpose of making sys_splice() generic and in 
> > > > particular usable in scripts/shells where pipes are commonly used, but 
> > > > we should also fulfill the original promise (outlined 15 years ago or 
> > > > so) and not limit this to pipes. That way i could improve TUX to make 
> > > > use of it for example ;)
> > > 
> > > There's absolutely no reason why we can't add fd -> fd splicing as 
> > > well, so no worries. Right now we just require a pipe transport. It's 
> > > extendable :-)
> > 
> > ok :) I think this code should be merged into v2.6.17 - it's very clean 
> > and unintrusive.
> 
> I hope so, too. I'll post a hopefully finished patch real soon, and 
> then the move pages addon afterwards (which works now).

what does the move pages addon do? pagecache->pagecache transfer?

	Ingo

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 12:42             ` Ingo Molnar
@ 2006-03-30 13:02               ` Jens Axboe
  0 siblings, 0 replies; 48+ messages in thread
From: Jens Axboe @ 2006-03-30 13:02 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel, akpm, torvalds

On Thu, Mar 30 2006, Ingo Molnar wrote:
> 
> * Jens Axboe <axboe@suse.de> wrote:
> 
> > On Thu, Mar 30 2006, Ingo Molnar wrote:
> > > 
> > > * Jens Axboe <axboe@suse.de> wrote:
> > > 
> > > > > I agree with the purpose of making sys_splice() generic and in 
> > > > > particular usable in scripts/shells where pipes are commonly used, but 
> > > > > we should also fulfill the original promise (outlined 15 years ago or 
> > > > > so) and not limit this to pipes. That way i could improve TUX to make 
> > > > > use of it for example ;)
> > > > 
> > > > There's absolutely no reason why we can't add fd -> fd splicing as 
> > > > well, so no worries. Right now we just require a pipe transport. It's 
> > > > extendable :-)
> > > 
> > > ok :) I think this code should be merged into v2.6.17 - it's very clean 
> > > and unintrusive.
> > 
> > I hope so, too. I'll post a hopefully finished patch real soon, and 
> > then the move pages addon afterwards (which works now).
> 
> what does the move pages addon do? pagecache->pagecache transfer?

Precisely, if you do file1 | file2 splicing, you can move the pages to
file2's address space. So say I do splice-cp file1 file2, I would only
have file2 contents in page cache aftwards. And I'd save a memcpy of all
the data.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 12:38         ` Ingo Molnar
  2006-03-30 12:42           ` Jens Axboe
@ 2006-03-30 14:20           ` Christoph Hellwig
  1 sibling, 0 replies; 48+ messages in thread
From: Christoph Hellwig @ 2006-03-30 14:20 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Jens Axboe, linux-kernel, akpm, torvalds

On Thu, Mar 30, 2006 at 02:38:05PM +0200, Ingo Molnar wrote:
> ok :) I think this code should be merged into v2.6.17 - it's very clean 
> and unintrusive.

no.  there's a reason we have a staging area in -mm.  Especially for things
introducing a new user ABI.  Give it a few weeks exposure in which a few
things will change for sure instead of rushing it into a release.


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 12:05   ` Jens Axboe
  2006-03-30 12:10     ` Ingo Molnar
@ 2006-03-30 17:02     ` Linus Torvalds
  2006-03-30 17:17       ` Linus Torvalds
                         ` (4 more replies)
  1 sibling, 5 replies; 48+ messages in thread
From: Linus Torvalds @ 2006-03-30 17:02 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Ingo Molnar, linux-kernel, akpm



On Thu, 30 Mar 2006, Jens Axboe wrote:
> On Thu, Mar 30 2006, Ingo Molnar wrote:
> >
> > neat stuff. One question: why do we require fdin or fdout to be a pipe?  
> > Is there any fundamental problem with implementing what Larry's original 
> > paper described too: straight pagecache -> socket transfers? Without a
> > pipe intermediary forced inbetween. It only adds unnecessary overhead.
> 
> No, not a fundamental problem. I think I even hid that in some comment
> in there, at least if it's decipharable by someone else than myself...

Actually, there _is_ a fundamental problem. Two of them, in fact.

The reason it goes through a pipe is two-fold:

 - the pipe _is_ the buffer. The reason sendfile() sucks is that sendfile 
   cannot work with <n> different buffer representations. sendfile() only 
   works with _one_ buffer representation, namely the "page cache of the 
   file".

   By using the page cache directly, sendfile() doesn't need any extra 
   buffering, but that's also why sendfile() fundamentally _cannot_ work 
   with anything else. You cannot do "sendfile" between two sockets to 
   forward data from one place to another, for example. You cannot do 
   sendfile from a streaming device.

   The pipe is just the standard in-kernel buffer between two arbitrary 
   points. Think of it as a scatter-gather list with a wait-queue. That's 
   what a pipe _is_. Trying to get rid of the pipe totally misses the 
   whole point of splice().

   Now, we could have a splice call that has an _implicit_ pipe, ie if 
   neither side is a pipe, we could create a temporary pipe and thus 
   allow what looks like a direct splice. But the pipe should still be 
   there.

 - The pipe is the buffer #2: it's what allows you to do _other_ things 
   with splice that are simply impossible to do with sendfile. Notably, 
   splice allows very naturally the "readv/writev" scatter-gather 
   behaviour of _mixing_ streams. If you're a web-server, with splice you 
   can do

	write(pipefd, header, header_len);
	splice(file, pipefd, file_len);
	splice(pipefd, socket, total_len);

   (this is all conceptual pseudo-code, of course), and this very 
   naturally has none of the issues that sendfile() has with plugging etc. 
   There's never any "send header separately and do extra work to make 
   sure it is in the same packet as the start of the data".

   So having a separate buffer even when you _do_ have a buffer like the 
   page cache is still something you want to do.

So there.

		Linus

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 17:02     ` Linus Torvalds
@ 2006-03-30 17:17       ` Linus Torvalds
  2006-03-31 20:38         ` Hua Zhong
  2006-03-30 20:48       ` Jeff Garzik
                         ` (3 subsequent siblings)
  4 siblings, 1 reply; 48+ messages in thread
From: Linus Torvalds @ 2006-03-30 17:17 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Ingo Molnar, linux-kernel, akpm



On Thu, 30 Mar 2006, Linus Torvalds wrote:
> 
> Actually, there _is_ a fundamental problem. Two of them, in fact.

Actually, four.

The third reason the pipe buffer is so useful is that it's literally a 
stream with no position.

That may sound bad, but it's actually a huge deal. It's why standard unix 
pipelines are so powerful. You don't pass around "this file, this offset, 
this length" - you pass around a simple fd, and you can feed that fd data 
without ever having to worry about what the reader of the data does. The 
reader cannot seek around to places that you didn't want him to see, and 
the reader cannot get confused about where the end is.

The 4th reason is "tee". Again, you _could_ perhaps do "tee" without the 
pipe, but it would be a total nightmare. Now, tee isn't that common, but 
it does happen, and in particular it happens a lot with certain streaming 
content.

Doing a "tee" with regular pipes is not that common: you usually just use 
it for debugging or logging (ie you have a pipeline you want to debug, and 
inserting "tee" in the middle is a good way to keep the same pipeline, but 
also being able to look at the intermediate data when something went 
wrong).

However, one reason "tee" _isn't_ that common with regular pipe usage is 
that normal programs never need to do that anyway: all the pipe data 
always goes through user space, so you can trivially do a "tee" inside of 
the application itself without any external support. You just log the data 
as you receive it. 

But with splice(), the whole _point_ of the system call is that at least a 
portion of the data never hits a user space buffer at all. Which means 
that suddenly "tee" becomes much more important, because it's the _only_ 
way to insert a point where you can do logging/debugging of the data.

Now, I didn't do the "tee()" system call in my initial example thing, and 
Jens didn't add it either, but I described it back in Jan-2005 with the 
original description. It really is very fundamental if you ever want to 
have a "plugin" kind of model, where you plug in different users to the 
same data stream.

The canonical example is getting video input from an mpeg encoder, and 
_both_ saving it to a file and sending it on in real-time to the app that 
shows it in a window. Again, having the pipe is what allows this.

		Linus

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 12:30         ` Jens Axboe
  2006-03-30 12:30           ` Jens Axboe
@ 2006-03-30 19:19           ` Andrew Morton
  1 sibling, 0 replies; 48+ messages in thread
From: Andrew Morton @ 2006-03-30 19:19 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, torvalds

Jens Axboe <axboe@suse.de> wrote:
>
> On Thu, Mar 30 2006, Jens Axboe wrote:
> > On Thu, Mar 30 2006, Andrew Morton wrote:
> > > Jens Axboe <axboe@suse.de> wrote:
> > > >
> > > > > > +static void *page_cache_pipe_buf_map(struct file *file,
> > > >  > > +				     struct pipe_inode_info *info,
> > > >  > > +				     struct pipe_buffer *buf)
> > > >  > > +{
> > > >  > > +	struct page *page = buf->page;
> > > >  > > +
> > > >  > > +	lock_page(page);
> > > >  > > +
> > > >  > > +	if (!PageUptodate(page)) {
> > > >  > 
> > > >  >  || page->mapping == NULL
> > > > 
> > > >  Fixed
> > > 
> > > Actually that wasn't right.  If the page was truncated then we shouldn't
> > > return -EIO to userspace.  We should return zero, as this is the first
> > > page.
> > 
> > _if_ this is the first page, then agree.
> > 
> > > Which means either returning an ERR_PTR here for -EIO or re-checking i_size
> > > in the caller.   Maybe the latter?
> > 
> > The latter sounds better, I'll add that.
> 
> Actually that's a little hard, since you don't know what state buf->page
> is in - it might be page cache, it might not be. So I opted for the
> ERR_PTR approach, just returning -ENODATA on !page->mapping.
> 

That's good, because the recheck-i_size idea was racy too.  If another
thread were to do a quick truncate-then-extend then there's a window where
we'd see a truncated page, return NULL from page_cache_pipe_buf_map() and
then by the time i_size is re-checked, i_size has grown again so we end up
deciding it must have been an IO error.


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 17:02     ` Linus Torvalds
  2006-03-30 17:17       ` Linus Torvalds
@ 2006-03-30 20:48       ` Jeff Garzik
  2006-03-30 21:16         ` Linus Torvalds
  2006-03-31  9:56       ` Jens Axboe
                         ` (2 subsequent siblings)
  4 siblings, 1 reply; 48+ messages in thread
From: Jeff Garzik @ 2006-03-30 20:48 UTC (permalink / raw)
  To: Linus Torvalds, Jens Axboe; +Cc: Ingo Molnar, linux-kernel, akpm


Let me pull back from the details a bit, to note a code pattern I'm 
beginning to see:

	if (src_fd is a file &&
	    dest_fd is a socket)
		sendfile()
	else
		hand code fd->fd data move

with splice this becomes

	if (special case fd combination #1)
		sendfile()
	else (special case fd combination #2)
		splice()
	else
		hand code fd->fd data move

Creating a syscall for each fd->fd data move case seems wasteful.  I 
would rather that the kernel Does The Right Thing so the app doesn't 
have to support all these special cases.  Handling the implicit buffer 
case in the kernel, when needed, means that the app is future-proofed: 
when another fd->fd optimization is implemented, the app automatically 
takes advantage of it.

	Jeff



^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 20:48       ` Jeff Garzik
@ 2006-03-30 21:16         ` Linus Torvalds
  2006-03-31  0:59           ` Nick Piggin
                             ` (2 more replies)
  0 siblings, 3 replies; 48+ messages in thread
From: Linus Torvalds @ 2006-03-30 21:16 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Jens Axboe, Ingo Molnar, linux-kernel, akpm



On Thu, 30 Mar 2006, Jeff Garzik wrote:
> 
> with splice this becomes
> 
> 	if (special case fd combination #1)
> 		sendfile()
> 	else (special case fd combination #2)
> 		splice()
> 	else
> 		hand code fd->fd data move

No, it really should be splice for all combinations (possibly with a 
manual read/write fallback for stuff where it just hasn't been done).

The fact that we don't have splice for certain fd combinations is really 
just a result of it not being done yet. One of the reasons I wanted to 
merge asap was that the original example patch was done over a year ago, 
and not a lot happened, so I'm hoping that the fact that the core code is 
now in the base tree is going to make people do the necessary splice input 
functions for different file types.

For filesystems, splice support tends to be really easy (both read and 
write). For other things, it depends a bit. But unlike sendfile(), it 
really is quite possible to splice _from_ a socket too, not just _to_ a 
socket. But no, that case hasn't been written yet.

(In contrast, with "sendfile()", you just fundamentally couldn't do it).

> Creating a syscall for each fd->fd data move case seems wasteful.  I would
> rather that the kernel Does The Right Thing so the app doesn't have to support
> all these special cases.  Handling the implicit buffer case in the kernel,
> when needed, means that the app is future-proofed: when another fd->fd
> optimization is implemented, the app automatically takes advantage of it.

splice() really can handle any fd->fd move.

The reason you want to have a pipe in the middle is that you have to 
handle partial moves _some_ way. And the pipe being the buffer really does 
allow that, and also handles the case of "what happens when we received 
more data than we could write".

So the way to do copies is

	int pipefd[2];
	unsigned long copied = 0;

	if (pipe(pipefd) < 0)
		error

	for (;;) {
		int nr = splice(in, pipefd[1], MAX_INT, 0);
		if (nr <= 0)
			break;
		do {
			int ret = splice(pipefd[0], out, nr, 0);
			if (ret <= 0) {
				error: couldn't write 'nr' bytes
				break;
			}

			nr -= ret;
		} while (nr);
	}
	close(pipefd[0]);
	close(pipefd[1]);

which may _seem_ very complicated and the extra pipe seems "useless", but 
it's (a) actually pretty efficient and (b) allows error recovery.

That (b) is the really important part. I can pretty much guarantee that 
without the "useless" pipe, you simply couldn't do it.

In particular, what happens when you try to connect two streaming devices, 
but the destination stops accepting data? You cannot put the received data 
"back" into the streaming source any way - so if you actually want to be 
able to handle error recovery, you _have_ to get access to the source 
buffers.

Also, for signal handling, you need to have some way to keep the pipe 
around for several iterations on the sender side, while still returning to 
user space to do the signal handler. 

And guess what? That's exactly what you get with that "useless" pipe. For 
error handling, you can decide to throw the extra data that the recipient 
didn't want away (just close the pipe), and maybe that's going to be what 
most people do. But you could also decide to just do a "read()" on the 
pipefd, to just read the data into user space for debugging and/or error 
recovery..

Similarly, for signals, the pipe _is_ that buffer that you need that is 
consistent across multiple invocations.

So that pipe is not at all unnecessary, and in fact, it's critical. It may 
look more complex than sendfile(), but it's more complex exactly because 
it can handle cases that sendfile never could, and just punted on (for 
regular files, you never have the issue of half-way buffers, since you 
just re-read them. Which is why sendfile() could do with it's simple 
interface, but is also why sendfile() was never good for anything else).

		Linus

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
@ 2006-03-31  0:03 linux
  0 siblings, 0 replies; 48+ messages in thread
From: linux @ 2006-03-31  0:03 UTC (permalink / raw)
  To: linux-kernel; +Cc: torvalds

The Lord High Penguin spake:
> In particular, what happens when you try to connect two streaming devices,
> but the destination stops accepting data? You cannot put the received
> data "back" into the streaming source any way - so if you actually want
> to be able to handle error recovery, you _have_ to get access to the
> source buffers.

Ding!  Ding!  Ding!  The light dawns.

Oh my, that *is* clever.  And I remember wrestling with the problem before.
If you're reading from a seekable fd (a la sendfile()), you can always
back up the read pointer, but if you're reading from a non-seekable
fd, what *do* you do if there's a write problem?  The pipe gives you
a solution.


However, that gets me thinking... what about a pipe lets it be used as
a splice() source?  The answer is that, like seekable files, it's peekable.
You can "peek at" an efficiently-sized chunk of data before deciding
to remove it from the source.  You don't have to read(buf)/write(buf),
you can buf=peek()/len=write(buf)/accept(len).

Network sockets have too many features (what with urgent data pointers
and crap) to make that easy, and some character devices have
side effects from read() that can't be hidden at all.  So you need
an explicit buffer.


Next, what about a pipe makes it an acceptable splice() destination?
The answer is that space is reservable.  You can know that a write()
won't fail before read()ing the source.  But a great many devices with
non-synchronous write semantics behave the same way.  Certainly it's
not hard to make a non-O_SYNC file system behave that way.


Give all that, it it necessary to use the fallback of a pipe all
the time?  It seems that doing without the pipe would be
feasible a large fraction of the time And would a

bytes_written = splice2(src, pipefd[1], pipefd[0], dst, &bytes_buffered);

system call be worth it to reduce overhead?  (I'm assuming that
bytes_read = bytes_written + bytes_buffered, but you can use any two
of the three in the interface.)  Note that it could bypass the pipe
entirely for those cases where it's possible.


Finally, has anyone thought about the semantics of tee(2)?
It seems that it could be used on block files to get a second
handle that does NOT share the file pointer.  But if used on
a pipe, does the write block if EITHER reader is behind?
So one reader could block the other?  I can't see any other way
to implement it, but I would like to be sure those semantics are
expected.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 21:16         ` Linus Torvalds
@ 2006-03-31  0:59           ` Nick Piggin
  2006-03-31  2:43             ` Andrew Morton
  2006-03-31  7:09           ` Ingo Molnar
  2006-03-31 12:46           ` Bernd Petrovitsch
  2 siblings, 1 reply; 48+ messages in thread
From: Nick Piggin @ 2006-03-31  0:59 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Jeff Garzik, Jens Axboe, Ingo Molnar, linux-kernel, akpm

Linus Torvalds wrote:

> splice() really can handle any fd->fd move.
> 
> The reason you want to have a pipe in the middle is that you have to 
> handle partial moves _some_ way. And the pipe being the buffer really does 
> allow that, and also handles the case of "what happens when we received 
> more data than we could write".
> 
> So the way to do copies is
> 
> 	int pipefd[2];
> 	unsigned long copied = 0;
> 
> 	if (pipe(pipefd) < 0)
> 		error
> 
> 	for (;;) {
> 		int nr = splice(in, pipefd[1], MAX_INT, 0);
> 		if (nr <= 0)
> 			break;
> 		do {
> 			int ret = splice(pipefd[0], out, nr, 0);
> 			if (ret <= 0) {
> 				error: couldn't write 'nr' bytes
> 				break;
> 			}
> 
> 			nr -= ret;
> 		} while (nr);
> 	}
> 	close(pipefd[0]);
> 	close(pipefd[1]);
> 

I think it makes sense to have a 64-bit length. It just seems
cleaner because it is in userspace units of file offset. Also,
might you be able to do a single file-sized file<->file splice,
and have it do a remote copy on a suitable network fs, or a
whole-file COW on some local fs (maybe not, as splice doesn't
deal with metadata... I don't know the tricky details of fses).

But your argument against a 64-bit length seemed to involve
limiting the usage that sys_splice would see. Make it 64-bit
instead and someone might come up with something that you
hadn't thought of. Is there any downside?

No offsets :(
Don't they only increase flexibility? Or would you prefer to
add a new sys_psplice for that?

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-31  0:59           ` Nick Piggin
@ 2006-03-31  2:43             ` Andrew Morton
  2006-03-31  2:51               ` Andrew Morton
  0 siblings, 1 reply; 48+ messages in thread
From: Andrew Morton @ 2006-03-31  2:43 UTC (permalink / raw)
  To: Nick Piggin; +Cc: torvalds, jeff, axboe, mingo, linux-kernel, Robert S Peterson

Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>
> Also,
>  might you be able to do a single file-sized file<->file splice,
>  and have it do a remote copy on a suitable network fs

splice() may not be suitable for such filesystems.  Quoting Robert Peterson
from earlier this week:


> The problem is that
> loop.c circumvents proper locking by going directly to prepare_write
> rather than following the normal process.  If I added some kind of
> library callbacks to allow cluster locking, it would break the
> normal locking sequence of an ordinary write.
>
> The normal sequence of an ordinary write typically looks like:
>
> 1. write
> 2.   (Take care of cluster locking if necessary)
> 3.   generic_file_write_nolock
> 4.      generic_file_aio_write_nolock
> 5.         generic_file_buffered_write
> 6.            a_ops->prepare_write
> etc.
>
> Right now, loop.c is circumventing step 2 (optional cluster locking
> if needed by the underlying fs) by bypassing the "write" op.  If I 
> added callbacks to prepare_write at (7) as you suggest, it would either
> (a) introduce deadlocks conflicting with step 2 above or
> (b) require the underlying fs to use its own versions of 3,4,5, and 6,
> thus bypassing a great deal of vfs, which is not good for anyone.
>
> Some could argue that loop.c should always use write rather than
> prepare_write/commit_write, but most fs's don't have special
> locking needs and therefore using them is a performance boost.
>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-31  2:43             ` Andrew Morton
@ 2006-03-31  2:51               ` Andrew Morton
  2006-03-31  3:20                 ` Nick Piggin
  2006-03-31  6:35                 ` Christoph Hellwig
  0 siblings, 2 replies; 48+ messages in thread
From: Andrew Morton @ 2006-03-31  2:51 UTC (permalink / raw)
  To: nickpiggin, torvalds, jeff, axboe, mingo, linux-kernel, rpeterso

Andrew Morton <akpm@osdl.org> wrote:
>
> splice() may not be suitable for such filesystems.

OK, splice() cuts in at the file_operations level, so sych a clustered
filesystem _could_ implement it, but none of the code we have there will be
usable by it.  If the operations in splice.c were to operate at the
file_operations level (->read, ->write) then probably they could be used
thusly.


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-31  2:51               ` Andrew Morton
@ 2006-03-31  3:20                 ` Nick Piggin
  2006-03-31  6:35                 ` Christoph Hellwig
  1 sibling, 0 replies; 48+ messages in thread
From: Nick Piggin @ 2006-03-31  3:20 UTC (permalink / raw)
  To: Andrew Morton; +Cc: torvalds, jeff, axboe, mingo, linux-kernel, rpeterso

Andrew Morton wrote:
> Andrew Morton <akpm@osdl.org> wrote:
> 
>>splice() may not be suitable for such filesystems.
> 

Well OK, but my point about making the syscall API use 64-bit
size is not completely invalid. Unless there is some downside
to it.

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
@ 2006-03-31  6:06 tridge
  2006-03-31  6:59 ` Antonio Vargas
  2006-03-31  9:57 ` Jens Axboe
  0 siblings, 2 replies; 48+ messages in thread
From: tridge @ 2006-03-31  6:06 UTC (permalink / raw)
  To: linux-kernel

Linus and Jens,

Stephen Rothwell just pointed me at the new splice() interface. Looks
really nice!

One comment though. Could we add a off_t to splice to make it more
like pwrite() ? Otherwise threads could get painful with race
conditions between the seek and the splice() call.

Either that or add psplice() like this:

  ssize_t psplice(int fdin, int fdout, size_t len, off_t ofs, unsigned flags);

where ofs sets the offset to read from fdin. 

Cheers, Tridge

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-31  2:51               ` Andrew Morton
  2006-03-31  3:20                 ` Nick Piggin
@ 2006-03-31  6:35                 ` Christoph Hellwig
  1 sibling, 0 replies; 48+ messages in thread
From: Christoph Hellwig @ 2006-03-31  6:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: nickpiggin, torvalds, jeff, axboe, mingo, linux-kernel, rpeterso

On Thu, Mar 30, 2006 at 06:51:33PM -0800, Andrew Morton wrote:
> Andrew Morton <akpm@osdl.org> wrote:
> >
> > splice() may not be suitable for such filesystems.
> 
> OK, splice() cuts in at the file_operations level, so sych a clustered
> filesystem _could_ implement it, but none of the code we have there will be
> usable by it.  If the operations in splice.c were to operate at the
> file_operations level (->read, ->write) then probably they could be used
> thusly.

Of course the code would be useable, same as current generic_file_* code
is useable with small wrappers.


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-31  6:06 [PATCH] splice support #2 tridge
@ 2006-03-31  6:59 ` Antonio Vargas
  2006-03-31  7:37   ` tridge
  2006-03-31  9:57 ` Jens Axboe
  1 sibling, 1 reply; 48+ messages in thread
From: Antonio Vargas @ 2006-03-31  6:59 UTC (permalink / raw)
  To: tridge; +Cc: linux-kernel

On 3/31/06, tridge@samba.org <tridge@samba.org> wrote:
> Linus and Jens,
>
> Stephen Rothwell just pointed me at the new splice() interface. Looks
> really nice!
>
> One comment though. Could we add a off_t to splice to make it more
> like pwrite() ? Otherwise threads could get painful with race
> conditions between the seek and the splice() call.
>
> Either that or add psplice() like this:
>
>   ssize_t psplice(int fdin, int fdout, size_t len, off_t ofs, unsigned flags);
>
> where ofs sets the offset to read from fdin.
>
> Cheers, Tridge

Maybe "ssize_t psplice(int fdin, int fdout, size_t len, off_t
fdin_ofs, unsigned flags);"
would be better at documenting that the offset is for the input? Or
maybe adding also a fdout_ofs for when fdout is also a file...

--
Greetz, Antonio Vargas aka winden of network

http://wind.codepixel.com/
windNOenSPAMntw@gmail.com
thesameasabove@amigascne.org

Every day, every year
you have to work
you have to study
you have to scene.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 21:16         ` Linus Torvalds
  2006-03-31  0:59           ` Nick Piggin
@ 2006-03-31  7:09           ` Ingo Molnar
  2006-04-02 22:33             ` Pavel Machek
  2006-03-31 12:46           ` Bernd Petrovitsch
  2 siblings, 1 reply; 48+ messages in thread
From: Ingo Molnar @ 2006-03-31  7:09 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jeff Garzik, Jens Axboe, linux-kernel, akpm, Christoph Hellwig


* Linus Torvalds <torvalds@osdl.org> wrote:

> In particular, what happens when you try to connect two streaming 
> devices, but the destination stops accepting data? You cannot put the 
> received data "back" into the streaming source any way - so if you 
> actually want to be able to handle error recovery, you _have_ to get 
> access to the source buffers.

i'd rather implement this error case as an exception mechanism, instead 
of a forced intermediary buffer mechanism.

We should extend the userspace API so that it is prepared to receive 
'excess data' via a separate 'flush excess data to' file descriptor:

	sys_splice(fd_in, fd_out, fd_flush, size,
                   max_flush_size, *bytes_flushed)

Note1: fd_flush can be a pipe too! This would avoid copies in the 
       exception case - if the exception case is expected to be common.

Note2: max_flush_size serves as hint and as a natural 'buffering limit' 
       for the kernel-internal loops. I believe it's more natural than 
       the implicit 'pipe buffering limit' we currently have.
       max_flush_size == 0 would say to the kernel: 'use whatever 
       buffering is natural or necessary'. E.g. if fd_flush is a pipe, 
       it would automatically set the buffering size to the flush-pipe's
       internal buffering limit.

Note3: we could even eliminate the "*bytes_flushed" parameter from 
       the syscall: as fd_flush's seek offset gives userspace an idea 
       about how much data was written to it.

Note4: if the user messes up fd_flush so that the kernel's "excess data" 
       transfer into fd_flush failes then that's 'tough luck' and flush 
       data may be lost. Users can use pipes [if the exception case is 
       common and they want to optimize that codepath] or can pre-write 
       their files if they need a 100% guarantee.
       In fact, the kernel doesnt even have to _look up_ fd_flush in the 
       common case. It's the application's responsibility to make sure 
       the exception case will work. This means that the _only_ overhead 
       from this exception mechanism are the 2-3 extra parameters to
       sys_splice(). That's _much_ faster.

Just look at the beauty of this generalization. fd_flush can be 
_anything_. It could be a pipe. It could be a temporary file in /tmp. It 
could be a file over the network. fd_flush could be mmap()-ed to 
user-space! Or it could even be -1 if the user is not interested in the 
error case for the streaming data. (For example a good portion of video 
and audio playback applications are not interested in the fd_out error 
case at all: such data can easily lose 'value' if it gets delayed by 
more than a few milliseconds and the right answer is to skip the frame 
or display an error message, ignoring the lost data.)

But for heaven's sake: do not slow down the 99.9999999999% fastpath by 
forcing a pipe inbetween on the ABI level! I really have nothing against 
making sys_splice() generic and i agree that a very good first step to 
achieve that is to include pipes in the implementation, but i dont think 
pipes are (or should be) all that critical and fundamental to the splice 
data-streaming concept itself, as you are suggesting.

> Also, for signal handling, you need to have some way to keep the pipe 
> around for several iterations on the sender side, while still 
> returning to user space to do the signal handler.

i believe the signal case is naturally handled by the fd_flush approach 
too - in fact it can also acts as a nice tester for the exception 
handling mechanism.

If the application in question expects to get many signals then it can 
use a pipe as fd_flush. (But signal-heavy apps are quite rare: most 
performance-critical apps avoid them for the fastpath like the plague, 
on modern CPUs it's more expensive to receive and handle a single signal 
than to create and tear down a completely new thread (!))

	Ingo

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-31  6:59 ` Antonio Vargas
@ 2006-03-31  7:37   ` tridge
  0 siblings, 0 replies; 48+ messages in thread
From: tridge @ 2006-03-31  7:37 UTC (permalink / raw)
  To: Antonio Vargas; +Cc: linux-kernel

Antonio,

 > Maybe "ssize_t psplice(int fdin, int fdout, size_t len, off_t
 > fdin_ofs, unsigned flags);"

yep, thats clearer

I think ideally splice() would just be changed to always take a off_t
as the interface is so new hopefully no apps depend on it yet. Then we
don't need psplice(). 

A while ago we got bitten with tdb in Samba and systems that only have
write(), and no pwrite(). If a process forks then accesses the db in
both parent and child the shared seek pointer semantics of inherited
file descriptors causes races with seek/write. It would be nice if
splice() was immune from this from the start.

If people want it both ways, then an alternative would be to have a
SPLICE_FLAG_OFS_CURRENT flag to splice(), which would tell splice() to
ignore the off_t and use the current seek ptr. I guess that would save
some book keeping for some apps that know they don't have any seek
races and want to always write at the current position. I wouldn't
need this, but some people might find it convenient.

 > would be better at documenting that the offset is for the input? Or
 > maybe adding also a fdout_ofs for when fdout is also a file...

I thought about that, but Stephen convinced me it was a bad idea, as
the semantics with whats in the pipe buffer at any one time gets too
messy if the output is a socket. I guess you could return -EINVAL if
you used the fdout_ofs and its not a seekable file, but that seems to
be overloading the system call too much for my liking.

Cheers, Tridge

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 17:02     ` Linus Torvalds
  2006-03-30 17:17       ` Linus Torvalds
  2006-03-30 20:48       ` Jeff Garzik
@ 2006-03-31  9:56       ` Jens Axboe
  2006-03-31 12:18       ` Ingo Molnar
  2006-03-31 12:27       ` Ingo Molnar
  4 siblings, 0 replies; 48+ messages in thread
From: Jens Axboe @ 2006-03-31  9:56 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Ingo Molnar, linux-kernel, akpm

On Thu, Mar 30 2006, Linus Torvalds wrote:
> 
> 
> On Thu, 30 Mar 2006, Jens Axboe wrote:
> > On Thu, Mar 30 2006, Ingo Molnar wrote:
> > >
> > > neat stuff. One question: why do we require fdin or fdout to be a pipe?  
> > > Is there any fundamental problem with implementing what Larry's original 
> > > paper described too: straight pagecache -> socket transfers? Without a
> > > pipe intermediary forced inbetween. It only adds unnecessary overhead.
> > 
> > No, not a fundamental problem. I think I even hid that in some comment
> > in there, at least if it's decipharable by someone else than myself...
> 
> Actually, there _is_ a fundamental problem. Two of them, in fact.
> 
> The reason it goes through a pipe is two-fold:
> 
>  - the pipe _is_ the buffer. The reason sendfile() sucks is that sendfile 
>    cannot work with <n> different buffer representations. sendfile() only 
>    works with _one_ buffer representation, namely the "page cache of the 
>    file".
> 
>    By using the page cache directly, sendfile() doesn't need any extra 
>    buffering, but that's also why sendfile() fundamentally _cannot_ work 
>    with anything else. You cannot do "sendfile" between two sockets to 
>    forward data from one place to another, for example. You cannot do 
>    sendfile from a streaming device.
> 
>    The pipe is just the standard in-kernel buffer between two arbitrary 
>    points. Think of it as a scatter-gather list with a wait-queue. That's 
>    what a pipe _is_. Trying to get rid of the pipe totally misses the 
>    whole point of splice().
> 
>    Now, we could have a splice call that has an _implicit_ pipe, ie if 
>    neither side is a pipe, we could create a temporary pipe and thus 
>    allow what looks like a direct splice. But the pipe should still be 
>    there.
> 
>  - The pipe is the buffer #2: it's what allows you to do _other_ things 
>    with splice that are simply impossible to do with sendfile. Notably, 
>    splice allows very naturally the "readv/writev" scatter-gather 
>    behaviour of _mixing_ streams. If you're a web-server, with splice you 
>    can do
> 
> 	write(pipefd, header, header_len);
> 	splice(file, pipefd, file_len);
> 	splice(pipefd, socket, total_len);
> 
>    (this is all conceptual pseudo-code, of course), and this very 
>    naturally has none of the issues that sendfile() has with plugging etc. 
>    There's never any "send header separately and do extra work to make 
>    sure it is in the same packet as the start of the data".
> 
>    So having a separate buffer even when you _do_ have a buffer like the 
>    page cache is still something you want to do.
> 
> So there.

My point was mainly that the buffer itself need not necessarily be a
pipe, it could be implemented with a pipe just using the same buffer
type. But I guess it doesn't make much sense, the pipe has nice
advantages in itself.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-31  6:06 [PATCH] splice support #2 tridge
  2006-03-31  6:59 ` Antonio Vargas
@ 2006-03-31  9:57 ` Jens Axboe
  2006-03-31 19:11   ` Linus Torvalds
  1 sibling, 1 reply; 48+ messages in thread
From: Jens Axboe @ 2006-03-31  9:57 UTC (permalink / raw)
  To: tridge; +Cc: linux-kernel, Linus Torvalds

On Fri, Mar 31 2006, tridge@samba.org wrote:
> Linus and Jens,
> 
> Stephen Rothwell just pointed me at the new splice() interface. Looks
> really nice!
> 
> One comment though. Could we add a off_t to splice to make it more
> like pwrite() ? Otherwise threads could get painful with race
> conditions between the seek and the splice() call.
> 
> Either that or add psplice() like this:
> 
>   ssize_t psplice(int fdin, int fdout, size_t len, off_t ofs, unsigned flags);
> 
> where ofs sets the offset to read from fdin. 

I definitely see some valid reasons for adding a file offset instead of
using ->f_pos, I'll leave that decision up to Linus though. Linus?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 17:02     ` Linus Torvalds
                         ` (2 preceding siblings ...)
  2006-03-31  9:56       ` Jens Axboe
@ 2006-03-31 12:18       ` Ingo Molnar
  2006-03-31 12:23         ` Jens Axboe
  2006-03-31 12:27       ` Ingo Molnar
  4 siblings, 1 reply; 48+ messages in thread
From: Ingo Molnar @ 2006-03-31 12:18 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Jens Axboe, linux-kernel, akpm


* Linus Torvalds <torvalds@osdl.org> wrote:

>  - The pipe is the buffer #2: it's what allows you to do _other_ things 
>    with splice that are simply impossible to do with sendfile. Notably, 
>    splice allows very naturally the "readv/writev" scatter-gather 
>    behaviour of _mixing_ streams. If you're a web-server, with splice you 
>    can do
> 
> 	write(pipefd, header, header_len);
> 	splice(file, pipefd, file_len);
> 	splice(pipefd, socket, total_len);
> 
>    (this is all conceptual pseudo-code, of course), and this very 
>    naturally has none of the issues that sendfile() has with plugging etc. 
>    There's never any "send header separately and do extra work to make 
>    sure it is in the same packet as the start of the data".

with pipe-based buffering this approach has still the very same problems 
that sendfile() has with packet boundaries, because it's not enough to 
have "large enough" buffering (like a pipe has), the pipe also has to be 
drained, and the networking layer has to know the precise boundary of 
data.

the right solution to the packet boundary problem is to pass in a proper 
"does userspace expect more data right now" flag, or to let userspace 
'flush' the socket independently - which is independent of the 
pipe-in-slice issue. This solution already exists: the MSG_MORE flag.

	Ingo

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-31 12:18       ` Ingo Molnar
@ 2006-03-31 12:23         ` Jens Axboe
  2006-03-31 12:26           ` Jens Axboe
  0 siblings, 1 reply; 48+ messages in thread
From: Jens Axboe @ 2006-03-31 12:23 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Linus Torvalds, linux-kernel, akpm

On Fri, Mar 31 2006, Ingo Molnar wrote:
> 
> * Linus Torvalds <torvalds@osdl.org> wrote:
> 
> >  - The pipe is the buffer #2: it's what allows you to do _other_ things 
> >    with splice that are simply impossible to do with sendfile. Notably, 
> >    splice allows very naturally the "readv/writev" scatter-gather 
> >    behaviour of _mixing_ streams. If you're a web-server, with splice you 
> >    can do
> > 
> > 	write(pipefd, header, header_len);
> > 	splice(file, pipefd, file_len);
> > 	splice(pipefd, socket, total_len);
> > 
> >    (this is all conceptual pseudo-code, of course), and this very 
> >    naturally has none of the issues that sendfile() has with plugging etc. 
> >    There's never any "send header separately and do extra work to make 
> >    sure it is in the same packet as the start of the data".
> 
> with pipe-based buffering this approach has still the very same problems 
> that sendfile() has with packet boundaries, because it's not enough to 
> have "large enough" buffering (like a pipe has), the pipe also has to be 
> drained, and the networking layer has to know the precise boundary of 
> data.
> 
> the right solution to the packet boundary problem is to pass in a proper 
> "does userspace expect more data right now" flag, or to let userspace 
> 'flush' the socket independently - which is independent of the 
> pipe-in-slice issue. This solution already exists: the MSG_MORE flag.

We can add a SPLICE_F_MORE flag for this, right now splice doesn't set
the MSG_MORE flag for the end of the pipe.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-31 12:23         ` Jens Axboe
@ 2006-03-31 12:26           ` Jens Axboe
  2006-03-31 12:47             ` Ingo Molnar
  0 siblings, 1 reply; 48+ messages in thread
From: Jens Axboe @ 2006-03-31 12:26 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Linus Torvalds, linux-kernel, akpm

On Fri, Mar 31 2006, Jens Axboe wrote:
> On Fri, Mar 31 2006, Ingo Molnar wrote:
> > 
> > * Linus Torvalds <torvalds@osdl.org> wrote:
> > 
> > >  - The pipe is the buffer #2: it's what allows you to do _other_ things 
> > >    with splice that are simply impossible to do with sendfile. Notably, 
> > >    splice allows very naturally the "readv/writev" scatter-gather 
> > >    behaviour of _mixing_ streams. If you're a web-server, with splice you 
> > >    can do
> > > 
> > > 	write(pipefd, header, header_len);
> > > 	splice(file, pipefd, file_len);
> > > 	splice(pipefd, socket, total_len);
> > > 
> > >    (this is all conceptual pseudo-code, of course), and this very 
> > >    naturally has none of the issues that sendfile() has with plugging etc. 
> > >    There's never any "send header separately and do extra work to make 
> > >    sure it is in the same packet as the start of the data".
> > 
> > with pipe-based buffering this approach has still the very same problems 
> > that sendfile() has with packet boundaries, because it's not enough to 
> > have "large enough" buffering (like a pipe has), the pipe also has to be 
> > drained, and the networking layer has to know the precise boundary of 
> > data.
> > 
> > the right solution to the packet boundary problem is to pass in a proper 
> > "does userspace expect more data right now" flag, or to let userspace 
> > 'flush' the socket independently - which is independent of the 
> > pipe-in-slice issue. This solution already exists: the MSG_MORE flag.
> 
> We can add a SPLICE_F_MORE flag for this, right now splice doesn't set
> the MSG_MORE flag for the end of the pipe.

Ala

diff --git a/fs/splice.c b/fs/splice.c
index d8787c1..8a9ef67 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -343,15 +343,16 @@ static int pipe_to_sendpage(struct pipe_
 	loff_t pos = sd->pos;
 	unsigned int offset;
 	ssize_t ret;
+	int more;
 
 	ret = buf->ops->map(file, info, buf);
 	if (unlikely(ret))
 		return ret;
 
 	offset = pos & ~PAGE_CACHE_MASK;
+	more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
 
-	ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,
-					sd->len < sd->total_len);
+	ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,more);
 
 	buf->ops->unmap(info, buf);
 	if (ret == sd->len)
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index bdf9d57..b003e3d 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -59,5 +59,6 @@ void free_pipe_info(struct inode* inode)
  * add the splice flags here.
  */
 #define SPLICE_F_MOVE	(0x01)	/* move pages instead of copying */
+#define SPLICE_F_MORE	(0x02)	/* expect more data */
 
 #endif

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 17:02     ` Linus Torvalds
                         ` (3 preceding siblings ...)
  2006-03-31 12:18       ` Ingo Molnar
@ 2006-03-31 12:27       ` Ingo Molnar
  4 siblings, 0 replies; 48+ messages in thread
From: Ingo Molnar @ 2006-03-31 12:27 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Jens Axboe, linux-kernel, akpm


* Linus Torvalds <torvalds@osdl.org> wrote:

> The reason it goes through a pipe is two-fold:
> 
>  - the pipe _is_ the buffer. The reason sendfile() sucks is that sendfile 
>    cannot work with <n> different buffer representations. sendfile() only 
>    works with _one_ buffer representation, namely the "page cache of the 
>    file".

The main problem of sendfile() is that it exposes an incoming-fd 
'offset', which pretty much hardcodes the assumption that the incoming 
file is a pagecache-backed object. I believe the flush_fd solution to 
sys_splice() would solve that range of problems, and thus slice could be 
used for offset-less file objects too.

all the remaining problems of sendfile() just derive from this basic 
pagecache assumption that is hardcoded in the ABI. Once this one is 
overcome (and i believe fd_flush overcomes it), there's no reason why we 
couldnt implement the complete range of sys_splice() transfers.

	Ingo

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-30 21:16         ` Linus Torvalds
  2006-03-31  0:59           ` Nick Piggin
  2006-03-31  7:09           ` Ingo Molnar
@ 2006-03-31 12:46           ` Bernd Petrovitsch
  2 siblings, 0 replies; 48+ messages in thread
From: Bernd Petrovitsch @ 2006-03-31 12:46 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Jeff Garzik, Jens Axboe, Ingo Molnar, linux-kernel, akpm

On Thu, 2006-03-30 at 13:16 -0800, Linus Torvalds wrote:
[...]
> splice() really can handle any fd->fd move.
> 
> The reason you want to have a pipe in the middle is that you have to 
> handle partial moves _some_ way. And the pipe being the buffer really does 
> allow that, and also handles the case of "what happens when we received 
> more data than we could write".
> 
> So the way to do copies is
> 
> 	int pipefd[2];
> 	unsigned long copied = 0;
> 
> 	if (pipe(pipefd) < 0)
> 		error
> 
> 	for (;;) {
> 		int nr = splice(in, pipefd[1], MAX_INT, 0);
> 		if (nr <= 0)
> 			break;
> 		do {
> 			int ret = splice(pipefd[0], out, nr, 0);
> 			if (ret <= 0) {
> 				error: couldn't write 'nr' bytes
> 				break;
> 			}
> 
> 			nr -= ret;
> 		} while (nr);
> 	}
> 	close(pipefd[0]);
> 	close(pipefd[1]);
> 
> which may _seem_ very complicated and the extra pipe seems "useless", but 
> it's (a) actually pretty efficient and (b) allows error recovery.
> 
> That (b) is the really important part. I can pretty much guarantee that 
> without the "useless" pipe, you simply couldn't do it.
> 
> In particular, what happens when you try to connect two streaming devices, 
> but the destination stops accepting data? You cannot put the received data 
> "back" into the streaming source any way - so if you actually want to be 
> able to handle error recovery, you _have_ to get access to the source 
> buffers.
> 
> Also, for signal handling, you need to have some way to keep the pipe 
> around for several iterations on the sender side, while still returning to 
> user space to do the signal handler. 
> 
> And guess what? That's exactly what you get with that "useless" pipe. For 
> error handling, you can decide to throw the extra data that the recipient 
> didn't want away (just close the pipe), and maybe that's going to be what 
> most people do. But you could also decide to just do a "read()" on the 
> pipefd, to just read the data into user space for debugging and/or error 
> recovery..
> 
> Similarly, for signals, the pipe _is_ that buffer that you need that is 
> consistent across multiple invocations.
> 
> So that pipe is not at all unnecessary, and in fact, it's critical. It may 

IOW the "pipe" above is in fact just a "handle" of a kernel-internal
buffer (of one page) and the two fd's can be used to tell sys-calls to
read from or write into the buffer. But it is much faster because it
avoids real copying of the data into a user-space buffer and out of the
user-space buffer since the kernel can manipulate the underlying data
directly as a typical implementation of the above code does (i.e. use
read(2) and write(2) instead of the two splice(2) calls).
Uuuuuh, real zero-copy seems possible.

	Bernd
-- 
Firmix Software GmbH                   http://www.firmix.at/
mobil: +43 664 4416156                 fax: +43 1 7890849-55
          Embedded Linux Development and Services


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-31 12:26           ` Jens Axboe
@ 2006-03-31 12:47             ` Ingo Molnar
  2006-03-31 18:18               ` Jens Axboe
  0 siblings, 1 reply; 48+ messages in thread
From: Ingo Molnar @ 2006-03-31 12:47 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Linus Torvalds, linux-kernel, akpm


* Jens Axboe <axboe@suse.de> wrote:

> > > with pipe-based buffering this approach has still the very same problems 
> > > that sendfile() has with packet boundaries, because it's not enough to 
> > > have "large enough" buffering (like a pipe has), the pipe also has to be 
> > > drained, and the networking layer has to know the precise boundary of 
> > > data.
> > > 
> > > the right solution to the packet boundary problem is to pass in a proper 
> > > "does userspace expect more data right now" flag, or to let userspace 
> > > 'flush' the socket independently - which is independent of the 
> > > pipe-in-slice issue. This solution already exists: the MSG_MORE flag.
> > 
> > We can add a SPLICE_F_MORE flag for this, right now splice doesn't set
> > the MSG_MORE flag for the end of the pipe.
> 
> Ala

>  #define SPLICE_F_MOVE	(0x01)	/* move pages instead of copying */
> +#define SPLICE_F_MORE	(0x02)	/* expect more data */

ok, nice - something like this should work. The direction of the flag is 
a philosophical question i guess: i believe in Linux we prefer to 
default to "buffering enabled", i.e. the default flag should be "expect 
more data". So maybe it would be better to pass in PLICE_F_END, to 
indicate end-of-data. [it doesnt mean 'permanent end', so all the files 
still remain open: this could be something like a HTTP 1.1 pipelined 
request.]

furthermore, the internal implementation should also get smarter and do 
a flush-socket if it would e.g. block on a pagecache page. [we often 
prefer a partial packet in such cases instead of having a half-built 
packet hang around.]

btw., that 'data boundary' detail is likely lost with the pipe 
intermediary solution: there is no direct connection between 'input 
file' and 'output socket', so a 'flush now' event doesnt get propagated 
in a natural way. (unless we extend pipes with 'data boundary' markers, 
or force their flushing, which looks a whole lot of complexity for such 
a simple thing.)

straight pagecache->socket splicing on the other hand preserves 'data 
boundary' markers in a natural way for two reasons: 1) the input and 
output objects are known to the kernel at once 2) because there is no 
internal buffering to begin with.

	Ingo

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-31 12:47             ` Ingo Molnar
@ 2006-03-31 18:18               ` Jens Axboe
  0 siblings, 0 replies; 48+ messages in thread
From: Jens Axboe @ 2006-03-31 18:18 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Linus Torvalds, linux-kernel, akpm

On Fri, Mar 31 2006, Ingo Molnar wrote:
> 
> * Jens Axboe <axboe@suse.de> wrote:
> 
> > > > with pipe-based buffering this approach has still the very same problems 
> > > > that sendfile() has with packet boundaries, because it's not enough to 
> > > > have "large enough" buffering (like a pipe has), the pipe also has to be 
> > > > drained, and the networking layer has to know the precise boundary of 
> > > > data.
> > > > 
> > > > the right solution to the packet boundary problem is to pass in a proper 
> > > > "does userspace expect more data right now" flag, or to let userspace 
> > > > 'flush' the socket independently - which is independent of the 
> > > > pipe-in-slice issue. This solution already exists: the MSG_MORE flag.
> > > 
> > > We can add a SPLICE_F_MORE flag for this, right now splice doesn't set
> > > the MSG_MORE flag for the end of the pipe.
> > 
> > Ala
> 
> >  #define SPLICE_F_MOVE	(0x01)	/* move pages instead of copying */
> > +#define SPLICE_F_MORE	(0x02)	/* expect more data */
> 
> ok, nice - something like this should work. The direction of the flag is 
> a philosophical question i guess: i believe in Linux we prefer to 
> default to "buffering enabled", i.e. the default flag should be "expect 
> more data". So maybe it would be better to pass in PLICE_F_END, to 
> indicate end-of-data. [it doesnt mean 'permanent end', so all the files 
> still remain open: this could be something like a HTTP 1.1 pipelined 
> request.]

Hmm I do prefer a MORE as to an END, as it will always give predictable
results. The MORE is a performance hit, where as a missing END wont
necessarily be detected until you start looking at latencies.

> furthermore, the internal implementation should also get smarter and do 
> a flush-socket if it would e.g. block on a pagecache page. [we often 
> prefer a partial packet in such cases instead of having a half-built 
> packet hang around.]

That's a really good idea! Wont be too much trouble to add, probably the
easiest is to add a 'wait' variable to the ->map() function or something
along those lines.

> btw., that 'data boundary' detail is likely lost with the pipe 
> intermediary solution: there is no direct connection between 'input 
> file' and 'output socket', so a 'flush now' event doesnt get propagated 
> in a natural way. (unless we extend pipes with 'data boundary' markers, 
> or force their flushing, which looks a whole lot of complexity for such 
> a simple thing.)

Probably not.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-31  9:57 ` Jens Axboe
@ 2006-03-31 19:11   ` Linus Torvalds
  2006-03-31 19:40     ` Jens Axboe
  0 siblings, 1 reply; 48+ messages in thread
From: Linus Torvalds @ 2006-03-31 19:11 UTC (permalink / raw)
  To: Jens Axboe; +Cc: tridge, linux-kernel



On Fri, 31 Mar 2006, Jens Axboe wrote:
> > 
> >   ssize_t psplice(int fdin, int fdout, size_t len, off_t ofs, unsigned flags);
> 
> I definitely see some valid reasons for adding a file offset instead of
> using ->f_pos, I'll leave that decision up to Linus though. Linus?

I think a file offset is fine, the one thing holding me back was just the 
interface. One file offset per fd? Or just have the rule that the file 
offset is for the "non-pipe" device?

		Linus

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-31 19:11   ` Linus Torvalds
@ 2006-03-31 19:40     ` Jens Axboe
  2006-04-04 17:16       ` Andy Lutomirski
  0 siblings, 1 reply; 48+ messages in thread
From: Jens Axboe @ 2006-03-31 19:40 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: tridge, linux-kernel

On Fri, Mar 31 2006, Linus Torvalds wrote:
> 
> 
> On Fri, 31 Mar 2006, Jens Axboe wrote:
> > > 
> > >   ssize_t psplice(int fdin, int fdout, size_t len, off_t ofs, unsigned flags);
> > 
> > I definitely see some valid reasons for adding a file offset instead of
> > using ->f_pos, I'll leave that decision up to Linus though. Linus?
> 
> I think a file offset is fine, the one thing holding me back was just the 
> interface. One file offset per fd? Or just have the rule that the file 
> offset is for the "non-pipe" device?

Intuitively, I'd expect the offset to be tied to the non-pipe if I were
to eg see this for the first time. So my vote would go for that.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 48+ messages in thread

* RE: [PATCH] splice support #2
  2006-03-30 17:17       ` Linus Torvalds
@ 2006-03-31 20:38         ` Hua Zhong
  2006-03-31 20:49           ` Linus Torvalds
  0 siblings, 1 reply; 48+ messages in thread
From: Hua Zhong @ 2006-03-31 20:38 UTC (permalink / raw)
  To: 'Linus Torvalds', 'Jens Axboe'
  Cc: 'Ingo Molnar', linux-kernel, akpm

Hi Linus,

> The 4th reason is "tee". Again, you _could_ perhaps do "tee" 
> without the pipe, but it would be a total nightmare. Now, tee 
> isn't that common, but it does happen, and in particular it 
> happens a lot with certain streaming content.

If I understand correctly:

splice is one fd in, one fd out
tee is one fd in, two fd out (and I'd assume the "one fd in" would always be
a pipe)

How about one fd in, N fd out? Do you then stack the tee calls using
temporary pipes?

i.e., if N=3, then we'd have:

pipe(fd_tmp_pipe);
tee(fd_in, fd_out1, fd_tmp_pipe[0];
tee(fd_tmp_pipe[1], fd_out2, fd_out3);

Basically, N-2 temporary pipes would be required.

Is this the intention?

Hua



^ permalink raw reply	[flat|nested] 48+ messages in thread

* RE: [PATCH] splice support #2
  2006-03-31 20:38         ` Hua Zhong
@ 2006-03-31 20:49           ` Linus Torvalds
  0 siblings, 0 replies; 48+ messages in thread
From: Linus Torvalds @ 2006-03-31 20:49 UTC (permalink / raw)
  To: Hua Zhong; +Cc: 'Jens Axboe', 'Ingo Molnar', linux-kernel, akpm



On Fri, 31 Mar 2006, Hua Zhong wrote:
> 
> If I understand correctly:
> 
> splice is one fd in, one fd out

Yes, and one of the fd's have to be a pipe.

> tee is one fd in, two fd out (and I'd assume the "one fd in" would always be
> a pipe)

Actually, all three of them would have to be pipes. The tee() thing has to 
push to both sources in a "synchronized" manner, so it can't just take 
arbitrary file descriptors that it doesn't know the exact buffering rules 
for.

(Otherwise you wouldn't need "tee()" at all - you could have a "splice()" 
that just takes several output fd's).

> How about one fd in, N fd out? Do you then stack the tee calls using
> temporary pipes?

I didn't write the tee() logic, but making it 1:N instead of 1:2 is not 
conceptually a problem at all. The exact system call interface might limit 
it some way, of course. 

		Linus

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-31  7:09           ` Ingo Molnar
@ 2006-04-02 22:33             ` Pavel Machek
  0 siblings, 0 replies; 48+ messages in thread
From: Pavel Machek @ 2006-04-02 22:33 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Jeff Garzik, Jens Axboe, linux-kernel, akpm,
	Christoph Hellwig

Hi!

> We should extend the userspace API so that it is prepared to receive 
> 'excess data' via a separate 'flush excess data to' file descriptor:
> 
> 	sys_splice(fd_in, fd_out, fd_flush, size,
>                    max_flush_size, *bytes_flushed)

I still see problems with error handling here. You get -EIO to
userspace. How do you know if it is error reading fd_in, or error
writing fd_out?
								Pavel
-- 
Picture of sleeping (Linux) penguin wanted...

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
       [not found]             ` <5XjoS-8t9-11@gated-at.bofh.it>
@ 2006-04-03 12:39               ` Bodo Eggert
  0 siblings, 0 replies; 48+ messages in thread
From: Bodo Eggert @ 2006-04-03 12:39 UTC (permalink / raw)
  To: Pavel Machek, Ingo Molnar, Linus Torvalds, Jeff Garzik,
	Jens Axboe, linux-kernel, akpm, Christoph Hellwig

Pavel Machek <pavel@suse.cz> wrote:

>> We should extend the userspace API so that it is prepared to receive
>> 'excess data' via a separate 'flush excess data to' file descriptor:
>> 
>> sys_splice(fd_in, fd_out, fd_flush, size,
>>                    max_flush_size, *bytes_flushed)
> 
> I still see problems with error handling here. You get -EIO to
> userspace. How do you know if it is error reading fd_in, or error
> writing fd_out?

ACK.

Possibly you'll get an error reading from fd_in while also getting an error
flushing the data to fd_out, and an additional error while trying to rescue
the data on fd_flush.

Additionally you'll want to know the amount of transfered bytes even in case
of an error.

struct splice_fds {
        int in_fd,  int in_errno,
        int out_fd, int out_errno,

        /* used on transfer error to rescue the to-be-flushed data: */
        int err_fd, int err_errno,
        /* err_fd == -1 -> don't use */
        int err_size
        /* in: max. size, out: used size, unchanged if no error */
}
int sys_splice(struct splice_fds *fds, size, flags)
/* returns transfered_bytes even in case of an error */

Possible flags:
- Streaming, copied data won't be used too soon.
- Unconditionally use err_fd as a buffer, e.g. because we know that seeking
  from in_fd to out_fd is slow while err_fd is mapped memory.
- fail if err_fd == -1 and an error could cause data loss (out_fd must have
  a buffer like a pipe)

Dangers:
splice(in_fd = /dev/zero, out_fd = /dev/null, ...)
...

</brainstorming>
-- 
Ich danke GMX dafür, die Verwendung meiner Adressen mittels per SPF
verbreiteten Lügen zu sabotieren.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-03-31 19:40     ` Jens Axboe
@ 2006-04-04 17:16       ` Andy Lutomirski
  2006-04-04 17:34         ` Jens Axboe
  0 siblings, 1 reply; 48+ messages in thread
From: Andy Lutomirski @ 2006-04-04 17:16 UTC (permalink / raw)
  To: Jens Axboe; +Cc: tridge, linux-kernel

Jens Axboe wrote:
> On Fri, Mar 31 2006, Linus Torvalds wrote:
> 
>>
>>On Fri, 31 Mar 2006, Jens Axboe wrote:
>>
>>>>  ssize_t psplice(int fdin, int fdout, size_t len, off_t ofs, unsigned flags);
>>>
>>>I definitely see some valid reasons for adding a file offset instead of
>>>using ->f_pos, I'll leave that decision up to Linus though. Linus?
>>
>>I think a file offset is fine, the one thing holding me back was just the 
>>interface. One file offset per fd? Or just have the rule that the file 
>>offset is for the "non-pipe" device?
> 
> 
> Intuitively, I'd expect the offset to be tied to the non-pipe if I were
> to eg see this for the first time. So my vote would go for that.
> 

Eee!  That means that splice(file_fd, pipe_fd, 1000) and splice(pipe_fd, 
file_fd, 1000) have different semantics.  It also would seem to prevent 
ever implementing direct file-to-file splicing.

--Andy

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] splice support #2
  2006-04-04 17:16       ` Andy Lutomirski
@ 2006-04-04 17:34         ` Jens Axboe
  0 siblings, 0 replies; 48+ messages in thread
From: Jens Axboe @ 2006-04-04 17:34 UTC (permalink / raw)
  To: Andy Lutomirski; +Cc: tridge, linux-kernel

On Tue, Apr 04 2006, Andy Lutomirski wrote:
> Jens Axboe wrote:
> >On Fri, Mar 31 2006, Linus Torvalds wrote:
> >
> >>
> >>On Fri, 31 Mar 2006, Jens Axboe wrote:
> >>
> >>>> ssize_t psplice(int fdin, int fdout, size_t len, off_t ofs, unsigned 
> >>>> flags);
> >>>
> >>>I definitely see some valid reasons for adding a file offset instead of
> >>>using ->f_pos, I'll leave that decision up to Linus though. Linus?
> >>
> >>I think a file offset is fine, the one thing holding me back was just the 
> >>interface. One file offset per fd? Or just have the rule that the file 
> >>offset is for the "non-pipe" device?
> >
> >
> >Intuitively, I'd expect the offset to be tied to the non-pipe if I were
> >to eg see this for the first time. So my vote would go for that.
> >
> 
> Eee!  That means that splice(file_fd, pipe_fd, 1000) and splice(pipe_fd, 
> file_fd, 1000) have different semantics.  It also would seem to prevent 
> ever implementing direct file-to-file splicing.

The semantics as written: (fdin, fdout, len). But yes the offset gets
ugly, unless you add two offsets. And you are right it would not go well
with file -> file splicing, which btw is already implemented (just not
in 2.6.17-rc1) along with file -> socket.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 48+ messages in thread

end of thread, other threads:[~2006-04-04 17:34 UTC | newest]

Thread overview: 48+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-03-31  6:06 [PATCH] splice support #2 tridge
2006-03-31  6:59 ` Antonio Vargas
2006-03-31  7:37   ` tridge
2006-03-31  9:57 ` Jens Axboe
2006-03-31 19:11   ` Linus Torvalds
2006-03-31 19:40     ` Jens Axboe
2006-04-04 17:16       ` Andy Lutomirski
2006-04-04 17:34         ` Jens Axboe
     [not found] <5W2gv-Tp-19@gated-at.bofh.it>
     [not found] ` <5W48C-3KW-17@gated-at.bofh.it>
     [not found]   ` <5W48D-3KW-21@gated-at.bofh.it>
     [not found]     ` <5W8OT-2ms-17@gated-at.bofh.it>
     [not found]       ` <5WcfS-7x9-23@gated-at.bofh.it>
     [not found]         ` <5WcIT-8nr-13@gated-at.bofh.it>
     [not found]           ` <5Wm5I-53z-7@gated-at.bofh.it>
     [not found]             ` <5XjoS-8t9-11@gated-at.bofh.it>
2006-04-03 12:39               ` Bodo Eggert
  -- strict thread matches above, loose matches on Subject: below --
2006-03-31  0:03 linux
2006-03-30 10:06 Jens Axboe
2006-03-30 10:16 ` Andrew Morton
2006-03-30 10:24   ` Jens Axboe
2006-03-30 11:16     ` Andrew Morton
2006-03-30 11:55       ` Jens Axboe
2006-03-30 12:30         ` Jens Axboe
2006-03-30 12:30           ` Jens Axboe
2006-03-30 19:19           ` Andrew Morton
2006-03-30 12:00 ` Ingo Molnar
2006-03-30 12:05   ` Jens Axboe
2006-03-30 12:10     ` Ingo Molnar
2006-03-30 12:16       ` Jens Axboe
2006-03-30 12:38         ` Ingo Molnar
2006-03-30 12:42           ` Jens Axboe
2006-03-30 12:42             ` Ingo Molnar
2006-03-30 13:02               ` Jens Axboe
2006-03-30 14:20           ` Christoph Hellwig
2006-03-30 17:02     ` Linus Torvalds
2006-03-30 17:17       ` Linus Torvalds
2006-03-31 20:38         ` Hua Zhong
2006-03-31 20:49           ` Linus Torvalds
2006-03-30 20:48       ` Jeff Garzik
2006-03-30 21:16         ` Linus Torvalds
2006-03-31  0:59           ` Nick Piggin
2006-03-31  2:43             ` Andrew Morton
2006-03-31  2:51               ` Andrew Morton
2006-03-31  3:20                 ` Nick Piggin
2006-03-31  6:35                 ` Christoph Hellwig
2006-03-31  7:09           ` Ingo Molnar
2006-04-02 22:33             ` Pavel Machek
2006-03-31 12:46           ` Bernd Petrovitsch
2006-03-31  9:56       ` Jens Axboe
2006-03-31 12:18       ` Ingo Molnar
2006-03-31 12:23         ` Jens Axboe
2006-03-31 12:26           ` Jens Axboe
2006-03-31 12:47             ` Ingo Molnar
2006-03-31 18:18               ` Jens Axboe
2006-03-31 12:27       ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).