public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] sendfile removal
@ 2007-05-31 10:33 Jens Axboe
  2007-05-31 10:47 ` Jens Axboe
                   ` (5 more replies)
  0 siblings, 6 replies; 37+ messages in thread
From: Jens Axboe @ 2007-05-31 10:33 UTC (permalink / raw)
  To: linux-kernel; +Cc: cotte, hugh, neilb, zanussi, Linus Torvalds, hch

Hi,

This patch removes the ->sendfile() hook from the file_operations
structure, and replaces the sys_sendfile() mechanism to be based on
->splice_read() instead. There should be no functional changes.

Work to be done:

- The ext2 xip support needs a splice_read() implementation, currently I
  just if 0'ed out the send xip_file_sendfile(). CC'ed Carsten, who
  seems to be the author of this code.

- shmem needs a splice_read() implementation. Optimistically CC'ed Hugh.

- nfds: The ->rq_sendfile_ok optimization is gone for now. I can't
  determine the value of it, but I'm assuming it's there for a reason.
  Any chance this can be converted to splice, or use something else than
  ->sendfile()? CC'ed Neil.

- relay: needs a splice_read() implementation. I think Tom already has
  one, CC'ed him.

Apart from that, it was mostly straight forward. Almost everybody uses
generic_file_sendfile(), which makes the conversion easy. I changed loop
to use do_generic_file_read() instead of sendfile, it works for me...

Patch is against current git.

 drivers/block/loop.c              |   22 ++++++++++-----
 fs/adfs/file.c                    |    2 -
 fs/affs/file.c                    |    2 -
 fs/afs/file.c                     |    2 -
 fs/bad_inode.c                    |    7 ----
 fs/bfs/file.c                     |    2 -
 fs/block_dev.c                    |    1 
 fs/cifs/cifsfs.c                  |    8 ++---
 fs/coda/file.c                    |   11 ++++---
 fs/ecryptfs/file.c                |   15 +++++-----
 fs/ext2/file.c                    |    5 ++-
 fs/ext3/file.c                    |    1 
 fs/ext4/file.c                    |    1 
 fs/fat/file.c                     |    2 -
 fs/fuse/file.c                    |    4 +-
 fs/gfs2/ops_file.c                |    1 
 fs/hfs/inode.c                    |    2 -
 fs/hfsplus/inode.c                |    2 -
 fs/hostfs/hostfs_kern.c           |    2 -
 fs/hpfs/file.c                    |    2 -
 fs/jffs2/file.c                   |    2 -
 fs/jfs/file.c                     |    1 
 fs/minix/file.c                   |    2 -
 fs/nfs/file.c                     |   15 ++++++----
 fs/nfsd/vfs.c                     |   19 ++++++------
 fs/ntfs/file.c                    |    2 -
 fs/ocfs2/file.c                   |    1 
 fs/qnx4/file.c                    |    2 -
 fs/ramfs/file-mmu.c               |    2 -
 fs/ramfs/file-nommu.c             |    2 -
 fs/read_write.c                   |   13 ++++++--
 fs/reiserfs/file.c                |    1 
 fs/smbfs/file.c                   |    9 +++---
 fs/sysv/file.c                    |    2 -
 fs/udf/file.c                     |    2 -
 fs/ufs/file.c                     |    2 -
 fs/xfs/linux-2.6/xfs_file.c       |   26 -----------------
 fs/xfs/linux-2.6/xfs_linux.h      |    1 
 fs/xfs/linux-2.6/xfs_lrw.c        |   44 ------------------------------
 fs/xfs/linux-2.6/xfs_lrw.h        |    3 --
 fs/xfs/linux-2.6/xfs_vnode.h      |    6 ----
 fs/xfs/xfs_vnodeops.c             |    3 --
 include/linux/fs.h                |    5 ---
 include/linux/sunrpc/svc.h        |    2 -
 kernel/relay.c                    |   16 ----------
 mm/filemap.c                      |   20 -------------
 mm/filemap_xip.c                  |    2 +
 mm/shmem.c                        |   32 ++++++++-------------
 net/sunrpc/auth_gss/svcauth_gss.c |    2 -
 net/sunrpc/svc.c                  |    2 -
 50 files changed, 105 insertions(+), 230 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5526ead..92bac14 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -435,16 +435,24 @@ do_lo_receive(struct loop_device *lo,
 {
 	struct lo_read_data cookie;
 	struct file *file;
-	int retval;
+	read_descriptor_t desc;
+
+	desc.written = 0;
+	desc.count = bvec->bv_len;
+	desc.arg.data = &cookie;
+	desc.error = 0;
 
 	cookie.lo = lo;
 	cookie.page = bvec->bv_page;
 	cookie.offset = bvec->bv_offset;
 	cookie.bsize = bsize;
 	file = lo->lo_backing_file;
-	retval = file->f_op->sendfile(file, &pos, bvec->bv_len,
-			lo_read_actor, &cookie);
-	return (retval < 0)? retval: 0;
+
+	do_generic_file_read(file, &pos, &desc, lo_read_actor);
+	if (desc.written)
+		return 0;
+
+	return desc.error;
 }
 
 static int
@@ -679,8 +687,8 @@ static int loop_change_fd(struct loop_device *lo, struct file *lo_file,
 	if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
 		goto out_putf;
 
-	/* new backing store needs to support loop (eg sendfile) */
-	if (!inode->i_fop->sendfile)
+	/* new backing store needs to support loop (eg readpage) */
+	if (!file->f_mapping->a_ops->readpage)
 		goto out_putf;
 
 	/* size of the new backing store needs to be the same */
@@ -760,7 +768,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 		 * If we can't read - sorry. If we only can't write - well,
 		 * it's going to be read-only.
 		 */
-		if (!file->f_op->sendfile)
+		if (!aops->readpage)
 			goto out_putf;
 		if (aops->prepare_write && aops->commit_write)
 			lo_flags |= LO_FLAGS_USE_AOPS;
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index f544a28..36e381c 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -33,7 +33,7 @@ const struct file_operations adfs_file_operations = {
 	.fsync		= file_fsync,
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
-	.sendfile	= generic_file_sendfile,
+	.splice_read	= generic_file_splice_read,
 };
 
 const struct inode_operations adfs_file_inode_operations = {
diff --git a/fs/affs/file.c b/fs/affs/file.c
index c879690..c314a35 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -35,7 +35,7 @@ const struct file_operations affs_file_operations = {
 	.open		= affs_file_open,
 	.release	= affs_file_release,
 	.fsync		= file_fsync,
-	.sendfile	= generic_file_sendfile,
+	.splice_read	= generic_file_splice_read,
 };
 
 const struct inode_operations affs_file_inode_operations = {
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 9c0e721..aede7eb 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -32,7 +32,7 @@ const struct file_operations afs_file_operations = {
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= afs_file_write,
 	.mmap		= generic_file_readonly_mmap,
-	.sendfile	= generic_file_sendfile,
+	.splice_read	= generic_file_splice_read,
 	.fsync		= afs_fsync,
 };
 
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 329ee47..521ff7c 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -114,12 +114,6 @@ static int bad_file_lock(struct file *file, int cmd, struct file_lock *fl)
 	return -EIO;
 }
 
-static ssize_t bad_file_sendfile(struct file *in_file, loff_t *ppos,
-			size_t count, read_actor_t actor, void *target)
-{
-	return -EIO;
-}
-
 static ssize_t bad_file_sendpage(struct file *file, struct page *page,
 			int off, size_t len, loff_t *pos, int more)
 {
@@ -182,7 +176,6 @@ static const struct file_operations bad_file_ops =
 	.aio_fsync	= bad_file_aio_fsync,
 	.fasync		= bad_file_fasync,
 	.lock		= bad_file_lock,
-	.sendfile	= bad_file_sendfile,
 	.sendpage	= bad_file_sendpage,
 	.get_unmapped_area = bad_file_get_unmapped_area,
 	.check_flags	= bad_file_check_flags,
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index ef4d1fa..24310e9 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -24,7 +24,7 @@ const struct file_operations bfs_file_operations = {
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
-	.sendfile	= generic_file_sendfile,
+	.splice_read	= generic_file_splice_read,
 };
 
 static int bfs_move_block(unsigned long from, unsigned long to, struct super_block *sb)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ea1480a..b3e9bfa 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1346,7 +1346,6 @@ const struct file_operations def_blk_fops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= compat_blkdev_ioctl,
 #endif
-	.sendfile	= generic_file_sendfile,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
 };
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index d38c69b..c05fa32 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -616,7 +616,7 @@ const struct file_operations cifs_file_ops = {
 	.fsync = cifs_fsync,
 	.flush = cifs_flush,
 	.mmap  = cifs_file_mmap,
-	.sendfile = generic_file_sendfile,
+	.splice_read = generic_file_splice_read,
 	.llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_POSIX
 	.ioctl	= cifs_ioctl,
@@ -637,7 +637,7 @@ const struct file_operations cifs_file_direct_ops = {
 	.lock = cifs_lock,
 	.fsync = cifs_fsync,
 	.flush = cifs_flush,
-	.sendfile = generic_file_sendfile, /* BB removeme BB */
+	.splice_read = generic_file_splice_read,
 #ifdef CONFIG_CIFS_POSIX
 	.ioctl  = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
@@ -656,7 +656,7 @@ const struct file_operations cifs_file_nobrl_ops = {
 	.fsync = cifs_fsync,
 	.flush = cifs_flush,
 	.mmap  = cifs_file_mmap,
-	.sendfile = generic_file_sendfile,
+	.splice_read = generic_file_splice_read,
 	.llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_POSIX
 	.ioctl	= cifs_ioctl,
@@ -676,7 +676,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
 	.release = cifs_close,
 	.fsync = cifs_fsync,
 	.flush = cifs_flush,
-	.sendfile = generic_file_sendfile, /* BB removeme BB */
+	.splice_read = generic_file_splice_read,
 #ifdef CONFIG_CIFS_POSIX
 	.ioctl  = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 5ef2b60..99dbe86 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -47,8 +47,9 @@ coda_file_read(struct file *coda_file, char __user *buf, size_t count, loff_t *p
 }
 
 static ssize_t
-coda_file_sendfile(struct file *coda_file, loff_t *ppos, size_t count,
-		   read_actor_t actor, void *target)
+coda_file_splice_read(struct file *coda_file, loff_t *ppos,
+		      struct pipe_inode_info *pipe, size_t count,
+		      unsigned int flags)
 {
 	struct coda_file_info *cfi;
 	struct file *host_file;
@@ -57,10 +58,10 @@ coda_file_sendfile(struct file *coda_file, loff_t *ppos, size_t count,
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 	host_file = cfi->cfi_container;
 
-	if (!host_file->f_op || !host_file->f_op->sendfile)
+	if (!host_file->f_op || !host_file->f_op->splice_read)
 		return -EINVAL;
 
-	return host_file->f_op->sendfile(host_file, ppos, count, actor, target);
+	return host_file->f_op->splice_read(host_file, ppos, pipe, count,flags);
 }
 
 static ssize_t
@@ -295,6 +296,6 @@ const struct file_operations coda_file_operations = {
 	.flush		= coda_flush,
 	.release	= coda_release,
 	.fsync		= coda_fsync,
-	.sendfile	= coda_file_sendfile,
+	.splice_read	= coda_file_splice_read,
 };
 
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 59288d8..94f456f 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -338,16 +338,17 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
 	return rc;
 }
 
-static ssize_t ecryptfs_sendfile(struct file *file, loff_t * ppos,
-				 size_t count, read_actor_t actor, void *target)
+static ssize_t ecryptfs_splice_read(struct file *file, loff_t * ppos,
+				    struct pipe_inode_info *pipe, size_t count,
+				    unsigned int flags)
 {
 	struct file *lower_file = NULL;
 	int rc = -EINVAL;
 
 	lower_file = ecryptfs_file_to_lower(file);
-	if (lower_file->f_op && lower_file->f_op->sendfile)
-		rc = lower_file->f_op->sendfile(lower_file, ppos, count,
-						actor, target);
+	if (lower_file->f_op && lower_file->f_op->splice_read)
+		rc = lower_file->f_op->splice_read(lower_file, ppos, pipe,
+						count, flags);
 
 	return rc;
 }
@@ -364,7 +365,7 @@ const struct file_operations ecryptfs_dir_fops = {
 	.release = ecryptfs_release,
 	.fsync = ecryptfs_fsync,
 	.fasync = ecryptfs_fasync,
-	.sendfile = ecryptfs_sendfile,
+	.splice_read = ecryptfs_splice_read,
 };
 
 const struct file_operations ecryptfs_main_fops = {
@@ -381,7 +382,7 @@ const struct file_operations ecryptfs_main_fops = {
 	.release = ecryptfs_release,
 	.fsync = ecryptfs_fsync,
 	.fasync = ecryptfs_fasync,
-	.sendfile = ecryptfs_sendfile,
+	.splice_read = ecryptfs_splice_read,
 };
 
 static int
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 566d4e2..a89edc9 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -53,7 +53,6 @@ const struct file_operations ext2_file_operations = {
 	.open		= generic_file_open,
 	.release	= ext2_release_file,
 	.fsync		= ext2_sync_file,
-	.sendfile	= generic_file_sendfile,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
 };
@@ -71,7 +70,9 @@ const struct file_operations ext2_xip_file_operations = {
 	.open		= generic_file_open,
 	.release	= ext2_release_file,
 	.fsync		= ext2_sync_file,
-	.sendfile	= xip_file_sendfile,
+#if 0
+	.splice_read	= xip_file_splice_read,
+#endif
 };
 #endif
 
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 1e6f138..acc4913 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -120,7 +120,6 @@ const struct file_operations ext3_file_operations = {
 	.open		= generic_file_open,
 	.release	= ext3_release_file,
 	.fsync		= ext3_sync_file,
-	.sendfile	= generic_file_sendfile,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
 };
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3c6c1fd..d4c8186 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -120,7 +120,6 @@ const struct file_operations ext4_file_operations = {
 	.open		= generic_file_open,
 	.release	= ext4_release_file,
 	.fsync		= ext4_sync_file,
-	.sendfile	= generic_file_sendfile,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
 };
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 55d3c74..69a83b5 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -134,7 +134,7 @@ const struct file_operations fat_file_operations = {
 	.release	= fat_file_release,
 	.ioctl		= fat_generic_ioctl,
 	.fsync		= file_fsync,
-	.sendfile	= generic_file_sendfile,
+	.splice_read	= generic_file_splice_read,
 };
 
 static int fat_cont_expand(struct inode *inode, loff_t size)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index adf7995..f79de7c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -802,7 +802,7 @@ static const struct file_operations fuse_file_operations = {
 	.release	= fuse_release,
 	.fsync		= fuse_fsync,
 	.lock		= fuse_file_lock,
-	.sendfile	= generic_file_sendfile,
+	.splice_read	= generic_file_splice_read,
 };
 
 static const struct file_operations fuse_direct_io_file_operations = {
@@ -814,7 +814,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
 	.release	= fuse_release,
 	.fsync		= fuse_fsync,
 	.lock		= fuse_file_lock,
-	/* no mmap and sendfile */
+	/* no mmap and splice_read */
 };
 
 static const struct address_space_operations fuse_file_aops  = {
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 064df88..7dc3be1 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -635,7 +635,6 @@ const struct file_operations gfs2_file_fops = {
 	.release	= gfs2_close,
 	.fsync		= gfs2_fsync,
 	.lock		= gfs2_lock,
-	.sendfile	= generic_file_sendfile,
 	.flock		= gfs2_flock,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 9a934db..bc835f2 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -607,7 +607,7 @@ static const struct file_operations hfs_file_operations = {
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
-	.sendfile	= generic_file_sendfile,
+	.splice_read	= generic_file_splice_read,
 	.fsync		= file_fsync,
 	.open		= hfs_file_open,
 	.release	= hfs_file_release,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 45dab5d..409ce54 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -288,7 +288,7 @@ static const struct file_operations hfsplus_file_operations = {
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
-	.sendfile	= generic_file_sendfile,
+	.splice_read	= generic_file_splice_read,
 	.fsync		= file_fsync,
 	.open		= hfsplus_file_open,
 	.release	= hfsplus_file_release,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 8286491..c778620 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -390,7 +390,7 @@ int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 static const struct file_operations hostfs_file_fops = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
-	.sendfile	= generic_file_sendfile,
+	.splice_read	= generic_file_splice_read,
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= generic_file_aio_write,
 	.write		= do_sync_write,
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index b4eafc0..5b53e5c 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -129,7 +129,7 @@ const struct file_operations hpfs_file_ops =
 	.mmap		= generic_file_mmap,
 	.release	= hpfs_file_release,
 	.fsync		= hpfs_file_fsync,
-	.sendfile	= generic_file_sendfile,
+	.splice_read	= generic_file_splice_read,
 };
 
 const struct inode_operations hpfs_file_iops =
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 9987127..c253019 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -47,7 +47,7 @@ const struct file_operations jffs2_file_operations =
 	.ioctl =	jffs2_ioctl,
 	.mmap =		generic_file_readonly_mmap,
 	.fsync =	jffs2_fsync,
-	.sendfile =	generic_file_sendfile
+	.splice_read =	generic_file_splice_read,
 };
 
 /* jffs2_file_inode_operations */
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index f7f8eff..87eb936 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -108,7 +108,6 @@ const struct file_operations jfs_file_operations = {
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
-	.sendfile	= generic_file_sendfile,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
 	.fsync		= jfs_fsync,
diff --git a/fs/minix/file.c b/fs/minix/file.c
index f92baa1..17765f6 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -23,7 +23,7 @@ const struct file_operations minix_file_operations = {
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.fsync		= minix_sync_file,
-	.sendfile	= generic_file_sendfile,
+	.splice_read	= generic_file_splice_read,
 };
 
 const struct inode_operations minix_file_inode_operations = {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 9eb8eb4..8689b73 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -41,7 +41,9 @@ static int nfs_file_open(struct inode *, struct file *);
 static int nfs_file_release(struct inode *, struct file *);
 static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin);
 static int  nfs_file_mmap(struct file *, struct vm_area_struct *);
-static ssize_t nfs_file_sendfile(struct file *, loff_t *, size_t, read_actor_t, void *);
+static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos,
+					struct pipe_inode_info *pipe,
+					size_t count, unsigned int flags);
 static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov,
 				unsigned long nr_segs, loff_t pos);
 static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
@@ -65,7 +67,7 @@ const struct file_operations nfs_file_operations = {
 	.fsync		= nfs_fsync,
 	.lock		= nfs_lock,
 	.flock		= nfs_flock,
-	.sendfile	= nfs_file_sendfile,
+	.splice_read	= nfs_file_splice_read,
 	.check_flags	= nfs_check_flags,
 };
 
@@ -224,20 +226,21 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
 }
 
 static ssize_t
-nfs_file_sendfile(struct file *filp, loff_t *ppos, size_t count,
-		read_actor_t actor, void *target)
+nfs_file_splice_read(struct file *filp, loff_t *ppos,
+		     struct pipe_inode_info *pipe, size_t count,
+		     unsigned int flags)
 {
 	struct dentry *dentry = filp->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	ssize_t res;
 
-	dfprintk(VFS, "nfs: sendfile(%s/%s, %lu@%Lu)\n",
+	dfprintk(VFS, "nfs: splice_read(%s/%s, %lu@%Lu)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
 		(unsigned long) count, (unsigned long long) *ppos);
 
 	res = nfs_revalidate_mapping(inode, filp->f_mapping);
 	if (!res)
-		res = generic_file_sendfile(filp, ppos, count, actor, target);
+		res = generic_file_splice_read(filp, ppos, pipe, count, flags);
 	return res;
 }
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 7e6aa24..1676e34 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -805,6 +805,7 @@ found:
  * so that they can be passed to the netowork sendmsg/sendpage routines
  * directrly. They will be released after the sending has completed.
  */
+#if 0
 static int
 nfsd_read_actor(read_descriptor_t *desc, struct page *page, unsigned long offset , unsigned long size)
 {
@@ -836,6 +837,7 @@ nfsd_read_actor(read_descriptor_t *desc, struct page *page, unsigned long offset
 	desc->written += size;
 	return size;
 }
+#endif
 
 static __be32
 nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
@@ -861,16 +863,13 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	if (ra && ra->p_set)
 		file->f_ra = ra->p_ra;
 
-	if (file->f_op->sendfile && rqstp->rq_sendfile_ok) {
-		rqstp->rq_resused = 1;
-		host_err = file->f_op->sendfile(file, &offset, *count,
-						 nfsd_read_actor, rqstp);
-	} else {
-		oldfs = get_fs();
-		set_fs(KERNEL_DS);
-		host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
-		set_fs(oldfs);
-	}
+	/*
+	 * Can we use ->splice_read() for this?
+	 */
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
+	set_fs(oldfs);
 
 	/* Write back readahead params */
 	if (ra) {
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 7ed5639..ffcc504 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2276,7 +2276,7 @@ const struct file_operations ntfs_file_ops = {
 						    mounted filesystem. */
 	.mmap		= generic_file_mmap,	 /* Mmap file. */
 	.open		= ntfs_file_open,	 /* Open file. */
-	.sendfile	= generic_file_sendfile, /* Zero-copy data send with
+	.splice_read	= generic_file_splice_read /* Zero-copy data send with
 						    the data source being on
 						    the ntfs partition.  We do
 						    not need to care about the
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ac6c964..84e2b1f 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1817,7 +1817,6 @@ const struct inode_operations ocfs2_special_file_iops = {
 const struct file_operations ocfs2_fops = {
 	.read		= do_sync_read,
 	.write		= do_sync_write,
-	.sendfile	= generic_file_sendfile,
 	.mmap		= ocfs2_mmap,
 	.fsync		= ocfs2_sync_file,
 	.release	= ocfs2_file_release,
diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c
index 4464998..867f42b 100644
--- a/fs/qnx4/file.c
+++ b/fs/qnx4/file.c
@@ -25,7 +25,7 @@ const struct file_operations qnx4_file_operations =
 	.read		= do_sync_read,
 	.aio_read	= generic_file_aio_read,
 	.mmap		= generic_file_mmap,
-	.sendfile	= generic_file_sendfile,
+	.splice_read	= generic_file_splice_read,
 #ifdef CONFIG_QNX4FS_RW
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 2f14774..97bdc0b 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -41,7 +41,7 @@ const struct file_operations ramfs_file_operations = {
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.fsync		= simple_sync_file,
-	.sendfile	= generic_file_sendfile,
+	.splice_read	= generic_file_splice_read,
 	.llseek		= generic_file_llseek,
 };
 
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 9345a46..69ff5f9 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -42,7 +42,7 @@ const struct file_operations ramfs_file_operations = {
 	.write			= do_sync_write,
 	.aio_write		= generic_file_aio_write,
 	.fsync			= simple_sync_file,
-	.sendfile		= generic_file_sendfile,
+	.splice_read		= generic_file_splice_read,
 	.llseek			= generic_file_llseek,
 };
 
diff --git a/fs/read_write.c b/fs/read_write.c
index 4d03008..cb4f456 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/syscalls.h>
 #include <linux/pagemap.h>
+#include <linux/pipe_fs_i.h>
 #include "read_write.h"
 
 #include <asm/uaccess.h>
@@ -25,7 +26,7 @@ const struct file_operations generic_ro_fops = {
 	.read		= do_sync_read,
 	.aio_read	= generic_file_aio_read,
 	.mmap		= generic_file_readonly_mmap,
-	.sendfile	= generic_file_sendfile,
+	.splice_read	= generic_file_splice_read,
 };
 
 EXPORT_SYMBOL(generic_ro_fops);
@@ -708,7 +709,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 	struct inode * in_inode, * out_inode;
 	loff_t pos;
 	ssize_t retval;
-	int fput_needed_in, fput_needed_out;
+	int fput_needed_in, fput_needed_out, fl;
 
 	/*
 	 * Get input file, and verify that it is ok..
@@ -723,7 +724,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 	in_inode = in_file->f_path.dentry->d_inode;
 	if (!in_inode)
 		goto fput_in;
-	if (!in_file->f_op || !in_file->f_op->sendfile)
+	if (!in_file->f_op || !in_file->f_op->splice_read)
 		goto fput_in;
 	retval = -ESPIPE;
 	if (!ppos)
@@ -776,7 +777,11 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 		count = max - pos;
 	}
 
-	retval = in_file->f_op->sendfile(in_file, ppos, count, file_send_actor, out_file);
+	fl = 0;
+	if (in_file->f_flags & O_NONBLOCK)
+		fl = SPLICE_F_NONBLOCK;
+
+	retval = do_splice_direct(in_file, ppos, out_file, count, fl);
 
 	if (retval > 0) {
 		add_rchar(current, retval);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 9e451a6..30eebfb 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1531,7 +1531,6 @@ const struct file_operations reiserfs_file_operations = {
 	.open = generic_file_open,
 	.release = reiserfs_file_release,
 	.fsync = reiserfs_sync_file,
-	.sendfile = generic_file_sendfile,
 	.aio_read = generic_file_aio_read,
 	.aio_write = generic_file_aio_write,
 	.splice_read = generic_file_splice_read,
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index aea3f8a..c5d78a7 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -262,8 +262,9 @@ out:
 }
 
 static ssize_t
-smb_file_sendfile(struct file *file, loff_t *ppos,
-		  size_t count, read_actor_t actor, void *target)
+smb_file_splice_read(struct file *file, loff_t *ppos,
+		     struct pipe_inode_info *pipe, size_t count,
+		     unsigned int flags)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	ssize_t status;
@@ -277,7 +278,7 @@ smb_file_sendfile(struct file *file, loff_t *ppos,
 			 DENTRY_PATH(dentry), status);
 		goto out;
 	}
-	status = generic_file_sendfile(file, ppos, count, actor, target);
+	status = generic_file_splice_read(file, ppos, pipe, count, flags);
 out:
 	return status;
 }
@@ -416,7 +417,7 @@ const struct file_operations smb_file_operations =
 	.open		= smb_file_open,
 	.release	= smb_file_release,
 	.fsync		= smb_fsync,
-	.sendfile	= smb_file_sendfile,
+	.splice_read	= smb_file_splice_read,
 };
 
 const struct inode_operations smb_file_inode_operations =
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 0732ddb..589be21 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -27,7 +27,7 @@ const struct file_operations sysv_file_operations = {
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.fsync		= sysv_sync_file,
-	.sendfile	= generic_file_sendfile,
+	.splice_read	= generic_file_splice_read,
 };
 
 const struct inode_operations sysv_file_inode_operations = {
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 51b5764..df070be 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -261,7 +261,7 @@ const struct file_operations udf_file_operations = {
 	.aio_write		= udf_file_aio_write,
 	.release		= udf_release_file,
 	.fsync			= udf_fsync_file,
-	.sendfile		= generic_file_sendfile,
+	.splice_read		= generic_file_splice_read,
 };
 
 const struct inode_operations udf_file_inode_operations = {
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 1e09632..6705d74 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -60,5 +60,5 @@ const struct file_operations ufs_file_operations = {
 	.mmap		= generic_file_mmap,
 	.open           = generic_file_open,
 	.fsync		= ufs_sync_file,
-	.sendfile	= generic_file_sendfile,
+	.splice_read	= generic_file_splice_read,
 };
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index cb51dc9..8c43cd2 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -124,30 +124,6 @@ xfs_file_aio_write_invis(
 }
 
 STATIC ssize_t
-xfs_file_sendfile(
-	struct file		*filp,
-	loff_t			*pos,
-	size_t			count,
-	read_actor_t		actor,
-	void			*target)
-{
-	return bhv_vop_sendfile(vn_from_inode(filp->f_path.dentry->d_inode),
-				filp, pos, 0, count, actor, target, NULL);
-}
-
-STATIC ssize_t
-xfs_file_sendfile_invis(
-	struct file		*filp,
-	loff_t			*pos,
-	size_t			count,
-	read_actor_t		actor,
-	void			*target)
-{
-	return bhv_vop_sendfile(vn_from_inode(filp->f_path.dentry->d_inode),
-				filp, pos, IO_INVIS, count, actor, target, NULL);
-}
-
-STATIC ssize_t
 xfs_file_splice_read(
 	struct file		*infilp,
 	loff_t			*ppos,
@@ -452,7 +428,6 @@ const struct file_operations xfs_file_operations = {
 	.write		= do_sync_write,
 	.aio_read	= xfs_file_aio_read,
 	.aio_write	= xfs_file_aio_write,
-	.sendfile	= xfs_file_sendfile,
 	.splice_read	= xfs_file_splice_read,
 	.splice_write	= xfs_file_splice_write,
 	.unlocked_ioctl	= xfs_file_ioctl,
@@ -475,7 +450,6 @@ const struct file_operations xfs_invis_file_operations = {
 	.write		= do_sync_write,
 	.aio_read	= xfs_file_aio_read_invis,
 	.aio_write	= xfs_file_aio_write_invis,
-	.sendfile	= xfs_file_sendfile_invis,
 	.splice_read	= xfs_file_splice_read_invis,
 	.splice_write	= xfs_file_splice_write_invis,
 	.unlocked_ioctl	= xfs_file_ioctl_invis,
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 715adad..af24a45 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -101,7 +101,6 @@
  * Feature macros (disable/enable)
  */
 #undef  HAVE_REFCACHE	/* reference cache not needed for NFS in 2.6 */
-#define HAVE_SENDFILE	/* sendfile(2) exists in 2.6, but not in 2.4 */
 #define HAVE_SPLICE	/* a splice(2) exists in 2.6, but not in 2.4 */
 #ifdef CONFIG_SMP
 #define HAVE_PERCPU_SB	/* per cpu superblock counters are a 2.6 feature */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 86fb671..23978d3 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -287,50 +287,6 @@ xfs_read(
 }
 
 ssize_t
-xfs_sendfile(
-	bhv_desc_t		*bdp,
-	struct file		*filp,
-	loff_t			*offset,
-	int			ioflags,
-	size_t			count,
-	read_actor_t		actor,
-	void			*target,
-	cred_t			*credp)
-{
-	xfs_inode_t		*ip = XFS_BHVTOI(bdp);
-	xfs_mount_t		*mp = ip->i_mount;
-	ssize_t			ret;
-
-	XFS_STATS_INC(xs_read_calls);
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return -EIO;
-
-	xfs_ilock(ip, XFS_IOLOCK_SHARED);
-
-	if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) &&
-	    (!(ioflags & IO_INVIS))) {
-		bhv_vrwlock_t locktype = VRWLOCK_READ;
-		int error;
-
-		error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp),
-				      *offset, count,
-				      FILP_DELAY_FLAG(filp), &locktype);
-		if (error) {
-			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-			return -error;
-		}
-	}
-	xfs_rw_enter_trace(XFS_SENDFILE_ENTER, &ip->i_iocore,
-		   (void *)(unsigned long)target, count, *offset, ioflags);
-	ret = generic_file_sendfile(filp, offset, count, actor, target);
-	if (ret > 0)
-		XFS_STATS_ADD(xs_read_bytes, ret);
-
-	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-	return ret;
-}
-
-ssize_t
 xfs_splice_read(
 	bhv_desc_t		*bdp,
 	struct file		*infilp,
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index 7ac51b1..7c60a1e 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -90,9 +90,6 @@ extern ssize_t xfs_read(struct bhv_desc *, struct kiocb *,
 extern ssize_t xfs_write(struct bhv_desc *, struct kiocb *,
 				const struct iovec *, unsigned int,
 				loff_t *, int, struct cred *);
-extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *,
-				loff_t *, int, size_t, read_actor_t,
-				void *, struct cred *);
 extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *, loff_t *,
 				struct pipe_inode_info *, size_t, int, int,
 				struct cred *);
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index d1b2d01..013048a 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -139,9 +139,6 @@ typedef ssize_t (*vop_read_t)(bhv_desc_t *, struct kiocb *,
 typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct kiocb *,
 				const struct iovec *, unsigned int,
 				loff_t *, int, struct cred *);
-typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *,
-				loff_t *, int, size_t, read_actor_t,
-				void *, struct cred *);
 typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *, loff_t *,
 				struct pipe_inode_info *, size_t, int, int,
 				struct cred *);
@@ -206,7 +203,6 @@ typedef struct bhv_vnodeops {
 	vop_close_t		vop_close;
 	vop_read_t		vop_read;
 	vop_write_t		vop_write;
-	vop_sendfile_t		vop_sendfile;
 	vop_splice_read_t	vop_splice_read;
 	vop_splice_write_t	vop_splice_write;
 	vop_ioctl_t		vop_ioctl;
@@ -254,8 +250,6 @@ typedef struct bhv_vnodeops {
 		VOP(vop_read, vp)(VNHEAD(vp),file,iov,segs,offset,ioflags,cr)
 #define bhv_vop_write(vp,file,iov,segs,offset,ioflags,cr)		\
 		VOP(vop_write, vp)(VNHEAD(vp),file,iov,segs,offset,ioflags,cr)
-#define bhv_vop_sendfile(vp,f,off,ioflags,cnt,act,targ,cr)		\
-		VOP(vop_sendfile, vp)(VNHEAD(vp),f,off,ioflags,cnt,act,targ,cr)
 #define bhv_vop_splice_read(vp,f,o,pipe,cnt,fl,iofl,cr)			\
 		VOP(vop_splice_read, vp)(VNHEAD(vp),f,o,pipe,cnt,fl,iofl,cr)
 #define bhv_vop_splice_write(vp,f,o,pipe,cnt,fl,iofl,cr)		\
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index de17aed..70bc82f 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -4680,9 +4680,6 @@ bhv_vnodeops_t xfs_vnodeops = {
 	.vop_open		= xfs_open,
 	.vop_close		= xfs_close,
 	.vop_read		= xfs_read,
-#ifdef HAVE_SENDFILE
-	.vop_sendfile		= xfs_sendfile,
-#endif
 #ifdef HAVE_SPLICE
 	.vop_splice_read	= xfs_splice_read,
 	.vop_splice_write	= xfs_splice_write,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7cf0c54..4c73cd6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1104,7 +1104,6 @@ struct file_operations {
 	int (*aio_fsync) (struct kiocb *, int datasync);
 	int (*fasync) (int, struct file *, int);
 	int (*lock) (struct file *, int, struct file_lock *);
-	ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void *);
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
 	int (*check_flags)(int);
@@ -1734,7 +1733,6 @@ extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *,
 		unsigned long, loff_t, loff_t *, size_t, ssize_t);
 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
 extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
-extern ssize_t generic_file_sendfile(struct file *, loff_t *, size_t, read_actor_t, void *);
 extern void do_generic_mapping_read(struct address_space *mapping,
 				    struct file_ra_state *, struct file *,
 				    loff_t *, read_descriptor_t *, read_actor_t);
@@ -1764,9 +1762,6 @@ extern int nonseekable_open(struct inode * inode, struct file * filp);
 #ifdef CONFIG_FS_XIP
 extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len,
 			     loff_t *ppos);
-extern ssize_t xip_file_sendfile(struct file *in_file, loff_t *ppos,
-				 size_t count, read_actor_t actor,
-				 void *target);
 extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma);
 extern ssize_t xip_file_write(struct file *filp, const char __user *buf,
 			      size_t len, loff_t *ppos);
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 4a7ae8a..129d50f 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -253,7 +253,7 @@ struct svc_rqst {
 						 * determine what device number
 						 * to report (real or virtual)
 						 */
-	int			rq_sendfile_ok; /* turned off in gss privacy
+	int			rq_splice_ok;   /* turned off in gss privacy
 						 * to prevent encrypting page
 						 * cache pages */
 	wait_queue_head_t	rq_wait;	/* synchronization */
diff --git a/kernel/relay.c b/kernel/relay.c
index 4311101..86a6579 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1060,21 +1060,6 @@ static ssize_t relay_file_read(struct file *filp,
 				       NULL, &desc);
 }
 
-static ssize_t relay_file_sendfile(struct file *filp,
-				   loff_t *ppos,
-				   size_t count,
-				   read_actor_t actor,
-				   void *target)
-{
-	read_descriptor_t desc;
-	desc.written = 0;
-	desc.count = count;
-	desc.arg.data = target;
-	desc.error = 0;
-	return relay_file_read_subbufs(filp, ppos, subbuf_send_actor,
-				       actor, &desc);
-}
-
 const struct file_operations relay_file_operations = {
 	.open		= relay_file_open,
 	.poll		= relay_file_poll,
@@ -1082,7 +1067,6 @@ const struct file_operations relay_file_operations = {
 	.read		= relay_file_read,
 	.llseek		= no_llseek,
 	.release	= relay_file_release,
-	.sendfile       = relay_file_sendfile,
 };
 EXPORT_SYMBOL_GPL(relay_file_operations);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index edb1b0b..14ad45a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1245,26 +1245,6 @@ int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long o
 	return written;
 }
 
-ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
-			 size_t count, read_actor_t actor, void *target)
-{
-	read_descriptor_t desc;
-
-	if (!count)
-		return 0;
-
-	desc.written = 0;
-	desc.count = count;
-	desc.arg.data = target;
-	desc.error = 0;
-
-	do_generic_file_read(in_file, ppos, &desc, actor);
-	if (desc.written)
-		return desc.written;
-	return desc.error;
-}
-EXPORT_SYMBOL(generic_file_sendfile);
-
 static ssize_t
 do_readahead(struct address_space *mapping, struct file *filp,
 	     unsigned long index, unsigned long nr)
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index fa360e5..e85bade 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -159,6 +159,7 @@ xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 }
 EXPORT_SYMBOL_GPL(xip_file_read);
 
+#if 0
 ssize_t
 xip_file_sendfile(struct file *in_file, loff_t *ppos,
 	     size_t count, read_actor_t actor, void *target)
@@ -180,6 +181,7 @@ xip_file_sendfile(struct file *in_file, loff_t *ppos,
 	return desc.error;
 }
 EXPORT_SYMBOL_GPL(xip_file_sendfile);
+#endif
 
 /*
  * __xip_unmap is invoked from xip_unmap and
diff --git a/mm/shmem.c b/mm/shmem.c
index e537317..71f480b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1483,6 +1483,17 @@ static const struct inode_operations shmem_symlink_inode_operations;
 static const struct inode_operations shmem_symlink_inline_operations;
 
 /*
+ * FIXME: need a real implementation :-)
+ */
+static ssize_t
+shmem_file_splice_read(struct file *in, loff_t *ppos,
+		       struct pipe_inode_info *pipe, size_t len,
+		       unsigned int flags)
+{
+	return -EINVAL;
+}
+
+/*
  * Normally tmpfs makes no use of shmem_prepare_write, but it
  * lets a tmpfs file be used read-write below the loop driver.
  */
@@ -1709,25 +1720,6 @@ static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count
 	return desc.error;
 }
 
-static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos,
-			 size_t count, read_actor_t actor, void *target)
-{
-	read_descriptor_t desc;
-
-	if (!count)
-		return 0;
-
-	desc.written = 0;
-	desc.count = count;
-	desc.arg.data = target;
-	desc.error = 0;
-
-	do_shmem_file_read(in_file, ppos, &desc, actor);
-	if (desc.written)
-		return desc.written;
-	return desc.error;
-}
-
 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -2397,7 +2389,7 @@ static const struct file_operations shmem_file_operations = {
 	.read		= shmem_file_read,
 	.write		= shmem_file_write,
 	.fsync		= simple_sync_file,
-	.sendfile	= shmem_file_sendfile,
+	.splice_read	= shmem_file_splice_read,
 #endif
 };
 
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 099a983..c094583 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -853,7 +853,7 @@ unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gs
 	u32 priv_len, maj_stat;
 	int pad, saved_len, remaining_len, offset;
 
-	rqstp->rq_sendfile_ok = 0;
+	rqstp->rq_splice_ok = 0;
 
 	priv_len = svc_getnl(&buf->head[0]);
 	if (rqstp->rq_deferred) {
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index e673ef9..55ea6df 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -814,7 +814,7 @@ svc_process(struct svc_rqst *rqstp)
 	rqstp->rq_res.tail[0].iov_base = NULL;
 	rqstp->rq_res.tail[0].iov_len = 0;
 	/* Will be turned off only in gss privacy case: */
-	rqstp->rq_sendfile_ok = 1;
+	rqstp->rq_splice_ok = 1;
 	/* tcp needs a space for the record length... */
 	if (rqstp->rq_prot == IPPROTO_TCP)
 		svc_putnl(resv, 0);

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-05-31 10:33 [PATCH] sendfile removal Jens Axboe
@ 2007-05-31 10:47 ` Jens Axboe
  2007-05-31 10:47 ` Eric Dumazet
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 37+ messages in thread
From: Jens Axboe @ 2007-05-31 10:47 UTC (permalink / raw)
  To: linux-kernel; +Cc: cotte, hugh, neilb, zanussi, Linus Torvalds, hch

On Thu, May 31 2007, Jens Axboe wrote:
> Hi,
> 
> This patch removes the ->sendfile() hook from the file_operations
> structure, and replaces the sys_sendfile() mechanism to be based on
> ->splice_read() instead. There should be no functional changes.

Forgot to mention, that the modified kernel of course passes ltp
sendfile tests. FWIW.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-05-31 10:33 [PATCH] sendfile removal Jens Axboe
  2007-05-31 10:47 ` Jens Axboe
@ 2007-05-31 10:47 ` Eric Dumazet
  2007-05-31 10:53   ` Jens Axboe
  2007-05-31 10:55 ` Christoph Hellwig
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 37+ messages in thread
From: Eric Dumazet @ 2007-05-31 10:47 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, cotte, hugh, neilb, zanussi, Linus Torvalds, hch

On Thu, 31 May 2007 12:33:16 +0200
Jens Axboe <jens.axboe@oracle.com> wrote:

> Hi,
> 
> This patch removes the ->sendfile() hook from the file_operations
> structure, and replaces the sys_sendfile() mechanism to be based on
> ->splice_read() instead. There should be no functional changes.
> 
  
> -	retval = in_file->f_op->sendfile(in_file, ppos, count, file_send_actor, out_file);
> +	fl = 0;
> +	if (in_file->f_flags & O_NONBLOCK)
> +		fl = SPLICE_F_NONBLOCK;
> +
> +	retval = do_splice_direct(in_file, ppos, out_file, count, fl);

I like this, but are you sure it wont break user land ?

Some applications might react badly if sendfile() returns EAGAIN ?


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-05-31 10:47 ` Eric Dumazet
@ 2007-05-31 10:53   ` Jens Axboe
  2007-06-01  4:09     ` H. Peter Anvin
  0 siblings, 1 reply; 37+ messages in thread
From: Jens Axboe @ 2007-05-31 10:53 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: linux-kernel, cotte, hugh, neilb, zanussi, Linus Torvalds, hch

On Thu, May 31 2007, Eric Dumazet wrote:
> On Thu, 31 May 2007 12:33:16 +0200
> Jens Axboe <jens.axboe@oracle.com> wrote:
> 
> > Hi,
> > 
> > This patch removes the ->sendfile() hook from the file_operations
> > structure, and replaces the sys_sendfile() mechanism to be based on
> > ->splice_read() instead. There should be no functional changes.
> > 
>   
> > -	retval = in_file->f_op->sendfile(in_file, ppos, count, file_send_actor, out_file);
> > +	fl = 0;
> > +	if (in_file->f_flags & O_NONBLOCK)
> > +		fl = SPLICE_F_NONBLOCK;
> > +
> > +	retval = do_splice_direct(in_file, ppos, out_file, count, fl);
> 
> I like this, but are you sure it wont break user land ?
> 
> Some applications might react badly if sendfile() returns EAGAIN ?

Yeah, I didn't actually intend for that to sneak in. I'd think that
userspace should handle it if they opened the file O_NONBLOCK (or used
fcntl()), but it's a change in behaviour none the less and probably not
a good idea.

Encourage those people to use splice() instead :-)

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-05-31 10:33 [PATCH] sendfile removal Jens Axboe
  2007-05-31 10:47 ` Jens Axboe
  2007-05-31 10:47 ` Eric Dumazet
@ 2007-05-31 10:55 ` Christoph Hellwig
  2007-05-31 11:05   ` Jens Axboe
  2007-05-31 11:04 ` Carsten Otte
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 37+ messages in thread
From: Christoph Hellwig @ 2007-05-31 10:55 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, cotte, hugh, neilb, zanussi, Linus Torvalds, hch

On Thu, May 31, 2007 at 12:33:16PM +0200, Jens Axboe wrote:
> - nfds: The ->rq_sendfile_ok optimization is gone for now. I can't
>   determine the value of it, but I'm assuming it's there for a reason.
>   Any chance this can be converted to splice, or use something else than
>   ->sendfile()? CC'ed Neil.

sendfile useage in nfsd avoids a data copy and allows to use checksum
offloading.  it's quite important for nfs server workloads.

> Apart from that, it was mostly straight forward. Almost everybody uses
> generic_file_sendfile(), which makes the conversion easy. I changed loop
> to use do_generic_file_read() instead of sendfile, it works for me...

> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index 5526ead..92bac14 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -435,16 +435,24 @@ do_lo_receive(struct loop_device *lo,
>  {
>  	struct lo_read_data cookie;
>  	struct file *file;
> -	int retval;
> +	read_descriptor_t desc;
> +
> +	desc.written = 0;
> +	desc.count = bvec->bv_len;
> +	desc.arg.data = &cookie;
> +	desc.error = 0;
>  
>  	cookie.lo = lo;
>  	cookie.page = bvec->bv_page;
>  	cookie.offset = bvec->bv_offset;
>  	cookie.bsize = bsize;
>  	file = lo->lo_backing_file;
> -	retval = file->f_op->sendfile(file, &pos, bvec->bv_len,
> -			lo_read_actor, &cookie);
> -	return (retval < 0)? retval: 0;
> +
> +	do_generic_file_read(file, &pos, &desc, lo_read_actor);

This change is wrong.  loop or any existing user of ->sendfile absolutely
needs to go through a file operations vector so that file-system specific
actions such as locking are performed.  This is required at least for the
clustered filesystems and XFS.   The right way to implement this is
via do_splice_direct or something similar.

do_generic_file_read is only a library function for filesystem use
and should never be called directly.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-05-31 10:33 [PATCH] sendfile removal Jens Axboe
                   ` (2 preceding siblings ...)
  2007-05-31 10:55 ` Christoph Hellwig
@ 2007-05-31 11:04 ` Carsten Otte
  2007-05-31 11:06   ` Jens Axboe
  2007-05-31 15:33 ` Tom Zanussi
  2007-05-31 17:06 ` Hugh Dickins
  5 siblings, 1 reply; 37+ messages in thread
From: Carsten Otte @ 2007-05-31 11:04 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, hugh, neilb, zanussi, Linus Torvalds, hch

Jens Axboe wrote:
> Work to be done:
> 
> - The ext2 xip support needs a splice_read() implementation, currently I
>   just if 0'ed out the send xip_file_sendfile(). CC'ed Carsten, who
>   seems to be the author of this code.
Yup. Will do the splice_read implementation for xip. Do you want to 
see it in splice.c or should it go to filemap_xip?

Acked-by: Carsten Otte <cotte@de.ibm.com>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-05-31 10:55 ` Christoph Hellwig
@ 2007-05-31 11:05   ` Jens Axboe
  2007-05-31 12:26     ` Neil Brown
  2007-06-01  8:15     ` [PATCH] sendfile removal Jens Axboe
  0 siblings, 2 replies; 37+ messages in thread
From: Jens Axboe @ 2007-05-31 11:05 UTC (permalink / raw)
  To: Christoph Hellwig, linux-kernel, cotte, hugh, neilb, zanussi,
	Linus Torvalds

On Thu, May 31 2007, Christoph Hellwig wrote:
> On Thu, May 31, 2007 at 12:33:16PM +0200, Jens Axboe wrote:
> > - nfds: The ->rq_sendfile_ok optimization is gone for now. I can't
> >   determine the value of it, but I'm assuming it's there for a reason.
> >   Any chance this can be converted to splice, or use something else than
> >   ->sendfile()? CC'ed Neil.
> 
> sendfile useage in nfsd avoids a data copy and allows to use checksum
> offloading.  it's quite important for nfs server workloads.

OK, I hope Neil can provide some input on how to convert it. Of course
I'm just fishing for Neil to actually do that work :-)

> > Apart from that, it was mostly straight forward. Almost everybody uses
> > generic_file_sendfile(), which makes the conversion easy. I changed loop
> > to use do_generic_file_read() instead of sendfile, it works for me...
> 
> > diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> > index 5526ead..92bac14 100644
> > --- a/drivers/block/loop.c
> > +++ b/drivers/block/loop.c
> > @@ -435,16 +435,24 @@ do_lo_receive(struct loop_device *lo,
> >  {
> >  	struct lo_read_data cookie;
> >  	struct file *file;
> > -	int retval;
> > +	read_descriptor_t desc;
> > +
> > +	desc.written = 0;
> > +	desc.count = bvec->bv_len;
> > +	desc.arg.data = &cookie;
> > +	desc.error = 0;
> >  
> >  	cookie.lo = lo;
> >  	cookie.page = bvec->bv_page;
> >  	cookie.offset = bvec->bv_offset;
> >  	cookie.bsize = bsize;
> >  	file = lo->lo_backing_file;
> > -	retval = file->f_op->sendfile(file, &pos, bvec->bv_len,
> > -			lo_read_actor, &cookie);
> > -	return (retval < 0)? retval: 0;
> > +
> > +	do_generic_file_read(file, &pos, &desc, lo_read_actor);
> 
> This change is wrong.  loop or any existing user of ->sendfile absolutely
> needs to go through a file operations vector so that file-system specific
> actions such as locking are performed.  This is required at least for the
> clustered filesystems and XFS.   The right way to implement this is
> via do_splice_direct or something similar.
> 
> do_generic_file_read is only a library function for filesystem use
> and should never be called directly.

I'll convert it to do_splice_direct(), thanks.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-05-31 11:04 ` Carsten Otte
@ 2007-05-31 11:06   ` Jens Axboe
  0 siblings, 0 replies; 37+ messages in thread
From: Jens Axboe @ 2007-05-31 11:06 UTC (permalink / raw)
  To: carsteno; +Cc: linux-kernel, hugh, neilb, zanussi, Linus Torvalds, hch

On Thu, May 31 2007, Carsten Otte wrote:
> Jens Axboe wrote:
> >Work to be done:
> >
> >- The ext2 xip support needs a splice_read() implementation, currently I
> >  just if 0'ed out the send xip_file_sendfile(). CC'ed Carsten, who
> >  seems to be the author of this code.
> Yup. Will do the splice_read implementation for xip. Do you want to 
> see it in splice.c or should it go to filemap_xip?

Just put it in filemap_xip.c, I think that would be the best place.
Thanks!

> Acked-by: Carsten Otte <cotte@de.ibm.com>

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-05-31 11:05   ` Jens Axboe
@ 2007-05-31 12:26     ` Neil Brown
  2007-05-31 12:27       ` Jens Axboe
  2007-06-01  8:15     ` [PATCH] sendfile removal Jens Axboe
  1 sibling, 1 reply; 37+ messages in thread
From: Neil Brown @ 2007-05-31 12:26 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Christoph Hellwig, linux-kernel, cotte, hugh, zanussi,
	Linus Torvalds

On Thursday May 31, jens.axboe@oracle.com wrote:
> On Thu, May 31 2007, Christoph Hellwig wrote:
> > On Thu, May 31, 2007 at 12:33:16PM +0200, Jens Axboe wrote:
> > > - nfds: The ->rq_sendfile_ok optimization is gone for now. I can't
> > >   determine the value of it, but I'm assuming it's there for a reason.
> > >   Any chance this can be converted to splice, or use something else than
> > >   ->sendfile()? CC'ed Neil.
> > 
> > sendfile useage in nfsd avoids a data copy and allows to use checksum
> > offloading.  it's quite important for nfs server workloads.
> 
> OK, I hope Neil can provide some input on how to convert it. Of course
> I'm just fishing for Neil to actually do that work :-)

Well, seeing you did that zero-length-barrier thing for me, how can I
complain :-)

I'll have a look in the morning.

NeilBrown

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-05-31 12:26     ` Neil Brown
@ 2007-05-31 12:27       ` Jens Axboe
  2007-06-01  2:44         ` [PATCH] sendfile removal (nfsd update) Neil Brown
  0 siblings, 1 reply; 37+ messages in thread
From: Jens Axboe @ 2007-05-31 12:27 UTC (permalink / raw)
  To: Neil Brown
  Cc: Christoph Hellwig, linux-kernel, cotte, hugh, zanussi,
	Linus Torvalds

On Thu, May 31 2007, Neil Brown wrote:
> On Thursday May 31, jens.axboe@oracle.com wrote:
> > On Thu, May 31 2007, Christoph Hellwig wrote:
> > > On Thu, May 31, 2007 at 12:33:16PM +0200, Jens Axboe wrote:
> > > > - nfds: The ->rq_sendfile_ok optimization is gone for now. I can't
> > > >   determine the value of it, but I'm assuming it's there for a reason.
> > > >   Any chance this can be converted to splice, or use something else than
> > > >   ->sendfile()? CC'ed Neil.
> > > 
> > > sendfile useage in nfsd avoids a data copy and allows to use checksum
> > > offloading.  it's quite important for nfs server workloads.
> > 
> > OK, I hope Neil can provide some input on how to convert it. Of course
> > I'm just fishing for Neil to actually do that work :-)
> 
> Well, seeing you did that zero-length-barrier thing for me, how can I
> complain :-)
> 
> I'll have a look in the morning.

Woo, thanks Neil :-)

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-05-31 10:33 [PATCH] sendfile removal Jens Axboe
                   ` (3 preceding siblings ...)
  2007-05-31 11:04 ` Carsten Otte
@ 2007-05-31 15:33 ` Tom Zanussi
  2007-05-31 19:01   ` Jens Axboe
  2007-05-31 17:06 ` Hugh Dickins
  5 siblings, 1 reply; 37+ messages in thread
From: Tom Zanussi @ 2007-05-31 15:33 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, cotte, hugh, neilb, Linus Torvalds, hch

On Thu, 2007-05-31 at 12:33 +0200, Jens Axboe wrote:
> Hi,
> 
> This patch removes the ->sendfile() hook from the file_operations
> structure, and replaces the sys_sendfile() mechanism to be based on
> ->splice_read() instead. There should be no functional changes.
> 
> Work to be done:
> 
> - The ext2 xip support needs a splice_read() implementation, currently I
>   just if 0'ed out the send xip_file_sendfile(). CC'ed Carsten, who
>   seems to be the author of this code.
> 
> - shmem needs a splice_read() implementation. Optimistically CC'ed Hugh.
> 
> - nfds: The ->rq_sendfile_ok optimization is gone for now. I can't
>   determine the value of it, but I'm assuming it's there for a reason.
>   Any chance this can be converted to splice, or use something else than
>   ->sendfile()? CC'ed Neil.
> 
> - relay: needs a splice_read() implementation. I think Tom already has
>   one, CC'ed him.
> 

Hi Jens,

Yeah, I have it lying around somewhere - the patch I sent you awhile
back of an initial version of splice_read() is the last work I did on
this.  Basically, it worked OK but IIRC there were a couple things in
the code that needed fixing.  I also modified blktrace to use splice so
I could use it for testing; I'll send a patch for that too once I get it
working again.  

Anyway, that was ages ago (sometime last year ;-) so I'm going to have
to dig up the code and probably rework it a bit.

Tom



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-05-31 10:33 [PATCH] sendfile removal Jens Axboe
                   ` (4 preceding siblings ...)
  2007-05-31 15:33 ` Tom Zanussi
@ 2007-05-31 17:06 ` Hugh Dickins
  2007-05-31 17:31   ` Christoph Hellwig
  2007-05-31 19:03   ` Jens Axboe
  5 siblings, 2 replies; 37+ messages in thread
From: Hugh Dickins @ 2007-05-31 17:06 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, cotte, neilb, zanussi, Linus Torvalds, hch

On Thu, 31 May 2007, Jens Axboe wrote:
> 
> This patch removes the ->sendfile() hook from the file_operations
> structure, and replaces the sys_sendfile() mechanism to be based on
> ->splice_read() instead. There should be no functional changes.
> 
> Work to be done:
> 
> - The ext2 xip support needs a splice_read() implementation, currently I
>   just if 0'ed out the send xip_file_sendfile(). CC'ed Carsten, who
>   seems to be the author of this code.
> 
> - shmem needs a splice_read() implementation. Optimistically CC'ed Hugh.

I'll take that "Optimistically" as a special compliment,
rather than as a particular insult ;)

Yes, thanks, please leave shmem_file_splice_read() to me, I'll give
it priority now.  Not deep enough in yet, but I'll probably aim for
something simple-minded (correct but slow once it hits swap).

> 
> - nfds: The ->rq_sendfile_ok optimization is gone for now. I can't
>   determine the value of it, but I'm assuming it's there for a reason.
>   Any chance this can be converted to splice, or use something else than
>   ->sendfile()? CC'ed Neil.
> 
> - relay: needs a splice_read() implementation. I think Tom already has
>   one, CC'ed him.
> 
> Apart from that, it was mostly straight forward. Almost everybody uses
> generic_file_sendfile(), which makes the conversion easy. I changed loop
> to use do_generic_file_read() instead of sendfile, it works for me...

Christoph already picked up on that, and it's of interest to shmem too:
loop over tmpfs in 2.6 was relying on shmem_file_sendfile, for which
the generic route is not good enough.

If we're giving a .splice_read to everything which used to have a
.sendfile, then I think you just need to make do_lo_read() use
->splice_read now?

Hugh

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-05-31 17:06 ` Hugh Dickins
@ 2007-05-31 17:31   ` Christoph Hellwig
  2007-05-31 19:03   ` Jens Axboe
  1 sibling, 0 replies; 37+ messages in thread
From: Christoph Hellwig @ 2007-05-31 17:31 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Jens Axboe, linux-kernel, cotte, neilb, zanussi, Linus Torvalds,
	hch

On Thu, May 31, 2007 at 06:06:19PM +0100, Hugh Dickins wrote:
> Christoph already picked up on that, and it's of interest to shmem too:
> loop over tmpfs in 2.6 was relying on shmem_file_sendfile, for which
> the generic route is not good enough.
> 
> If we're giving a .splice_read to everything which used to have a
> .sendfile, then I think you just need to make do_lo_read() use
> ->splice_read now?

I'd be perfectly happy with killing support for using ->read.  Any
simple filesystem can use the generic methods, and any more complex
filesystem should better have splice aswell.  Btw, someone should
look into using splice for the loop write path aswell, the current
useage of ->prepare_write and ->commit_write has similar locking
problems aswell.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-05-31 15:33 ` Tom Zanussi
@ 2007-05-31 19:01   ` Jens Axboe
  0 siblings, 0 replies; 37+ messages in thread
From: Jens Axboe @ 2007-05-31 19:01 UTC (permalink / raw)
  To: Tom Zanussi; +Cc: linux-kernel, cotte, hugh, neilb, Linus Torvalds, hch

On Thu, May 31 2007, Tom Zanussi wrote:
> On Thu, 2007-05-31 at 12:33 +0200, Jens Axboe wrote:
> > Hi,
> > 
> > This patch removes the ->sendfile() hook from the file_operations
> > structure, and replaces the sys_sendfile() mechanism to be based on
> > ->splice_read() instead. There should be no functional changes.
> > 
> > Work to be done:
> > 
> > - The ext2 xip support needs a splice_read() implementation, currently I
> >   just if 0'ed out the send xip_file_sendfile(). CC'ed Carsten, who
> >   seems to be the author of this code.
> > 
> > - shmem needs a splice_read() implementation. Optimistically CC'ed Hugh.
> > 
> > - nfds: The ->rq_sendfile_ok optimization is gone for now. I can't
> >   determine the value of it, but I'm assuming it's there for a reason.
> >   Any chance this can be converted to splice, or use something else than
> >   ->sendfile()? CC'ed Neil.
> > 
> > - relay: needs a splice_read() implementation. I think Tom already has
> >   one, CC'ed him.
> > 
> 
> Hi Jens,
> 
> Yeah, I have it lying around somewhere - the patch I sent you awhile
> back of an initial version of splice_read() is the last work I did on
> this.  Basically, it worked OK but IIRC there were a couple things in
> the code that needed fixing.  I also modified blktrace to use splice so
> I could use it for testing; I'll send a patch for that too once I get it
> working again.  
> 
> Anyway, that was ages ago (sometime last year ;-) so I'm going to have
> to dig up the code and probably rework it a bit.

It'd be great if you can polish it a bit (basically just make it apply
to the current kernel). I don't think it's hard, aside from moving the
code from fs/relayfs/ to kernel/relay.c, there hasn't been many changes.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-05-31 17:06 ` Hugh Dickins
  2007-05-31 17:31   ` Christoph Hellwig
@ 2007-05-31 19:03   ` Jens Axboe
  1 sibling, 0 replies; 37+ messages in thread
From: Jens Axboe @ 2007-05-31 19:03 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: linux-kernel, cotte, neilb, zanussi, Linus Torvalds, hch

On Thu, May 31 2007, Hugh Dickins wrote:
> On Thu, 31 May 2007, Jens Axboe wrote:
> > 
> > This patch removes the ->sendfile() hook from the file_operations
> > structure, and replaces the sys_sendfile() mechanism to be based on
> > ->splice_read() instead. There should be no functional changes.
> > 
> > Work to be done:
> > 
> > - The ext2 xip support needs a splice_read() implementation, currently I
> >   just if 0'ed out the send xip_file_sendfile(). CC'ed Carsten, who
> >   seems to be the author of this code.
> > 
> > - shmem needs a splice_read() implementation. Optimistically CC'ed Hugh.
> 
> I'll take that "Optimistically" as a special compliment,
> rather than as a particular insult ;)

It's a compliment for sure, the "optimistically" was just mentioned
because there are lots of people mentioned in that file, but I mostly
remember you doing some heavy lifting there :-)

> Yes, thanks, please leave shmem_file_splice_read() to me, I'll give
> it priority now.  Not deep enough in yet, but I'll probably aim for
> something simple-minded (correct but slow once it hits swap).

Super, thanks!!

> > - nfds: The ->rq_sendfile_ok optimization is gone for now. I can't
> >   determine the value of it, but I'm assuming it's there for a reason.
> >   Any chance this can be converted to splice, or use something else than
> >   ->sendfile()? CC'ed Neil.
> > 
> > - relay: needs a splice_read() implementation. I think Tom already has
> >   one, CC'ed him.
> > 
> > Apart from that, it was mostly straight forward. Almost everybody uses
> > generic_file_sendfile(), which makes the conversion easy. I changed loop
> > to use do_generic_file_read() instead of sendfile, it works for me...
> 
> Christoph already picked up on that, and it's of interest to shmem too:
> loop over tmpfs in 2.6 was relying on shmem_file_sendfile, for which
> the generic route is not good enough.
> 
> If we're giving a .splice_read to everything which used to have a
> .sendfile, then I think you just need to make do_lo_read() use
> ->splice_read now?

Yes, I will make that change myself. I can do that, as I seem to have
sucessfully pushed out the remaining work to others :-)

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal (nfsd update)
  2007-05-31 12:27       ` Jens Axboe
@ 2007-06-01  2:44         ` Neil Brown
  2007-06-01  5:44           ` Jens Axboe
  0 siblings, 1 reply; 37+ messages in thread
From: Neil Brown @ 2007-06-01  2:44 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Christoph Hellwig, linux-kernel, cotte, hugh, zanussi,
	Linus Torvalds


Ok, here is a patch that makes nfsd use splice instead of sendfile.
It appears to both compile and work.

Some observations:
  - __splice_from_pipe wants a "struct file*" and I wanted to pass a
    "struct svcrqst *".  Maybe it should take a void * ?
  - It also wants a *ppos which I had no use for.. It that really
    need?  Cannot &file->f_pos be used?
  - I copied do_splice_to from splice.c as it wasn't exported, and
    then found I couldn't compile because rw_verify_area wasn't
    exported. As nfsd doesn't need that (we never export
    mandatory-locking files) I just remove it and some other cruft
    that I didn't need.... Not sure if that was the best approach.
  - I needed to export alloc_pipe_info.  Maybe there should be a 
    get_current_pipe instead which does the alloc if needed.
  - I would much rather have something like free_pipe_info exported
    than open code it in do_splice_read (which is based heavily on
    do_splice_direct).
  
NeilBrown

-------------------------------
Replace ->sendfile with ->splice_read

Apparently ->sendfile is going away, so change nfsd to use ->splice_read
to get pages for a file.

Signed-off-by: Neil Brown <neilb@suse.de>

### Diffstat output
 ./fs/nfsd/vfs.c |  125 ++++++++++++++++++++++++++++++++++++++++++++++++--------
 ./fs/pipe.c     |    1 
 2 files changed, 109 insertions(+), 17 deletions(-)

diff .prev/fs/nfsd/vfs.c ./fs/nfsd/vfs.c
--- .prev/fs/nfsd/vfs.c	2007-06-01 10:41:27.000000000 +1000
+++ ./fs/nfsd/vfs.c	2007-06-01 12:32:51.000000000 +1000
@@ -23,7 +23,7 @@
 #include <linux/file.h>
 #include <linux/mount.h>
 #include <linux/major.h>
-#include <linux/ext2_fs.h>
+#include <linux/pipe_fs_i.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
 #include <linux/fcntl.h>
@@ -801,26 +801,32 @@ found:
 }
 
 /*
- * Grab and keep cached pages assosiated with a file in the svc_rqst
- * so that they can be passed to the netowork sendmsg/sendpage routines
- * directrly. They will be released after the sending has completed.
+ * Grab and keep cached pages associated with a file in the svc_rqst
+ * so that they can be passed to the network sendmsg/sendpage routines
+ * directly. They will be released after the sending has completed.
  */
 static int
-nfsd_read_actor(read_descriptor_t *desc, struct page *page, unsigned long offset , unsigned long size)
+nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+		  struct splice_desc *sd)
 {
-	unsigned long count = desc->count;
-	struct svc_rqst *rqstp = desc->arg.data;
+	struct svc_rqst *rqstp = (struct svc_rqst *)sd->file;
 	struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
+	struct page *page = buf->page;
+	size_t size;
+	int ret;
+
+	ret = buf->ops->pin(pipe, buf);
+	if (unlikely(ret))
+		return ret;
 
-	if (size > count)
-		size = count;
+	size = sd->len;
 
 	if (rqstp->rq_res.page_len == 0) {
 		get_page(page);
 		put_page(*pp);
 		*pp = page;
 		rqstp->rq_resused++;
-		rqstp->rq_res.page_base = offset;
+		rqstp->rq_res.page_base = buf->offset;
 		rqstp->rq_res.page_len = size;
 	} else if (page != pp[-1]) {
 		get_page(page);
@@ -832,11 +838,98 @@ nfsd_read_actor(read_descriptor_t *desc,
 	} else
 		rqstp->rq_res.page_len += size;
 
-	desc->count = count - size;
-	desc->written += size;
 	return size;
 }
 
+static long do_splice_to(struct file *in, loff_t *ppos,
+			 struct pipe_inode_info *pipe, size_t len,
+			 unsigned int flags)
+{
+	loff_t isize, left;
+
+	isize = i_size_read(in->f_mapping->host);
+	if (unlikely(*ppos >= isize))
+		return 0;
+
+	left = isize - *ppos;
+	if (unlikely(left < len))
+		len = left;
+
+	return in->f_op->splice_read(in, ppos, pipe, len, flags);
+}
+
+static int do_splice_read(struct file *in, loff_t *ppos, size_t count,
+			  struct svc_rqst *rqstp)
+{
+	struct pipe_inode_info *pipe;
+	long ret, bytes;
+	int i;
+
+	rqstp->rq_resused = 1;
+
+	pipe = current->splice_pipe;
+	if (unlikely(!pipe)) {
+		pipe = alloc_pipe_info(NULL);
+		if (!pipe)
+			return -ENOMEM;
+
+		pipe->readers = 1;
+		current->splice_pipe = pipe;
+	}
+
+	ret = 0;
+	bytes = 0;
+
+	while (count) {
+		loff_t unused = 0;
+		size_t read_len, max_read_len;
+
+		max_read_len = min(count, (size_t)(PIPE_BUFFERS*PAGE_SIZE));
+
+		ret = do_splice_to(in, ppos, pipe, max_read_len, 0);
+		if (unlikely(ret < 0))
+			goto out_release;
+
+		read_len = ret;
+
+		ret = __splice_from_pipe(pipe, (struct file *)rqstp,
+					 &unused, read_len, 0,
+					 nfsd_splice_actor);
+		if (unlikely(ret < 0))
+			goto out_release;
+
+		bytes += ret;
+		count -= ret;
+	}
+
+	pipe->nrbufs = pipe->curbuf = 0;
+
+	return bytes;
+
+ out_release:
+	/*
+	 * If we did an incomplete transfer we must release
+	 * the pipe buffers in question:
+	 */
+	for (i = 0; i < PIPE_BUFFERS; i++) {
+		struct pipe_buffer *buf = pipe->bufs + i;
+
+		if (buf->ops) {
+			buf->ops->release(pipe, buf);
+			buf->ops = NULL;
+		}
+	}
+	pipe->nrbufs = pipe->curbuf = 0;
+
+	/*
+	 * If we transferred some data, return the number of bytes:
+	 */
+	if (bytes > 0)
+		return bytes;
+
+	return ret;
+}
+
 static __be32
 nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
               loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
@@ -861,11 +954,9 @@ nfsd_vfs_read(struct svc_rqst *rqstp, st
 	if (ra && ra->p_set)
 		file->f_ra = ra->p_ra;
 
-	if (file->f_op->sendfile && rqstp->rq_sendfile_ok) {
-		rqstp->rq_resused = 1;
-		host_err = file->f_op->sendfile(file, &offset, *count,
-						 nfsd_read_actor, rqstp);
-	} else {
+	if (file->f_op->splice_read && rqstp->rq_sendfile_ok)
+		host_err = do_splice_read(file, &offset, *count, rqstp);
+	else {
 		oldfs = get_fs();
 		set_fs(KERNEL_DS);
 		host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);

diff .prev/fs/pipe.c ./fs/pipe.c
--- .prev/fs/pipe.c	2007-06-01 12:30:43.000000000 +1000
+++ ./fs/pipe.c	2007-06-01 12:31:58.000000000 +1000
@@ -865,6 +865,7 @@ struct pipe_inode_info * alloc_pipe_info
 
 	return pipe;
 }
+EXPORT_SYMBOL(alloc_pipe_info);
 
 void __free_pipe_info(struct pipe_inode_info *pipe)
 {

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-05-31 10:53   ` Jens Axboe
@ 2007-06-01  4:09     ` H. Peter Anvin
  2007-06-01  5:41       ` Jens Axboe
  0 siblings, 1 reply; 37+ messages in thread
From: H. Peter Anvin @ 2007-06-01  4:09 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Eric Dumazet, linux-kernel, cotte, hugh, neilb, zanussi,
	Linus Torvalds, hch

Jens Axboe wrote:
>>   
>>> -	retval = in_file->f_op->sendfile(in_file, ppos, count, file_send_actor, out_file);
>>> +	fl = 0;
>>> +	if (in_file->f_flags & O_NONBLOCK)
>>> +		fl = SPLICE_F_NONBLOCK;
>>> +
>>> +	retval = do_splice_direct(in_file, ppos, out_file, count, fl);
>> I like this, but are you sure it wont break user land ?
>>
>> Some applications might react badly if sendfile() returns EAGAIN ?
> 
> Yeah, I didn't actually intend for that to sneak in. I'd think that
> userspace should handle it if they opened the file O_NONBLOCK (or used
> fcntl()), but it's a change in behaviour none the less and probably not
> a good idea.
> 

I would personally argue that sendfile() blocking on an O_NONBLOCK
desriptor, as opposed to returning EAGAIN, is a bug, and a fairly
serious such.

	-hpa

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-06-01  4:09     ` H. Peter Anvin
@ 2007-06-01  5:41       ` Jens Axboe
  2007-06-01  5:50         ` H. Peter Anvin
  0 siblings, 1 reply; 37+ messages in thread
From: Jens Axboe @ 2007-06-01  5:41 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Eric Dumazet, linux-kernel, cotte, hugh, neilb, zanussi,
	Linus Torvalds, hch

On Thu, May 31 2007, H. Peter Anvin wrote:
> Jens Axboe wrote:
> >>   
> >>> -	retval = in_file->f_op->sendfile(in_file, ppos, count, file_send_actor, out_file);
> >>> +	fl = 0;
> >>> +	if (in_file->f_flags & O_NONBLOCK)
> >>> +		fl = SPLICE_F_NONBLOCK;
> >>> +
> >>> +	retval = do_splice_direct(in_file, ppos, out_file, count, fl);
> >> I like this, but are you sure it wont break user land ?
> >>
> >> Some applications might react badly if sendfile() returns EAGAIN ?
> > 
> > Yeah, I didn't actually intend for that to sneak in. I'd think that
> > userspace should handle it if they opened the file O_NONBLOCK (or used
> > fcntl()), but it's a change in behaviour none the less and probably not
> > a good idea.
> > 
> 
> I would personally argue that sendfile() blocking on an O_NONBLOCK
> desriptor, as opposed to returning EAGAIN, is a bug, and a fairly
> serious such.

I agree, but it's still a change in behaviour. Even if we consider the
app buggy (it is), can we potentially break it?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal (nfsd update)
  2007-06-01  2:44         ` [PATCH] sendfile removal (nfsd update) Neil Brown
@ 2007-06-01  5:44           ` Jens Axboe
  2007-06-01  8:01             ` Jens Axboe
  0 siblings, 1 reply; 37+ messages in thread
From: Jens Axboe @ 2007-06-01  5:44 UTC (permalink / raw)
  To: Neil Brown
  Cc: Christoph Hellwig, linux-kernel, cotte, hugh, zanussi,
	Linus Torvalds

On Fri, Jun 01 2007, Neil Brown wrote:
> 
> Ok, here is a patch that makes nfsd use splice instead of sendfile.
> It appears to both compile and work.
> 
> Some observations:
>   - __splice_from_pipe wants a "struct file*" and I wanted to pass a
>     "struct svcrqst *".  Maybe it should take a void * ?
>   - It also wants a *ppos which I had no use for.. It that really
>     need?  Cannot &file->f_pos be used?

See:

http://git.kernel.dk/?p=linux-2.6-block.git;a=commitdiff;h=c73a9509ef7877d31e0c97c684ee8b7ed13ecbbe;hp=07f0e716250d4a3a550b2f39bd0a7e4e6566b3c2

I'll rebase the conversion on top of this one, loop will need the same
change.

>   - I copied do_splice_to from splice.c as it wasn't exported, and
>     then found I couldn't compile because rw_verify_area wasn't
>     exported. As nfsd doesn't need that (we never export
>     mandatory-locking files) I just remove it and some other cruft
>     that I didn't need.... Not sure if that was the best approach.
>   - I needed to export alloc_pipe_info.  Maybe there should be a 
>     get_current_pipe instead which does the alloc if needed.
>   - I would much rather have something like free_pipe_info exported
>     than open code it in do_splice_read (which is based heavily on
>     do_splice_direct).

I need the same thing for loop. I think I'll add do_splice_foo() to
fs/splice.c in a way that loop can use instead of do_splice_direct(),
then nfsd should be able to do the same.

Thanks for this Neil!

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-06-01  5:41       ` Jens Axboe
@ 2007-06-01  5:50         ` H. Peter Anvin
  2007-06-01  7:22           ` Eric Dumazet
  0 siblings, 1 reply; 37+ messages in thread
From: H. Peter Anvin @ 2007-06-01  5:50 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Eric Dumazet, linux-kernel, cotte, hugh, neilb, zanussi,
	Linus Torvalds, hch

Jens Axboe wrote:
>>>
>> I would personally argue that sendfile() blocking on an O_NONBLOCK
>> desriptor, as opposed to returning EAGAIN, is a bug, and a fairly
>> serious such.
> 
> I agree, but it's still a change in behaviour. Even if we consider the
> app buggy (it is), can we potentially break it?
> 

It depends on which app it is, of course.  However, I think we have to
smoke that out the hard way.  I don't think we should retain a bug in
the kernel just because some unknown app might depend on that bug --
taking that to the extreme  we could never fix bugs at all...

	-hpa

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-06-01  5:50         ` H. Peter Anvin
@ 2007-06-01  7:22           ` Eric Dumazet
  2007-06-01 15:52             ` H. Peter Anvin
  0 siblings, 1 reply; 37+ messages in thread
From: Eric Dumazet @ 2007-06-01  7:22 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Jens Axboe, linux-kernel, cotte, hugh, neilb, zanussi,
	Linus Torvalds, hch

H. Peter Anvin a écrit :
> Jens Axboe wrote:
>>> I would personally argue that sendfile() blocking on an O_NONBLOCK
>>> desriptor, as opposed to returning EAGAIN, is a bug, and a fairly
>>> serious such.
>> I agree, but it's still a change in behaviour. Even if we consider the
>> app buggy (it is), can we potentially break it?
>>
> 
> It depends on which app it is, of course.  However, I think we have to
> smoke that out the hard way.  I don't think we should retain a bug in
> the kernel just because some unknown app might depend on that bug --
> taking that to the extreme  we could never fix bugs at all...
> 

As I said, this new non blocking feature on the input side (disk), is nice and 
usefull. (For people scared by splice() syscall :) )

Just have to mention it is a change of behavior, and documentation probably 
needs to reflect this change. "Since linux 2.6.23, sendfile() repects 
O_NONBLOCK on in_fd as well"


My man page here says :

ERRORS
        EAGAIN Non-blocking I/O has been selected using O_NONBLOCK and the 
write would block.

        EBADF  The input file was not opened for reading or the output file 
was not opened for writing.

        EFAULT Bad address.

        EINVAL Descriptor is not valid or locked, or an mmap()-like operation 
is not available for in_fd.

        EIO    Unspecified error while reading from in_fd.

        ENOMEM Insufficient memory to read from in_fd.


This implies O_NONBLOCK on the out filedesc, not the input one :)


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal (nfsd update)
  2007-06-01  5:44           ` Jens Axboe
@ 2007-06-01  8:01             ` Jens Axboe
  0 siblings, 0 replies; 37+ messages in thread
From: Jens Axboe @ 2007-06-01  8:01 UTC (permalink / raw)
  To: Neil Brown
  Cc: Christoph Hellwig, linux-kernel, cotte, hugh, zanussi,
	Linus Torvalds

On Fri, Jun 01 2007, Jens Axboe wrote:
> On Fri, Jun 01 2007, Neil Brown wrote:
> > 
> > Ok, here is a patch that makes nfsd use splice instead of sendfile.
> > It appears to both compile and work.
> > 
> > Some observations:
> >   - __splice_from_pipe wants a "struct file*" and I wanted to pass a
> >     "struct svcrqst *".  Maybe it should take a void * ?
> >   - It also wants a *ppos which I had no use for.. It that really
> >     need?  Cannot &file->f_pos be used?
> 
> See:
> 
> http://git.kernel.dk/?p=linux-2.6-block.git;a=commitdiff;h=c73a9509ef7877d31e0c97c684ee8b7ed13ecbbe;hp=07f0e716250d4a3a550b2f39bd0a7e4e6566b3c2
> 
> I'll rebase the conversion on top of this one, loop will need the same
> change.
> 
> >   - I copied do_splice_to from splice.c as it wasn't exported, and
> >     then found I couldn't compile because rw_verify_area wasn't
> >     exported. As nfsd doesn't need that (we never export
> >     mandatory-locking files) I just remove it and some other cruft
> >     that I didn't need.... Not sure if that was the best approach.
> >   - I needed to export alloc_pipe_info.  Maybe there should be a 
> >     get_current_pipe instead which does the alloc if needed.
> >   - I would much rather have something like free_pipe_info exported
> >     than open code it in do_splice_read (which is based heavily on
> >     do_splice_direct).
> 
> I need the same thing for loop. I think I'll add do_splice_foo() to
> fs/splice.c in a way that loop can use instead of do_splice_direct(),
> then nfsd should be able to do the same.

I rebased and added a splice_direct_to_actor() helper that nfsd can then
use. This makes the nfsd diff look like the below, which I think you'll
agree is a lot more pleasant. You can view the tree here:

http://git.kernel.dk/?p=linux-2.6-block.git;a=shortlog;h=splice

The vmsplice patch probably needs to be split in two, for now it is the
one introducing the framework that nfsd (and next, loop) can use.

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 7e6aa24..96f76e2 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -23,7 +23,7 @@
 #include <linux/file.h>
 #include <linux/mount.h>
 #include <linux/major.h>
-#include <linux/ext2_fs.h>
+#include <linux/pipe_fs_i.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
 #include <linux/fcntl.h>
@@ -801,26 +801,32 @@ found:
 }
 
 /*
- * Grab and keep cached pages assosiated with a file in the svc_rqst
- * so that they can be passed to the netowork sendmsg/sendpage routines
- * directrly. They will be released after the sending has completed.
+ * Grab and keep cached pages associated with a file in the svc_rqst
+ * so that they can be passed to the network sendmsg/sendpage routines
+ * directly. They will be released after the sending has completed.
  */
 static int
-nfsd_read_actor(read_descriptor_t *desc, struct page *page, unsigned long offset , unsigned long size)
+nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+		  struct splice_desc *sd)
 {
-	unsigned long count = desc->count;
-	struct svc_rqst *rqstp = desc->arg.data;
+	struct svc_rqst *rqstp = sd->data;
 	struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
+	struct page *page = buf->page;
+	size_t size;
+	int ret;
+
+	ret = buf->ops->pin(pipe, buf);
+	if (unlikely(ret))
+		return ret;
 
-	if (size > count)
-		size = count;
+	size = sd->len;
 
 	if (rqstp->rq_res.page_len == 0) {
 		get_page(page);
 		put_page(*pp);
 		*pp = page;
 		rqstp->rq_resused++;
-		rqstp->rq_res.page_base = offset;
+		rqstp->rq_res.page_base = buf->offset;
 		rqstp->rq_res.page_len = size;
 	} else if (page != pp[-1]) {
 		get_page(page);
@@ -832,11 +838,15 @@ nfsd_read_actor(read_descriptor_t *desc, struct page *page, unsigned long offset
 	} else
 		rqstp->rq_res.page_len += size;
 
-	desc->count = count - size;
-	desc->written += size;
 	return size;
 }
 
+static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
+				    struct splice_desc *sd)
+{
+	return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
+}
+
 static __be32
 nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
               loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
@@ -861,10 +871,15 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	if (ra && ra->p_set)
 		file->f_ra = ra->p_ra;
 
-	if (file->f_op->sendfile && rqstp->rq_sendfile_ok) {
-		rqstp->rq_resused = 1;
-		host_err = file->f_op->sendfile(file, &offset, *count,
-						 nfsd_read_actor, rqstp);
+	if (file->f_op->splice_read && rqstp->rq_splice_ok) {
+		struct splice_desc sd = {
+			.len		= 0,
+			.total_len	= *count,
+			.pos		= offset,
+		};
+
+		sd.data = rqstp;
+		host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
 	} else {
 		oldfs = get_fs();
 		set_fs(KERNEL_DS);


-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-05-31 11:05   ` Jens Axboe
  2007-05-31 12:26     ` Neil Brown
@ 2007-06-01  8:15     ` Jens Axboe
  1 sibling, 0 replies; 37+ messages in thread
From: Jens Axboe @ 2007-06-01  8:15 UTC (permalink / raw)
  To: Christoph Hellwig, linux-kernel, cotte, hugh, neilb, zanussi,
	Linus Torvalds

On Thu, May 31 2007, Jens Axboe wrote:
> > This change is wrong.  loop or any existing user of ->sendfile absolutely
> > needs to go through a file operations vector so that file-system specific
> > actions such as locking are performed.  This is required at least for the
> > clustered filesystems and XFS.   The right way to implement this is
> > via do_splice_direct or something similar.
> > 
> > do_generic_file_read is only a library function for filesystem use
> > and should never be called directly.
> 
> I'll convert it to do_splice_direct(), thanks.

So how does this look? Totally untested, will do that now. It only
covers the read side, the write conversion to splice will happen later.

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 92bac14..3714e60 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -74,6 +74,7 @@
 #include <linux/highmem.h>
 #include <linux/gfp.h>
 #include <linux/kthread.h>
+#include <linux/pipe_fs_i.h>
 
 #include <asm/uaccess.h>
 
@@ -401,58 +402,70 @@ struct lo_read_data {
 };
 
 static int
-lo_read_actor(read_descriptor_t *desc, struct page *page,
-	      unsigned long offset, unsigned long size)
+lo_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+		struct splice_desc *sd)
 {
-	unsigned long count = desc->count;
-	struct lo_read_data *p = desc->arg.data;
+	struct lo_read_data *p = sd->data;
 	struct loop_device *lo = p->lo;
+	struct page *page = buf->page;
 	sector_t IV;
+	size_t size;
+	int ret;
 
-	IV = ((sector_t) page->index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9);
+	ret = buf->ops->pin(pipe, buf);
+	if (unlikely(ret))
+		return ret;
 
-	if (size > count)
-		size = count;
+	IV = ((sector_t) page->index << (PAGE_CACHE_SHIFT - 9))+(buf->offset >> 9);
+	size = sd->len;
 
-	if (lo_do_transfer(lo, READ, page, offset, p->page, p->offset, size, IV)) {
+	if (lo_do_transfer(lo, READ, page, buf->offset, p->page, p->offset, size, IV)) {
 		size = 0;
 		printk(KERN_ERR "loop: transfer error block %ld\n",
 		       page->index);
-		desc->error = -EINVAL;
+		size = -EINVAL;
 	}
 
 	flush_dcache_page(p->page);
 
-	desc->count = count - size;
-	desc->written += size;
-	p->offset += size;
+	if (size > 0)
+		p->offset += size;
 	return size;
 }
 
 static int
+lo_direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd)
+{
+	return __splice_from_pipe(pipe, sd, lo_splice_actor);
+}
+
+static int
 do_lo_receive(struct loop_device *lo,
 	      struct bio_vec *bvec, int bsize, loff_t pos)
 {
 	struct lo_read_data cookie;
+	struct splice_desc sd;
 	struct file *file;
-	read_descriptor_t desc;
-
-	desc.written = 0;
-	desc.count = bvec->bv_len;
-	desc.arg.data = &cookie;
-	desc.error = 0;
+	int retval;
 
 	cookie.lo = lo;
 	cookie.page = bvec->bv_page;
 	cookie.offset = bvec->bv_offset;
 	cookie.bsize = bsize;
+
+	sd.len = 0;
+	sd.total_len = bsize;
+	sd.flags = 0;
+	sd.pos = pos;
+	sd.data = &cookie;
+
 	file = lo->lo_backing_file;
+	retval = splice_direct_to_actor(file, &sd, lo_direct_splice_actor);
 
-	do_generic_file_read(file, &pos, &desc, lo_read_actor);
-	if (desc.written)
-		return 0;
+	if (retval < 0)
+		return retval;
 
-	return desc.error;
+	return 0;
 }
 
 static int
@@ -687,8 +700,8 @@ static int loop_change_fd(struct loop_device *lo, struct file *lo_file,
 	if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
 		goto out_putf;
 
-	/* new backing store needs to support loop (eg readpage) */
-	if (!file->f_mapping->a_ops->readpage)
+	/* new backing store needs to support loop (eg splice_read) */
+	if (!inode->i_fop->splice_read)
 		goto out_putf;
 
 	/* size of the new backing store needs to be the same */
@@ -768,7 +781,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 		 * If we can't read - sorry. If we only can't write - well,
 		 * it's going to be read-only.
 		 */
-		if (!aops->readpage)
+		if (!file->f_op->splice_read)
 			goto out_putf;
 		if (aops->prepare_write && aops->commit_write)
 			lo_flags |= LO_FLAGS_USE_AOPS;

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-06-01  7:22           ` Eric Dumazet
@ 2007-06-01 15:52             ` H. Peter Anvin
  2007-06-01 16:18               ` Linus Torvalds
  2007-06-01 16:22               ` Pádraig Brady
  0 siblings, 2 replies; 37+ messages in thread
From: H. Peter Anvin @ 2007-06-01 15:52 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Jens Axboe, linux-kernel, cotte, hugh, neilb, zanussi,
	Linus Torvalds, hch

Eric Dumazet wrote:
> 
> As I said, this new non blocking feature on the input side (disk), is
> nice and usefull. (For people scared by splice() syscall :) )
> 
> Just have to mention it is a change of behavior, and documentation
> probably needs to reflect this change. "Since linux 2.6.23, sendfile()
> repects O_NONBLOCK on in_fd as well"
> 

Fair enough.  Unix has traditionally not acknowledged the possibility of
nonblocking I/O on conventional files, for some odd reason.

	-hpa

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-06-01 15:52             ` H. Peter Anvin
@ 2007-06-01 16:18               ` Linus Torvalds
  2007-06-01 16:47                 ` Eric Dumazet
                                   ` (2 more replies)
  2007-06-01 16:22               ` Pádraig Brady
  1 sibling, 3 replies; 37+ messages in thread
From: Linus Torvalds @ 2007-06-01 16:18 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Eric Dumazet, Jens Axboe, linux-kernel, cotte, hugh, neilb,
	zanussi, hch



On Fri, 1 Jun 2007, H. Peter Anvin wrote:
> 
> Fair enough.  Unix has traditionally not acknowledged the possibility of
> nonblocking I/O on conventional files, for some odd reason.

It's not odd at all.

If you return EAGAIN, you had better have a way to _wait_ for that EAGAIN 
to go away, otherwise the EAGAIN is just a total waste of time.

So the rule about EAGAIN is very simple:
 (a) the file descriptor must be O_NONBLOCK
 (b) the access must otherwise block
AND
 (c) the condition must be something we can wait for with poll/select

I don't know why people continually ignore that (c) point, even though 
it's obvious and very very important!

If you cannot wait for it, tell me why the kernel should _ever_ return 
EAGAIN? The only option for the user is to just do the operation again 
immediately.

And the thing is, neither poll nor select work on regular files. And no, 
that is _not_ just an implementation issue. It's very fundamental: neither 
poll nor select get the file offset to wait for!

And that file offset is _critical_ for a regular file, in a way it 
obviously is _not_ for a socket, pipe, or other special file. Because 
without knowing the file offset, you cannot know which page you should be 
waiting for!

And no, the file offset is not "f_pos". sendfile(), along with 
pread/pwrite, uses a totally separate file offset, so if select/poll were 
to base their decision on f_pos, they'd be _wrong_.

This really is very fundamental. 

Now, you can argue that you can always just return -EAGAIN anyway, but 
then the calling process will basically be busy-looping, calling 
sendfile() (or splice()) over and over again. That's _horrible_. It's much 
better to just not return EAGAIN, and sleep like a good process should!

So there's a few things to take away from this:

 - regular file access MUST NOT return EAGAIN just because a page isn't 
   in the cache. Doing so is simply a bug. No ifs, buts or maybe's about 
   it!

   Busy-looping is NOT ACCEPTABLE!

 - you *could* make some alternative conventions:

	(a) you could make O_NONBLOCK mean that you'll at least 
	    guarantee that you *start* the IO, and while you never return 
	    EAGAIN, you migth validly return a _partial_ result!

	(b) variation on (a): it's ok to return EAGAIN if _you_ were the 
	    one who started the IO during this particular time aroudn the 
	    loop. But if you find a page that isn't up-to-date yet, and 
	    you didn't start the IO, you *must* wait for it, so that you 
	    end up returning EAGAIN atmost once! Exactly because 
	    busy-looping is simply not acceptable behaviour!

I have to admit that I didn't look at what raw splice() itself does these 
days. I would not be surprised if Jens also didn't realize this very 
fundamental issue. It seems too easy to miss, because people think 
that EAGAIN stands on its own, and don't realize that EAGAIN must be 
paired with select/poll to make sense.

Jens?

		Linus

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-06-01 15:52             ` H. Peter Anvin
  2007-06-01 16:18               ` Linus Torvalds
@ 2007-06-01 16:22               ` Pádraig Brady
  1 sibling, 0 replies; 37+ messages in thread
From: Pádraig Brady @ 2007-06-01 16:22 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Eric Dumazet, Jens Axboe, linux-kernel, cotte, hugh, neilb,
	zanussi, Linus Torvalds, hch

H. Peter Anvin wrote:
> Eric Dumazet wrote:
>> As I said, this new non blocking feature on the input side (disk), is
>> nice and usefull. (For people scared by splice() syscall :) )
>>
>> Just have to mention it is a change of behavior, and documentation
>> probably needs to reflect this change. "Since linux 2.6.23, sendfile()
>> repects O_NONBLOCK on in_fd as well"
>>
> 
> Fair enough.  Unix has traditionally not acknowledged the possibility of
> nonblocking I/O on conventional files, for some odd reason.

That reminds me of this patch:
http://lkml.org/lkml/2004/10/1/217
which went in for a while but was reverted:
http://lkml.org/lkml/2004/10/17/17

Pádraig

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-06-01 16:18               ` Linus Torvalds
@ 2007-06-01 16:47                 ` Eric Dumazet
  2007-06-01 16:53                 ` H. Peter Anvin
  2007-06-02 15:01                 ` Jens Axboe
  2 siblings, 0 replies; 37+ messages in thread
From: Eric Dumazet @ 2007-06-01 16:47 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: H. Peter Anvin, Jens Axboe, linux-kernel, cotte, hugh, neilb,
	zanussi, hch

Linus Torvalds a écrit :
> 
> On Fri, 1 Jun 2007, H. Peter Anvin wrote:
>> Fair enough.  Unix has traditionally not acknowledged the possibility of
>> nonblocking I/O on conventional files, for some odd reason.
> 
> It's not odd at all.
> 
> If you return EAGAIN, you had better have a way to _wait_ for that EAGAIN 
> to go away, otherwise the EAGAIN is just a total waste of time.
> 
> So the rule about EAGAIN is very simple:
>  (a) the file descriptor must be O_NONBLOCK
>  (b) the access must otherwise block
> AND
>  (c) the condition must be something we can wait for with poll/select
> 
> I don't know why people continually ignore that (c) point, even though 
> it's obvious and very very important!
> 
> If you cannot wait for it, tell me why the kernel should _ever_ return 
> EAGAIN? The only option for the user is to just do the operation again 
> immediately.
> 
> And the thing is, neither poll nor select work on regular files. And no, 
> that is _not_ just an implementation issue. It's very fundamental: neither 
> poll nor select get the file offset to wait for!
> 
> And that file offset is _critical_ for a regular file, in a way it 
> obviously is _not_ for a socket, pipe, or other special file. Because 
> without knowing the file offset, you cannot know which page you should be 
> waiting for!
> 
> And no, the file offset is not "f_pos". sendfile(), along with 
> pread/pwrite, uses a totally separate file offset, so if select/poll were 
> to base their decision on f_pos, they'd be _wrong_.
> 
> This really is very fundamental. 
> 
> Now, you can argue that you can always just return -EAGAIN anyway, but 
> then the calling process will basically be busy-looping, calling 
> sendfile() (or splice()) over and over again. That's _horrible_. It's much 
> better to just not return EAGAIN, and sleep like a good process should!
> 
> So there's a few things to take away from this:
> 
>  - regular file access MUST NOT return EAGAIN just because a page isn't 
>    in the cache. Doing so is simply a bug. No ifs, buts or maybe's about 
>    it!
> 
>    Busy-looping is NOT ACCEPTABLE!

yes, very true, but then some apps do this (and sometimes depends on yield())


> 
>  - you *could* make some alternative conventions:
> 
> 	(a) you could make O_NONBLOCK mean that you'll at least 
> 	    guarantee that you *start* the IO, and while you never return 
> 	    EAGAIN, you migth validly return a _partial_ result!
> 
> 	(b) variation on (a): it's ok to return EAGAIN if _you_ were the 
> 	    one who started the IO during this particular time aroudn the 
> 	    loop. But if you find a page that isn't up-to-date yet, and 
> 	    you didn't start the IO, you *must* wait for it, so that you 
> 	    end up returning EAGAIN atmost once! Exactly because 
> 	    busy-looping is simply not acceptable behaviour!
> 
> I have to admit that I didn't look at what raw splice() itself does these 
> days. I would not be surprised if Jens also didn't realize this very 
> fundamental issue. It seems too easy to miss, because people think 
> that EAGAIN stands on its own, and don't realize that EAGAIN must be 
> paired with select/poll to make sense.
> 

Right now, splice() has one SPLICE_F_NONBLOCK flag, and this flag is applied 
on both sides (in & out)

So either :

1) We separate the flag into two flags NONBLOCK_IN & NONBLOCK_OUT, so that the 
application is free to chose to busy-loop/yield if it wants.

2) We ignore NONBLOCK flag for regular files in splice() (and sendfile()), 
just following current facto

3) We consider select()/poll()/splice() can be extended to regular files on 
[f_pos] (select() and related functions have a meaning on non-seekable files, 
so consider it can be extended on files only on current file pos)



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-06-01 16:18               ` Linus Torvalds
  2007-06-01 16:47                 ` Eric Dumazet
@ 2007-06-01 16:53                 ` H. Peter Anvin
  2007-06-02 15:02                   ` Jens Axboe
  2007-06-02 15:01                 ` Jens Axboe
  2 siblings, 1 reply; 37+ messages in thread
From: H. Peter Anvin @ 2007-06-01 16:53 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Eric Dumazet, Jens Axboe, linux-kernel, cotte, hugh, neilb,
	zanussi, hch

Linus Torvalds wrote:
> And the thing is, neither poll nor select work on regular files. And no, 
> that is _not_ just an implementation issue. It's very fundamental: neither 
> poll nor select get the file offset to wait for!
> 
> And that file offset is _critical_ for a regular file, in a way it 
> obviously is _not_ for a socket, pipe, or other special file. Because 
> without knowing the file offset, you cannot know which page you should be 
> waiting for!
> 
> And no, the file offset is not "f_pos". sendfile(), along with 
> pread/pwrite, uses a totally separate file offset, so if select/poll were 
> to base their decision on f_pos, they'd be _wrong_.

This is obviously correct, although at the time those interfaces were
designed, I don't believe either pread/pwrite nor sendfile() existed,
and they still couldn't wait on real files.  That there isn't a suitable
way to wait for a file at an offset is probably a result of that past
history.

Waiting at f_pos is still a possible interface, of course; it would mean
that pread/pwrite/sendfile users would have to seek before waiting.
However, implementing waiting on files in select/poll is prohibited by
POSIX, so it would at least need some sort of Linux-specific flag anyway.

It seems that being able to do nonblocking I/O on files would be a
useful thing.  This really *does* require proper nonblocking I/O and not
just the ability to wait, since you can never know when the kernel
decides to recycle the page you are just about to want from the cache.

> So there's a few things to take away from this:
> 
>  - regular file access MUST NOT return EAGAIN just because a page isn't 
>    in the cache. Doing so is simply a bug. No ifs, buts or maybe's about 
>    it!
> 
>    Busy-looping is NOT ACCEPTABLE!
> 
>  - you *could* make some alternative conventions:
> 
> 	(a) you could make O_NONBLOCK mean that you'll at least 
> 	    guarantee that you *start* the IO, and while you never return 
> 	    EAGAIN, you migth validly return a _partial_ result!
> 
> 	(b) variation on (a): it's ok to return EAGAIN if _you_ were the 
> 	    one who started the IO during this particular time aroudn the 
> 	    loop. But if you find a page that isn't up-to-date yet, and 
> 	    you didn't start the IO, you *must* wait for it, so that you 
> 	    end up returning EAGAIN atmost once! Exactly because 
> 	    busy-looping is simply not acceptable behaviour!

(b) seems really ugly.  (a) is at least well-defined.  Either seems
wrong, though.

	-hpa

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-06-01 16:18               ` Linus Torvalds
  2007-06-01 16:47                 ` Eric Dumazet
  2007-06-01 16:53                 ` H. Peter Anvin
@ 2007-06-02 15:01                 ` Jens Axboe
  2007-06-02 15:40                   ` Linus Torvalds
  2 siblings, 1 reply; 37+ messages in thread
From: Jens Axboe @ 2007-06-02 15:01 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: H. Peter Anvin, Eric Dumazet, linux-kernel, cotte, hugh, neilb,
	zanussi, hch

On Fri, Jun 01 2007, Linus Torvalds wrote:
> 
> 
> On Fri, 1 Jun 2007, H. Peter Anvin wrote:
> > 
> > Fair enough.  Unix has traditionally not acknowledged the possibility of
> > nonblocking I/O on conventional files, for some odd reason.
> 
> It's not odd at all.
> 
> If you return EAGAIN, you had better have a way to _wait_ for that EAGAIN 
> to go away, otherwise the EAGAIN is just a total waste of time.
> 
> So the rule about EAGAIN is very simple:
>  (a) the file descriptor must be O_NONBLOCK
>  (b) the access must otherwise block
> AND
>  (c) the condition must be something we can wait for with poll/select
> 
> I don't know why people continually ignore that (c) point, even though 
> it's obvious and very very important!
> 
> If you cannot wait for it, tell me why the kernel should _ever_ return 
> EAGAIN? The only option for the user is to just do the operation again 
> immediately.
> 
> And the thing is, neither poll nor select work on regular files. And no, 
> that is _not_ just an implementation issue. It's very fundamental: neither 
> poll nor select get the file offset to wait for!
> 
> And that file offset is _critical_ for a regular file, in a way it 
> obviously is _not_ for a socket, pipe, or other special file. Because 
> without knowing the file offset, you cannot know which page you should be 
> waiting for!
> 
> And no, the file offset is not "f_pos". sendfile(), along with 
> pread/pwrite, uses a totally separate file offset, so if select/poll were 
> to base their decision on f_pos, they'd be _wrong_.
> 
> This really is very fundamental. 
> 
> Now, you can argue that you can always just return -EAGAIN anyway, but 
> then the calling process will basically be busy-looping, calling 
> sendfile() (or splice()) over and over again. That's _horrible_. It's much 
> better to just not return EAGAIN, and sleep like a good process should!
> 
> So there's a few things to take away from this:
> 
>  - regular file access MUST NOT return EAGAIN just because a page isn't 
>    in the cache. Doing so is simply a bug. No ifs, buts or maybe's about 
>    it!
> 
>    Busy-looping is NOT ACCEPTABLE!
> 
>  - you *could* make some alternative conventions:
> 
> 	(a) you could make O_NONBLOCK mean that you'll at least 
> 	    guarantee that you *start* the IO, and while you never return 
> 	    EAGAIN, you migth validly return a _partial_ result!
> 
> 	(b) variation on (a): it's ok to return EAGAIN if _you_ were the 
> 	    one who started the IO during this particular time aroudn the 
> 	    loop. But if you find a page that isn't up-to-date yet, and 
> 	    you didn't start the IO, you *must* wait for it, so that you 
> 	    end up returning EAGAIN atmost once! Exactly because 
> 	    busy-looping is simply not acceptable behaviour!
> 
> I have to admit that I didn't look at what raw splice() itself does these 
> days. I would not be surprised if Jens also didn't realize this very 
> fundamental issue. It seems too easy to miss, because people think 
> that EAGAIN stands on its own, and don't realize that EAGAIN must be 
> paired with select/poll to make sense.

splice() WILL return EAGAIN, but in that case it should have triggered
the read-ahead and thus started some IO. At least that was/is the
intention, and it worked like that originally. Since I'm deep on the
splice updating for sendfile etc right now anyway, I'll doubly verify
that it still works that way as expected!

For the from-file case, see __generic_file_splice_read(). splice does:

       if (!PageUptodate(page)) {
               /*
                * If in nonblock mode then dont block on
                * waiting
                * for an in-flight io page
                */
               if (flags & SPLICE_F_NONBLOCK) {
                       if (TestSetPageLocked(page))
                               break;
               } else
                       lock_page(page);

but it doesn't know whether this call into __generic_file_splice_read()
initiated IO to that page or if someone else did. Naturally this will
most often be the case that __generic_file_splice_read() was the one
that initiated IO to that page, but it could be someone else of course.

The code above doesn't actually return EAGAIN, but
generic_file_splice_read() turns a 0 return for SPLICE_F_NONBLOCK into
EAGAIN.

So we need a bit of tweaking. I don't see how we can make b) work well
enough without getting hackish and breaking into the page cache and
read-ahead code, so I'd be inclined to suggest we make it conform to
your a) case. Now I didn't test yet, but it seems that just doing:

      if ((flags & SPLICE_F_NONBLOCK) && spd.nr_pages) {
              if (TestSetPageLocked(page))
                      break;
      } else
              lock_page(page);

should do that - always block for the first page and potentially return
a partial results for the remaining pages that read-ahead kicked into
gear.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-06-01 16:53                 ` H. Peter Anvin
@ 2007-06-02 15:02                   ` Jens Axboe
  0 siblings, 0 replies; 37+ messages in thread
From: Jens Axboe @ 2007-06-02 15:02 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Linus Torvalds, Eric Dumazet, linux-kernel, cotte, hugh, neilb,
	zanussi, hch

On Fri, Jun 01 2007, H. Peter Anvin wrote:
> > So there's a few things to take away from this:
> > 
> >  - regular file access MUST NOT return EAGAIN just because a page isn't 
> >    in the cache. Doing so is simply a bug. No ifs, buts or maybe's about 
> >    it!
> > 
> >    Busy-looping is NOT ACCEPTABLE!
> > 
> >  - you *could* make some alternative conventions:
> > 
> > 	(a) you could make O_NONBLOCK mean that you'll at least 
> > 	    guarantee that you *start* the IO, and while you never return 
> > 	    EAGAIN, you migth validly return a _partial_ result!
> > 
> > 	(b) variation on (a): it's ok to return EAGAIN if _you_ were the 
> > 	    one who started the IO during this particular time aroudn the 
> > 	    loop. But if you find a page that isn't up-to-date yet, and 
> > 	    you didn't start the IO, you *must* wait for it, so that you 
> > 	    end up returning EAGAIN atmost once! Exactly because 
> > 	    busy-looping is simply not acceptable behaviour!
> 
> (b) seems really ugly.  (a) is at least well-defined.  Either seems
> wrong, though.

I totally agree, b) would get nasty. And while a) isn't perfect by any
means, I do follow Linus' logic and agree it's probably the best (only?)
way to handle it.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-06-02 15:01                 ` Jens Axboe
@ 2007-06-02 15:40                   ` Linus Torvalds
  2007-06-02 16:35                     ` Jens Axboe
       [not found]                     ` <20070603130507.GA11170@mail.ustc.edu.cn>
  0 siblings, 2 replies; 37+ messages in thread
From: Linus Torvalds @ 2007-06-02 15:40 UTC (permalink / raw)
  To: Jens Axboe
  Cc: H. Peter Anvin, Eric Dumazet, linux-kernel, cotte, hugh, neilb,
	zanussi, hch



On Sat, 2 Jun 2007, Jens Axboe wrote:
>
> splice() WILL return EAGAIN, but in that case it should have triggered
> the read-ahead and thus started some IO.

That's not enough.

If the IO has already been started, splice needs to wait.

> For the from-file case, see __generic_file_splice_read(). splice does:
> 
>        if (!PageUptodate(page)) {
>                /*
>                 * If in nonblock mode then dont block on
>                 * waiting
>                 * for an in-flight io page
>                 */
>                if (flags & SPLICE_F_NONBLOCK) {
>                        if (TestSetPageLocked(page))
>                                break;
>                } else
>                        lock_page(page);

Yeah, that's just wrong.

Your suggested:

>       if ((flags & SPLICE_F_NONBLOCK) && spd.nr_pages) {
>               if (TestSetPageLocked(page))
>                       break;
>       } else
>               lock_page(page);
> 
> should do that - always block for the first page and potentially return
> a partial results for the remaining pages that read-ahead kicked into
> gear.

would work, but I suspect that for a server, returning EAGAIN once is 
actually the best option - if it has a select() loop, and something else 
is running, the "return EAGAIN once" actually makes tons of sense (it 
would basically boil down to the kernel effectively saying "ok, try 
anything else you might have pending in your queues first, if you get back 
to me, I'll block then").

		Linus

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
  2007-06-02 15:40                   ` Linus Torvalds
@ 2007-06-02 16:35                     ` Jens Axboe
       [not found]                     ` <20070603130507.GA11170@mail.ustc.edu.cn>
  1 sibling, 0 replies; 37+ messages in thread
From: Jens Axboe @ 2007-06-02 16:35 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: H. Peter Anvin, Eric Dumazet, linux-kernel, cotte, hugh, neilb,
	zanussi, hch

On Sat, Jun 02 2007, Linus Torvalds wrote:
> 
> 
> On Sat, 2 Jun 2007, Jens Axboe wrote:
> >
> > splice() WILL return EAGAIN, but in that case it should have triggered
> > the read-ahead and thus started some IO.
> 
> That's not enough.
> 
> If the IO has already been started, splice needs to wait.

But splice doesn't know, page_cache_readahead() may not have started
anything.

> > For the from-file case, see __generic_file_splice_read(). splice does:
> > 
> >        if (!PageUptodate(page)) {
> >                /*
> >                 * If in nonblock mode then dont block on
> >                 * waiting
> >                 * for an in-flight io page
> >                 */
> >                if (flags & SPLICE_F_NONBLOCK) {
> >                        if (TestSetPageLocked(page))
> >                                break;
> >                } else
> >                        lock_page(page);
> 
> Yeah, that's just wrong.
> 
> Your suggested:
> 
> >       if ((flags & SPLICE_F_NONBLOCK) && spd.nr_pages) {
> >               if (TestSetPageLocked(page))
> >                       break;
> >       } else
> >               lock_page(page);
> > 
> > should do that - always block for the first page and potentially return
> > a partial results for the remaining pages that read-ahead kicked into
> > gear.
> 
> would work, but I suspect that for a server, returning EAGAIN once is 
> actually the best option - if it has a select() loop, and something else 
> is running, the "return EAGAIN once" actually makes tons of sense (it 
> would basically boil down to the kernel effectively saying "ok, try 
> anything else you might have pending in your queues first, if you get back 
> to me, I'll block then").

Well then the current code should work, _provided_ that we know
read-ahead kicked off the IO. Well almost, still needs a bit of
tweaking, with some knowledge of whether page_cache_readahead() actually
called into read_pages() or not.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
       [not found]                     ` <20070603130507.GA11170@mail.ustc.edu.cn>
@ 2007-06-03 13:05                       ` Fengguang Wu
       [not found]                       ` <20070603142931.GA5916@mail.ustc.edu.cn>
  1 sibling, 0 replies; 37+ messages in thread
From: Fengguang Wu @ 2007-06-03 13:05 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jens Axboe, H. Peter Anvin, Eric Dumazet, linux-kernel, cotte,
	hugh, neilb, zanussi, hch

On Sat, Jun 02, 2007 at 08:40:04AM -0700, Linus Torvalds wrote:
> On Sat, 2 Jun 2007, Jens Axboe wrote:
> Your suggested:
> 
> >       if ((flags & SPLICE_F_NONBLOCK) && spd.nr_pages) {
> >               if (TestSetPageLocked(page))
> >                       break;
> >       } else
> >               lock_page(page);
> > 
> > should do that - always block for the first page and potentially return
> > a partial results for the remaining pages that read-ahead kicked into
> > gear.
> 
> would work, but I suspect that for a server, returning EAGAIN once is 
> actually the best option - if it has a select() loop, and something else 
> is running, the "return EAGAIN once" actually makes tons of sense (it 
> would basically boil down to the kernel effectively saying "ok, try 
> anything else you might have pending in your queues first, if you get back 
> to me, I'll block then").

May be we can settle with "return EAGAIN once" for *all* possible
waits, including I/O submitted by others:

          if ((flags & SPLICE_F_NONBLOCK) &&
                  TestSetPageLocked(page) &&
                  in->f_ra.prev_index != index)
                  break;
          else
                  lock_page(page);

However, the upper layer generic_file_splice_read() may keep
on calling __generic_file_splice_read(), so we need more code to
stop it with flag SPLICE_INTERNAL_WILLBLOCK:

---
(not tested yet; still not sure if it's the best solution.)

In non-block splicing, return EAGAIN once for all possible I/O waits.

It works by checking (ra.prev_index != index) in
__generic_file_splice_read().  Another reader on the same fd will at
worst cause a few more splice syscalls.

If keep calling generic_file_splice_read() in non-block mode, it will
return partial data before the first hole; then return EAGAIN at
second call; at last it will block on the I/O page.

Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
---
 fs/splice.c               |   38 ++++++++++++++++++++++--------------
 include/linux/pipe_fs_i.h |    2 +
 2 files changed, 26 insertions(+), 14 deletions(-)

--- linux-2.6.22-rc3-mm1.orig/fs/splice.c
+++ linux-2.6.22-rc3-mm1/fs/splice.c
@@ -264,10 +264,11 @@ static ssize_t splice_to_pipe(struct pip
 static int
 __generic_file_splice_read(struct file *in, loff_t *ppos,
 			   struct pipe_inode_info *pipe, size_t len,
-			   unsigned int flags)
+			   unsigned int *flags)
 {
 	struct address_space *mapping = in->f_mapping;
 	unsigned int loff, nr_pages;
+	unsigned int first_hole;
 	struct page *pages[PIPE_BUFFERS];
 	struct partial_page partial[PIPE_BUFFERS];
 	struct page *page;
@@ -278,7 +279,7 @@ __generic_file_splice_read(struct file *
 	struct splice_pipe_desc spd = {
 		.pages = pages,
 		.partial = partial,
-		.flags = flags,
+		.flags = *flags,
 		.ops = &page_cache_pipe_buf_ops,
 	};
 
@@ -299,12 +300,17 @@ __generic_file_splice_read(struct file *
 	 * Lookup the (hopefully) full range of pages we need.
 	 */
 	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
+	first_hole = spd.nr_pages;
+	index += spd.nr_pages;
 
 	/*
 	 * If find_get_pages_contig() returned fewer pages than we needed,
-	 * allocate the rest.
+	 * readahead/allocate the rest.
 	 */
-	index += spd.nr_pages;
+	if (spd.nr_pages < nr_pages)
+		page_cache_readahead_ondemand(mapping, &in->f_ra, in,
+				NULL, index, nr_pages - spd.nr_pages);
+
 	while (spd.nr_pages < nr_pages) {
 		/*
 		 * Page could be there, find_get_pages_contig() breaks on
@@ -312,9 +318,6 @@ __generic_file_splice_read(struct file *
 		 */
 		page = find_get_page(mapping, index);
 		if (!page) {
-			page_cache_readahead_ondemand(mapping, &in->f_ra, in,
-					NULL, index, nr_pages - spd.nr_pages);
-
 			/*
 			 * page didn't exist, allocate one.
 			 */
@@ -369,13 +372,13 @@ __generic_file_splice_read(struct file *
 		 */
 		if (!PageUptodate(page)) {
 			/*
-			 * If in nonblock mode then dont block on waiting
-			 * for an in-flight io page
+			 * In non-block mode, only block at the second try.
 			 */
-			if (flags & SPLICE_F_NONBLOCK) {
-				if (TestSetPageLocked(page))
-					break;
-			} else
+			if ((*flags & SPLICE_F_NONBLOCK) &&
+				TestSetPageLocked(page) &&
+				in->f_ra.prev_index != index)
+				break;
+			else
 				lock_page(page);
 
 			/*
@@ -454,6 +457,10 @@ fill_it:
 		page_cache_release(pages[page_nr++]);
 	in->f_ra.prev_index = index;
 
+	if ((*flags & SPLICE_F_NONBLOCK) && (spd.nr_pages > first_hole)) {
+		*flags |= SPLICE_INTERNAL_WILLBLOCK;
+		spd.nr_pages = first_hole;
+	}
 	if (spd.nr_pages)
 		return splice_to_pipe(pipe, &spd);
 
@@ -480,7 +487,7 @@ ssize_t generic_file_splice_read(struct 
 	spliced = 0;
 
 	while (len) {
-		ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
+		ret = __generic_file_splice_read(in, ppos, pipe, len, &flags);
 
 		if (ret < 0)
 			break;
@@ -496,6 +503,9 @@ ssize_t generic_file_splice_read(struct 
 		*ppos += ret;
 		len -= ret;
 		spliced += ret;
+
+		if (flags & SPLICE_INTERNAL_WILLBLOCK)
+			break;
 	}
 
 	if (spliced)
--- linux-2.6.22-rc3-mm1.orig/include/linux/pipe_fs_i.h
+++ linux-2.6.22-rc3-mm1/include/linux/pipe_fs_i.h
@@ -82,6 +82,8 @@ int generic_pipe_buf_steal(struct pipe_i
 #define SPLICE_F_MORE	(0x04)	/* expect more data */
 #define SPLICE_F_GIFT	(0x08)	/* pages passed in are a gift */
 
+#define SPLICE_INTERNAL_WILLBLOCK  (0x100) /* read on next page will block */
+
 /*
  * Passed to the actors
  */


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
       [not found]                       ` <20070603142931.GA5916@mail.ustc.edu.cn>
@ 2007-06-03 14:29                         ` Fengguang Wu
       [not found]                         ` <20070604004647.GA8076@mail.ustc.edu.cn>
  1 sibling, 0 replies; 37+ messages in thread
From: Fengguang Wu @ 2007-06-03 14:29 UTC (permalink / raw)
  To: Linus Torvalds, Jens Axboe, H. Peter Anvin, Eric Dumazet,
	linux-kernel, cotte, hugh, neilb, zanussi, hch

On Sun, Jun 03, 2007 at 09:05:08PM +0800, Fengguang Wu wrote:
> In non-block splicing, return EAGAIN once for all possible I/O waits.
> 
> It works by checking (ra.prev_index != index) in
> __generic_file_splice_read().  Another reader on the same fd will at
> worst cause a few more splice syscalls.
> 
> If keep calling generic_file_splice_read() in non-block mode, it will
> return partial data before the first hole; then return EAGAIN at
> second call; at last it will block on the I/O page.

Here is a better one, comments are welcome.
---

In non-block splicing read, return EAGAIN once for all possible I/O waits.

It avoids busy waiting by checking (ra.prev_index != index) in
__generic_file_splice_read().  Possible other readers on the same fd
may temporarily cheat the logic, but will at worst cause a few more
splice syscalls.

If keep calling generic_file_splice_read() in non-blocking mode, it
will return partial data before the first wait point; then return
EAGAIN at second call; at last it will block on the I/O page. This
behavior should be compatible with old apps that are unaware of
EAGAIN.

Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
---
 fs/splice.c               |   33 ++++++++++++++++++---------------
 include/linux/pipe_fs_i.h |    2 ++
 2 files changed, 20 insertions(+), 15 deletions(-)

--- linux-2.6.22-rc3-mm1.orig/fs/splice.c
+++ linux-2.6.22-rc3-mm1/fs/splice.c
@@ -264,7 +264,7 @@ static ssize_t splice_to_pipe(struct pip
 static int
 __generic_file_splice_read(struct file *in, loff_t *ppos,
 			   struct pipe_inode_info *pipe, size_t len,
-			   unsigned int flags)
+			   unsigned int *flags)
 {
 	struct address_space *mapping = in->f_mapping;
 	unsigned int loff, nr_pages;
@@ -278,7 +278,7 @@ __generic_file_splice_read(struct file *
 	struct splice_pipe_desc spd = {
 		.pages = pages,
 		.partial = partial,
-		.flags = flags,
+		.flags = *flags,
 		.ops = &page_cache_pipe_buf_ops,
 	};
 
@@ -299,12 +299,16 @@ __generic_file_splice_read(struct file *
 	 * Lookup the (hopefully) full range of pages we need.
 	 */
 	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
+	index += spd.nr_pages;
 
 	/*
 	 * If find_get_pages_contig() returned fewer pages than we needed,
-	 * allocate the rest.
+	 * readahead/allocate the rest.
 	 */
-	index += spd.nr_pages;
+	if (spd.nr_pages < nr_pages)
+		page_cache_readahead_ondemand(mapping, &in->f_ra, in,
+				NULL, index, nr_pages - spd.nr_pages);
+
 	while (spd.nr_pages < nr_pages) {
 		/*
 		 * Page could be there, find_get_pages_contig() breaks on
@@ -312,9 +316,6 @@ __generic_file_splice_read(struct file *
 		 */
 		page = find_get_page(mapping, index);
 		if (!page) {
-			page_cache_readahead_ondemand(mapping, &in->f_ra, in,
-					NULL, index, nr_pages - spd.nr_pages);
-
 			/*
 			 * page didn't exist, allocate one.
 			 */
@@ -326,8 +327,6 @@ __generic_file_splice_read(struct file *
 					      GFP_KERNEL);
 			if (unlikely(error)) {
 				page_cache_release(page);
-				if (error == -EEXIST)
-					continue;
 				break;
 			}
 			/*
@@ -369,12 +368,13 @@ __generic_file_splice_read(struct file *
 		 */
 		if (!PageUptodate(page)) {
 			/*
-			 * If in nonblock mode then dont block on waiting
-			 * for an in-flight io page
+			 * In non-block mode, only block at the second try.
 			 */
-			if (flags & SPLICE_F_NONBLOCK) {
-				if (TestSetPageLocked(page))
-					break;
+			if ((*flags & SPLICE_F_NONBLOCK) &&
+				TestSetPageLocked(page) &&
+				in->f_ra.prev_index != index) {
+				*flags |= SPLICE_INTERNAL_WILLBLOCK;
+				break;
 			} else
 				lock_page(page);
 
@@ -480,7 +480,7 @@ ssize_t generic_file_splice_read(struct 
 	spliced = 0;
 
 	while (len) {
-		ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
+		ret = __generic_file_splice_read(in, ppos, pipe, len, &flags);
 
 		if (ret < 0)
 			break;
@@ -496,6 +496,9 @@ ssize_t generic_file_splice_read(struct 
 		*ppos += ret;
 		len -= ret;
 		spliced += ret;
+
+		if (flags & SPLICE_INTERNAL_WILLBLOCK)
+			break;
 	}
 
 	if (spliced)
--- linux-2.6.22-rc3-mm1.orig/include/linux/pipe_fs_i.h
+++ linux-2.6.22-rc3-mm1/include/linux/pipe_fs_i.h
@@ -82,6 +82,8 @@ int generic_pipe_buf_steal(struct pipe_i
 #define SPLICE_F_MORE	(0x04)	/* expect more data */
 #define SPLICE_F_GIFT	(0x08)	/* pages passed in are a gift */
 
+#define SPLICE_INTERNAL_WILLBLOCK  (0x100) /* read on next page will block */
+
 /*
  * Passed to the actors
  */


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
       [not found]                         ` <20070604004647.GA8076@mail.ustc.edu.cn>
@ 2007-06-04  0:46                           ` Fengguang Wu
  2007-06-04  8:05                           ` Jens Axboe
  1 sibling, 0 replies; 37+ messages in thread
From: Fengguang Wu @ 2007-06-04  0:46 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Linus Torvalds, H. Peter Anvin, Eric Dumazet, linux-kernel, cotte,
	hugh, neilb, zanussi, hch

Hi Jens,

This is another try, still not in a comfortable state though.
//Busy waiting is possible for interleaved reads.

Fengguang
---

In non-block splicing read, return EAGAIN once for possible I/O waits.

It avoids busy waiting by checking (ra.prev_index != index) in
__generic_file_splice_read().  If there are other readers on the same
fd, busy waiting is still possible.

If keep calling generic_file_splice_read() in non-blocking mode, it
will return partial data before the first wait point; then return
EAGAIN at second call; at last it will block on the I/O page.

Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
---
 fs/splice.c               |   41 +++++++++++++++++++++---------------
 include/linux/pipe_fs_i.h |    2 +
 2 files changed, 27 insertions(+), 16 deletions(-)

--- linux-2.6.22-rc3-mm1.orig/fs/splice.c
+++ linux-2.6.22-rc3-mm1/fs/splice.c
@@ -264,7 +264,7 @@ static ssize_t splice_to_pipe(struct pip
 static int
 __generic_file_splice_read(struct file *in, loff_t *ppos,
 			   struct pipe_inode_info *pipe, size_t len,
-			   unsigned int flags)
+			   unsigned int *flags)
 {
 	struct address_space *mapping = in->f_mapping;
 	unsigned int loff, nr_pages;
@@ -278,7 +278,7 @@ __generic_file_splice_read(struct file *
 	struct splice_pipe_desc spd = {
 		.pages = pages,
 		.partial = partial,
-		.flags = flags,
+		.flags = *flags,
 		.ops = &page_cache_pipe_buf_ops,
 	};
 
@@ -299,12 +299,16 @@ __generic_file_splice_read(struct file *
 	 * Lookup the (hopefully) full range of pages we need.
 	 */
 	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
+	index += spd.nr_pages;
 
 	/*
 	 * If find_get_pages_contig() returned fewer pages than we needed,
-	 * allocate the rest.
+	 * readahead/allocate the rest.
 	 */
-	index += spd.nr_pages;
+	if (spd.nr_pages < nr_pages)
+		page_cache_readahead_ondemand(mapping, &in->f_ra, in,
+				NULL, index, nr_pages - spd.nr_pages);
+
 	while (spd.nr_pages < nr_pages) {
 		/*
 		 * Page could be there, find_get_pages_contig() breaks on
@@ -312,9 +316,6 @@ __generic_file_splice_read(struct file *
 		 */
 		page = find_get_page(mapping, index);
 		if (!page) {
-			page_cache_readahead_ondemand(mapping, &in->f_ra, in,
-					NULL, index, nr_pages - spd.nr_pages);
-
 			/*
 			 * page didn't exist, allocate one.
 			 */
@@ -326,8 +327,6 @@ __generic_file_splice_read(struct file *
 					      GFP_KERNEL);
 			if (unlikely(error)) {
 				page_cache_release(page);
-				if (error == -EEXIST)
-					continue;
 				break;
 			}
 			/*
@@ -369,12 +368,19 @@ __generic_file_splice_read(struct file *
 		 */
 		if (!PageUptodate(page)) {
 			/*
-			 * If in nonblock mode then dont block on waiting
-			 * for an in-flight io page
+			 * On retrying sequential reads in non-blocking mode:
+			 * 1. (prev_index <= index - 2)  => return partial data
+			 * 2. (prev_index == index - 1)  => return EAGAIN
+			 * 3. (prev_index == index)      => wait on I/O
+			 * This works nicely for page aligned reads.
 			 */
-			if (flags & SPLICE_F_NONBLOCK) {
-				if (TestSetPageLocked(page))
-					break;
+			if ((*flags & SPLICE_F_NONBLOCK) &&
+				TestSetPageLocked(page) &&
+				in->f_ra.prev_index != index) {
+				if (in->f_ra.prev_index != index - 1)
+					index--;
+				*flags |= SPLICE_INTERNAL_WILLBLOCK;
+				break;
 			} else
 				lock_page(page);
 
@@ -452,7 +458,6 @@ fill_it:
 	 */
 	while (page_nr < nr_pages)
 		page_cache_release(pages[page_nr++]);
-	in->f_ra.prev_index = index;
 
 	if (spd.nr_pages)
 		return splice_to_pipe(pipe, &spd);
@@ -480,7 +485,7 @@ ssize_t generic_file_splice_read(struct 
 	spliced = 0;
 
 	while (len) {
-		ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
+		ret = __generic_file_splice_read(in, ppos, pipe, len, &flags);
 
 		if (ret < 0)
 			break;
@@ -496,8 +501,12 @@ ssize_t generic_file_splice_read(struct 
 		*ppos += ret;
 		len -= ret;
 		spliced += ret;
+
+		if (flags & SPLICE_INTERNAL_WILLBLOCK)
+			break;
 	}
 
+	in->f_ra.prev_index = (*ppos - 1) >> PAGE_CACHE_SHIFT;
 	if (spliced)
 		return spliced;
 
--- linux-2.6.22-rc3-mm1.orig/include/linux/pipe_fs_i.h
+++ linux-2.6.22-rc3-mm1/include/linux/pipe_fs_i.h
@@ -82,6 +82,8 @@ int generic_pipe_buf_steal(struct pipe_i
 #define SPLICE_F_MORE	(0x04)	/* expect more data */
 #define SPLICE_F_GIFT	(0x08)	/* pages passed in are a gift */
 
+#define SPLICE_INTERNAL_WILLBLOCK  (0x100) /* read on next page will block */
+
 /*
  * Passed to the actors
  */


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
       [not found]                         ` <20070604004647.GA8076@mail.ustc.edu.cn>
  2007-06-04  0:46                           ` Fengguang Wu
@ 2007-06-04  8:05                           ` Jens Axboe
       [not found]                             ` <20070604112214.GA7457@mail.ustc.edu.cn>
  1 sibling, 1 reply; 37+ messages in thread
From: Jens Axboe @ 2007-06-04  8:05 UTC (permalink / raw)
  To: Linus Torvalds, H. Peter Anvin, Eric Dumazet, linux-kernel, cotte,
	hugh, neilb, zanussi, hch

On Mon, Jun 04 2007, Fengguang Wu wrote:
> Hi Jens,
> 
> This is another try, still not in a comfortable state though.
> //Busy waiting is possible for interleaved reads.

A few random comments...

Adding an internal flag is fine, but please put it at the upper end of
the spectrum. So, use (1 << 31) for that flag.

And please work on the #splice branch of the block repo, not -mm. There
are quite a few things pending for inclusion in there, and I'm sure your
patch as-is wont apply.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH] sendfile removal
       [not found]                             ` <20070604112214.GA7457@mail.ustc.edu.cn>
@ 2007-06-04 11:22                               ` Fengguang Wu
  0 siblings, 0 replies; 37+ messages in thread
From: Fengguang Wu @ 2007-06-04 11:22 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Linus Torvalds, H. Peter Anvin, Eric Dumazet, linux-kernel, cotte,
	hugh, neilb, zanussi, hch

On Mon, Jun 04, 2007 at 10:05:35AM +0200, Jens Axboe wrote:
> On Mon, Jun 04 2007, Fengguang Wu wrote:
> > Hi Jens,
> > 
> > This is another try, still not in a comfortable state though.
> > //Busy waiting is possible for interleaved reads.
> 
> A few random comments...
> 
> Adding an internal flag is fine, but please put it at the upper end of
> the spectrum. So, use (1 << 31) for that flag.

OK.

> And please work on the #splice branch of the block repo, not -mm. There
> are quite a few things pending for inclusion in there, and I'm sure your
> patch as-is wont apply.

I'm afraid this patch cannot be moved over to your branch trivially.

The core of the algorithm reuses f_ra.prev_index to record the state.
It is OK for the on-demand readahead in the -mm tree. But the current
readahead code in 2.6.22-rc3 is sensible to the change. And it also
does not reliably tell if readahead I/O has been submitted.

We can either try other ways of doing non-blocking I/O, or just wait
until the merge of on-demand readahead?

The current patch should work perfect with single splice reader. In
the case of multiple readers on the same fd, we might simply err on
the side of I/O waiting, since busy EAGAIN looping is not acceptable.

Fengguang Wu


^ permalink raw reply	[flat|nested] 37+ messages in thread

end of thread, other threads:[~2007-06-04 11:22 UTC | newest]

Thread overview: 37+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-05-31 10:33 [PATCH] sendfile removal Jens Axboe
2007-05-31 10:47 ` Jens Axboe
2007-05-31 10:47 ` Eric Dumazet
2007-05-31 10:53   ` Jens Axboe
2007-06-01  4:09     ` H. Peter Anvin
2007-06-01  5:41       ` Jens Axboe
2007-06-01  5:50         ` H. Peter Anvin
2007-06-01  7:22           ` Eric Dumazet
2007-06-01 15:52             ` H. Peter Anvin
2007-06-01 16:18               ` Linus Torvalds
2007-06-01 16:47                 ` Eric Dumazet
2007-06-01 16:53                 ` H. Peter Anvin
2007-06-02 15:02                   ` Jens Axboe
2007-06-02 15:01                 ` Jens Axboe
2007-06-02 15:40                   ` Linus Torvalds
2007-06-02 16:35                     ` Jens Axboe
     [not found]                     ` <20070603130507.GA11170@mail.ustc.edu.cn>
2007-06-03 13:05                       ` Fengguang Wu
     [not found]                       ` <20070603142931.GA5916@mail.ustc.edu.cn>
2007-06-03 14:29                         ` Fengguang Wu
     [not found]                         ` <20070604004647.GA8076@mail.ustc.edu.cn>
2007-06-04  0:46                           ` Fengguang Wu
2007-06-04  8:05                           ` Jens Axboe
     [not found]                             ` <20070604112214.GA7457@mail.ustc.edu.cn>
2007-06-04 11:22                               ` Fengguang Wu
2007-06-01 16:22               ` Pádraig Brady
2007-05-31 10:55 ` Christoph Hellwig
2007-05-31 11:05   ` Jens Axboe
2007-05-31 12:26     ` Neil Brown
2007-05-31 12:27       ` Jens Axboe
2007-06-01  2:44         ` [PATCH] sendfile removal (nfsd update) Neil Brown
2007-06-01  5:44           ` Jens Axboe
2007-06-01  8:01             ` Jens Axboe
2007-06-01  8:15     ` [PATCH] sendfile removal Jens Axboe
2007-05-31 11:04 ` Carsten Otte
2007-05-31 11:06   ` Jens Axboe
2007-05-31 15:33 ` Tom Zanussi
2007-05-31 19:01   ` Jens Axboe
2007-05-31 17:06 ` Hugh Dickins
2007-05-31 17:31   ` Christoph Hellwig
2007-05-31 19:03   ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox