public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] generic_file_sendpage
@ 2005-07-15 10:48 Jan Blunck
  2005-07-15 11:06 ` Andrew Morton
  0 siblings, 1 reply; 11+ messages in thread
From: Jan Blunck @ 2005-07-15 10:48 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Linux-Kernel Mailing List, Jörn Engel

[-- Attachment #1: Type: text/plain, Size: 643 bytes --]

This is a generic sendpage() for regular files.

With generic_file_sendpage() it is possible to use sendfile() on file 
targets, instead of only sending data to sockets.

This implementation is basically an extension of Joern's original patch 
(http://marc.theaimsgroup.com/?l=linux-kernel&m=109455958522766&w=2) but 
is honoring signals. I also removed some unnecessary code: no IOVs, no AIO.

qemu-debian:/home/root/sendfile_file# time ./fastcp 100mb test
real    0m11.037s
user    0m0.010s
sys     0m7.600s

qemu-debian:/home/root/sendfile_file# time cp 100mb test
real    0m13.342s
user    0m0.400s
sys     0m9.080s

Comments please,
Jan


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: generic_file_sendpage.diff --]
[-- Type: text/x-patch; name="generic_file_sendpage.diff", Size: 6778 bytes --]

Signed-off-by: Jörn Engel <joern@wohnheim.fh-wedel.de>
Signed-off-by: Jan Blunck <j.blunck@tu-harburg.de>

 fs/ext2/file.c     |    1 
 fs/ext3/file.c     |    1 
 include/linux/fs.h |    2 
 mm/filemap.c       |  170 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 174 insertions(+)

Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h
+++ linux-2.6/include/linux/fs.h
@@ -1502,6 +1502,8 @@ extern ssize_t do_sync_write(struct file
 ssize_t generic_file_write_nolock(struct file *file, const struct iovec *iov,
 				unsigned long nr_segs, loff_t *ppos);
 extern ssize_t generic_file_sendfile(struct file *, loff_t *, size_t, read_actor_t, void *);
+extern ssize_t generic_file_sendpage(struct file *, struct page *, int, size_t,
+				     loff_t *, int);
 extern void do_generic_mapping_read(struct address_space *mapping,
 				    struct file_ra_state *, struct file *,
 				    loff_t *, read_descriptor_t *, read_actor_t);
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -1086,6 +1086,27 @@ int file_send_actor(read_descriptor_t * 
 	return written;
 }
 
+/*
+ * Simple generic file sendpage. Just write a kernel buffer to the file.
+ */
+static ssize_t
+__generic_kernel_file_write(struct file *, const char *, size_t, loff_t *);
+
+ssize_t generic_file_sendpage(struct file *file, struct page *page,
+			      int offset, size_t size, loff_t *ppos, int more)
+{
+	ssize_t ret;
+	char *kaddr;
+
+	kaddr = kmap(page);
+	ret = __generic_kernel_file_write(file, kaddr + offset, size, ppos);
+	kunmap(page);
+
+	return ret;
+}
+
+EXPORT_SYMBOL(generic_file_sendpage);
+
 ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
 			 size_t count, read_actor_t actor, void *target)
 {
@@ -1715,6 +1736,19 @@ int remove_suid(struct dentry *dentry)
 }
 EXPORT_SYMBOL(remove_suid);
 
+static inline size_t
+filemap_copy_from_kernel(struct page *page, unsigned long offset,
+			 const char *buf, unsigned bytes)
+{
+	char *kaddr;
+
+	kaddr = kmap(page);
+	memcpy(kaddr + offset, buf, bytes);
+	kunmap(page);
+
+	return bytes;
+}
+
 size_t
 __filemap_copy_from_user_iovec(char *vaddr, 
 			const struct iovec *iov, size_t base, size_t bytes)
@@ -1862,6 +1896,142 @@ generic_file_direct_write(struct kiocb *
 }
 EXPORT_SYMBOL(generic_file_direct_write);
 
+/*
+ * TODO:
+ * This largely tries to copy generic_file_aio_write_nolock(), although it
+ * doesn't have to be nearly as generic.  A real cleanup should either
+ * merge this into generic_file_aio_write_nolock() as well or keep it special
+ * and remove as much code as possible.
+ *
+ * Check for pending signals here. Otherwise return -EINTR early.
+ *
+ * No iov, no kiocb. If you think this is a problem, use the source ;)
+ */
+static ssize_t
+__generic_kernel_file_write(struct file *file, const char *buf,
+			    size_t count, loff_t *ppos)
+{
+	struct address_space * mapping = file->f_mapping;
+	struct address_space_operations *a_ops = mapping->a_ops;
+	struct inode 	*inode = mapping->host;
+	long		status = 0;
+	loff_t		pos;
+	struct page	*page;
+	struct page	*cached_page = NULL;
+	const int	isblk = S_ISBLK(inode->i_mode);
+	ssize_t		written;
+	ssize_t		err;
+	size_t		bytes;
+	struct pagevec	lru_pvec;
+
+	/* There is no sane reason to use O_DIRECT */
+	BUG_ON(file->f_flags & O_DIRECT);
+
+	if (unlikely(signal_pending(current)))
+		return -EINTR;
+
+	if (unlikely(count < 0))
+		return -EINVAL;
+
+	down(&inode->i_sem);
+
+	pos = *ppos;
+	pagevec_init(&lru_pvec, 0);
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = mapping->backing_dev_info;
+	written = 0;
+
+	err = generic_write_checks(file, &pos, &count, isblk);
+	if (err)
+		goto out;
+
+	if (count == 0)
+		goto out;
+
+	remove_suid(file->f_dentry);
+	inode_update_time(inode, 1);
+
+	do {
+		unsigned long index;
+		unsigned long offset;
+		size_t copied;
+
+		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+		index = pos >> PAGE_CACHE_SHIFT;
+		bytes = PAGE_CACHE_SIZE - offset;
+		if (bytes > count)
+			bytes = count;
+
+		page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
+		if (!page) {
+			status = -ENOMEM;
+			break;
+		}
+
+		status = a_ops->prepare_write(file, page, offset, offset+bytes);
+		if (unlikely(status)) {
+			loff_t isize = i_size_read(inode);
+			/*
+			 * prepare_write() may have instantiated a few blocks
+			 * outside i_size.  Trim these off again.
+			 */
+			unlock_page(page);
+			page_cache_release(page);
+			if (pos + bytes > isize)
+				vmtruncate(inode, isize);
+			break;
+		}
+
+		copied = filemap_copy_from_kernel(page, offset, buf, bytes);
+
+		flush_dcache_page(page);
+		status = a_ops->commit_write(file, page, offset, offset+bytes);
+		if (likely(copied > 0)) {
+			if (!status)
+				status = copied;
+
+			if (status >= 0) {
+				written += status;
+				count -= status;
+				pos += status;
+				buf += status;
+			}
+		}
+		if (unlikely(copied != bytes))
+			if (status >= 0)
+				status = -EFAULT;
+		unlock_page(page);
+		mark_page_accessed(page);
+		page_cache_release(page);
+		if (status < 0)
+			break;
+		balance_dirty_pages_ratelimited(mapping);
+		cond_resched();
+	} while (count);
+	*ppos = pos;
+
+	if (cached_page)
+		page_cache_release(cached_page);
+
+	/*
+	 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
+	 */
+	if (status >= 0) {
+		if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
+			status = generic_osync_inode(inode, mapping,
+					OSYNC_METADATA|OSYNC_DATA);
+	}
+
+	err = written ? written : status;
+out:
+	pagevec_lru_add(&lru_pvec);
+	current->backing_dev_info = 0;
+
+	up(&inode->i_sem);
+	return err;
+}
+
 ssize_t
 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t pos, loff_t *ppos,
Index: linux-2.6/fs/ext2/file.c
===================================================================
--- linux-2.6.orig/fs/ext2/file.c
+++ linux-2.6/fs/ext2/file.c
@@ -53,6 +53,7 @@ struct file_operations ext2_file_operati
 	.readv		= generic_file_readv,
 	.writev		= generic_file_writev,
 	.sendfile	= generic_file_sendfile,
+	.sendpage	= generic_file_sendpage,
 };
 
 #ifdef CONFIG_EXT2_FS_XIP
Index: linux-2.6/fs/ext3/file.c
===================================================================
--- linux-2.6.orig/fs/ext3/file.c
+++ linux-2.6/fs/ext3/file.c
@@ -119,6 +119,7 @@ struct file_operations ext3_file_operati
 	.release	= ext3_release_file,
 	.fsync		= ext3_sync_file,
 	.sendfile	= generic_file_sendfile,
+	.sendpage	= generic_file_sendpage,
 };
 
 struct inode_operations ext3_file_inode_operations = {

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] generic_file_sendpage
  2005-07-15 10:48 [PATCH] generic_file_sendpage Jan Blunck
@ 2005-07-15 11:06 ` Andrew Morton
  2005-07-15 11:22   ` Jörn Engel
                     ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Andrew Morton @ 2005-07-15 11:06 UTC (permalink / raw)
  To: Jan Blunck; +Cc: torvalds, linux-kernel, joern

Jan Blunck <j.blunck@tu-harburg.de> wrote:
>
> This is a generic sendpage() for regular files.
> 

> +static inline size_t
> +filemap_copy_from_kernel(struct page *page, unsigned long offset,
> +			 const char *buf, unsigned bytes)
> +{
> +	char *kaddr;
> +
> +	kaddr = kmap(page);
> +	memcpy(kaddr + offset, buf, bytes);
> +	kunmap(page);

Use kmap_atomic().

Move the flush_dcache_page() into here.

> +static ssize_t
> +__generic_kernel_file_write(struct file *file, const char *buf,
> +			    size_t count, loff_t *ppos)
> +{
> +	struct address_space * mapping = file->f_mapping;
> +	struct address_space_operations *a_ops = mapping->a_ops;
> +	struct inode 	*inode = mapping->host;
> +	long		status = 0;
> +	loff_t		pos;
> +	struct page	*page;
> +	struct page	*cached_page = NULL;
> +	const int	isblk = S_ISBLK(inode->i_mode);
> +	ssize_t		written;
> +	ssize_t		err;
> +	size_t		bytes;
> +	struct pagevec	lru_pvec;
> +
> +	/* There is no sane reason to use O_DIRECT */
> +	BUG_ON(file->f_flags & O_DIRECT);

err, this seems like an easy way for people to make the kernel go BUG.

> +	if (unlikely(signal_pending(current)))
> +		return -EINTR;

This doesn't help.  The reason we've avoided file-to-file sendfile() is
that it can cause applications to get uninterruptibly stuck in the kernel
for ages.  This code doesn't solve that problem.  It needs to handle
signal_pending() inside the main loop.

And it probably needs to return a sane value (number of bytes copied)
rather than -EINTR.

I don't know if we want to add this feature, really.  It's such a
specialised thing.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] generic_file_sendpage
  2005-07-15 11:06 ` Andrew Morton
@ 2005-07-15 11:22   ` Jörn Engel
  2005-07-15 13:13     ` Jan Engelhardt
  2005-07-21  5:34     ` Herbert Poetzl
  2005-07-15 12:34   ` [PATCH] generic_file_sendpage (updated patch) Jan Blunck
  2005-07-15 16:01   ` [PATCH] generic_file_sendpage Linus Torvalds
  2 siblings, 2 replies; 11+ messages in thread
From: Jörn Engel @ 2005-07-15 11:22 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Jan Blunck, torvalds, linux-kernel

On Fri, 15 July 2005 04:06:11 -0700, Andrew Morton wrote:
> > +
> > +	/* There is no sane reason to use O_DIRECT */
> > +	BUG_ON(file->f_flags & O_DIRECT);
> 
> err, this seems like an easy way for people to make the kernel go BUG.

Is there a sane use for O_DIRECT in combination with sendfile()?

If not, I'd like to change sys_sendfile() and return -EINVAL for
O_DIRECT file descriptors.

> > +	if (unlikely(signal_pending(current)))
> > +		return -EINTR;
> 
> This doesn't help.  The reason we've avoided file-to-file sendfile() is
> that it can cause applications to get uninterruptibly stuck in the kernel
> for ages.  This code doesn't solve that problem.  It needs to handle
> signal_pending() inside the main loop.
> 
> And it probably needs to return a sane value (number of bytes copied)
> rather than -EINTR.

Makes sense.

> I don't know if we want to add this feature, really.  It's such a
> specialised thing.

With union mount and cowlink, there are two users already.  cp(1)
could use it as well, even if the improvement is quite minimal.

Jörn

-- 
All art is but imitation of nature.
-- Lucius Annaeus Seneca

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] generic_file_sendpage (updated patch)
  2005-07-15 11:06 ` Andrew Morton
  2005-07-15 11:22   ` Jörn Engel
@ 2005-07-15 12:34   ` Jan Blunck
  2005-07-15 16:01   ` [PATCH] generic_file_sendpage Linus Torvalds
  2 siblings, 0 replies; 11+ messages in thread
From: Jan Blunck @ 2005-07-15 12:34 UTC (permalink / raw)
  To: Andrew Morton; +Cc: torvalds, linux-kernel, joern

[-- Attachment #1: Type: text/plain, Size: 1852 bytes --]

Andrew Morton wrote:
 >
 >>+static inline size_t
 >>+filemap_copy_from_kernel(struct page *page, unsigned long offset,
 >>+			 const char *buf, unsigned bytes)
 >>+{
 >>+	char *kaddr;
 >>+
 >>+	kaddr = kmap(page);
 >>+	memcpy(kaddr + offset, buf, bytes);
 >>+	kunmap(page);
 >
 >
 > Use kmap_atomic().
 >
 > Move the flush_dcache_page() into here.
 >

Ok

 >>+
 >>+	/* There is no sane reason to use O_DIRECT */
 >>+	BUG_ON(file->f_flags & O_DIRECT);
 >
 >
 > err, this seems like an easy way for people to make the kernel go BUG.
 >

Hmm, yeah ... -EINVAL should do it.

 >
 >>+	if (unlikely(signal_pending(current)))
 >>+		return -EINTR;
 >
 >
 > This doesn't help.  The reason we've avoided file-to-file sendfile() is
 > that it can cause applications to get uninterruptibly stuck in the kernel
 > for ages.  This code doesn't solve that problem.  It needs to handle
 > signal_pending() inside the main loop.

The sendfile() mainloop is in generic_file_sendfile().
generic_file_sendpage() is called via the file_send_actor() from
do_generic_file_read(). Therefore generic_file_sendpage() is called for
every page which is written to the file. Since signals are checked
before every page this is solving the uninterruptible problem. IMHO, the
sendpage() should not be interrupted in between a single page.

 >
 > And it probably needs to return a sane value (number of bytes copied)
 > rather than -EINTR.

The return value of sendfile() is not the same as the return value of
sendpage(). This is done in file_send_actor().

 >
 > I don't know if we want to add this feature, really.  It's such a
 > specialised thing.

This issue comes up every now and than. I know of at least two users: 
COW links and my union mounts. I think this solution is quite good. I 
tested it under qemu and it is working for me.

Jan

BTW: I'm out for the weekend now.


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: generic_file_sendpage.diff --]
[-- Type: text/x-patch; name="generic_file_sendpage.diff", Size: 6828 bytes --]

Signed-off-by: Jörn Engel <joern@wohnheim.fh-wedel.de>
Signed-off-by: Jan Blunck <j.blunck@tu-harburg.de>

 fs/ext2/file.c     |    1 
 fs/ext3/file.c     |    1 
 include/linux/fs.h |    2 
 mm/filemap.c       |  172 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 176 insertions(+)

Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h
+++ linux-2.6/include/linux/fs.h
@@ -1502,6 +1502,8 @@ extern ssize_t do_sync_write(struct file
 ssize_t generic_file_write_nolock(struct file *file, const struct iovec *iov,
 				unsigned long nr_segs, loff_t *ppos);
 extern ssize_t generic_file_sendfile(struct file *, loff_t *, size_t, read_actor_t, void *);
+extern ssize_t generic_file_sendpage(struct file *, struct page *, int, size_t,
+				     loff_t *, int);
 extern void do_generic_mapping_read(struct address_space *mapping,
 				    struct file_ra_state *, struct file *,
 				    loff_t *, read_descriptor_t *, read_actor_t);
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -1086,6 +1086,27 @@ int file_send_actor(read_descriptor_t * 
 	return written;
 }
 
+/*
+ * Simple generic file sendpage. Just write a kernel buffer to the file.
+ */
+static ssize_t
+__generic_kernel_file_write(struct file *, const char *, size_t, loff_t *);
+
+ssize_t generic_file_sendpage(struct file *file, struct page *page,
+			      int offset, size_t size, loff_t *ppos, int more)
+{
+	ssize_t ret;
+	char *kaddr;
+
+	kaddr = kmap(page);
+	ret = __generic_kernel_file_write(file, kaddr + offset, size, ppos);
+	kunmap(page);
+
+	return ret;
+}
+
+EXPORT_SYMBOL(generic_file_sendpage);
+
 ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
 			 size_t count, read_actor_t actor, void *target)
 {
@@ -1715,6 +1736,21 @@ int remove_suid(struct dentry *dentry)
 }
 EXPORT_SYMBOL(remove_suid);
 
+static inline size_t
+filemap_copy_from_kernel(struct page *page, unsigned long offset,
+			 const char *buf, unsigned bytes)
+{
+	char *kaddr;
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	memcpy(kaddr + offset, buf, bytes);
+	kunmap_atomic(page, KM_USER0);
+
+	flush_dcache_page(page);
+
+	return bytes;
+}
+
 size_t
 __filemap_copy_from_user_iovec(char *vaddr, 
 			const struct iovec *iov, size_t base, size_t bytes)
@@ -1862,6 +1898,142 @@ generic_file_direct_write(struct kiocb *
 }
 EXPORT_SYMBOL(generic_file_direct_write);
 
+/*
+ * TODO:
+ * This largely tries to copy generic_file_aio_write_nolock(), although it
+ * doesn't have to be nearly as generic.  A real cleanup should either
+ * merge this into generic_file_aio_write_nolock() as well or keep it special
+ * and remove as much code as possible.
+ *
+ * Check for pending signals here. Otherwise return -EINTR early.
+ *
+ * No iov, no kiocb. If you think this is a problem, use the source ;)
+ */
+static ssize_t
+__generic_kernel_file_write(struct file *file, const char *buf,
+			    size_t count, loff_t *ppos)
+{
+	struct address_space * mapping = file->f_mapping;
+	struct address_space_operations *a_ops = mapping->a_ops;
+	struct inode 	*inode = mapping->host;
+	long		status = 0;
+	loff_t		pos;
+	struct page	*page;
+	struct page	*cached_page = NULL;
+	const int	isblk = S_ISBLK(inode->i_mode);
+	ssize_t		written;
+	ssize_t		err;
+	size_t		bytes;
+	struct pagevec	lru_pvec;
+
+	/* There is no sane reason to use O_DIRECT */
+	if (file->f_flags & O_DIRECT)
+		return -EINVAL;
+
+	if (unlikely(signal_pending(current)))
+		return -EINTR;
+
+	if (unlikely(count < 0))
+		return -EINVAL;
+
+	down(&inode->i_sem);
+
+	pos = *ppos;
+	pagevec_init(&lru_pvec, 0);
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = mapping->backing_dev_info;
+	written = 0;
+
+	err = generic_write_checks(file, &pos, &count, isblk);
+	if (err)
+		goto out;
+
+	if (count == 0)
+		goto out;
+
+	remove_suid(file->f_dentry);
+	inode_update_time(inode, 1);
+
+	do {
+		unsigned long index;
+		unsigned long offset;
+		size_t copied;
+
+		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+		index = pos >> PAGE_CACHE_SHIFT;
+		bytes = PAGE_CACHE_SIZE - offset;
+		if (bytes > count)
+			bytes = count;
+
+		page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
+		if (!page) {
+			status = -ENOMEM;
+			break;
+		}
+
+		status = a_ops->prepare_write(file, page, offset, offset+bytes);
+		if (unlikely(status)) {
+			loff_t isize = i_size_read(inode);
+			/*
+			 * prepare_write() may have instantiated a few blocks
+			 * outside i_size.  Trim these off again.
+			 */
+			unlock_page(page);
+			page_cache_release(page);
+			if (pos + bytes > isize)
+				vmtruncate(inode, isize);
+			break;
+		}
+
+		copied = filemap_copy_from_kernel(page, offset, buf, bytes);
+
+		status = a_ops->commit_write(file, page, offset, offset+bytes);
+		if (likely(copied > 0)) {
+			if (!status)
+				status = copied;
+
+			if (status >= 0) {
+				written += status;
+				count -= status;
+				pos += status;
+				buf += status;
+			}
+		}
+		if (unlikely(copied != bytes))
+			if (status >= 0)
+				status = -EFAULT;
+		unlock_page(page);
+		mark_page_accessed(page);
+		page_cache_release(page);
+		if (status < 0)
+			break;
+		balance_dirty_pages_ratelimited(mapping);
+		cond_resched();
+	} while (count);
+	*ppos = pos;
+
+	if (cached_page)
+		page_cache_release(cached_page);
+
+	/*
+	 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
+	 */
+	if (status >= 0) {
+		if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
+			status = generic_osync_inode(inode, mapping,
+					OSYNC_METADATA|OSYNC_DATA);
+	}
+
+	err = written ? written : status;
+out:
+	pagevec_lru_add(&lru_pvec);
+	current->backing_dev_info = 0;
+
+	up(&inode->i_sem);
+	return err;
+}
+
 ssize_t
 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t pos, loff_t *ppos,
Index: linux-2.6/fs/ext2/file.c
===================================================================
--- linux-2.6.orig/fs/ext2/file.c
+++ linux-2.6/fs/ext2/file.c
@@ -53,6 +53,7 @@ struct file_operations ext2_file_operati
 	.readv		= generic_file_readv,
 	.writev		= generic_file_writev,
 	.sendfile	= generic_file_sendfile,
+	.sendpage	= generic_file_sendpage,
 };
 
 #ifdef CONFIG_EXT2_FS_XIP
Index: linux-2.6/fs/ext3/file.c
===================================================================
--- linux-2.6.orig/fs/ext3/file.c
+++ linux-2.6/fs/ext3/file.c
@@ -119,6 +119,7 @@ struct file_operations ext3_file_operati
 	.release	= ext3_release_file,
 	.fsync		= ext3_sync_file,
 	.sendfile	= generic_file_sendfile,
+	.sendpage	= generic_file_sendpage,
 };
 
 struct inode_operations ext3_file_inode_operations = {

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] generic_file_sendpage
  2005-07-15 11:22   ` Jörn Engel
@ 2005-07-15 13:13     ` Jan Engelhardt
  2005-07-15 16:08       ` Linus Torvalds
  2005-07-21  5:34     ` Herbert Poetzl
  1 sibling, 1 reply; 11+ messages in thread
From: Jan Engelhardt @ 2005-07-15 13:13 UTC (permalink / raw)
  To: Jörn Engel; +Cc: Andrew Morton, Jan Blunck, torvalds, linux-kernel

>> I don't know if we want to add this feature, really.  It's such a
>> specialised thing.
>
>With union mount and cowlink, there are two users already.  cp(1)
>could use it as well, even if the improvement is quite minimal.

FTP PUT could use this too - currently, only FTPGETs can use sendfile because 
the target had to be a socket.
(With FTP PUT, the src is a sock, dst is a filedescriptor.)


Jan Engelhardt
-- 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] generic_file_sendpage
  2005-07-15 11:06 ` Andrew Morton
  2005-07-15 11:22   ` Jörn Engel
  2005-07-15 12:34   ` [PATCH] generic_file_sendpage (updated patch) Jan Blunck
@ 2005-07-15 16:01   ` Linus Torvalds
  2005-07-20 19:03     ` Jan Blunck
  2 siblings, 1 reply; 11+ messages in thread
From: Linus Torvalds @ 2005-07-15 16:01 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Jan Blunck, linux-kernel, joern



On Fri, 15 Jul 2005, Andrew Morton wrote:
> 
> I don't know if we want to add this feature, really.  It's such a
> specialised thing.

It is, in this format, and I agree - I don't want to add it.

What I'd really like to see is somebody taking a look at my old "pipe as a
zero-copy buffer" patches, which as an interface allowed arbitrary data to
be copied between _any_ file descriptors, and allowed you to do things
like mix input sources quite naturally (ie you could write a header first
from a user-space buffer, then the contents of a file, and then push the
result out to a socket, all with zero-copy).

"sendfile()" in general I think has been a mistake. It's too specialized,
and the interface has always sucked. As Andrew pointed out, it actually
needs to limit the number of buffers in flight partly because otherwise 
you have uninterruptible kernel work etc etc.

But more importantly, sendfile() was always broken as a interface for
receiving data from anything but a page-cache based filesystem. That means 
that it's totally useless for a lot of things. The pipe-buffer thing ends 
up being a totally generic "in-kernel buffer" interface, and is a _lot_ 
more flexible. 

For anybody interested in zero-copy work, here's a LWN write-up of some of 
the original discussion:

	http://lwn.net/Articles/118750/

and my (very ugly) example patch can be found for example here:

	http://groups-beta.google.com/group/linux.kernel/msg/782bd9e5cb647207?hl=en&

(it's not a a complete implementation, but it shows how to go from a file 
_to_ a pipe buffer, but not back to a file again).

I really want to get _rid_ of sendfile, not make any more of it.

			Linus

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] generic_file_sendpage
  2005-07-15 13:13     ` Jan Engelhardt
@ 2005-07-15 16:08       ` Linus Torvalds
  2005-07-15 19:02         ` Jan Engelhardt
  0 siblings, 1 reply; 11+ messages in thread
From: Linus Torvalds @ 2005-07-15 16:08 UTC (permalink / raw)
  To: Jan Engelhardt; +Cc: Jörn Engel, Andrew Morton, Jan Blunck, linux-kernel



On Fri, 15 Jul 2005, Jan Engelhardt wrote:
>
> >> I don't know if we want to add this feature, really.  It's such a
> >> specialised thing.
> >
> >With union mount and cowlink, there are two users already.  cp(1)
> >could use it as well, even if the improvement is quite minimal.
> 
> FTP PUT could use this too - currently, only FTPGETs can use sendfile because 
> the target had to be a socket.
> (With FTP PUT, the src is a sock, dst is a filedescriptor.)

No, FTP PUT _cannot_ use it, exactly because sendfile() can't do anything 
but file sources (and not even all file sources - it can only do regular 
filesystems that use the page cache).

This is why I want to get rid of sendfile(). It's a fundamentally broken
interface. Really. In contrast, the pipe buffers _can_ be used for direct
socket->file interfaces.

Now, even pipe buffers obviously won't actually be really "zero-copy":
you'll end up needing one copy to align the result, since the incoming
network packets will obviously not be nice page-sized and page-aligned
things. But they won't need the "copy to user space" and "copy back from
user space into kernel space", so it will be a question of _minimal_ copy
(and avoiding unnecessary user space VM work).

			Linus

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] generic_file_sendpage
  2005-07-15 16:08       ` Linus Torvalds
@ 2005-07-15 19:02         ` Jan Engelhardt
  2005-07-15 19:16           ` Linus Torvalds
  0 siblings, 1 reply; 11+ messages in thread
From: Jan Engelhardt @ 2005-07-15 19:02 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Jörn Engel, Andrew Morton, Jan Blunck, linux-kernel

>> >With union mount and cowlink, there are two users already.  cp(1)
>> >could use it as well, even if the improvement is quite minimal.
>> 
>> FTP PUT could use this too - ...
>
>No, FTP PUT _cannot_ use it, exactly because sendfile() can't do anything 
>but file sources (and not even all file sources - it can only do regular 
>filesystems that use the page cache).
>
>This is why I want to get rid of sendfile(). It's a fundamentally broken
>interface. Really. In contrast, the pipe buffers _can_ be used for direct
>socket->file interfaces.

How will userspace access these pipe buffers?



Jan Engelhardt
-- 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] generic_file_sendpage
  2005-07-15 19:02         ` Jan Engelhardt
@ 2005-07-15 19:16           ` Linus Torvalds
  0 siblings, 0 replies; 11+ messages in thread
From: Linus Torvalds @ 2005-07-15 19:16 UTC (permalink / raw)
  To: Jan Engelhardt; +Cc: Jörn Engel, Andrew Morton, Jan Blunck, linux-kernel



On Fri, 15 Jul 2005, Jan Engelhardt wrote:
> >
> >This is why I want to get rid of sendfile(). It's a fundamentally broken
> >interface. Really. In contrast, the pipe buffers _can_ be used for direct
> >socket->file interfaces.
> 
> How will userspace access these pipe buffers?

You can fill them from a user-space buffer with "write()", and you can
read them into a user-space buffer with "read()".

It's really a pipe.

The idea of the pipe buffers is that you can _also_ send them to other 
file descriptors, or fill them from other file descriptors, without having 
to copy the data to/from user space.

		Linus

---
See the example patch that I posted months ago (and that I referred to in 
my other email, here is that thing repeated in case you missed it):

> For anybody interested in zero-copy work, here's a LWN write-up of some of
> the original discussion:
> 
>         http://lwn.net/Articles/118750/
> 
> and my (very ugly) example patch can be found for example here:
> 
>         http://groups-beta.google.com/group/linux.kernel/msg/782bd9e5cb647207?hl=en&
> 
> (it's not a a complete implementation, but it shows how to go from a file
> _to_ a pipe buffer, but not back to a file again).


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] generic_file_sendpage
  2005-07-15 16:01   ` [PATCH] generic_file_sendpage Linus Torvalds
@ 2005-07-20 19:03     ` Jan Blunck
  0 siblings, 0 replies; 11+ messages in thread
From: Jan Blunck @ 2005-07-20 19:03 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, linux-kernel, joern

Linus Torvalds wrote:
> 
> "sendfile()" in general I think has been a mistake. It's too specialized,
> and the interface has always sucked.

Ok, you're right. I will have a look at the pipe buffer stuff.

> As Andrew pointed out, it actually
> needs to limit the number of buffers in flight partly because otherwise 
> you have uninterruptible kernel work etc etc.

Hmm, sendfile() uses do_generic_mapping_read() which is calling the 
file_send_actor() per page. The actor calls ->sendpage(). I don't see 
any situation how this could lead to uninterruptible kernel work when 
the sendpage() itself is checking for signals. There are only 2 pages in 
flight in this case.

Jan

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] generic_file_sendpage
  2005-07-15 11:22   ` Jörn Engel
  2005-07-15 13:13     ` Jan Engelhardt
@ 2005-07-21  5:34     ` Herbert Poetzl
  1 sibling, 0 replies; 11+ messages in thread
From: Herbert Poetzl @ 2005-07-21  5:34 UTC (permalink / raw)
  To: Jörn Engel; +Cc: Andrew Morton, Jan Blunck, torvalds, linux-kernel

On Fri, Jul 15, 2005 at 01:22:55PM +0200, Jörn Engel wrote:
> On Fri, 15 July 2005 04:06:11 -0700, Andrew Morton wrote:
> > > +
> > > +	/* There is no sane reason to use O_DIRECT */
> > > +	BUG_ON(file->f_flags & O_DIRECT);
> > 
> > err, this seems like an easy way for people to make the kernel go BUG.
> 
> Is there a sane use for O_DIRECT in combination with sendfile()?
> 
> If not, I'd like to change sys_sendfile() and return -EINVAL for
> O_DIRECT file descriptors.
> 
> > > +	if (unlikely(signal_pending(current)))
> > > +		return -EINTR;
> > 
> > This doesn't help.  The reason we've avoided file-to-file sendfile() is
> > that it can cause applications to get uninterruptibly stuck in the kernel
> > for ages.  This code doesn't solve that problem.  It needs to handle
> > signal_pending() inside the main loop.
> > 
> > And it probably needs to return a sane value (number of bytes copied)
> > rather than -EINTR.
> 
> Makes sense.
> 
> > I don't know if we want to add this feature, really.  It's such a
> > specialised thing.
> 
> With union mount and cowlink, there are two users already.  cp(1)
> could use it as well, even if the improvement is quite minimal.

you might soon add linux-vserver to the list, as
we will be using this for a special version of the
COW links (to break unified links on write) ...

best,
Herbert

> Jörn
> 
> -- 
> All art is but imitation of nature.
> -- Lucius Annaeus Seneca
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2005-07-21  5:37 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-07-15 10:48 [PATCH] generic_file_sendpage Jan Blunck
2005-07-15 11:06 ` Andrew Morton
2005-07-15 11:22   ` Jörn Engel
2005-07-15 13:13     ` Jan Engelhardt
2005-07-15 16:08       ` Linus Torvalds
2005-07-15 19:02         ` Jan Engelhardt
2005-07-15 19:16           ` Linus Torvalds
2005-07-21  5:34     ` Herbert Poetzl
2005-07-15 12:34   ` [PATCH] generic_file_sendpage (updated patch) Jan Blunck
2005-07-15 16:01   ` [PATCH] generic_file_sendpage Linus Torvalds
2005-07-20 19:03     ` Jan Blunck

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox