Recvfile patch used for Samba.

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* Recvfile patch used for Samba.
@ 2013-07-22 21:57 Jeremy Allison
  2013-07-22 23:26 ` Joe Perches
  2013-07-23  7:10 ` Dave Chinner
  0 siblings, 2 replies; 8+ messages in thread
From: Jeremy Allison @ 2013-07-22 21:57 UTC (permalink / raw)
  To: Steve French, Jeff Layton, linux-cifs-u79uwXL29TY76Z2rM5mHXA,
	LKML, linux-fsdevel
  Cc: jra-eUNUBHrolfbYtjvyW6yDsg

[-- Attachment #1: Type: text/plain, Size: 404 bytes --]

Hi Steve and Jeff (and others).

Here is a patch that Samba vendors have been using
to implement recvfile (copy directly from socket
to file). It can improve write performance on boxes
by a significant amount (10% or more).

I'm not qualified to evaluate this code, can someone
who is (hi there Steve and Jeff :-) take a look at
this and see if it's work shepherding into the kernel ?

Cheers,

	Jeremy.

[-- Attachment #2: splice-from-socket-to-file-2.6.37.patch --]
[-- Type: text/x-diff, Size: 12295 bytes --]

diff -urp linux-2.6.37-rc5.orig/fs/splice.c linux-2.6.37-rc5/fs/splice.c
--- linux-2.6.37-rc5.orig/fs/splice.c	2010-12-06 20:09:04.000000000 -0800
+++ linux-2.6.37-rc5/fs/splice.c	2010-12-07 16:16:48.000000000 -0800
@@ -31,6 +31,7 @@
 #include <linux/uio.h>
 #include <linux/security.h>
 #include <linux/gfp.h>
+#include <net/sock.h>
 
 /*
  * Attempt to steal a page from a pipe buffer. This should perhaps go into
@@ -1387,6 +1388,141 @@ static long do_splice(struct file *in, l
 	return -EINVAL;
 }
 
+static ssize_t do_splice_from_socket(struct file *file, struct socket *sock,
+				     loff_t __user *ppos, size_t count)
+{		
+	struct address_space *mapping = file->f_mapping;
+	struct inode	*inode = mapping->host;
+	loff_t pos;
+	int count_tmp;
+	int err = 0;
+	int cPagePtr = 0;		
+	int cPagesAllocated = 0;
+	struct recvfile_ctl_blk rv_cb[MAX_PAGES_PER_RECVFILE];
+	struct kvec iov[MAX_PAGES_PER_RECVFILE];
+	struct msghdr msg;
+	long rcvtimeo;
+	int ret;
+
+	if(copy_from_user(&pos, ppos, sizeof(loff_t)))
+		return -EFAULT;
+
+	if(count > MAX_PAGES_PER_RECVFILE * PAGE_SIZE) {
+		printk("%s: count(%u) exceeds maxinum\n", __func__, count);
+		return -EINVAL;
+	}    
+	mutex_lock(&inode->i_mutex);
+
+	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = mapping->backing_dev_info;
+
+	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+	if (err != 0 || count == 0)
+		goto done;
+
+	file_remove_suid(file);
+	file_update_time(file);	
+
+	count_tmp = count;
+	do {
+		unsigned long bytes;	/* Bytes to write to page */
+		unsigned long offset;	/* Offset into pagecache page */
+		struct page *pageP;
+		void *fsdata;
+
+		offset = (pos & (PAGE_CACHE_SIZE - 1));
+		bytes = PAGE_CACHE_SIZE - offset;
+		if (bytes > count_tmp)
+			bytes = count_tmp;
+		ret = mapping->a_ops->write_begin(file, mapping, pos, bytes,
+						  AOP_FLAG_UNINTERRUPTIBLE,
+						  &pageP, &fsdata);
+
+		if (unlikely(ret)) {
+			err = ret;
+			for(cPagePtr = 0; cPagePtr < cPagesAllocated; cPagePtr++) {
+				kunmap(rv_cb[cPagePtr].rv_page);
+				ret = mapping->a_ops->write_end(file, mapping,
+								rv_cb[cPagePtr].rv_pos,
+								rv_cb[cPagePtr].rv_count,
+								rv_cb[cPagePtr].rv_count,
+								rv_cb[cPagePtr].rv_page,
+								rv_cb[cPagePtr].rv_fsdata);
+			}
+			goto done;
+		}
+		rv_cb[cPagesAllocated].rv_page = pageP;
+		rv_cb[cPagesAllocated].rv_pos = pos;
+		rv_cb[cPagesAllocated].rv_count = bytes;
+		rv_cb[cPagesAllocated].rv_fsdata = fsdata;
+		iov[cPagesAllocated].iov_base = kmap(pageP) + offset;
+		iov[cPagesAllocated].iov_len = bytes;
+		cPagesAllocated++;
+		count_tmp -= bytes;
+		pos += bytes;
+	} while (count_tmp);
+
+	/* IOV is ready, receive the date from socket now */
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_iov = (struct iovec *)&iov[0];
+	msg.msg_iovlen = cPagesAllocated ;
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags = MSG_KERNSPACE;
+	rcvtimeo = sock->sk->sk_rcvtimeo;    
+	sock->sk->sk_rcvtimeo = 8 * HZ;
+
+	ret = kernel_recvmsg(sock, &msg, &iov[0], cPagesAllocated, count,
+			     MSG_WAITALL | MSG_NOCATCHSIG);
+
+	sock->sk->sk_rcvtimeo = rcvtimeo;
+	if(ret != count)
+		err = -EPIPE;
+	else
+		err = 0;
+
+	if (unlikely(err < 0)) {
+		for(cPagePtr = 0; cPagePtr < cPagesAllocated; cPagePtr++) {
+			kunmap(rv_cb[cPagePtr].rv_page);
+			ret = mapping->a_ops->write_end(file, mapping,
+							rv_cb[cPagePtr].rv_pos,
+							rv_cb[cPagePtr].rv_count,
+							rv_cb[cPagePtr].rv_count,
+							rv_cb[cPagePtr].rv_page,
+							rv_cb[cPagePtr].rv_fsdata);
+		}
+		goto done;
+	}
+
+	for(cPagePtr=0,count=0;cPagePtr < cPagesAllocated;cPagePtr++) {
+		//flush_dcache_page(pageP);
+		kunmap(rv_cb[cPagePtr].rv_page);
+		ret = mapping->a_ops->write_end(file, mapping,
+						rv_cb[cPagePtr].rv_pos,
+						rv_cb[cPagePtr].rv_count,
+						rv_cb[cPagePtr].rv_count,
+						rv_cb[cPagePtr].rv_page,
+						rv_cb[cPagePtr].rv_fsdata);
+		if (unlikely(ret < 0))
+			printk("%s: write_end fail,ret = %d\n", __func__, ret);
+		count += rv_cb[cPagePtr].rv_count;
+		//cond_resched();
+	}
+	balance_dirty_pages_ratelimited_nr(mapping, cPagesAllocated);
+	copy_to_user(ppos,&pos,sizeof(loff_t));
+    
+done:
+	current->backing_dev_info = NULL;    
+	mutex_unlock(&inode->i_mutex);
+	if(err)
+		return err;
+	else 
+		return count;
+}
+
 /*
  * Map an iov into an array of pages and offset/length tupples. With the
  * partial_page structure, we can map several non-contiguous ranges into
@@ -1698,11 +1834,33 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff
 	long error;
 	struct file *in, *out;
 	int fput_in, fput_out;
+	struct socket *sock = NULL;
 
 	if (unlikely(!len))
 		return 0;
 
 	error = -EBADF;
+
+	/* check if fd_in is a socket */
+	sock = sockfd_lookup(fd_in, &error);
+	if (sock) {
+		out = NULL;
+		if (!sock->sk)
+			goto done;
+		out = fget_light(fd_out, &fput_out);
+        
+		if (out) {
+			if (!(out->f_mode & FMODE_WRITE))
+				goto done;
+			error = do_splice_from_socket(out, sock, off_out, len);
+		}       
+done:
+		if(out)
+			fput_light(out, fput_out);      
+		fput(sock->file);
+		return error;
+	}
+
 	in = fget_light(fd_in, &fput_in);
 	if (in) {
 		if (in->f_mode & FMODE_READ) {
diff -urp linux-2.6.37-rc5.orig/include/linux/fs.h linux-2.6.37-rc5/include/linux/fs.h
--- linux-2.6.37-rc5.orig/include/linux/fs.h	2010-12-06 20:09:04.000000000 -0800
+++ linux-2.6.37-rc5/include/linux/fs.h	2010-12-07 15:58:26.000000000 -0800
@@ -372,6 +372,8 @@ struct inodes_stat_t {
 #define SYNC_FILE_RANGE_WRITE		2
 #define SYNC_FILE_RANGE_WAIT_AFTER	4
 
+#define MAX_PAGES_PER_RECVFILE		32
+
 #ifdef __KERNEL__
 
 #include <linux/linkage.h>
diff -urp linux-2.6.37-rc5.orig/include/linux/skbuff.h linux-2.6.37-rc5/include/linux/skbuff.h
--- linux-2.6.37-rc5.orig/include/linux/skbuff.h	2010-12-06 20:09:04.000000000 -0800
+++ linux-2.6.37-rc5/include/linux/skbuff.h	2010-12-07 15:31:43.000000000 -0800
@@ -1817,6 +1817,9 @@ extern unsigned int    datagram_poll(str
 extern int	       skb_copy_datagram_iovec(const struct sk_buff *from,
 					       int offset, struct iovec *to,
 					       int size);
+extern int	       skb_copy_datagram_to_kernel_iovec(const struct sk_buff *from,
+					       int offset, struct iovec *to,
+					       int size);
 extern int	       skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
 							int hlen,
 							struct iovec *iov);
diff -urp linux-2.6.37-rc5.orig/include/linux/socket.h linux-2.6.37-rc5/include/linux/socket.h
--- linux-2.6.37-rc5.orig/include/linux/socket.h	2010-12-06 20:09:04.000000000 -0800
+++ linux-2.6.37-rc5/include/linux/socket.h	2010-12-07 15:33:52.000000000 -0800
@@ -261,6 +261,8 @@ struct ucred {
 #define MSG_NOSIGNAL	0x4000	/* Do not generate SIGPIPE */
 #define MSG_MORE	0x8000	/* Sender will send more */
 #define MSG_WAITFORONE	0x10000	/* recvmmsg(): block until 1+ packets avail */
+#define MSG_KERNSPACE	0x20000
+#define MSG_NOCATCHSIG	0x40000
 
 #define MSG_EOF         MSG_FIN
 
@@ -326,6 +328,7 @@ extern int verify_iovec(struct msghdr *m
 extern int memcpy_toiovec(struct iovec *v, unsigned char *kdata, int len);
 extern int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata,
 			     int offset, int len);
+extern void memcpy_tokerneliovec(struct iovec *iov, unsigned char *kdata, int len);
 extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr);
 extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
 
diff -urp linux-2.6.37-rc5.orig/include/linux/splice.h linux-2.6.37-rc5/include/linux/splice.h
--- linux-2.6.37-rc5.orig/include/linux/splice.h	2010-12-06 20:09:04.000000000 -0800
+++ linux-2.6.37-rc5/include/linux/splice.h	2010-12-07 15:46:44.000000000 -0800
@@ -57,6 +57,14 @@ struct splice_pipe_desc {
 	void (*spd_release)(struct splice_pipe_desc *, unsigned int);
 };
 
+struct recvfile_ctl_blk
+{
+	struct page *rv_page;
+	loff_t rv_pos;
+	size_t rv_count;
+	void *rv_fsdata;
+};
+
 typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
 			   struct splice_desc *);
 typedef int (splice_direct_actor)(struct pipe_inode_info *,
diff -urp linux-2.6.37-rc5.orig/net/core/datagram.c linux-2.6.37-rc5/net/core/datagram.c
--- linux-2.6.37-rc5.orig/net/core/datagram.c	2010-12-06 20:09:04.000000000 -0800
+++ linux-2.6.37-rc5/net/core/datagram.c	2010-12-07 16:01:36.000000000 -0800
@@ -128,6 +128,65 @@ out_noerr:
 	goto out;
 }
 
+/*
+ *	skb_copy_datagram_to_kernel_iovec - Copy a datagram to a kernel iovec structure.
+ *	@skb: buffer to copy
+ *	@offset: offset in the buffer to start copying from
+ *	@to: io vector to copy to
+ *	@len: amount of data to copy from buffer to iovec
+ *
+ *	Note: the iovec is modified during the copy.
+ */
+int skb_copy_datagram_to_kernel_iovec(const struct sk_buff *skb, int offset,
+				      struct iovec *to, int len)
+{
+	int i, fraglen, end = 0;
+	struct sk_buff *next = skb_shinfo(skb)->frag_list;
+
+	if (!len)
+		return 0;
+
+next_skb:
+	fraglen = skb_headlen(skb);
+	i = -1;
+
+	while (1) {
+		int start = end;
+
+		if ((end += fraglen) > offset) {
+			int copy = end - offset;
+			int o = offset - start;
+
+			if (copy > len)
+				copy = len;
+			if (i == -1)
+				memcpy_tokerneliovec(to, skb->data + o, copy);
+			else {
+				skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+				struct page *page = frag->page;
+				void *p = kmap(page) + frag->page_offset + o;
+				memcpy_tokerneliovec(to, p, copy);
+				kunmap(page);
+			}
+
+			if (!(len -= copy))
+				return 0;
+			offset += copy;
+		}
+		if (++i >= skb_shinfo(skb)->nr_frags)
+			break;
+		fraglen = skb_shinfo(skb)->frags[i].size;
+	}
+	if (next) {
+		skb = next;
+		BUG_ON(skb_shinfo(skb)->frag_list);
+		next = skb->next;
+		goto next_skb;
+	}
+
+	return -EFAULT;
+}
+
 /**
  *	__skb_recv_datagram - Receive a datagram skbuff
  *	@sk: socket
diff -urp linux-2.6.37-rc5.orig/net/core/iovec.c linux-2.6.37-rc5/net/core/iovec.c
--- linux-2.6.37-rc5.orig/net/core/iovec.c	2010-12-06 20:09:04.000000000 -0800
+++ linux-2.6.37-rc5/net/core/iovec.c	2010-12-07 16:03:46.000000000 -0800
@@ -124,6 +124,30 @@ int memcpy_toiovecend(const struct iovec
 }
 EXPORT_SYMBOL(memcpy_toiovecend);
 
+/* This was removed in 2.6. Re-add it for splice from socket to file. */
+/*
+ *	In kernel copy to iovec. Returns -EFAULT on error.
+ *
+ *	Note: this modifies the original iovec.
+ */
+ 
+void memcpy_tokerneliovec(struct iovec *iov, unsigned char *kdata, int len)
+{
+	while(len>0)
+	{
+		if(iov->iov_len)
+		{
+			int copy = min_t(unsigned int, iov->iov_len, len);
+			memcpy(iov->iov_base, kdata, copy);
+			len -= copy;
+			kdata += copy;
+			iov->iov_base += copy;
+			iov->iov_len -= copy;
+		}
+		iov++;
+	}
+}
+
 /*
  *	Copy iovec to kernel. Returns -EFAULT on error.
  *
diff -urp linux-2.6.37-rc5.orig/net/ipv4/tcp.c linux-2.6.37-rc5/net/ipv4/tcp.c
--- linux-2.6.37-rc5.orig/net/ipv4/tcp.c	2010-12-06 20:09:04.000000000 -0800
+++ linux-2.6.37-rc5/net/ipv4/tcp.c	2010-12-07 15:49:35.000000000 -0800
@@ -1460,8 +1460,23 @@ int tcp_recvmsg(struct kiocb *iocb, stru
 	do {
 		u32 offset;
 
+		if (flags & MSG_NOCATCHSIG) {
+			if (signal_pending(current)) {
+				if (sigismember(&current->pending.signal, SIGQUIT) || 
+				    sigismember(&current->pending.signal, SIGABRT) ||
+				    sigismember(&current->pending.signal, SIGKILL) ||
+				    sigismember(&current->pending.signal, SIGTERM) ||
+				    sigismember(&current->pending.signal, SIGSTOP)) {
+
+					if (copied)
+						break;
+					copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
+					break;
+				}
+			}
+		}
 		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
-		if (tp->urg_data && tp->urg_seq == *seq) {
+		else if (tp->urg_data && tp->urg_seq == *seq) {
 			if (copied)
 				break;
 			if (signal_pending(current)) {
@@ -1690,8 +1705,12 @@ do_prequeue:
 			} else
 #endif
 			{
-				err = skb_copy_datagram_iovec(skb, offset,
-						msg->msg_iov, used);
+				if(msg->msg_flags & MSG_KERNSPACE)
+					err = skb_copy_datagram_to_kernel_iovec(skb,
+							offset, msg->msg_iov, used);
+				else
+					err = skb_copy_datagram_iovec(skb, offset,
+							msg->msg_iov, used);
 				if (err) {
 					/* Exception. Bailout! */
 					if (!copied)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Recvfile patch used for Samba.
  2013-07-22 21:57 Recvfile patch used for Samba Jeremy Allison
@ 2013-07-22 23:26 ` Joe Perches
  2013-07-23  7:10 ` Dave Chinner
  1 sibling, 0 replies; 8+ messages in thread
From: Joe Perches @ 2013-07-22 23:26 UTC (permalink / raw)
  To: Jeremy Allison; +Cc: Steve French, Jeff Layton, linux-cifs, LKML, linux-fsdevel

On Mon, 2013-07-22 at 14:57 -0700, Jeremy Allison wrote:
> Here is a patch that Samba vendors have been using
> to implement recvfile (copy directly from socket
> to file). It can improve write performance on boxes
> by a significant amount (10% or more).
> 
> I'm not qualified to evaluate this code,

Nor I really.  This doesn't apply to current btw.

Single comment: (just on a brief look)

do_splice_from_socket with 64bit compilation appears
to have a pretty large (~1500 byte) stack frame which
doesn't seem very nice.  maybe a malloc/free instead
of stack for 
	struct recvfile_ctl_blk rv_cb[MAX_PAGES_PER_RECVFILE];
	struct kvec iov[MAX_PAGES_PER_RECVFILE];



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Recvfile patch used for Samba.
  2013-07-22 21:57 Recvfile patch used for Samba Jeremy Allison
  2013-07-22 23:26 ` Joe Perches
@ 2013-07-23  7:10 ` Dave Chinner
  2013-07-23 13:31   ` Jeff Layton
  2013-07-23 21:58   ` Jeremy Allison
  1 sibling, 2 replies; 8+ messages in thread
From: Dave Chinner @ 2013-07-23  7:10 UTC (permalink / raw)
  To: Jeremy Allison; +Cc: Steve French, Jeff Layton, linux-cifs, LKML, linux-fsdevel

On Mon, Jul 22, 2013 at 02:57:38PM -0700, Jeremy Allison wrote:
> Hi Steve and Jeff (and others).
> 
> Here is a patch that Samba vendors have been using
> to implement recvfile (copy directly from socket
> to file). It can improve write performance on boxes
> by a significant amount (10% or more).

Is that all? It's just moving the memcpy() into the kernel?

> I'm not qualified to evaluate this code, can someone
> who is (hi there Steve and Jeff :-) take a look at
> this and see if it's work shepherding into the kernel ?

It's pretty nasty.

> 
> Cheers,
> 
> 	Jeremy.

> diff -urp linux-2.6.37-rc5.orig/fs/splice.c linux-2.6.37-rc5/fs/splice.c
> --- linux-2.6.37-rc5.orig/fs/splice.c	2010-12-06 20:09:04.000000000 -0800
> +++ linux-2.6.37-rc5/fs/splice.c	2010-12-07 16:16:48.000000000 -0800
> @@ -31,6 +31,7 @@
>  #include <linux/uio.h>
>  #include <linux/security.h>
>  #include <linux/gfp.h>
> +#include <net/sock.h>
>  
>  /*
>   * Attempt to steal a page from a pipe buffer. This should perhaps go into
> @@ -1387,6 +1388,141 @@ static long do_splice(struct file *in, l
>  	return -EINVAL;
>  }
>  
> +static ssize_t do_splice_from_socket(struct file *file, struct socket *sock,
> +				     loff_t __user *ppos, size_t count)
> +{		
> +	struct address_space *mapping = file->f_mapping;
> +	struct inode	*inode = mapping->host;
> +	loff_t pos;
> +	int count_tmp;
> +	int err = 0;
> +	int cPagePtr = 0;		
> +	int cPagesAllocated = 0;

camel case.

> +	struct recvfile_ctl_blk rv_cb[MAX_PAGES_PER_RECVFILE];

MAX_PAGES_PER_RECVFILE = 32, and the structure is 32 bytes on 64
bit. So 1k of stack usage right there. Bad.

> +	struct kvec iov[MAX_PAGES_PER_RECVFILE];

And that's another 512 bytes of stack. Way too much.

> +	struct msghdr msg;

Another ~52 bytes of stack.

> +	long rcvtimeo;
> +	int ret;
> +
> +	if(copy_from_user(&pos, ppos, sizeof(loff_t)))
> +		return -EFAULT;
> +
> +	if(count > MAX_PAGES_PER_RECVFILE * PAGE_SIZE) {
> +		printk("%s: count(%u) exceeds maxinum\n", __func__, count);
> +		return -EINVAL;
> +	}    
> +	mutex_lock(&inode->i_mutex);

Locking on write needs to be passed to filesystems, not done by
generic code. i.e. this isn't sufficient for XFS, and may also be
problematic for btrfs.

> +
> +	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);

That doesn't exist anymore - it was removed over a year ago....

> +
> +	/* We can write back this queue in page reclaim */
> +	current->backing_dev_info = mapping->backing_dev_info;
> +
> +	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
> +	if (err != 0 || count == 0)
> +		goto done;
> +
> +	file_remove_suid(file);
> +	file_update_time(file);	

error checking?

> +
> +	count_tmp = count;
> +	do {
> +		unsigned long bytes;	/* Bytes to write to page */
> +		unsigned long offset;	/* Offset into pagecache page */
> +		struct page *pageP;

camel case

> +		void *fsdata;
> +
> +		offset = (pos & (PAGE_CACHE_SIZE - 1));
> +		bytes = PAGE_CACHE_SIZE - offset;
> +		if (bytes > count_tmp)
> +			bytes = count_tmp;
> +		ret = mapping->a_ops->write_begin(file, mapping, pos, bytes,
> +						  AOP_FLAG_UNINTERRUPTIBLE,
> +						  &pageP, &fsdata);
> +
> +		if (unlikely(ret)) {
> +			err = ret;
> +			for(cPagePtr = 0; cPagePtr < cPagesAllocated; cPagePtr++) {
> +				kunmap(rv_cb[cPagePtr].rv_page);
> +				ret = mapping->a_ops->write_end(file, mapping,
> +								rv_cb[cPagePtr].rv_pos,
> +								rv_cb[cPagePtr].rv_count,
> +								rv_cb[cPagePtr].rv_count,
> +								rv_cb[cPagePtr].rv_page,
> +								rv_cb[cPagePtr].rv_fsdata);
> +			}
> +			goto done;
> +		}
> +		rv_cb[cPagesAllocated].rv_page = pageP;
> +		rv_cb[cPagesAllocated].rv_pos = pos;
> +		rv_cb[cPagesAllocated].rv_count = bytes;
> +		rv_cb[cPagesAllocated].rv_fsdata = fsdata;
> +		iov[cPagesAllocated].iov_base = kmap(pageP) + offset;
> +		iov[cPagesAllocated].iov_len = bytes;
> +		cPagesAllocated++;
> +		count_tmp -= bytes;
> +		pos += bytes;
> +	} while (count_tmp);

So, we are nesting up to 32 page locks here. That's bad. And we are
nesting kmap() calls for all the pages individually - is that even
safe to do?

So, what happens when we've got 16 pages in, and the filesystem has
allocated space for those 16 blocks, and we get ENOSPC on the 17th?
Sure, you undo the state here, but what about the 16 blocks that the
filesystem has allocated to this file? There's no notification to
the filesystem that they need to be truncated away because the write
failed....

> +
> +	/* IOV is ready, receive the date from socket now */
> +	msg.msg_name = NULL;
> +	msg.msg_namelen = 0;
> +	msg.msg_iov = (struct iovec *)&iov[0];
> +	msg.msg_iovlen = cPagesAllocated ;
> +	msg.msg_control = NULL;
> +	msg.msg_controllen = 0;
> +	msg.msg_flags = MSG_KERNSPACE;
> +	rcvtimeo = sock->sk->sk_rcvtimeo;    
> +	sock->sk->sk_rcvtimeo = 8 * HZ;

We can hold the inode and the pages locked for 8 seconds?

I'll stop there. This is fundamentally broken. It's an attempt to do
a multi-page write operation without any of the supporting
structures needed to handle the failure cases properly.  The nested
page locking has "deadlock" written all over it, and the lack of
partial failure handling shouts "data corruption" and "stale data
exposure" to me. The fact it can block for up to 8 seconds waiting
for network shenanigans to be completed while holding lots of locks
is going to cause all sorts of problems under memory pressure.

Not to mention it means that all memory allocations in the msgrcv
path need to be done with GFP_NOFS, because GFP_KERNEL allocations
are almost guaranteed to deadlock on the locked pages this path
already holds....

Need I say more?

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Recvfile patch used for Samba.
  2013-07-23  7:10 ` Dave Chinner
@ 2013-07-23 13:31   ` Jeff Layton
  2013-07-23 21:58   ` Jeremy Allison
  1 sibling, 0 replies; 8+ messages in thread
From: Jeff Layton @ 2013-07-23 13:31 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Jeremy Allison, Steve French, linux-cifs-u79uwXL29TY76Z2rM5mHXA,
	LKML, linux-fsdevel

On Tue, 23 Jul 2013 17:10:27 +1000
Dave Chinner <david-FqsqvQoI3Ljby3iVrkZq2A@public.gmane.org> wrote:

> On Mon, Jul 22, 2013 at 02:57:38PM -0700, Jeremy Allison wrote:
> > Hi Steve and Jeff (and others).
> > 
> > Here is a patch that Samba vendors have been using
> > to implement recvfile (copy directly from socket
> > to file). It can improve write performance on boxes
> > by a significant amount (10% or more).
> 
> Is that all? It's just moving the memcpy() into the kernel?
> 
> > I'm not qualified to evaluate this code, can someone
> > who is (hi there Steve and Jeff :-) take a look at
> > this and see if it's work shepherding into the kernel ?
> 
> It's pretty nasty.
> 
> > 
> > Cheers,
> > 
> > 	Jeremy.
> 
> > diff -urp linux-2.6.37-rc5.orig/fs/splice.c linux-2.6.37-rc5/fs/splice.c
> > --- linux-2.6.37-rc5.orig/fs/splice.c	2010-12-06 20:09:04.000000000 -0800
> > +++ linux-2.6.37-rc5/fs/splice.c	2010-12-07 16:16:48.000000000 -0800
> > @@ -31,6 +31,7 @@
> >  #include <linux/uio.h>
> >  #include <linux/security.h>
> >  #include <linux/gfp.h>
> > +#include <net/sock.h>
> >  
> >  /*
> >   * Attempt to steal a page from a pipe buffer. This should perhaps go into
> > @@ -1387,6 +1388,141 @@ static long do_splice(struct file *in, l
> >  	return -EINVAL;
> >  }
> >  
> > +static ssize_t do_splice_from_socket(struct file *file, struct socket *sock,
> > +				     loff_t __user *ppos, size_t count)
> > +{		
> > +	struct address_space *mapping = file->f_mapping;
> > +	struct inode	*inode = mapping->host;
> > +	loff_t pos;
> > +	int count_tmp;
> > +	int err = 0;
> > +	int cPagePtr = 0;		
> > +	int cPagesAllocated = 0;
> 
> camel case.
> 
> > +	struct recvfile_ctl_blk rv_cb[MAX_PAGES_PER_RECVFILE];
> 
> MAX_PAGES_PER_RECVFILE = 32, and the structure is 32 bytes on 64
> bit. So 1k of stack usage right there. Bad.
> 
> > +	struct kvec iov[MAX_PAGES_PER_RECVFILE];
> 
> And that's another 512 bytes of stack. Way too much.
> 
> > +	struct msghdr msg;
> 
> Another ~52 bytes of stack.
> 
> > +	long rcvtimeo;
> > +	int ret;
> > +
> > +	if(copy_from_user(&pos, ppos, sizeof(loff_t)))
> > +		return -EFAULT;
> > +
> > +	if(count > MAX_PAGES_PER_RECVFILE * PAGE_SIZE) {
> > +		printk("%s: count(%u) exceeds maxinum\n", __func__, count);
> > +		return -EINVAL;
> > +	}    
> > +	mutex_lock(&inode->i_mutex);
> 
> Locking on write needs to be passed to filesystems, not done by
> generic code. i.e. this isn't sufficient for XFS, and may also be
> problematic for btrfs.
> 
> > +
> > +	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
> 
> That doesn't exist anymore - it was removed over a year ago....
> 
> > +
> > +	/* We can write back this queue in page reclaim */
> > +	current->backing_dev_info = mapping->backing_dev_info;
> > +
> > +	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
> > +	if (err != 0 || count == 0)
> > +		goto done;
> > +
> > +	file_remove_suid(file);
> > +	file_update_time(file);	
> 
> error checking?
> 
> > +
> > +	count_tmp = count;
> > +	do {
> > +		unsigned long bytes;	/* Bytes to write to page */
> > +		unsigned long offset;	/* Offset into pagecache page */
> > +		struct page *pageP;
> 
> camel case
> 
> > +		void *fsdata;
> > +
> > +		offset = (pos & (PAGE_CACHE_SIZE - 1));
> > +		bytes = PAGE_CACHE_SIZE - offset;
> > +		if (bytes > count_tmp)
> > +			bytes = count_tmp;
> > +		ret = mapping->a_ops->write_begin(file, mapping, pos, bytes,
> > +						  AOP_FLAG_UNINTERRUPTIBLE,
> > +						  &pageP, &fsdata);
> > +
> > +		if (unlikely(ret)) {
> > +			err = ret;
> > +			for(cPagePtr = 0; cPagePtr < cPagesAllocated; cPagePtr++) {
> > +				kunmap(rv_cb[cPagePtr].rv_page);
> > +				ret = mapping->a_ops->write_end(file, mapping,
> > +								rv_cb[cPagePtr].rv_pos,
> > +								rv_cb[cPagePtr].rv_count,
> > +								rv_cb[cPagePtr].rv_count,
> > +								rv_cb[cPagePtr].rv_page,
> > +								rv_cb[cPagePtr].rv_fsdata);
> > +			}
> > +			goto done;
> > +		}
> > +		rv_cb[cPagesAllocated].rv_page = pageP;
> > +		rv_cb[cPagesAllocated].rv_pos = pos;
> > +		rv_cb[cPagesAllocated].rv_count = bytes;
> > +		rv_cb[cPagesAllocated].rv_fsdata = fsdata;
> > +		iov[cPagesAllocated].iov_base = kmap(pageP) + offset;
> > +		iov[cPagesAllocated].iov_len = bytes;
> > +		cPagesAllocated++;
> > +		count_tmp -= bytes;
> > +		pos += bytes;
> > +	} while (count_tmp);
> 
> So, we are nesting up to 32 page locks here. That's bad. And we are
> nesting kmap() calls for all the pages individually - is that even
> safe to do?
> 

You can do it, but if you run out of kmap space, it'll hang and you'll
be screwed. Wouldn't take much either -- just a few of these calls
running in parallel would be enough to deadlock all of them.

> So, what happens when we've got 16 pages in, and the filesystem has
> allocated space for those 16 blocks, and we get ENOSPC on the 17th?
> Sure, you undo the state here, but what about the 16 blocks that the
> filesystem has allocated to this file? There's no notification to
> the filesystem that they need to be truncated away because the write
> failed....
> 
> > +
> > +	/* IOV is ready, receive the date from socket now */
> > +	msg.msg_name = NULL;
> > +	msg.msg_namelen = 0;
> > +	msg.msg_iov = (struct iovec *)&iov[0];
> > +	msg.msg_iovlen = cPagesAllocated ;
> > +	msg.msg_control = NULL;
> > +	msg.msg_controllen = 0;
> > +	msg.msg_flags = MSG_KERNSPACE;
> > +	rcvtimeo = sock->sk->sk_rcvtimeo;    
> > +	sock->sk->sk_rcvtimeo = 8 * HZ;
> 
> We can hold the inode and the pages locked for 8 seconds?
> 
> I'll stop there. This is fundamentally broken. It's an attempt to do
> a multi-page write operation without any of the supporting
> structures needed to handle the failure cases properly.  The nested
> page locking has "deadlock" written all over it, and the lack of
> partial failure handling shouts "data corruption" and "stale data
> exposure" to me. The fact it can block for up to 8 seconds waiting
> for network shenanigans to be completed while holding lots of locks
> is going to cause all sorts of problems under memory pressure.
> 
> Not to mention it means that all memory allocations in the msgrcv
> path need to be done with GFP_NOFS, because GFP_KERNEL allocations
> are almost guaranteed to deadlock on the locked pages this path
> already holds....
> 

Yep, pretty broken.

A better approach might be to hook this call up to the sk_write_space
callback on the socket. Have it block until there is something to
actually be read, and then do the loop over the pages one at a time at
that point until all of the ready data is read (non-blocking) into the
pages. When you run out of data, unlock everything and go back to
blocking until there is more or until you hit the "len" passed into the
syscall.

Locking and error handling with that approach would still be pretty
tricky, however...

-- 
Jeff Layton <jlayton-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Recvfile patch used for Samba.
  2013-07-23  7:10 ` Dave Chinner
  2013-07-23 13:31   ` Jeff Layton
@ 2013-07-23 21:58   ` Jeremy Allison
  2013-07-24  2:47     ` Dave Chinner
  1 sibling, 1 reply; 8+ messages in thread
From: Jeremy Allison @ 2013-07-23 21:58 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Jeremy Allison, Steve French, Jeff Layton,
	linux-cifs-u79uwXL29TY76Z2rM5mHXA, LKML, linux-fsdevel

On Tue, Jul 23, 2013 at 05:10:27PM +1000, Dave Chinner wrote:
> So, we are nesting up to 32 page locks here. That's bad. And we are
> nesting kmap() calls for all the pages individually - is that even
> safe to do?
> 
> So, what happens when we've got 16 pages in, and the filesystem has
> allocated space for those 16 blocks, and we get ENOSPC on the 17th?
> Sure, you undo the state here, but what about the 16 blocks that the
> filesystem has allocated to this file? There's no notification to
> the filesystem that they need to be truncated away because the write
> failed....
> 
> > +
> > +	/* IOV is ready, receive the date from socket now */
> > +	msg.msg_name = NULL;
> > +	msg.msg_namelen = 0;
> > +	msg.msg_iov = (struct iovec *)&iov[0];
> > +	msg.msg_iovlen = cPagesAllocated ;
> > +	msg.msg_control = NULL;
> > +	msg.msg_controllen = 0;
> > +	msg.msg_flags = MSG_KERNSPACE;
> > +	rcvtimeo = sock->sk->sk_rcvtimeo;    
> > +	sock->sk->sk_rcvtimeo = 8 * HZ;
> 
> We can hold the inode and the pages locked for 8 seconds?
> 
> I'll stop there. This is fundamentally broken. It's an attempt to do
> a multi-page write operation without any of the supporting
> structures needed to handle the failure cases properly.  The nested
> page locking has "deadlock" written all over it, and the lack of
> partial failure handling shouts "data corruption" and "stale data
> exposure" to me. The fact it can block for up to 8 seconds waiting
> for network shenanigans to be completed while holding lots of locks
> is going to cause all sorts of problems under memory pressure.
> 
> Not to mention it means that all memory allocations in the msgrcv
> path need to be done with GFP_NOFS, because GFP_KERNEL allocations
> are almost guaranteed to deadlock on the locked pages this path
> already holds....
> 
> Need I say more?

No, that's great ! :-).

Thanks for the analysis. I'd heard it wasn't
near production quality, but not being a kernel
engineer myself I wasn't able to make that assessment.

Having said that the OEMs that are using it does
find it improves write speeds by a large amount (10%
or more), so it's showing there is room for improvement
here if the correct code can be created for recvfile.

Cheers,

Jeremy.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Recvfile patch used for Samba.
  2013-07-23 21:58   ` Jeremy Allison
@ 2013-07-24  2:47     ` Dave Chinner
  2013-07-25  8:17       ` Steven Whitehouse
  0 siblings, 1 reply; 8+ messages in thread
From: Dave Chinner @ 2013-07-24  2:47 UTC (permalink / raw)
  To: Jeremy Allison; +Cc: Steve French, Jeff Layton, linux-cifs, LKML, linux-fsdevel

On Tue, Jul 23, 2013 at 02:58:58PM -0700, Jeremy Allison wrote:
> On Tue, Jul 23, 2013 at 05:10:27PM +1000, Dave Chinner wrote:
> > So, we are nesting up to 32 page locks here. That's bad. And we are
> > nesting kmap() calls for all the pages individually - is that even
> > safe to do?
> > 
> > So, what happens when we've got 16 pages in, and the filesystem has
> > allocated space for those 16 blocks, and we get ENOSPC on the 17th?
> > Sure, you undo the state here, but what about the 16 blocks that the
> > filesystem has allocated to this file? There's no notification to
> > the filesystem that they need to be truncated away because the write
> > failed....
> > 
> > > +
> > > +	/* IOV is ready, receive the date from socket now */
> > > +	msg.msg_name = NULL;
> > > +	msg.msg_namelen = 0;
> > > +	msg.msg_iov = (struct iovec *)&iov[0];
> > > +	msg.msg_iovlen = cPagesAllocated ;
> > > +	msg.msg_control = NULL;
> > > +	msg.msg_controllen = 0;
> > > +	msg.msg_flags = MSG_KERNSPACE;
> > > +	rcvtimeo = sock->sk->sk_rcvtimeo;    
> > > +	sock->sk->sk_rcvtimeo = 8 * HZ;
> > 
> > We can hold the inode and the pages locked for 8 seconds?
> > 
> > I'll stop there. This is fundamentally broken. It's an attempt to do
> > a multi-page write operation without any of the supporting
> > structures needed to handle the failure cases properly.  The nested
> > page locking has "deadlock" written all over it, and the lack of
> > partial failure handling shouts "data corruption" and "stale data
> > exposure" to me. The fact it can block for up to 8 seconds waiting
> > for network shenanigans to be completed while holding lots of locks
> > is going to cause all sorts of problems under memory pressure.
> > 
> > Not to mention it means that all memory allocations in the msgrcv
> > path need to be done with GFP_NOFS, because GFP_KERNEL allocations
> > are almost guaranteed to deadlock on the locked pages this path
> > already holds....
> > 
> > Need I say more?
> 
> No, that's great ! :-).
> 
> Thanks for the analysis. I'd heard it wasn't
> near production quality, but not being a kernel
> engineer myself I wasn't able to make that assessment.
> 
> Having said that the OEMs that are using it does
> find it improves write speeds by a large amount (10%
> or more), so it's showing there is room for improvement
> here if the correct code can be created for recvfile.

10% is not very large gain given the complexity it adds, and I
question that the gain actually comes from moving the memcpy() into
the kernel.  If this recvfile code enabled zero-copy behaviour into
the page cache, then it would be worth pursuing. But it doesn't, and
so IMO the complexity is not worth the gain right now.

Indeed, I suspect the 10% gain will be from the multi-page write
behaviour that was hacked into the code. I wrote a multi-page
write prototype ~3 years ago that showed write(2) performance gains
of roughly 10% on low CPU power machines running XFS.

$ git branch |grep multi
  multipage-write
$ git checkout multipage-write 
Checking out files: 100% (45114/45114), done.
Switched to branch 'multipage-write'
$ head -4 Makefile 
VERSION = 2
PATCHLEVEL = 6
SUBLEVEL = 37
EXTRAVERSION = -rc6
$

I should probably pick this up again and push it forwards. FWIW,
I've attached the first multipage-write infrastructure patch from
the above branch to show how this sort of operation needs to be done
from a filesystem and page-cache perspective to avoid locking
problems have sane error handling.

I beleive the version that Christoph implemented for a couple of
OEMs around that time de-multiplexed the ->iomap method....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

multipage-write: introduce iomap infrastructure

From: Dave Chinner <dchinner@redhat.com>

Add infrastructure for multipage writes by defining the mapping interface
that the multipage writes will use and the main multipage write loop.

Signed-off-by: Dave Chinner <dchinner@redhat.com>

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 76041b6..1196877 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -513,6 +513,7 @@ enum positive_aop_returns {
 struct page;
 struct address_space;
 struct writeback_control;
+struct iomap;
 
 struct iov_iter {
 	const struct iovec *iov;
@@ -604,6 +605,9 @@ struct address_space_operations {
 	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
 					unsigned long);
 	int (*error_remove_page)(struct address_space *, struct page *);
+
+	int (*iomap)(struct address_space *mapping, loff_t pos,
+			ssize_t length, struct iomap *iomap, int cmd);
 };
 
 /*
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
new file mode 100644
index 0000000..7708614
--- /dev/null
+++ b/include/linux/iomap.h
@@ -0,0 +1,45 @@
+#ifndef _IOMAP_H
+#define _IOMAP_H
+
+/* ->iomap a_op command types */
+#define IOMAP_READ	0x01	/* read the current mapping starting at the
+				   given position, trimmed to a maximum length.
+				   FS's should use this to obtain and lock
+				   resources within this range */
+#define	IOMAP_RESERVE	0x02	/* reserve space for an allocation that spans
+				   the given iomap */
+#define IOMAP_ALLOCATE	0x03	/* allocate space in a given iomap - must have
+				   first been reserved */
+#define	IOMAP_UNRESERVE	0x04	/* return unused reserved space for the given
+				   iomap and used space. This will always be
+				   called after a IOMAP_READ so as to allow the
+				   FS to release held resources. */
+
+/* types of block ranges for multipage write mappings. */
+#define IOMAP_HOLE	0x01	/* no blocks allocated, need allocation */
+#define IOMAP_DELALLOC	0x02	/* delayed allocation blocks */
+#define IOMAP_MAPPED	0x03	/* blocks allocated @blkno */
+#define IOMAP_UNWRITTEN	0x04	/* blocks allocated @blkno in unwritten state */
+
+struct iomap {
+	sector_t	blkno;	/* first sector of mapping */
+	loff_t		offset;	/* file offset of mapping, bytes */
+	ssize_t		length;	/* length of mapping, bytes */
+	int		type;	/* type of mapping */
+	void		*priv;	/* fs private data associated with map */
+};
+
+static inline bool
+iomap_needs_allocation(struct iomap *iomap)
+{
+	return iomap->type == IOMAP_HOLE;
+}
+
+/* multipage write interfaces use iomaps */
+typedef int (*mpw_actor_t)(struct address_space *mapping, void *src,
+			loff_t pos, ssize_t len, struct iomap *iomap);
+
+ssize_t multipage_write_segment(struct address_space *mapping, void *src,
+			loff_t pos, ssize_t length, mpw_actor_t actor);
+
+#endif /* _IOMAP_H */
diff --git a/mm/filemap.c b/mm/filemap.c
index 3d4df44..27e2f7d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/compiler.h>
 #include <linux/fs.h>
+#include <linux/iomap.h>
 #include <linux/uaccess.h>
 #include <linux/aio.h>
 #include <linux/capability.h>
@@ -2221,10 +2222,14 @@ repeat:
 }
 EXPORT_SYMBOL(grab_cache_page_write_begin);
 
-static ssize_t generic_perform_write(struct file *file,
-				struct iov_iter *i, loff_t pos)
+static ssize_t
+__generic_perform_write(
+	struct file		*file,
+	struct address_space	*mapping,
+	struct iov_iter		*i,
+	loff_t			pos,
+	void			*priv)
 {
-	struct address_space *mapping = file->f_mapping;
 	const struct address_space_operations *a_ops = mapping->a_ops;
 	long status = 0;
 	ssize_t written = 0;
@@ -2241,7 +2246,6 @@ static ssize_t generic_perform_write(struct file *file,
 		unsigned long offset;	/* Offset into pagecache page */
 		unsigned long bytes;	/* Bytes to write to page */
 		size_t copied;		/* Bytes copied from user */
-		void *fsdata;
 
 		offset = (pos & (PAGE_CACHE_SIZE - 1));
 		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
@@ -2265,7 +2269,7 @@ again:
 		}
 
 		status = a_ops->write_begin(file, mapping, pos, bytes, flags,
-						&page, &fsdata);
+						&page, priv);
 		if (unlikely(status))
 			break;
 
@@ -2279,7 +2283,7 @@ again:
 
 		mark_page_accessed(page);
 		status = a_ops->write_end(file, mapping, pos, bytes, copied,
-						page, fsdata);
+						page, priv);
 		if (unlikely(status < 0))
 			break;
 		copied = status;
@@ -2310,6 +2314,159 @@ again:
 	return written ? written : status;
 }
 
+static int
+multipage_write_actor(
+	struct address_space	*mapping,
+	void			*src,
+	loff_t			pos,
+	ssize_t			length,
+	struct iomap		*iomap)
+{
+	struct iov_iter		*i = src;
+
+	return __generic_perform_write(NULL, mapping, i, pos, iomap);
+}
+
+/*
+ * Execute a multipage write on a segment of the mapping that spans a
+ * contiguous range of pages that have identical block mapping state.
+ * This avoids the need to map pages individually, do individual allocations
+ * for each page and most importantly avoid the need for filesystem specific
+ * locking per page. Instead, all the operations are amortised over the entire
+ * range of pages. It is assumed that the filesystems will lock whatever
+ * resources they require in the IOMAP_READ call, and release them in the
+ * IOMAP_COMPLETE call.
+ */
+ssize_t
+multipage_write_segment(
+	struct address_space	*mapping,
+	void			*src,
+	loff_t			pos,
+	ssize_t			length,
+	mpw_actor_t		actor)
+{
+	const struct address_space_operations *a_ops = mapping->a_ops;
+	long			status;
+	bool			need_alloc;
+	struct iomap		iomap = { 0 };
+
+	/*
+	 * need to map a range from start position for count bytes. This can
+	 * span multiple pages - it is only guaranteed to return a range of a
+	 * single type of pages (e.g. all into a hole, all mapped or all
+	 * unwritten). Failure at this point has nothing to undo.
+	 *
+	 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
+	 * to keep the chunks of work done where somewhat symmetric with the
+	 * work writeback does. This is a completely arbitrary number pulled
+	 * out of thin air as a best guess for initial testing.
+	 */
+	length = min_t(ssize_t, length, 1024 * PAGE_SIZE);
+	status = a_ops->iomap(mapping, pos, length, &iomap, IOMAP_READ);
+	if (status)
+		goto out;
+
+	/*
+	 * If allocation is required for this range, reserve the space now so
+	 * that the allocation is guaranteed to succeed later on. Once we copy
+	 * the data into the page cache pages, then we cannot fail otherwise we
+	 * expose transient stale data. If the reserve fails, we can safely
+	 * back out at this point as there is nothing to undo.
+	 */
+	need_alloc = iomap_needs_allocation(&iomap);
+	if (need_alloc) {
+		status = a_ops->iomap(mapping, pos, length, &iomap, IOMAP_RESERVE);
+		if (status)
+			goto out;
+	}
+
+	/*
+	 * because we have now guaranteed that the space allocation will
+	 * succeed, we can do the copy-in page by page without having to worry
+	 * about failures exposing transient data. Hence we can mostly reuse
+	 * the existing method of:
+	 *	- grab and lock page
+	 *	- set up page mapping structures (e.g. bufferheads)
+	 *	- copy data in
+	 *	- update page state and unlock page
+	 *
+	 * This avoids the need to hold MAX_WRITEBACK_PAGES locked at once
+	 * while we execute the copy-in. It does mean, however, that the
+	 * filesystem needs to avoid any attempts at writeback of pages in this
+	 * iomap until the allocation is completed after the copyin.
+	 *
+	 * XXX: needs to be done per-filesystem in ->writepage
+	 */
+
+	status = actor(mapping, src, pos, length, &iomap);
+	printk("mpws: pos %lld, len %lld, status %lld\n", pos, length, status);
+	if (status == -ERANGE)
+		status = 0;
+	if (status <= 0)
+		goto out_failed_write;
+
+	/* now the data has been copied, allocate the range we've copied */
+	if (need_alloc) {
+		int error;
+		/*
+		 * allocate should not fail unless the filesystem has had a
+		 * fatal error. Issue a warning here to track this situation.
+		 */
+		error = a_ops->iomap(mapping, pos, status, &iomap, IOMAP_ALLOCATE);
+		if (error) {
+			WARN_ON(0);
+			status = error;
+			/* XXX: mark all pages not-up-to-date? */
+			goto out_failed_write;
+		}
+	}
+
+
+out:
+	return status;
+	/*
+	 * if we copied less than we reserved, release any unused
+	 * reservation we hold so that we don't leak the space. Unreserving
+	 * space never fails.
+	 */
+out_failed_write:
+	if (need_alloc)
+		a_ops->iomap(mapping, pos, 0, &iomap, IOMAP_UNRESERVE);
+	return status;
+}
+EXPORT_SYMBOL(multipage_write_segment);
+
+/*
+ * Loop writing multiple pages in segments until we have run out
+ * of data to write in the iovec iterator.
+ */
+static ssize_t
+generic_perform_multipage_write(struct file *file,
+				struct iov_iter *i, loff_t pos)
+{
+	long status = 0;
+	ssize_t written = 0;
+
+	do {
+		status = multipage_write_segment(file->f_mapping, i, pos,
+				iov_iter_count(i), multipage_write_actor);
+		if (status <= 0)
+			break;
+		pos += status;
+		written += status;
+	} while (iov_iter_count(i));
+
+	return written ? written : status;
+}
+
+static ssize_t generic_perform_write(struct file *file,
+				struct iov_iter *i, loff_t pos)
+{
+	void *fs_data;
+
+	return __generic_perform_write(file, file->f_mapping, i, pos, &fs_data);
+}
+
 ssize_t
 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t pos, loff_t *ppos,
@@ -2320,13 +2477,17 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 	struct iov_iter i;
 
 	iov_iter_init(&i, iov, nr_segs, count, written);
-	status = generic_perform_write(file, &i, pos);
+
+	if (file->f_mapping->a_ops->iomap)
+		status = generic_perform_multipage_write(file, &i, pos);
+	else
+		status = generic_perform_write(file, &i, pos);
 
 	if (likely(status >= 0)) {
 		written += status;
 		*ppos = pos + status;
-  	}
-	
+	}
+
 	return written ? written : status;
 }
 EXPORT_SYMBOL(generic_file_buffered_write);

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: Recvfile patch used for Samba.
  2013-07-24  2:47     ` Dave Chinner
@ 2013-07-25  8:17       ` Steven Whitehouse
  2013-07-26  4:42         ` Dave Chinner
  0 siblings, 1 reply; 8+ messages in thread
From: Steven Whitehouse @ 2013-07-25  8:17 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Jeremy Allison, Steve French, Jeff Layton,
	linux-cifs-u79uwXL29TY76Z2rM5mHXA, LKML, linux-fsdevel

Hi,

On Wed, 2013-07-24 at 12:47 +1000, Dave Chinner wrote:
> On Tue, Jul 23, 2013 at 02:58:58PM -0700, Jeremy Allison wrote:
> > On Tue, Jul 23, 2013 at 05:10:27PM +1000, Dave Chinner wrote:
> > > So, we are nesting up to 32 page locks here. That's bad. And we are
> > > nesting kmap() calls for all the pages individually - is that even
> > > safe to do?
> > > 
> > > So, what happens when we've got 16 pages in, and the filesystem has
> > > allocated space for those 16 blocks, and we get ENOSPC on the 17th?
> > > Sure, you undo the state here, but what about the 16 blocks that the
> > > filesystem has allocated to this file? There's no notification to
> > > the filesystem that they need to be truncated away because the write
> > > failed....
> > > 
> > > > +
> > > > +	/* IOV is ready, receive the date from socket now */
> > > > +	msg.msg_name = NULL;
> > > > +	msg.msg_namelen = 0;
> > > > +	msg.msg_iov = (struct iovec *)&iov[0];
> > > > +	msg.msg_iovlen = cPagesAllocated ;
> > > > +	msg.msg_control = NULL;
> > > > +	msg.msg_controllen = 0;
> > > > +	msg.msg_flags = MSG_KERNSPACE;
> > > > +	rcvtimeo = sock->sk->sk_rcvtimeo;    
> > > > +	sock->sk->sk_rcvtimeo = 8 * HZ;
> > > 
> > > We can hold the inode and the pages locked for 8 seconds?
> > > 
> > > I'll stop there. This is fundamentally broken. It's an attempt to do
> > > a multi-page write operation without any of the supporting
> > > structures needed to handle the failure cases properly.  The nested
> > > page locking has "deadlock" written all over it, and the lack of
> > > partial failure handling shouts "data corruption" and "stale data
> > > exposure" to me. The fact it can block for up to 8 seconds waiting
> > > for network shenanigans to be completed while holding lots of locks
> > > is going to cause all sorts of problems under memory pressure.
> > > 
> > > Not to mention it means that all memory allocations in the msgrcv
> > > path need to be done with GFP_NOFS, because GFP_KERNEL allocations
> > > are almost guaranteed to deadlock on the locked pages this path
> > > already holds....
> > > 
> > > Need I say more?
> > 
> > No, that's great ! :-).
> > 
> > Thanks for the analysis. I'd heard it wasn't
> > near production quality, but not being a kernel
> > engineer myself I wasn't able to make that assessment.
> > 
> > Having said that the OEMs that are using it does
> > find it improves write speeds by a large amount (10%
> > or more), so it's showing there is room for improvement
> > here if the correct code can be created for recvfile.
> 
> 10% is not very large gain given the complexity it adds, and I
> question that the gain actually comes from moving the memcpy() into
> the kernel.  If this recvfile code enabled zero-copy behaviour into
> the page cache, then it would be worth pursuing. But it doesn't, and
> so IMO the complexity is not worth the gain right now.
> 
> Indeed, I suspect the 10% gain will be from the multi-page write
> behaviour that was hacked into the code. I wrote a multi-page
> write prototype ~3 years ago that showed write(2) performance gains
> of roughly 10% on low CPU power machines running XFS.
> 
> $ git branch |grep multi
>   multipage-write
> $ git checkout multipage-write 
> Checking out files: 100% (45114/45114), done.
> Switched to branch 'multipage-write'
> $ head -4 Makefile 
> VERSION = 2
> PATCHLEVEL = 6
> SUBLEVEL = 37
> EXTRAVERSION = -rc6
> $
> 
> I should probably pick this up again and push it forwards. FWIW,
> I've attached the first multipage-write infrastructure patch from
> the above branch to show how this sort of operation needs to be done
> from a filesystem and page-cache perspective to avoid locking
> problems have sane error handling.
> 
> I beleive the version that Christoph implemented for a couple of
> OEMs around that time de-multiplexed the ->iomap method....
> 
> Cheers,
> 
> Dave.

I have Christoph's version here and between other tasks, I'm working on
figuring out how it all works and writing GFS2 support for it. I'd more
or less got that complete for your version, but there are a number of
differences with Christoph's code and it is taking me a while to ensure
that I've not missed any corner cases and figuring out how to fit some
of GFS2's odd write modes into the framework,

Steve.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Recvfile patch used for Samba.
  2013-07-25  8:17       ` Steven Whitehouse
@ 2013-07-26  4:42         ` Dave Chinner
  0 siblings, 0 replies; 8+ messages in thread
From: Dave Chinner @ 2013-07-26  4:42 UTC (permalink / raw)
  To: Steven Whitehouse
  Cc: Jeremy Allison, Steve French, Jeff Layton,
	linux-cifs-u79uwXL29TY76Z2rM5mHXA, LKML, linux-fsdevel

On Thu, Jul 25, 2013 at 09:17:01AM +0100, Steven Whitehouse wrote:
> Hi,
> 
> On Wed, 2013-07-24 at 12:47 +1000, Dave Chinner wrote:
> > On Tue, Jul 23, 2013 at 02:58:58PM -0700, Jeremy Allison wrote:
> > > Having said that the OEMs that are using it does
> > > find it improves write speeds by a large amount (10%
> > > or more), so it's showing there is room for improvement
> > > here if the correct code can be created for recvfile.
> > 
> > 10% is not very large gain given the complexity it adds, and I
> > question that the gain actually comes from moving the memcpy() into
> > the kernel.  If this recvfile code enabled zero-copy behaviour into
> > the page cache, then it would be worth pursuing. But it doesn't, and
> > so IMO the complexity is not worth the gain right now.
> > 
> > Indeed, I suspect the 10% gain will be from the multi-page write
> > behaviour that was hacked into the code. I wrote a multi-page
> > write prototype ~3 years ago that showed write(2) performance gains
> > of roughly 10% on low CPU power machines running XFS.
...
> > I should probably pick this up again and push it forwards. FWIW,
> > I've attached the first multipage-write infrastructure patch from
> > the above branch to show how this sort of operation needs to be done
> > from a filesystem and page-cache perspective to avoid locking
> > problems have sane error handling.
> > 
> > I beleive the version that Christoph implemented for a couple of
> > OEMs around that time de-multiplexed the ->iomap method....
> 
> I have Christoph's version here and between other tasks, I'm working on
> figuring out how it all works and writing GFS2 support for it. I'd more
> or less got that complete for your version, but there are a number of
> differences with Christoph's code and it is taking me a while to ensure
> that I've not missed any corner cases and figuring out how to fit some
> of GFS2's odd write modes into the framework,

Can you send me Christoph's version so I can have a look at the
differences? I'm pretty sure there isn't anything architecturally
different, but I've never seen it so I don't know exactly how it
differs...

Cheers,

Dave.
-- 
Dave Chinner
david-FqsqvQoI3Ljby3iVrkZq2A@public.gmane.org

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2013-07-26  4:42 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-07-22 21:57 Recvfile patch used for Samba Jeremy Allison
2013-07-22 23:26 ` Joe Perches
2013-07-23  7:10 ` Dave Chinner
2013-07-23 13:31   ` Jeff Layton
2013-07-23 21:58   ` Jeremy Allison
2013-07-24  2:47     ` Dave Chinner
2013-07-25  8:17       ` Steven Whitehouse
2013-07-26  4:42         ` Dave Chinner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).