Netdev List
 help / color / mirror / Atom feed
From: Askar Safin <safinaskar@gmail.com>
To: linux-fsdevel@vger.kernel.org,
	Christian Brauner <brauner@kernel.org>,
	Alexander Viro <viro@zeniv.linux.org.uk>, Jan Kara <jack@suse.cz>
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	linux-api@vger.kernel.org, netdev@vger.kernel.org,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Matthew Wilcox <willy@infradead.org>,
	Jens Axboe <axboe@kernel.dk>,
	Christoph Hellwig <hch@infradead.org>,
	David Howells <dhowells@redhat.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	David Hildenbrand <david@kernel.org>,
	Pedro Falcato <pfalcato@suse.de>,
	Miklos Szeredi <miklos@szeredi.hu>,
	patches@lists.linux.dev
Subject: [PATCH 2/3] vmsplice: make vmsplice a trivial wrapper for preadv2/pwritev2
Date: Sun, 31 May 2026 01:01:06 +0000	[thread overview]
Message-ID: <20260531010107.1953702-3-safinaskar@gmail.com> (raw)
In-Reply-To: <20260531010107.1953702-1-safinaskar@gmail.com>

vmsplice behavior on writable pipe became equivalent to pwritev2.
vmsplice behavior on readable pipe already was nearly
equivalent to preadv2, but I made this explicit. I. e. I made it
obvious from code that vmsplice now is equivalent to preadv2/pwritev2.

Also I moved vmsplice to fs/read_write.c, because now it arguably
belongs there.

Note that SPLICE_F_NONBLOCK behavior slightly changed: previously
vmsplice ignored whether the pipe was opened with O_NONBLOCK, and mode
of operation depended on whether SPLICE_F_NONBLOCK was passed only.
Now the operation will be non-blocking if O_NONBLOCK was passed when
opening *or* SPLICE_F_NONBLOCK was passed to vmsplice. Previous
behavior was arguably buggy, and new behavior is arguably better.

Now SPLICE_F_GIFT is always ignored by all 3 syscalls: splice, tee
and vmsplice.

Signed-off-by: Askar Safin <safinaskar@gmail.com>
---
 fs/read_write.c          |  23 +++++
 fs/splice.c              | 192 +--------------------------------------
 include/linux/skbuff.h   |   4 +-
 include/linux/splice.h   |   2 +-
 include/linux/syscalls.h |   4 +-
 5 files changed, 29 insertions(+), 196 deletions(-)

diff --git a/fs/read_write.c b/fs/read_write.c
index 50bff7edc91f..1e5444f4dab3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1213,6 +1213,29 @@ SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
 	return do_pwritev(fd, vec, vlen, pos, flags);
 }
 
+/*
+ * Legacy preadv2/pwritev2 wrapper.
+ */
+SYSCALL_DEFINE4(vmsplice, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen, unsigned int, flags)
+{
+	if (unlikely(flags & ~SPLICE_F_ALL))
+		return -EINVAL;
+
+	CLASS(fd, f)(fd);
+	if (fd_empty(f))
+		return -EBADF;
+
+	/* We do do_writev/do_readv, so it is okay to pass "false" here */
+	if (!get_pipe_info(fd_file(f), /* for_splice = */ false))
+		return -EBADF;
+
+	if (fd_file(f)->f_mode & FMODE_WRITE)
+		return do_writev(fd, vec, vlen, (flags & SPLICE_F_NONBLOCK) ? RWF_NOWAIT : 0);
+	else
+		return do_readv(fd, vec, vlen, (flags & SPLICE_F_NONBLOCK) ? RWF_NOWAIT : 0);
+}
+
 /*
  * Various compat syscalls.  Note that they all pretend to take a native
  * iovec - import_iovec will properly treat those as compat_iovecs based on
diff --git a/fs/splice.c b/fs/splice.c
index 59adbc2fa4d6..b1a4e3713bd6 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -159,22 +159,6 @@ const struct pipe_buf_operations page_cache_pipe_buf_ops = {
 	.get		= generic_pipe_buf_get,
 };
 
-static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe,
-		struct pipe_buffer *buf)
-{
-	if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
-		return false;
-
-	buf->flags |= PIPE_BUF_FLAG_LRU;
-	return generic_pipe_buf_try_steal(pipe, buf);
-}
-
-static const struct pipe_buf_operations user_page_pipe_buf_ops = {
-	.release	= page_cache_pipe_buf_release,
-	.try_steal	= user_page_pipe_buf_try_steal,
-	.get		= generic_pipe_buf_get,
-};
-
 static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
 {
 	smp_mb();
@@ -589,8 +573,7 @@ static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_des
  * Description:
  *    This function does little more than loop over the pipe and call
  *    @actor to do the actual moving of a single struct pipe_buffer to
- *    the desired destination. See pipe_to_file, pipe_to_sendmsg, or
- *    pipe_to_user.
+ *    the desired destination. See pipe_to_file or pipe_to_sendmsg.
  *
  */
 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
@@ -1440,179 +1423,6 @@ static ssize_t __do_splice(struct file *in, loff_t __user *off_in,
 	return ret;
 }
 
-static ssize_t iter_to_pipe(struct iov_iter *from,
-			    struct pipe_inode_info *pipe,
-			    unsigned int flags)
-{
-	struct pipe_buffer buf = {
-		.ops = &user_page_pipe_buf_ops,
-		.flags = flags
-	};
-	size_t total = 0;
-	ssize_t ret = 0;
-
-	while (iov_iter_count(from)) {
-		struct page *pages[16];
-		ssize_t left;
-		size_t start;
-		int i, n;
-
-		left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start);
-		if (left <= 0) {
-			ret = left;
-			break;
-		}
-
-		n = DIV_ROUND_UP(left + start, PAGE_SIZE);
-		for (i = 0; i < n; i++) {
-			int size = umin(left, PAGE_SIZE - start);
-
-			buf.page = pages[i];
-			buf.offset = start;
-			buf.len = size;
-			ret = add_to_pipe(pipe, &buf);
-			if (unlikely(ret < 0)) {
-				iov_iter_revert(from, left);
-				// this one got dropped by add_to_pipe()
-				while (++i < n)
-					put_page(pages[i]);
-				goto out;
-			}
-			total += ret;
-			left -= size;
-			start = 0;
-		}
-	}
-out:
-	return total ? total : ret;
-}
-
-static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
-			struct splice_desc *sd)
-{
-	int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
-	return n == sd->len ? n : -EFAULT;
-}
-
-/*
- * For lack of a better implementation, implement vmsplice() to userspace
- * as a simple copy of the pipe's pages to the user iov.
- */
-static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter,
-				unsigned int flags)
-{
-	struct pipe_inode_info *pipe = get_pipe_info(file, true);
-	struct splice_desc sd = {
-		.total_len = iov_iter_count(iter),
-		.flags = flags,
-		.u.data = iter
-	};
-	ssize_t ret = 0;
-
-	if (!pipe)
-		return -EBADF;
-
-	pipe_clear_nowait(file);
-
-	if (sd.total_len) {
-		pipe_lock(pipe);
-		ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
-		pipe_unlock(pipe);
-	}
-
-	if (ret > 0)
-		fsnotify_access(file);
-
-	return ret;
-}
-
-/*
- * vmsplice splices a user address range into a pipe. It can be thought of
- * as splice-from-memory, where the regular splice is splice-from-file (or
- * to file). In both cases the output is a pipe, naturally.
- */
-static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
-				unsigned int flags)
-{
-	struct pipe_inode_info *pipe;
-	ssize_t ret = 0;
-	unsigned buf_flag = 0;
-
-	if (flags & SPLICE_F_GIFT)
-		buf_flag = PIPE_BUF_FLAG_GIFT;
-
-	pipe = get_pipe_info(file, true);
-	if (!pipe)
-		return -EBADF;
-
-	pipe_clear_nowait(file);
-
-	pipe_lock(pipe);
-	ret = wait_for_space(pipe, flags);
-	if (!ret)
-		ret = iter_to_pipe(iter, pipe, buf_flag);
-	pipe_unlock(pipe);
-	if (ret > 0) {
-		wakeup_pipe_readers(pipe);
-		fsnotify_modify(file);
-	}
-	return ret;
-}
-
-/*
- * Note that vmsplice only really supports true splicing _from_ user memory
- * to a pipe, not the other way around. Splicing from user memory is a simple
- * operation that can be supported without any funky alignment restrictions
- * or nasty vm tricks. We simply map in the user memory and fill them into
- * a pipe. The reverse isn't quite as easy, though. There are two possible
- * solutions for that:
- *
- *	- memcpy() the data internally, at which point we might as well just
- *	  do a regular read() on the buffer anyway.
- *	- Lots of nasty vm tricks, that are neither fast nor flexible (it
- *	  has restriction limitations on both ends of the pipe).
- *
- * Currently we punt and implement it as a normal copy, see pipe_to_user().
- *
- */
-SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
-		unsigned long, nr_segs, unsigned int, flags)
-{
-	struct iovec iovstack[UIO_FASTIOV];
-	struct iovec *iov = iovstack;
-	struct iov_iter iter;
-	ssize_t error;
-	int type;
-
-	if (unlikely(flags & ~SPLICE_F_ALL))
-		return -EINVAL;
-
-	CLASS(fd, f)(fd);
-	if (fd_empty(f))
-		return -EBADF;
-	if (fd_file(f)->f_mode & FMODE_WRITE)
-		type = ITER_SOURCE;
-	else if (fd_file(f)->f_mode & FMODE_READ)
-		type = ITER_DEST;
-	else
-		return -EBADF;
-
-	error = import_iovec(type, uiov, nr_segs,
-			     ARRAY_SIZE(iovstack), &iov, &iter);
-	if (error < 0)
-		return error;
-
-	if (!iov_iter_count(&iter))
-		error = 0;
-	else if (type == ITER_SOURCE)
-		error = vmsplice_to_pipe(fd_file(f), &iter, flags);
-	else
-		error = vmsplice_to_user(fd_file(f), &iter, flags);
-
-	kfree(iov);
-	return error;
-}
-
 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
 		int, fd_out, loff_t __user *, off_out,
 		size_t, len, unsigned int, flags)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2bcf78a4de7b..2961fee3e5cc 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -505,7 +505,7 @@ enum {
 	SKBFL_ZEROCOPY_ENABLE = BIT(0),
 
 	/* This indicates at least one fragment might be overwritten
-	 * (as in vmsplice(), sendfile() ...)
+	 * (as in sendfile(), ...)
 	 * If we need to compute a TX checksum, we'll need to copy
 	 * all frags to avoid possible bad checksum
 	 */
@@ -4017,7 +4017,7 @@ static inline int skb_linearize(struct sk_buff *skb)
  * @skb: buffer to test
  *
  * Return: true if the skb has at least one frag that might be modified
- * by an external entity (as in vmsplice()/sendfile())
+ * by an external entity (as in sendfile())
  */
 static inline bool skb_has_shared_frag(const struct sk_buff *skb)
 {
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 9dec4861d09f..fb4f035aae83 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -19,7 +19,7 @@
 				 /* we may still block on the fd we splice */
 				 /* from/to, of course */
 #define SPLICE_F_MORE	(0x04)	/* expect more data */
-#define SPLICE_F_GIFT	(0x08)	/* pages passed in are a gift */
+#define SPLICE_F_GIFT	(0x08)	/* ignored */
 
 #define SPLICE_F_ALL (SPLICE_F_MOVE|SPLICE_F_NONBLOCK|SPLICE_F_MORE|SPLICE_F_GIFT)
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index f5639d5ac331..a86a88207956 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -514,8 +514,8 @@ asmlinkage long sys_ppoll_time32(struct pollfd __user *, unsigned int,
 			  struct old_timespec32 __user *, const sigset_t __user *,
 			  size_t);
 asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask, size_t sizemask, int flags);
-asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
-			     unsigned long nr_segs, unsigned int flags);
+asmlinkage long sys_vmsplice(unsigned long fd, const struct iovec __user *vec,
+			     unsigned long vlen, unsigned int flags);
 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
 			   int fd_out, loff_t __user *off_out,
 			   size_t len, unsigned int flags);
-- 
2.47.3


  parent reply	other threads:[~2026-05-31  1:02 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-31  1:01 [PATCH 0/3] vmsplice: make vmsplice a trivial wrapper for preadv2/pwritev2 Askar Safin
2026-05-31  1:01 ` [PATCH 1/3] tee: fs/splice.c: remove unused parameter "flags" from "link_pipe" Askar Safin
2026-05-31  1:01 ` Askar Safin [this message]
2026-05-31  1:01 ` [PATCH 3/3] splice: remove PIPE_BUF_FLAG_GIFT Askar Safin
2026-05-31  8:54 ` [PATCH 0/3] vmsplice: make vmsplice a trivial wrapper for preadv2/pwritev2 Pedro Falcato

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260531010107.1953702-3-safinaskar@gmail.com \
    --to=safinaskar@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=axboe@kernel.dk \
    --cc=brauner@kernel.org \
    --cc=david@kernel.org \
    --cc=dhowells@redhat.com \
    --cc=hch@infradead.org \
    --cc=jack@suse.cz \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=miklos@szeredi.hu \
    --cc=netdev@vger.kernel.org \
    --cc=patches@lists.linux.dev \
    --cc=pfalcato@suse.de \
    --cc=torvalds@linux-foundation.org \
    --cc=viro@zeniv.linux.org.uk \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox