From: Askar Safin <safinaskar@gmail.com>
To: linux-fsdevel@vger.kernel.org,
Christian Brauner <brauner@kernel.org>,
Alexander Viro <viro@zeniv.linux.org.uk>, Jan Kara <jack@suse.cz>
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
linux-api@vger.kernel.org, netdev@vger.kernel.org,
Linus Torvalds <torvalds@linux-foundation.org>,
Matthew Wilcox <willy@infradead.org>,
Jens Axboe <axboe@kernel.dk>,
Christoph Hellwig <hch@infradead.org>,
David Howells <dhowells@redhat.com>,
Andrew Morton <akpm@linux-foundation.org>,
David Hildenbrand <david@kernel.org>,
Pedro Falcato <pfalcato@suse.de>,
Miklos Szeredi <miklos@szeredi.hu>,
patches@lists.linux.dev
Subject: [PATCH 2/3] vmsplice: make vmsplice a trivial wrapper for preadv2/pwritev2
Date: Sun, 31 May 2026 01:01:06 +0000 [thread overview]
Message-ID: <20260531010107.1953702-3-safinaskar@gmail.com> (raw)
In-Reply-To: <20260531010107.1953702-1-safinaskar@gmail.com>
vmsplice behavior on writable pipe became equivalent to pwritev2.
vmsplice behavior on readable pipe already was nearly
equivalent to preadv2, but I made this explicit. I. e. I made it
obvious from code that vmsplice now is equivalent to preadv2/pwritev2.
Also I moved vmsplice to fs/read_write.c, because now it arguably
belongs there.
Note that SPLICE_F_NONBLOCK behavior slightly changed: previously
vmsplice ignored whether the pipe was opened with O_NONBLOCK, and mode
of operation depended on whether SPLICE_F_NONBLOCK was passed only.
Now the operation will be non-blocking if O_NONBLOCK was passed when
opening *or* SPLICE_F_NONBLOCK was passed to vmsplice. Previous
behavior was arguably buggy, and new behavior is arguably better.
Now SPLICE_F_GIFT is always ignored by all 3 syscalls: splice, tee
and vmsplice.
Signed-off-by: Askar Safin <safinaskar@gmail.com>
---
fs/read_write.c | 23 +++++
fs/splice.c | 192 +--------------------------------------
include/linux/skbuff.h | 4 +-
include/linux/splice.h | 2 +-
include/linux/syscalls.h | 4 +-
5 files changed, 29 insertions(+), 196 deletions(-)
diff --git a/fs/read_write.c b/fs/read_write.c
index 50bff7edc91f..1e5444f4dab3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1213,6 +1213,29 @@ SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
return do_pwritev(fd, vec, vlen, pos, flags);
}
+/*
+ * Legacy preadv2/pwritev2 wrapper.
+ */
+SYSCALL_DEFINE4(vmsplice, unsigned long, fd, const struct iovec __user *, vec,
+ unsigned long, vlen, unsigned int, flags)
+{
+ if (unlikely(flags & ~SPLICE_F_ALL))
+ return -EINVAL;
+
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
+
+ /* We do do_writev/do_readv, so it is okay to pass "false" here */
+ if (!get_pipe_info(fd_file(f), /* for_splice = */ false))
+ return -EBADF;
+
+ if (fd_file(f)->f_mode & FMODE_WRITE)
+ return do_writev(fd, vec, vlen, (flags & SPLICE_F_NONBLOCK) ? RWF_NOWAIT : 0);
+ else
+ return do_readv(fd, vec, vlen, (flags & SPLICE_F_NONBLOCK) ? RWF_NOWAIT : 0);
+}
+
/*
* Various compat syscalls. Note that they all pretend to take a native
* iovec - import_iovec will properly treat those as compat_iovecs based on
diff --git a/fs/splice.c b/fs/splice.c
index 59adbc2fa4d6..b1a4e3713bd6 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -159,22 +159,6 @@ const struct pipe_buf_operations page_cache_pipe_buf_ops = {
.get = generic_pipe_buf_get,
};
-static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf)
-{
- if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
- return false;
-
- buf->flags |= PIPE_BUF_FLAG_LRU;
- return generic_pipe_buf_try_steal(pipe, buf);
-}
-
-static const struct pipe_buf_operations user_page_pipe_buf_ops = {
- .release = page_cache_pipe_buf_release,
- .try_steal = user_page_pipe_buf_try_steal,
- .get = generic_pipe_buf_get,
-};
-
static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
{
smp_mb();
@@ -589,8 +573,7 @@ static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_des
* Description:
* This function does little more than loop over the pipe and call
* @actor to do the actual moving of a single struct pipe_buffer to
- * the desired destination. See pipe_to_file, pipe_to_sendmsg, or
- * pipe_to_user.
+ * the desired destination. See pipe_to_file or pipe_to_sendmsg.
*
*/
ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
@@ -1440,179 +1423,6 @@ static ssize_t __do_splice(struct file *in, loff_t __user *off_in,
return ret;
}
-static ssize_t iter_to_pipe(struct iov_iter *from,
- struct pipe_inode_info *pipe,
- unsigned int flags)
-{
- struct pipe_buffer buf = {
- .ops = &user_page_pipe_buf_ops,
- .flags = flags
- };
- size_t total = 0;
- ssize_t ret = 0;
-
- while (iov_iter_count(from)) {
- struct page *pages[16];
- ssize_t left;
- size_t start;
- int i, n;
-
- left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start);
- if (left <= 0) {
- ret = left;
- break;
- }
-
- n = DIV_ROUND_UP(left + start, PAGE_SIZE);
- for (i = 0; i < n; i++) {
- int size = umin(left, PAGE_SIZE - start);
-
- buf.page = pages[i];
- buf.offset = start;
- buf.len = size;
- ret = add_to_pipe(pipe, &buf);
- if (unlikely(ret < 0)) {
- iov_iter_revert(from, left);
- // this one got dropped by add_to_pipe()
- while (++i < n)
- put_page(pages[i]);
- goto out;
- }
- total += ret;
- left -= size;
- start = 0;
- }
- }
-out:
- return total ? total : ret;
-}
-
-static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
- struct splice_desc *sd)
-{
- int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
- return n == sd->len ? n : -EFAULT;
-}
-
-/*
- * For lack of a better implementation, implement vmsplice() to userspace
- * as a simple copy of the pipe's pages to the user iov.
- */
-static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter,
- unsigned int flags)
-{
- struct pipe_inode_info *pipe = get_pipe_info(file, true);
- struct splice_desc sd = {
- .total_len = iov_iter_count(iter),
- .flags = flags,
- .u.data = iter
- };
- ssize_t ret = 0;
-
- if (!pipe)
- return -EBADF;
-
- pipe_clear_nowait(file);
-
- if (sd.total_len) {
- pipe_lock(pipe);
- ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
- pipe_unlock(pipe);
- }
-
- if (ret > 0)
- fsnotify_access(file);
-
- return ret;
-}
-
-/*
- * vmsplice splices a user address range into a pipe. It can be thought of
- * as splice-from-memory, where the regular splice is splice-from-file (or
- * to file). In both cases the output is a pipe, naturally.
- */
-static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
- unsigned int flags)
-{
- struct pipe_inode_info *pipe;
- ssize_t ret = 0;
- unsigned buf_flag = 0;
-
- if (flags & SPLICE_F_GIFT)
- buf_flag = PIPE_BUF_FLAG_GIFT;
-
- pipe = get_pipe_info(file, true);
- if (!pipe)
- return -EBADF;
-
- pipe_clear_nowait(file);
-
- pipe_lock(pipe);
- ret = wait_for_space(pipe, flags);
- if (!ret)
- ret = iter_to_pipe(iter, pipe, buf_flag);
- pipe_unlock(pipe);
- if (ret > 0) {
- wakeup_pipe_readers(pipe);
- fsnotify_modify(file);
- }
- return ret;
-}
-
-/*
- * Note that vmsplice only really supports true splicing _from_ user memory
- * to a pipe, not the other way around. Splicing from user memory is a simple
- * operation that can be supported without any funky alignment restrictions
- * or nasty vm tricks. We simply map in the user memory and fill them into
- * a pipe. The reverse isn't quite as easy, though. There are two possible
- * solutions for that:
- *
- * - memcpy() the data internally, at which point we might as well just
- * do a regular read() on the buffer anyway.
- * - Lots of nasty vm tricks, that are neither fast nor flexible (it
- * has restriction limitations on both ends of the pipe).
- *
- * Currently we punt and implement it as a normal copy, see pipe_to_user().
- *
- */
-SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
- unsigned long, nr_segs, unsigned int, flags)
-{
- struct iovec iovstack[UIO_FASTIOV];
- struct iovec *iov = iovstack;
- struct iov_iter iter;
- ssize_t error;
- int type;
-
- if (unlikely(flags & ~SPLICE_F_ALL))
- return -EINVAL;
-
- CLASS(fd, f)(fd);
- if (fd_empty(f))
- return -EBADF;
- if (fd_file(f)->f_mode & FMODE_WRITE)
- type = ITER_SOURCE;
- else if (fd_file(f)->f_mode & FMODE_READ)
- type = ITER_DEST;
- else
- return -EBADF;
-
- error = import_iovec(type, uiov, nr_segs,
- ARRAY_SIZE(iovstack), &iov, &iter);
- if (error < 0)
- return error;
-
- if (!iov_iter_count(&iter))
- error = 0;
- else if (type == ITER_SOURCE)
- error = vmsplice_to_pipe(fd_file(f), &iter, flags);
- else
- error = vmsplice_to_user(fd_file(f), &iter, flags);
-
- kfree(iov);
- return error;
-}
-
SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
int, fd_out, loff_t __user *, off_out,
size_t, len, unsigned int, flags)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2bcf78a4de7b..2961fee3e5cc 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -505,7 +505,7 @@ enum {
SKBFL_ZEROCOPY_ENABLE = BIT(0),
/* This indicates at least one fragment might be overwritten
- * (as in vmsplice(), sendfile() ...)
+ * (as in sendfile(), ...)
* If we need to compute a TX checksum, we'll need to copy
* all frags to avoid possible bad checksum
*/
@@ -4017,7 +4017,7 @@ static inline int skb_linearize(struct sk_buff *skb)
* @skb: buffer to test
*
* Return: true if the skb has at least one frag that might be modified
- * by an external entity (as in vmsplice()/sendfile())
+ * by an external entity (as in sendfile())
*/
static inline bool skb_has_shared_frag(const struct sk_buff *skb)
{
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 9dec4861d09f..fb4f035aae83 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -19,7 +19,7 @@
/* we may still block on the fd we splice */
/* from/to, of course */
#define SPLICE_F_MORE (0x04) /* expect more data */
-#define SPLICE_F_GIFT (0x08) /* pages passed in are a gift */
+#define SPLICE_F_GIFT (0x08) /* ignored */
#define SPLICE_F_ALL (SPLICE_F_MOVE|SPLICE_F_NONBLOCK|SPLICE_F_MORE|SPLICE_F_GIFT)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index f5639d5ac331..a86a88207956 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -514,8 +514,8 @@ asmlinkage long sys_ppoll_time32(struct pollfd __user *, unsigned int,
struct old_timespec32 __user *, const sigset_t __user *,
size_t);
asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask, size_t sizemask, int flags);
-asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
- unsigned long nr_segs, unsigned int flags);
+asmlinkage long sys_vmsplice(unsigned long fd, const struct iovec __user *vec,
+ unsigned long vlen, unsigned int flags);
asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
int fd_out, loff_t __user *off_out,
size_t len, unsigned int flags);
--
2.47.3
next prev parent reply other threads:[~2026-05-31 1:02 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-31 1:01 [PATCH 0/3] vmsplice: make vmsplice a trivial wrapper for preadv2/pwritev2 Askar Safin
2026-05-31 1:01 ` [PATCH 1/3] tee: fs/splice.c: remove unused parameter "flags" from "link_pipe" Askar Safin
2026-05-31 1:01 ` Askar Safin [this message]
2026-05-31 1:01 ` [PATCH 3/3] splice: remove PIPE_BUF_FLAG_GIFT Askar Safin
2026-05-31 8:54 ` [PATCH 0/3] vmsplice: make vmsplice a trivial wrapper for preadv2/pwritev2 Pedro Falcato
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260531010107.1953702-3-safinaskar@gmail.com \
--to=safinaskar@gmail.com \
--cc=akpm@linux-foundation.org \
--cc=axboe@kernel.dk \
--cc=brauner@kernel.org \
--cc=david@kernel.org \
--cc=dhowells@redhat.com \
--cc=hch@infradead.org \
--cc=jack@suse.cz \
--cc=linux-api@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=miklos@szeredi.hu \
--cc=netdev@vger.kernel.org \
--cc=patches@lists.linux.dev \
--cc=pfalcato@suse.de \
--cc=torvalds@linux-foundation.org \
--cc=viro@zeniv.linux.org.uk \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox