From: David Howells <dhowells@redhat.com>
To: Jens Axboe <axboe@kernel.dk>, Al Viro <viro@zeniv.linux.org.uk>,
Christoph Hellwig <hch@infradead.org>
Cc: dhowells@redhat.com, Matthew Wilcox <willy@infradead.org>,
Jan Kara <jack@suse.cz>, Jeff Layton <jlayton@kernel.org>,
David Hildenbrand <david@redhat.com>,
Jason Gunthorpe <jgg@nvidia.com>,
Logan Gunthorpe <logang@deltatee.com>,
Hillf Danton <hdanton@sina.com>,
Christian Brauner <brauner@kernel.org>,
Linus Torvalds <torvalds@linux-foundation.org>,
linux-fsdevel@vger.kernel.org, linux-block@vger.kernel.org,
linux-kernel@vger.kernel.org, linux-mm@kvack.org
Subject: [PATCH] iov_iter: Add automatic-alloc for ITER_BVEC and use in direct_splice_read()
Date: Fri, 19 May 2023 08:49:18 +0100 [thread overview]
Message-ID: <1740264.1684482558@warthog.procyon.org.uk> (raw)
In-Reply-To: <20230519074047.1739879-1-dhowells@redhat.com>
If it's a problem that direct_splice_read() always allocates as much memory as
is asked for and that will fit into the pipe when less could be allocated in
the case that, say, an O_DIRECT-read will hit a hole and do a short read or a
socket will return less than was asked for, something like the attached
modification to ITER_BVEC could be made.
David
---
iov_iter: Add automatic-alloc for ITER_BVEC and use in direct_splice_read()
Add a flag to the iov_iter struct that tells things that write to or allow
writing to a BVEC-type iterator that they should allocate pages to fill in
any slots in the bio_vec array that have null page pointers. This allows
the bufferage in the bvec to be allocated on-demand.
Iterators of this type are initialised with iov_iter_bvec_autoalloc()
instead of iov_iter_bvec(). Only destination (ie. READ/ITER_DEST)
iterators may be used in this fashion.
An additional function, iov_iter_auto_alloc() is provided to perform the
allocation in the case that the caller wishes to make use of the bio_vec
array directly and the block layer is modified to use it.
direct_splice_read() is then modified to make use of this. This is less
efficient if we know in advance that we want to allocate the full buffer as
we can't so easily use bulk allocation, but it does mean in cases where we
might not want the full buffer (say we hit a hole in DIO), we don't have to
allocate it.
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Christoph Hellwig <hch@lst.de>
cc: Jens Axboe <axboe@kernel.dk>
cc: Al Viro <viro@zeniv.linux.org.uk>
cc: David Hildenbrand <david@redhat.com>
cc: John Hubbard <jhubbard@nvidia.com>
cc: linux-mm@kvack.org
cc: linux-block@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
---
block/bio.c | 2
fs/splice.c | 36 ++++++-----------
include/linux/uio.h | 13 ++++--
lib/iov_iter.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++-
4 files changed, 133 insertions(+), 28 deletions(-)
diff --git a/block/bio.c b/block/bio.c
index 798cc4cf3bd2..72d5c1125df2 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1330,6 +1330,8 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
int ret = 0;
if (iov_iter_is_bvec(iter)) {
+ if (!iov_iter_auto_alloc(iter, iov_iter_count(iter)))
+ return -ENOMEM;
bio_iov_bvec_set(bio, iter);
iov_iter_advance(iter, bio->bi_iter.bi_size);
return 0;
diff --git a/fs/splice.c b/fs/splice.c
index 56d9802729d0..30e7a31c5ada 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -310,10 +310,8 @@ ssize_t direct_splice_read(struct file *in, loff_t *ppos,
struct iov_iter to;
struct bio_vec *bv;
struct kiocb kiocb;
- struct page **pages;
ssize_t ret;
- size_t used, npages, chunk, remain, keep = 0;
- int i;
+ size_t used, npages, chunk, remain, keep = 0, i;
if (!len)
return 0;
@@ -334,30 +332,14 @@ ssize_t direct_splice_read(struct file *in, loff_t *ppos,
len = min_t(size_t, len, npages * PAGE_SIZE);
npages = DIV_ROUND_UP(len, PAGE_SIZE);
- bv = kzalloc(array_size(npages, sizeof(bv[0])) +
- array_size(npages, sizeof(struct page *)), GFP_KERNEL);
+ bv = kzalloc(array_size(npages, sizeof(bv[0])), GFP_KERNEL);
if (!bv)
return -ENOMEM;
- pages = (struct page **)(bv + npages);
- npages = alloc_pages_bulk_array(GFP_USER, npages, pages);
- if (!npages) {
- kfree(bv);
- return -ENOMEM;
- }
-
remain = len = min_t(size_t, len, npages * PAGE_SIZE);
- for (i = 0; i < npages; i++) {
- chunk = min_t(size_t, PAGE_SIZE, remain);
- bv[i].bv_page = pages[i];
- bv[i].bv_offset = 0;
- bv[i].bv_len = chunk;
- remain -= chunk;
- }
-
/* Do the I/O */
- iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
+ iov_iter_bvec_autoalloc(&to, ITER_DEST, bv, npages, len);
init_sync_kiocb(&kiocb, in);
kiocb.ki_pos = *ppos;
ret = call_read_iter(in, &kiocb, &to);
@@ -376,8 +358,16 @@ ssize_t direct_splice_read(struct file *in, loff_t *ppos,
}
/* Free any pages that didn't get touched at all. */
- if (keep < npages)
- release_pages(pages + keep, npages - keep);
+ if (keep < npages) {
+ struct page **pages = (struct page **)&bv[keep];
+ size_t j = 0;
+
+ for (i = keep; i < npages; i++)
+ if (bv[i].bv_page)
+ pages[j++] = bv[i].bv_page;
+ if (j)
+ release_pages(pages, j);
+ }
/* Push the remaining pages into the pipe. */
remain = ret;
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 60c342bb7ab8..6bc2287021d9 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -40,10 +40,11 @@ struct iov_iter_state {
struct iov_iter {
u8 iter_type;
- bool copy_mc;
- bool nofault;
- bool data_source;
- bool user_backed;
+ bool copy_mc:1;
+ bool nofault:1;
+ bool data_source:1;
+ bool user_backed:1;
+ bool auto_alloc:1; /* Automatically alloc pages into a bvec */
union {
size_t iov_offset;
int last_offset;
@@ -263,6 +264,7 @@ static inline bool iov_iter_is_copy_mc(const struct iov_iter *i)
}
#endif
+bool iov_iter_auto_alloc(struct iov_iter *iter, size_t count);
size_t iov_iter_zero(size_t bytes, struct iov_iter *);
bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
unsigned len_mask);
@@ -274,6 +276,9 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec
unsigned long nr_segs, size_t count);
void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec,
unsigned long nr_segs, size_t count);
+void iov_iter_bvec_autoalloc(struct iov_iter *i, unsigned int direction,
+ const struct bio_vec *bvec, unsigned long nr_segs,
+ size_t count);
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray,
loff_t start, size_t count);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index f18138e0292a..3643f9d80ecc 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -52,7 +52,11 @@
while (n) { \
unsigned offset = p->bv_offset + skip; \
unsigned left; \
- void *kaddr = kmap_local_page(p->bv_page + \
+ void *kaddr; \
+ \
+ if (!p->bv_page) \
+ break; \
+ kaddr = kmap_local_page(p->bv_page + \
offset / PAGE_SIZE); \
base = kaddr + offset % PAGE_SIZE; \
len = min(min(n, (size_t)(p->bv_len - skip)), \
@@ -159,6 +163,49 @@ __out: \
#define iterate_and_advance(i, n, base, len, off, I, K) \
__iterate_and_advance(i, n, base, len, off, I, ((void)(K),0))
+/*
+ * Preallocate pages into the bvec sufficient to store count bytes.
+ */
+static bool bvec_auto_alloc(struct iov_iter *iter, size_t count)
+{
+ struct bio_vec *bvec = (struct bio_vec *)iter->bvec;
+ int j;
+
+ if (!count)
+ return true;
+
+ count += iter->iov_offset;
+ for (j = 0; j < iter->nr_segs; j++) {
+ if (!bvec[j].bv_page) {
+ bvec[j].bv_page = alloc_page(GFP_KERNEL);
+ if (!bvec[j].bv_page)
+ return false;
+ }
+ if (bvec[j].bv_len >= count)
+ break;
+ count -= bvec[j].bv_len;
+ }
+
+ return true;
+}
+
+/**
+ * iov_iter_auto_alloc - Perform auto-alloc for an iterator
+ * @iter: The iterator to do the allocation for
+ * @count: The number of bytes we need to store
+ *
+ * Perform auto-allocation on a iterator. This only works with ITER_BVEC-type
+ * iterators. It will make sure that pages are allocated sufficient to store
+ * the specified number of bytes. Returns true if sufficient pages are present
+ * in the bvec and false if an allocation failure occurs.
+ */
+bool iov_iter_auto_alloc(struct iov_iter *iter, size_t count)
+{
+ return !iov_iter_is_bvec(iter) || !iter->auto_alloc ||
+ bvec_auto_alloc(iter, count);
+}
+EXPORT_SYMBOL_GPL(iov_iter_auto_alloc);
+
static int copyout(void __user *to, const void *from, size_t n)
{
if (should_fail_usercopy())
@@ -313,6 +360,8 @@ size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
return 0;
if (user_backed_iter(i))
might_fault();
+ if (!iov_iter_auto_alloc(i, bytes))
+ return 0;
iterate_and_advance(i, bytes, base, len, off,
copyout(base, addr + off, len),
memcpy(base, addr + off, len)
@@ -362,6 +411,8 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
return 0;
if (user_backed_iter(i))
might_fault();
+ if (!iov_iter_auto_alloc(i, bytes))
+ return 0;
__iterate_and_advance(i, bytes, base, len, off,
copyout_mc(base, addr + off, len),
copy_mc_to_kernel(base, addr + off, len)
@@ -503,6 +554,8 @@ size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t byte
return 0;
if (WARN_ON_ONCE(i->data_source))
return 0;
+ if (!iov_iter_auto_alloc(i, bytes))
+ return 0;
page += offset / PAGE_SIZE; // first subpage
offset %= PAGE_SIZE;
while (1) {
@@ -557,6 +610,8 @@ EXPORT_SYMBOL(copy_page_from_iter);
size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
{
+ if (!iov_iter_auto_alloc(i, bytes))
+ return -ENOMEM;
iterate_and_advance(i, bytes, base, len, count,
clear_user(base, len),
memset(base, 0, len)
@@ -598,6 +653,7 @@ static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
size += i->iov_offset;
for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) {
+ BUG_ON(!bvec->bv_page);
if (likely(size < bvec->bv_len))
break;
size -= bvec->bv_len;
@@ -740,6 +796,51 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
}
EXPORT_SYMBOL(iov_iter_bvec);
+/**
+ * iov_iter_bvec_autoalloc - Initialise a BVEC-type I/O iterator with automatic alloc
+ * @i: The iterator to initialise.
+ * @direction: The direction of the transfer.
+ * @bvec: The array of bio_vecs listing the buffer segments
+ * @nr_segs: The number of segments in @bvec[].
+ * @count: The size of the I/O buffer in bytes.
+ *
+ * Set up an I/O iterator to insert pages into a bvec as data is written into
+ * it where NULL pointers exist in the bvec array (if a pointer isn't NULL, the
+ * page it points to will just be used). No more than @nr_segs pages will be
+ * filled in. Empty slots will have bv_offset set to 0 and bv_len to
+ * PAGE_SIZE.
+ *
+ * If the iterator is reverted, excess pages will be left for the
+ * caller to clean up.
+ */
+void iov_iter_bvec_autoalloc(struct iov_iter *i, unsigned int direction,
+ const struct bio_vec *bvec, unsigned long nr_segs,
+ size_t count)
+{
+ struct bio_vec *bv = (struct bio_vec *)bvec;
+ unsigned long j;
+
+ BUG_ON(direction != READ);
+ *i = (struct iov_iter){
+ .iter_type = ITER_BVEC,
+ .copy_mc = false,
+ .data_source = direction,
+ .auto_alloc = true,
+ .bvec = bvec,
+ .nr_segs = nr_segs,
+ .iov_offset = 0,
+ .count = count
+ };
+
+ for (j = 0; j < nr_segs; j++) {
+ if (!bv[j].bv_page) {
+ bv[j].bv_offset = 0;
+ bv[j].bv_len = PAGE_SIZE;
+ }
+ }
+}
+EXPORT_SYMBOL(iov_iter_bvec_autoalloc);
+
/**
* iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
* @i: The iterator to initialise.
@@ -1122,6 +1223,8 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
struct page **p;
struct page *page;
+ if (!iov_iter_auto_alloc(i, maxsize))
+ return -ENOMEM;
page = first_bvec_segment(i, &maxsize, start);
n = want_pages_array(pages, maxsize, *start, maxpages);
if (!n)
@@ -1226,6 +1329,8 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
csstate->off += bytes;
return bytes;
}
+ if (!iov_iter_auto_alloc(i, bytes))
+ return -ENOMEM;
sum = csum_shift(csstate->csum, csstate->off);
iterate_and_advance(i, bytes, base, len, off, ({
@@ -1664,6 +1769,9 @@ static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i,
size_t skip = i->iov_offset, offset;
int k;
+ if (!iov_iter_auto_alloc(i, maxsize))
+ return -ENOMEM;
+
for (;;) {
if (i->nr_segs == 0)
return 0;
next prev parent reply other threads:[~2023-05-19 7:52 UTC|newest]
Thread overview: 63+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-05-19 7:40 [PATCH v20 00/32] splice, block: Use page pinning and kill ITER_PIPE David Howells
2023-05-19 7:40 ` [PATCH v20 01/32] splice: Fix filemap of a blockdev David Howells
2023-05-19 8:06 ` Christoph Hellwig
2023-05-19 9:11 ` David Howells
2023-05-19 7:40 ` [PATCH v20 02/32] splice: Clean up direct_splice_read() a bit David Howells
2023-05-19 7:40 ` [PATCH v20 03/32] splice: Make direct_read_splice() limit to eof where appropriate David Howells
2023-05-19 8:09 ` Christoph Hellwig
2023-05-19 8:43 ` David Howells
2023-05-19 8:47 ` Christoph Hellwig
2023-05-19 22:27 ` David Howells
2023-05-20 3:54 ` Christoph Hellwig
2023-05-19 16:31 ` Linus Torvalds
2023-05-19 16:47 ` David Howells
2023-05-19 17:37 ` Linus Torvalds
2023-05-19 7:40 ` [PATCH v20 04/32] splice: Make do_splice_to() generic and export it David Howells
2023-05-19 7:40 ` [PATCH v20 05/32] splice: Make splice from a DAX file use direct_splice_read() David Howells
2023-05-19 8:10 ` Christoph Hellwig
2023-05-19 8:48 ` David Howells
2023-05-19 7:40 ` [PATCH v20 06/32] shmem: Implement splice-read David Howells
2023-05-19 7:40 ` [PATCH v20 07/32] overlayfs: " David Howells
2023-05-19 7:40 ` [PATCH v20 08/32] coda: " David Howells
2023-05-19 7:40 ` [PATCH v20 09/32] tty, proc, kernfs, random: Use direct_splice_read() David Howells
2023-05-19 8:12 ` Christoph Hellwig
2023-05-19 16:22 ` Linus Torvalds
2023-05-19 7:40 ` [PATCH v20 10/32] net: Make sock_splice_read() use direct_splice_read() by default David Howells
2023-05-19 7:40 ` [PATCH v20 11/32] 9p: Add splice_read stub David Howells
2023-05-19 7:40 ` [PATCH v20 12/32] afs: Provide a splice-read stub David Howells
2023-05-19 7:40 ` [PATCH v20 13/32] ceph: " David Howells
2023-05-19 8:40 ` Xiubo Li
2023-05-19 9:24 ` David Howells
2023-05-22 1:53 ` Xiubo Li
2023-05-19 7:40 ` [PATCH v20 14/32] ecryptfs: " David Howells
2023-05-19 7:40 ` [PATCH v20 15/32] ext4: " David Howells
2023-05-19 7:40 ` [PATCH v20 16/32] f2fs: " David Howells
2023-05-19 7:40 ` [PATCH v20 17/32] nfs: " David Howells
2023-05-19 7:40 ` [PATCH v20 18/32] ntfs3: " David Howells
2023-05-19 7:40 ` [PATCH v20 19/32] ocfs2: " David Howells
2023-05-19 7:40 ` [PATCH v20 20/32] orangefs: " David Howells
2023-05-19 7:40 ` [PATCH v20 21/32] xfs: " David Howells
2023-05-19 7:40 ` [PATCH v20 22/32] zonefs: " David Howells
2023-05-19 7:40 ` [PATCH v20 23/32] splice: Convert trace/seq to use direct_splice_read() David Howells
2023-05-22 14:29 ` Steven Rostedt
2023-05-22 14:50 ` David Howells
2023-05-22 17:42 ` Linus Torvalds
2023-05-22 18:38 ` Steven Rostedt
2023-05-19 7:40 ` [PATCH v20 24/32] splice: Do splice read from a file without using ITER_PIPE David Howells
2023-05-19 7:40 ` [PATCH v20 25/32] cifs: Use generic_file_splice_read() David Howells
2023-05-19 7:40 ` [PATCH v20 26/32] iov_iter: Kill ITER_PIPE David Howells
2023-05-19 7:40 ` [PATCH v20 27/32] iomap: Don't get an reference on ZERO_PAGE for direct I/O block zeroing David Howells
2023-05-19 7:40 ` [PATCH v20 28/32] block: Fix bio_flagged() so that gcc can better optimise it David Howells
2023-05-19 7:40 ` [PATCH v20 29/32] block: Replace BIO_NO_PAGE_REF with BIO_PAGE_REFFED with inverted logic David Howells
2023-05-20 1:26 ` Kent Overstreet
2023-05-20 3:56 ` Christoph Hellwig
2023-05-20 4:13 ` Kent Overstreet
2023-05-20 4:17 ` Christoph Hellwig
2023-05-20 5:52 ` Kent Overstreet
2023-05-20 8:40 ` David Howells
2023-05-19 7:40 ` [PATCH v20 30/32] block: Add BIO_PAGE_PINNED and associated infrastructure David Howells
2023-05-19 7:40 ` [PATCH v20 31/32] block: Convert bio_iov_iter_get_pages to use iov_iter_extract_pages David Howells
2023-05-19 7:40 ` [PATCH v20 32/32] block: convert bio_map_user_iov " David Howells
2023-05-19 7:49 ` David Howells [this message]
2023-05-19 8:18 ` [PATCH] iov_iter: Add automatic-alloc for ITER_BVEC and use in direct_splice_read() Christoph Hellwig
2023-05-19 8:06 ` [PATCH v20 00/32] splice, block: Use page pinning and kill ITER_PIPE Christoph Hellwig
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1740264.1684482558@warthog.procyon.org.uk \
--to=dhowells@redhat.com \
--cc=axboe@kernel.dk \
--cc=brauner@kernel.org \
--cc=david@redhat.com \
--cc=hch@infradead.org \
--cc=hdanton@sina.com \
--cc=jack@suse.cz \
--cc=jgg@nvidia.com \
--cc=jlayton@kernel.org \
--cc=linux-block@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=logang@deltatee.com \
--cc=torvalds@linux-foundation.org \
--cc=viro@zeniv.linux.org.uk \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).