From: Ritesh Harjani <riteshh@linux.ibm.com>
To: tytso@mit.edu, jack@suse.cz, linux-ext4@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org, mbobrowski@mbobrowski.org,
riteshh@linux.ibm.com
Subject: [RFC 4/5] ext4: Add support for blocksize < pagesize in dioread_nolock
Date: Wed, 16 Oct 2019 13:07:10 +0530 [thread overview]
Message-ID: <20191016073711.4141-5-riteshh@linux.ibm.com> (raw)
In-Reply-To: <20191016073711.4141-1-riteshh@linux.ibm.com>
This patch adds the support for blocksize < pagesize for
dioread_nolock feature.
Since in case of blocksize < pagesize, we can have multiple
small buffers of page as unwritten extents, we need to
maintain a vector of these unwritten extents which needs
the conversion after the IO is complete. Thus, we maintain
a list of tuple <offset, size> pair (io_end_vec) for this &
traverse this list to do the unwritten to written conversion.
Signed-off-by: Ritesh Harjani <riteshh@linux.ibm.com>
---
fs/ext4/ext4.h | 11 ++++++++--
fs/ext4/extents.c | 11 ++++++++--
fs/ext4/inode.c | 37 +++++++++++++++++----------------
fs/ext4/page-io.c | 52 +++++++++++++++++++++++++++++++++++++++--------
4 files changed, 82 insertions(+), 29 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8d924bd19ca7..3616f1b0c987 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -198,6 +198,12 @@ struct ext4_system_blocks {
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
+struct ext4_io_end_vec {
+ struct list_head list; /* list of io_end_vec */
+ loff_t offset; /* offset in the file */
+ ssize_t size; /* size of the extent */
+};
+
/*
* For converting unwritten extents on a work queue. 'handle' is used for
* buffered writeback.
@@ -211,8 +217,7 @@ typedef struct ext4_io_end {
* bios covering the extent */
unsigned int flag; /* unwritten or not */
atomic_t count; /* reference counter */
- loff_t offset; /* offset in the file */
- ssize_t size; /* size of the extent */
+ struct list_head list_vec; /* list of ext4_io_end_vec */
} ext4_io_end_t;
struct ext4_io_submit {
@@ -3324,6 +3329,8 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
int len,
struct writeback_control *wbc,
bool keep_towrite);
+extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
+extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 731e67ccab22..cf6c5f64cb58 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5005,6 +5005,7 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
{
int ret, err = 0;
+ struct ext4_io_end_vec *io_end_vec;
/*
* This is somewhat ugly but the idea is clear: When transaction is
@@ -5018,8 +5019,14 @@ int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
return PTR_ERR(handle);
}
- ret = ext4_convert_unwritten_extents(handle, io_end->inode,
- io_end->offset, io_end->size);
+ list_for_each_entry(io_end_vec, &io_end->list_vec, list) {
+ ret = ext4_convert_unwritten_extents(handle, io_end->inode,
+ io_end_vec->offset,
+ io_end_vec->size);
+ if (ret)
+ break;
+ }
+
if (handle)
err = ext4_journal_stop(handle);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3204a7d35581..026666bdc058 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2364,6 +2364,9 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
ext4_lblk_t lblk = *m_lblk;
ext4_fsblk_t pblock = *m_pblk;
int err = 0;
+ int blkbits = mpd->inode->i_blkbits;
+ ssize_t io_end_size = 0;
+ struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);
bh = head = page_buffers(page);
do {
@@ -2376,17 +2379,16 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
*/
mpd->map.m_len = 0;
mpd->map.m_flags = 0;
+ io_end_vec->size += io_end_size;
+ io_end_size = 0;
- /*
- * FIXME: If dioread_nolock supports
- * blocksize < pagesize, we need to make
- * sure we add size mapped so far to
- * io_end->size as the following call
- * can submit the page for IO.
- */
err = mpage_process_page_bufs(mpd, head, bh, lblk);
if (err > 0)
err = 0;
+ if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) {
+ io_end_vec = ext4_alloc_io_end_vec(io_end);
+ io_end_vec->offset = mpd->map.m_lblk << blkbits;
+ }
*map_bh = true;
goto out;
}
@@ -2395,13 +2397,11 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
bh->b_blocknr = pblock++;
}
clear_buffer_unwritten(bh);
+ io_end_size += (1 << blkbits);
} while (lblk++, (bh = bh->b_this_page) != head);
- /*
- * FIXME: This is going to break if dioread_nolock
- * supports blocksize < pagesize as we will try to
- * convert potentially unmapped parts of inode.
- */
- io_end->size += PAGE_SIZE;
+
+ io_end_vec->size += io_end_size;
+ io_end_size = 0;
*map_bh = false;
out:
*m_lblk = lblk;
@@ -2551,9 +2551,10 @@ static int mpage_map_and_submit_extent(handle_t *handle,
int err;
loff_t disksize;
int progress = 0;
+ ext4_io_end_t *io_end = mpd->io_submit.io_end;
+ struct ext4_io_end_vec *io_end_vec = ext4_alloc_io_end_vec(io_end);
- mpd->io_submit.io_end->offset =
- ((loff_t)map->m_lblk) << inode->i_blkbits;
+ io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
do {
err = mpage_map_one_extent(handle, mpd);
if (err < 0) {
@@ -3654,6 +3655,7 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private)
{
ext4_io_end_t *io_end = private;
+ struct ext4_io_end_vec *io_end_vec;
/* if not async direct IO just return */
if (!io_end)
@@ -3671,8 +3673,9 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ext4_clear_io_unwritten_flag(io_end);
size = 0;
}
- io_end->offset = offset;
- io_end->size = size;
+ io_end_vec = ext4_alloc_io_end_vec(io_end);
+ io_end_vec->offset = offset;
+ io_end_vec->size = size;
ext4_put_io_end(io_end);
return 0;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 3ccf54a380b2..893913d1c2fe 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -31,18 +31,56 @@
#include "acl.h"
static struct kmem_cache *io_end_cachep;
+static struct kmem_cache *io_end_vec_cachep;
int __init ext4_init_pageio(void)
{
io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
if (io_end_cachep == NULL)
return -ENOMEM;
+
+ io_end_vec_cachep = KMEM_CACHE(ext4_io_end_vec, 0);
+ if (io_end_vec_cachep == NULL) {
+ kmem_cache_destroy(io_end_cachep);
+ return -ENOMEM;
+ }
return 0;
}
void ext4_exit_pageio(void)
{
kmem_cache_destroy(io_end_cachep);
+ kmem_cache_destroy(io_end_vec_cachep);
+}
+
+struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end)
+{
+ struct ext4_io_end_vec *io_end_vec;
+
+ io_end_vec = kmem_cache_zalloc(io_end_vec_cachep, GFP_NOFS);
+ if (!io_end_vec)
+ return ERR_PTR(-ENOMEM);
+ INIT_LIST_HEAD(&io_end_vec->list);
+ list_add_tail(&io_end_vec->list, &io_end->list_vec);
+ return io_end_vec;
+}
+
+static void ext4_free_io_end_vec(ext4_io_end_t *io_end)
+{
+ struct ext4_io_end_vec *io_end_vec, *tmp;
+
+ if (list_empty(&io_end->list_vec))
+ return;
+ list_for_each_entry_safe(io_end_vec, tmp, &io_end->list_vec, list) {
+ list_del(&io_end_vec->list);
+ kmem_cache_free(io_end_vec_cachep, io_end_vec);
+ }
+}
+
+struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end)
+{
+ BUG_ON(list_empty(&io_end->list_vec));
+ return list_last_entry(&io_end->list_vec, struct ext4_io_end_vec, list);
}
/*
@@ -125,6 +163,7 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
ext4_finish_bio(bio);
bio_put(bio);
}
+ ext4_free_io_end_vec(io_end);
kmem_cache_free(io_end_cachep, io_end);
}
@@ -139,8 +178,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
static int ext4_end_io_end(ext4_io_end_t *io_end)
{
struct inode *inode = io_end->inode;
- loff_t offset = io_end->offset;
- ssize_t size = io_end->size;
handle_t *handle = io_end->handle;
int ret = 0;
@@ -154,8 +191,7 @@ static int ext4_end_io_end(ext4_io_end_t *io_end)
ext4_msg(inode->i_sb, KERN_EMERG,
"failed to convert unwritten extents to written "
"extents -- potential data loss! "
- "(inode %lu, offset %llu, size %zd, error %d)",
- inode->i_ino, offset, size, ret);
+ "(inode %lu, error %d)", inode->i_ino, ret);
}
ext4_clear_io_unwritten_flag(io_end);
ext4_release_io_end(io_end);
@@ -247,6 +283,7 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
if (io_end) {
io_end->inode = inode;
INIT_LIST_HEAD(&io_end->list);
+ INIT_LIST_HEAD(&io_end->list_vec);
atomic_set(&io_end->count, 1);
}
return io_end;
@@ -255,7 +292,8 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
if (atomic_dec_and_test(&io_end->count)) {
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
+ list_empty(&io_end->list_vec)) {
ext4_release_io_end(io_end);
return;
}
@@ -307,10 +345,8 @@ static void ext4_end_bio(struct bio *bio)
struct inode *inode = io_end->inode;
ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
- "(offset %llu size %ld starting block %llu)",
+ "starting block %llu)",
bio->bi_status, inode->i_ino,
- (unsigned long long) io_end->offset,
- (long) io_end->size,
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
mapping_set_error(inode->i_mapping,
--
2.21.0
next prev parent reply other threads:[~2019-10-16 7:37 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-10-16 7:37 [RFC 0/5] Ext4: Add support for blocksize < pagesize for dioread_nolock Ritesh Harjani
2019-10-16 7:37 ` [RFC 1/5] ext4: keep uniform naming convention for io & io_end variables Ritesh Harjani
2019-10-25 13:07 ` Jan Kara
2019-10-16 7:37 ` [RFC 2/5] ext4: Add API to bring in support for unwritten io_end_vec conversion Ritesh Harjani
2019-10-16 7:37 ` [RFC 3/5] ext4: Refactor mpage_map_and_submit_buffers function Ritesh Harjani
2019-10-16 7:37 ` Ritesh Harjani [this message]
2019-10-16 7:37 ` [RFC 5/5] ext4: Enable blocksize < pagesize for dioread_nolock Ritesh Harjani
2019-10-23 23:26 ` [RFC 0/5] Ext4: Add support for " Theodore Y. Ts'o
2019-10-24 1:12 ` Ritesh Harjani
2019-10-29 7:19 ` Ritesh Harjani
2019-11-03 19:16 ` Theodore Y. Ts'o
2019-11-04 10:16 ` Matthew Bobrowski
2019-11-04 10:37 ` Ritesh Harjani
2019-11-04 10:49 ` Matthew Bobrowski
2019-11-04 16:08 ` Theodore Y. Ts'o
2019-11-04 10:43 ` Ritesh Harjani
2019-11-04 11:59 ` Ritesh Harjani
2019-11-06 17:23 ` Jan Kara
2019-11-07 11:15 ` Ritesh Harjani
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20191016073711.4141-5-riteshh@linux.ibm.com \
--to=riteshh@linux.ibm.com \
--cc=jack@suse.cz \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=mbobrowski@mbobrowski.org \
--cc=tytso@mit.edu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.