* [PATCH 1/3] vfs: Add page_cache_seek_hole_data helper
@ 2017-06-23 0:19 Andreas Gruenbacher
2017-06-23 0:19 ` [PATCH 2/3] ext4: Switch to page_cache_seek_hole_data Andreas Gruenbacher
` (2 more replies)
0 siblings, 3 replies; 5+ messages in thread
From: Andreas Gruenbacher @ 2017-06-23 0:19 UTC (permalink / raw)
To: linux-fsdevel; +Cc: Andreas Gruenbacher, linux-ext4, linux-xfs, Jan Kara
Both ext4 and xfs implement seeking for the next hole or piece of data
in unwritten extents by scanning the page cache, and both versions share
the same bug when iterating the buffers of a page: the start offset into
the page isn't taken into account, so when a page fits more than two
filesystem blocks, things will go wrong. For example, on a filesystem
with a block size of 1k, the following command will fail:
xfs_io -f -c "falloc 0 4k" \
-c "pwrite 1k 1k" \
-c "pwrite 3k 1k" \
-c "seek -a -r 0" foo
In this example, neither lseek(fd, 1024, SEEK_HOLE) nor
lseek(fd, 2048, SEEK_DATA) will return the correct result.
Introduce a generic vfs helper for seeking in the page cache that gets
this right. The next two commits replace the filesystem specific
implementations.
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
fs/buffer.c | 125 ++++++++++++++++++++++++++++++++++++++++++++
include/linux/buffer_head.h | 2 +
2 files changed, 127 insertions(+)
diff --git a/fs/buffer.c b/fs/buffer.c
index 161be58..2c58537 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3492,6 +3492,131 @@ int bh_submit_read(struct buffer_head *bh)
}
EXPORT_SYMBOL(bh_submit_read);
+/*
+ * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
+ *
+ * Returns the offset within the file on success, and -ENOENT otherwise.
+ */
+static loff_t
+page_seek_hole_data(struct page *page, loff_t lastoff, int whence)
+{
+ loff_t offset = page_offset(page);
+ struct buffer_head *bh, *head;
+ bool seek_data = whence == SEEK_DATA;
+
+ if (lastoff < offset)
+ lastoff = offset;
+
+ bh = head = page_buffers(page);
+ do {
+ offset += bh->b_size;
+ if (lastoff >= offset)
+ continue;
+
+ /*
+ * Unwritten extents that have data in the page cache covering
+ * them can be identified by the BH_Unwritten state flag.
+ * Pages with multiple buffers might have a mix of holes, data
+ * and unwritten extents - any buffer with valid data in it
+ * should have BH_Uptodate flag set on it.
+ */
+
+ if ((buffer_unwritten(bh) || buffer_uptodate(bh)) == seek_data)
+ return lastoff;
+
+ lastoff = offset;
+ } while ((bh = bh->b_this_page) != head);
+ return -ENOENT;
+}
+
+/*
+ * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
+ *
+ * Within unwritten extents, the page cache determines which parts are holes
+ * and which are data: unwritten and uptodate buffer heads count as data;
+ * everything else counts as a hole.
+ *
+ * Returns the resulting offset on successs, and -ENOENT otherwise.
+ */
+loff_t
+page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
+ int whence)
+{
+ pgoff_t index = offset >> PAGE_SHIFT;
+ pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
+ loff_t lastoff = offset;
+ struct pagevec pvec;
+
+ if (length <= 0)
+ return -ENOENT;
+
+ pagevec_init(&pvec, 0);
+
+ do {
+ unsigned want, nr_pages, i;
+
+ want = min_t(unsigned, end - index, PAGEVEC_SIZE);
+ nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /*
+ * At this point, the page may be truncated or
+ * invalidated (changing page->mapping to NULL), or
+ * even swizzled back from swapper_space to tmpfs file
+ * mapping. However, page->index will not change
+ * because we have a reference on the page.
+ *
+ * If current page offset is beyond where we've ended,
+ * we've found a hole.
+ */
+ if (whence == SEEK_HOLE &&
+ lastoff < page_offset(page))
+ goto check_range;
+
+ /* Searching done if the page index is out of range. */
+ if (page->index >= end)
+ goto not_found;
+
+ lock_page(page);
+ if (likely(page->mapping == inode->i_mapping) &&
+ page_has_buffers(page)) {
+ lastoff = page_seek_hole_data(page, lastoff, whence);
+ if (lastoff >= 0) {
+ unlock_page(page);
+ goto check_range;
+ }
+ }
+ unlock_page(page);
+ lastoff = page_offset(page) + PAGE_SIZE;
+ }
+
+ /* Searching done if fewer pages returned than wanted. */
+ if (nr_pages < want)
+ break;
+
+ index = pvec.pages[i - 1]->index + 1;
+ pagevec_release(&pvec);
+ } while (index < end);
+
+ /* When no page at lastoff and we are not done, we found a hole. */
+ if (whence != SEEK_HOLE)
+ goto not_found;
+
+check_range:
+ if (lastoff < offset + length)
+ goto out;
+not_found:
+ lastoff = -ENOENT;
+out:
+ pagevec_release(&pvec);
+ return lastoff;
+}
+EXPORT_SYMBOL_GPL(page_cache_seek_hole_data);
+
void __init buffer_init(void)
{
unsigned long nrpages;
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index bd029e52..ad4e024 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -201,6 +201,8 @@ void write_boundary_block(struct block_device *bdev,
sector_t bblock, unsigned blocksize);
int bh_uptodate_or_lock(struct buffer_head *bh);
int bh_submit_read(struct buffer_head *bh);
+loff_t page_cache_seek_hole_data(struct inode *inode, loff_t offset,
+ loff_t length, int whence);
extern int buffer_heads_over_limit;
--
2.7.5
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 2/3] ext4: Switch to page_cache_seek_hole_data
2017-06-23 0:19 [PATCH 1/3] vfs: Add page_cache_seek_hole_data helper Andreas Gruenbacher
@ 2017-06-23 0:19 ` Andreas Gruenbacher
2017-06-23 0:19 ` [PATCH 3/3] xfs: " Andreas Gruenbacher
2017-06-26 10:41 ` [PATCH 1/3] vfs: Add page_cache_seek_hole_data helper Christoph Hellwig
2 siblings, 0 replies; 5+ messages in thread
From: Andreas Gruenbacher @ 2017-06-23 0:19 UTC (permalink / raw)
To: linux-fsdevel; +Cc: Andreas Gruenbacher, linux-ext4, linux-xfs, Jan Kara
Fix SEEK_HOLE / SEEK_DATA when a page fits more than two filesystem
blocks (see the commit that introduces page_cache_seek_hole_data).
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
fs/ext4/file.c | 144 ++++++---------------------------------------------------
1 file changed, 13 insertions(+), 131 deletions(-)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 02ce7e7..d0b0862 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -439,129 +439,6 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
}
/*
- * Here we use ext4_map_blocks() to get a block mapping for a extent-based
- * file rather than ext4_ext_walk_space() because we can introduce
- * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same
- * function. When extent status tree has been fully implemented, it will
- * track all extent status for a file and we can directly use it to
- * retrieve the offset for SEEK_DATA/SEEK_HOLE.
- */
-
-/*
- * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to
- * lookup page cache to check whether or not there has some data between
- * [startoff, endoff] because, if this range contains an unwritten extent,
- * we determine this extent as a data or a hole according to whether the
- * page cache has data or not.
- */
-static int ext4_find_unwritten_pgoff(struct inode *inode,
- int whence,
- ext4_lblk_t end_blk,
- loff_t *offset)
-{
- struct pagevec pvec;
- unsigned int blkbits;
- pgoff_t index;
- pgoff_t end;
- loff_t endoff;
- loff_t startoff;
- loff_t lastoff;
- int found = 0;
-
- blkbits = inode->i_sb->s_blocksize_bits;
- startoff = *offset;
- lastoff = startoff;
- endoff = (loff_t)end_blk << blkbits;
-
- index = startoff >> PAGE_SHIFT;
- end = (endoff - 1) >> PAGE_SHIFT;
-
- pagevec_init(&pvec, 0);
- do {
- int i, num;
- unsigned long nr_pages;
-
- num = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1;
- nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
- (pgoff_t)num);
- if (nr_pages == 0)
- break;
-
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- struct buffer_head *bh, *head;
-
- /*
- * If current offset is smaller than the page offset,
- * there is a hole at this offset.
- */
- if (whence == SEEK_HOLE && lastoff < endoff &&
- lastoff < page_offset(pvec.pages[i])) {
- found = 1;
- *offset = lastoff;
- goto out;
- }
-
- if (page->index > end)
- goto out;
-
- lock_page(page);
-
- if (unlikely(page->mapping != inode->i_mapping)) {
- unlock_page(page);
- continue;
- }
-
- if (!page_has_buffers(page)) {
- unlock_page(page);
- continue;
- }
-
- if (page_has_buffers(page)) {
- lastoff = page_offset(page);
- bh = head = page_buffers(page);
- do {
- if (buffer_uptodate(bh) ||
- buffer_unwritten(bh)) {
- if (whence == SEEK_DATA)
- found = 1;
- } else {
- if (whence == SEEK_HOLE)
- found = 1;
- }
- if (found) {
- *offset = max_t(loff_t,
- startoff, lastoff);
- unlock_page(page);
- goto out;
- }
- lastoff += bh->b_size;
- bh = bh->b_this_page;
- } while (bh != head);
- }
-
- lastoff = page_offset(page) + PAGE_SIZE;
- unlock_page(page);
- }
-
- /* The no. of pages is less than our desired, we are done. */
- if (nr_pages < num)
- break;
-
- index = pvec.pages[i - 1]->index + 1;
- pagevec_release(&pvec);
- } while (index <= end);
-
- if (whence == SEEK_HOLE && lastoff < endoff) {
- found = 1;
- *offset = lastoff;
- }
-out:
- pagevec_release(&pvec);
- return found;
-}
-
-/*
* ext4_seek_data() retrieves the offset for SEEK_DATA.
*/
static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
@@ -569,7 +446,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
struct inode *inode = file->f_mapping->host;
struct extent_status es;
ext4_lblk_t start, last, end;
- loff_t dataoff, isize;
+ loff_t dataoff, length, isize;
int blkbits;
int ret;
@@ -608,8 +485,10 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
* it will be as a data or a hole according to page
* cache that has data or not.
*/
- if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
- es.es_lblk + es.es_len, &dataoff))
+ length = ((loff_t)(last + es.es_len) << blkbits) - dataoff;
+ dataoff = page_cache_seek_hole_data(inode, dataoff,
+ length, SEEK_DATA);
+ if (dataoff >= 0)
break;
last += es.es_len;
dataoff = (loff_t)last << blkbits;
@@ -632,7 +511,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
struct inode *inode = file->f_mapping->host;
struct extent_status es;
ext4_lblk_t start, last, end;
- loff_t holeoff, isize;
+ loff_t holeoff, length, isize;
int blkbits;
int ret;
@@ -667,10 +546,13 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
* it will be as a data or a hole according to page
* cache that has data or not.
*/
- if (ext4_es_is_unwritten(&es) &&
- ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
- last + es.es_len, &holeoff))
- break;
+ if (ext4_es_is_unwritten(&es)) {
+ length = ((loff_t)(last + es.es_len) << blkbits) - holeoff;
+ holeoff = page_cache_seek_hole_data(inode, holeoff,
+ length, SEEK_HOLE);
+ if (holeoff >= 0)
+ break;
+ }
last += es.es_len;
holeoff = (loff_t)last << blkbits;
--
2.7.5
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 3/3] xfs: Switch to page_cache_seek_hole_data
2017-06-23 0:19 [PATCH 1/3] vfs: Add page_cache_seek_hole_data helper Andreas Gruenbacher
2017-06-23 0:19 ` [PATCH 2/3] ext4: Switch to page_cache_seek_hole_data Andreas Gruenbacher
@ 2017-06-23 0:19 ` Andreas Gruenbacher
2017-06-26 10:41 ` [PATCH 1/3] vfs: Add page_cache_seek_hole_data helper Christoph Hellwig
2 siblings, 0 replies; 5+ messages in thread
From: Andreas Gruenbacher @ 2017-06-23 0:19 UTC (permalink / raw)
To: linux-fsdevel; +Cc: Andreas Gruenbacher, linux-ext4, linux-xfs, Jan Kara
Fix SEEK_HOLE / SEEK_DATA when a page fits more than two filesystem
blocks (see the commit that introduces page_cache_seek_hole_data).
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
fs/xfs/xfs_file.c | 197 +++---------------------------------------------------
1 file changed, 9 insertions(+), 188 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 5fb5a09..fff9aba 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -954,191 +954,6 @@ xfs_file_readdir(
}
/*
- * This type is designed to indicate the type of offset we would like
- * to search from page cache for xfs_seek_hole_data().
- */
-enum {
- HOLE_OFF = 0,
- DATA_OFF,
-};
-
-/*
- * Lookup the desired type of offset from the given page.
- *
- * On success, return true and the offset argument will point to the
- * start of the region that was found. Otherwise this function will
- * return false and keep the offset argument unchanged.
- */
-STATIC bool
-xfs_lookup_buffer_offset(
- struct page *page,
- loff_t *offset,
- unsigned int type)
-{
- loff_t lastoff = page_offset(page);
- bool found = false;
- struct buffer_head *bh, *head;
-
- bh = head = page_buffers(page);
- do {
- /*
- * Unwritten extents that have data in the page
- * cache covering them can be identified by the
- * BH_Unwritten state flag. Pages with multiple
- * buffers might have a mix of holes, data and
- * unwritten extents - any buffer with valid
- * data in it should have BH_Uptodate flag set
- * on it.
- */
- if (buffer_unwritten(bh) ||
- buffer_uptodate(bh)) {
- if (type == DATA_OFF)
- found = true;
- } else {
- if (type == HOLE_OFF)
- found = true;
- }
-
- if (found) {
- *offset = lastoff;
- break;
- }
- lastoff += bh->b_size;
- } while ((bh = bh->b_this_page) != head);
-
- return found;
-}
-
-/*
- * This routine is called to find out and return a data or hole offset
- * from the page cache for unwritten extents according to the desired
- * type for xfs_seek_hole_data().
- *
- * The argument offset is used to tell where we start to search from the
- * page cache. Map is used to figure out the end points of the range to
- * lookup pages.
- *
- * Return true if the desired type of offset was found, and the argument
- * offset is filled with that address. Otherwise, return false and keep
- * offset unchanged.
- */
-STATIC bool
-xfs_find_get_desired_pgoff(
- struct inode *inode,
- struct xfs_bmbt_irec *map,
- unsigned int type,
- loff_t *offset)
-{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- struct pagevec pvec;
- pgoff_t index;
- pgoff_t end;
- loff_t endoff;
- loff_t startoff = *offset;
- loff_t lastoff = startoff;
- bool found = false;
-
- pagevec_init(&pvec, 0);
-
- index = startoff >> PAGE_SHIFT;
- endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount);
- end = (endoff - 1) >> PAGE_SHIFT;
- do {
- int want;
- unsigned nr_pages;
- unsigned int i;
-
- want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1;
- nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
- want);
- if (nr_pages == 0)
- break;
-
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- loff_t b_offset;
-
- /*
- * At this point, the page may be truncated or
- * invalidated (changing page->mapping to NULL),
- * or even swizzled back from swapper_space to tmpfs
- * file mapping. However, page->index will not change
- * because we have a reference on the page.
- *
- * If current page offset is beyond where we've ended,
- * we've found a hole.
- */
- if (type == HOLE_OFF && lastoff < endoff &&
- lastoff < page_offset(pvec.pages[i])) {
- found = true;
- *offset = lastoff;
- goto out;
- }
- /* Searching done if the page index is out of range. */
- if (page->index > end)
- goto out;
-
- lock_page(page);
- /*
- * Page truncated or invalidated(page->mapping == NULL).
- * We can freely skip it and proceed to check the next
- * page.
- */
- if (unlikely(page->mapping != inode->i_mapping)) {
- unlock_page(page);
- continue;
- }
-
- if (!page_has_buffers(page)) {
- unlock_page(page);
- continue;
- }
-
- found = xfs_lookup_buffer_offset(page, &b_offset, type);
- if (found) {
- /*
- * The found offset may be less than the start
- * point to search if this is the first time to
- * come here.
- */
- *offset = max_t(loff_t, startoff, b_offset);
- unlock_page(page);
- goto out;
- }
-
- /*
- * We either searching data but nothing was found, or
- * searching hole but found a data buffer. In either
- * case, probably the next page contains the desired
- * things, update the last offset to it so.
- */
- lastoff = page_offset(page) + PAGE_SIZE;
- unlock_page(page);
- }
-
- /*
- * The number of returned pages less than our desired, search
- * done.
- */
- if (nr_pages < want)
- break;
-
- index = pvec.pages[i - 1]->index + 1;
- pagevec_release(&pvec);
- } while (index <= end);
-
- /* No page at lastoff and we are not done - we found a hole. */
- if (type == HOLE_OFF && lastoff < endoff) {
- *offset = lastoff;
- found = true;
- }
-out:
- pagevec_release(&pvec);
- return found;
-}
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH 1/3] vfs: Add page_cache_seek_hole_data helper
2017-06-23 0:19 [PATCH 1/3] vfs: Add page_cache_seek_hole_data helper Andreas Gruenbacher
2017-06-23 0:19 ` [PATCH 2/3] ext4: Switch to page_cache_seek_hole_data Andreas Gruenbacher
2017-06-23 0:19 ` [PATCH 3/3] xfs: " Andreas Gruenbacher
@ 2017-06-26 10:41 ` Christoph Hellwig
2017-06-26 10:51 ` Andreas Gruenbacher
2 siblings, 1 reply; 5+ messages in thread
From: Christoph Hellwig @ 2017-06-26 10:41 UTC (permalink / raw)
To: Andreas Gruenbacher; +Cc: linux-fsdevel, linux-ext4, linux-xfs, Jan Kara
On Fri, Jun 23, 2017 at 02:19:31AM +0200, Andreas Gruenbacher wrote:
> Both ext4 and xfs implement seeking for the next hole or piece of data
> in unwritten extents by scanning the page cache, and both versions share
> the same bug when iterating the buffers of a page: the start offset into
> the page isn't taken into account, so when a page fits more than two
> filesystem blocks, things will go wrong. For example, on a filesystem
> with a block size of 1k, the following command will fail:
>
> xfs_io -f -c "falloc 0 4k" \
> -c "pwrite 1k 1k" \
> -c "pwrite 3k 1k" \
> -c "seek -a -r 0" foo
Can you wire this up for xfstests, please?
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH 1/3] vfs: Add page_cache_seek_hole_data helper
2017-06-26 10:41 ` [PATCH 1/3] vfs: Add page_cache_seek_hole_data helper Christoph Hellwig
@ 2017-06-26 10:51 ` Andreas Gruenbacher
0 siblings, 0 replies; 5+ messages in thread
From: Andreas Gruenbacher @ 2017-06-26 10:51 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: linux-fsdevel, linux-ext4, linux-xfs, Jan Kara
On Mon, Jun 26, 2017 at 12:41 PM, Christoph Hellwig <hch@infradead.org> wrote:
> On Fri, Jun 23, 2017 at 02:19:31AM +0200, Andreas Gruenbacher wrote:
>> Both ext4 and xfs implement seeking for the next hole or piece of data
>> in unwritten extents by scanning the page cache, and both versions share
>> the same bug when iterating the buffers of a page: the start offset into
>> the page isn't taken into account, so when a page fits more than two
>> filesystem blocks, things will go wrong. For example, on a filesystem
>> with a block size of 1k, the following command will fail:
>>
>> xfs_io -f -c "falloc 0 4k" \
>> -c "pwrite 1k 1k" \
>> -c "pwrite 3k 1k" \
>> -c "seek -a -r 0" foo
>
> Can you wire this up for xfstests, please?
I did:
https://marc.info/?l=fstests&m=149817668119033
Andreas
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2017-06-26 10:51 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-06-23 0:19 [PATCH 1/3] vfs: Add page_cache_seek_hole_data helper Andreas Gruenbacher
2017-06-23 0:19 ` [PATCH 2/3] ext4: Switch to page_cache_seek_hole_data Andreas Gruenbacher
2017-06-23 0:19 ` [PATCH 3/3] xfs: " Andreas Gruenbacher
2017-06-26 10:41 ` [PATCH 1/3] vfs: Add page_cache_seek_hole_data helper Christoph Hellwig
2017-06-26 10:51 ` Andreas Gruenbacher
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).