* Introduce SEEK_DATA/SEEK_HOLE to XFS V5
@ 2012-01-06 13:28 Jeff Liu
2012-01-10 17:18 ` Ben Myers
2012-01-11 15:43 ` Ben Myers
0 siblings, 2 replies; 20+ messages in thread
From: Jeff Liu @ 2012-01-06 13:28 UTC (permalink / raw)
To: xfs; +Cc: Christoph Hellwig, Chris Mason
Hello,
This is a revised patch according to Christoph's comments at V4.
Changes to V5:
--------------
* Revise xfs_has_unwritten_buffer() to lookup pages match tag.
* For unwritten extents, in both xfs_seek_data() and xfs_seek_hole(), call xfs_has_unwritten_buffer() to search
DIRTY pages firstly, if no dirty data found, call it again to search WRITEBACK pages.
* In xfs_seek_hole(), if dirty data was found in page cache for an unwritten extents, but its start offset past the start block
of the map, treat it as a hole, returns the offset if possible(data_buffer_offset > max(seek_offset, start_block_of_map)).
Tests:
------
seek sanity tester:
http://patchwork.xfs.org/patch/3108/
seek copy tester:
http://patchwork.xfs.org/patch/3109/
Thanks,
-Jeff
Signed-off-by: Jie Liu <jeff.liu@oracle.com>
---
fs/xfs/xfs_file.c | 466 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 465 insertions(+), 1 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 753ed9b..24ae40a 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -38,6 +38,7 @@
#include <linux/dcache.h>
#include <linux/falloc.h>
+#include <linux/pagevec.h>
static const struct vm_operations_struct xfs_file_vm_ops;
@@ -1141,8 +1142,471 @@ xfs_vm_page_mkwrite(
return block_page_mkwrite(vma, vmf, xfs_get_blocks);
}
+/*
+ * Probe the data buffer offset in page cache for unwritten extents.
+ * Fetch all the pages match @tag, and iterate each page to find out
+ * if a buffer head state has BH_Unwritten or BH_Uptodate set.
+ */
+STATIC bool
+xfs_has_unwritten_buffer(
+ struct inode *inode,
+ struct xfs_bmbt_irec *map,
+ int tag,
+ loff_t *offset)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct pagevec pvec;
+ pgoff_t index;
+ pgoff_t end;
+ bool found = false;
+
+ pagevec_init(&pvec, 0);
+
+ index = XFS_FSB_TO_B(mp, map->br_startoff) >> PAGE_CACHE_SHIFT;
+ end = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount)
+ >> PAGE_CACHE_SHIFT;
+
+ do {
+ unsigned int i;
+ unsigned nr_pages;
+ int want = min_t(pgoff_t, end - index,
+ (pgoff_t)PAGEVEC_SIZE - 1) + 1;
+ nr_pages = pagevec_lookup_tag(&pvec, inode->i_mapping,
+ &index, tag, want);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ struct buffer_head *bh;
+ struct buffer_head *head;
+ xfs_fileoff_t last;
+
+ if (!page_has_buffers(page))
+ continue;
+
+ /*
+ * There is no need to check the following pages
+ * if the current page offset is out of range.
+ */
+ if (page->index > end)
+ goto out;
+
+ last = XFS_B_TO_FSBT(mp,
+ page->index << PAGE_CACHE_SHIFT);
+
+ bh = head = page_buffers(page);
+ do {
+ /*
+ * An extent in XFS_EXT_UNWRITTEN have disk
+ * blocks already mapped to it, but no data
+ * has been committed to them yet. If it has
+ * dirty data in the page cache it can be
+ * identified by having BH_Unwritten set in
+ * each buffer. Also, the buffer head state
+ * might be in BH_Uptodate too if the buffer
+ * writeback procedure was fired, we need to
+ * examine it as well.
+ */
+ if (buffer_unwritten(bh) ||
+ buffer_uptodate(bh)) {
+ found = true;
+ *offset = XFS_FSB_TO_B(mp, last);
+ goto out;
+ }
+ last++;
+ } while ((bh = bh->b_this_page) != head);
+ }
+
+ /*
+ * If the number of probed pages less than our desired,
+ * there should no more pages mapped, search done.
+ */
+ if (nr_pages < want)
+ break;
+
+ index = pvec.pages[i - 1]->index + 1;
+ pagevec_release(&pvec);
+ } while (index < end);
+
+out:
+ pagevec_release(&pvec);
+ if (!found)
+ *offset = 0;
+
+ return found;
+}
+
+STATIC loff_t
+xfs_seek_data(
+ struct file *file,
+ loff_t start)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fsize_t isize = i_size_read(inode);
+ loff_t offset = 0;
+ struct xfs_ifork *ifp;
+ xfs_fileoff_t fsbno;
+ xfs_filblks_t len;
+ int lock;
+ int error;
+
+ lock = xfs_ilock_map_shared(ip);
+
+ if (start >= isize) {
+ error = ENXIO;
+ goto out_lock;
+ }
+
+ fsbno = XFS_B_TO_FSBT(mp, start);
+ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ len = XFS_B_TO_FSB(mp, isize);
+
+ for (;;) {
+ struct xfs_bmbt_irec map[2];
+ int nmap = 2;
+ loff_t seekoff;
+
+ error = xfs_bmapi_read(ip, fsbno, len - fsbno, map, &nmap,
+ XFS_BMAPI_ENTIRE);
+ if (error)
+ goto out_lock;
+
+ /* No extents at given offset, must be beyond EOF */
+ if (!nmap) {
+ error = ENXIO;
+ goto out_lock;
+ }
+
+ seekoff = XFS_FSB_TO_B(mp, fsbno);
+ /*
+ * Landed in a hole, skip to check the next extent.
+ * If the next extent landed in an in-memory data extent,
+ * or it is a normal extent, its fine to return.
+ * If the next extent landed in a hole extent, calculate
+ * the start file system block number for the next scan.
+ * If the next extent landed in an unwritten extent, we
+ * need to lookup the page cache to examine the data
+ * buffer offset, if nothing found, treat it as a hole
+ * extent too.
+ */
+ if (map[0].br_startblock == HOLESTARTBLOCK) {
+ /*
+ * Return ENXIO if no data extent behind
+ * the given offset. In this case, the seek
+ * offset should be landed in a hole.
+ */
+ if (nmap == 1) {
+ error = ENXIO;
+ break;
+ }
+
+ if (map[1].br_state == XFS_EXT_NORM ||
+ map[1].br_startblock == DELAYSTARTBLOCK) {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp, map[1].br_startoff));
+
+ break;
+ } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
+ if (xfs_has_unwritten_buffer(inode, &map[1],
+ PAGECACHE_TAG_DIRTY,
+ &offset) ||
+ xfs_has_unwritten_buffer(inode, &map[1],
+ PAGECACHE_TAG_WRITEBACK,
+ &offset)) {
+ offset = max_t(loff_t, seekoff, offset);
+ break;
+ }
+ }
+
+ fsbno = map[1].br_startoff + map[1].br_blockcount;
+ }
+
+ /*
+ * Landed in an unwritten extent, try to find out the data
+ * buffer offset from page cache firstly. If nothing was
+ * found, treat it as a hole, and skip to check the next
+ * extent, something just like above.
+ */
+ if (map[0].br_state == XFS_EXT_UNWRITTEN) {
+ if (xfs_has_unwritten_buffer(inode, &map[0],
+ PAGECACHE_TAG_DIRTY,
+ &offset) ||
+ xfs_has_unwritten_buffer(inode, &map[0],
+ PAGECACHE_TAG_WRITEBACK,
+ &offset)) {
+ offset = max_t(loff_t, seekoff, offset);
+ break;
+ }
+
+ /* No data extent at the given offset */
+ if (nmap == 1) {
+ error = ENXIO;
+ break;
+ }
+
+ if (map[1].br_state == XFS_EXT_NORM ||
+ map[1].br_startblock == DELAYSTARTBLOCK) {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp, map[1].br_startoff));
+ break;
+ } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
+ if (xfs_has_unwritten_buffer(inode, &map[1],
+ PAGECACHE_TAG_DIRTY,
+ &offset) ||
+ xfs_has_unwritten_buffer(inode, &map[1],
+ PAGECACHE_TAG_WRITEBACK,
+ &offset)) {
+ offset = max_t(loff_t, seekoff, offset);
+ break;
+ }
+ }
+
+ fsbno = map[1].br_startoff + map[1].br_blockcount;
+ }
+
+ /* Landed in a delay allocated extent or a real data extent */
+ if (map[0].br_startblock == DELAYSTARTBLOCK ||
+ map[0].br_state == XFS_EXT_NORM) {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp, map[0].br_startoff));
+ break;
+ }
+
+ /* Return ENXIO if beyond eof */
+ if (XFS_FSB_TO_B(mp, fsbno) > isize) {
+ error = ENXIO;
+ goto out_lock;
+ }
+ }
+
+ if (offset < start)
+ offset = start;
+
+ if (offset != file->f_pos)
+ file->f_pos = offset;
+
+out_lock:
+ xfs_iunlock_map_shared(ip, lock);
+ if (error)
+ return -error;
+
+ return offset;
+}
+
+STATIC loff_t
+xfs_seek_hole(
+ struct file *file,
+ loff_t start)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fsize_t isize = i_size_read(inode);
+ loff_t offset = 0;
+ struct xfs_ifork *ifp;
+ xfs_fileoff_t fsbno;
+ xfs_filblks_t len;
+ int lock;
+ int error;
+
+ lock = xfs_ilock_map_shared(ip);
+
+ if (start >= isize) {
+ error = ENXIO;
+ goto out_lock;
+ }
+
+ fsbno = XFS_B_TO_FSBT(mp, start);
+ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ len = XFS_B_TO_FSB(mp, isize);
+
+ for (;;) {
+ struct xfs_bmbt_irec map[2];
+ int nmap = 2;
+ loff_t seekoff;
+
+ error = xfs_bmapi_read(ip, fsbno, len - fsbno, map, &nmap,
+ XFS_BMAPI_ENTIRE);
+ if (error)
+ goto out_lock;
+
+ /* No extents at given offset, must be beyond EOF */
+ if (!nmap) {
+ error = ENXIO;
+ goto out_lock;
+ }
+
+ seekoff = XFS_FSB_TO_B(mp, fsbno);
+ /*
+ * Landed in an unwritten extent, try to lookup the page
+ * cache to find out if there is dirty data or not. If
+ * nothing was found, treate it as a hole. If there has
+ * dirty data and its offset starts past both the start
+ * block of the map and the current seek offset, it should
+ * be treated as hole too. Otherwise, go through the next
+ * extent to fetch holes.
+ */
+ if (map[0].br_state == XFS_EXT_UNWRITTEN) {
+ if (xfs_has_unwritten_buffer(inode, &map[0],
+ PAGECACHE_TAG_DIRTY,
+ &offset) ||
+ xfs_has_unwritten_buffer(inode, &map[0],
+ PAGECACHE_TAG_WRITEBACK,
+ &offset)) {
+ if (offset > max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp,
+ map[0].br_startoff))) {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp,
+ map[0].br_startoff));
+ break;
+ }
+ } else {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp, map[0].br_startoff));
+ break;
+ }
+
+ /*
+ * No more extent at the given offst, return the total
+ * file size.
+ */
+ if (nmap == 1) {
+ offset = isize;
+ break;
+ }
+
+ if (map[1].br_startblock == HOLESTARTBLOCK) {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp, map[1].br_startoff));
+ break;
+ } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
+ if (xfs_has_unwritten_buffer(inode, &map[1],
+ PAGECACHE_TAG_DIRTY,
+ &offset) ||
+ xfs_has_unwritten_buffer(inode, &map[1],
+ PAGECACHE_TAG_WRITEBACK,
+ &offset)) {
+ if (offset > max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp,
+ map[1].br_startoff))) {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp,
+ map[1].br_startoff));
+ break;
+ }
+ } else {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp, map[1].br_startoff));
+ break;
+ }
+ }
+
+ fsbno = map[1].br_startoff + map[1].br_blockcount;
+ }
+
+ /*
+ * Landed in a delay allocated extent or a real data extent,
+ * if the next extent is landed in a hole or in an unwritten
+ * extent but without data committed in the page cache, return
+ * its offset. If the next extent has dirty data in page cache,
+ * but its offset starts past both the start block of the map
+ * and the seek offset, it still be a hole.
+ */
+ if (map[0].br_startblock == DELAYSTARTBLOCK ||
+ map[0].br_state == XFS_EXT_NORM) {
+ /*
+ * No more extent at the give offset, return the
+ * total file size.
+ */
+ if (nmap == 1) {
+ offset = isize;
+ break;
+ }
+
+ if (map[1].br_startblock == HOLESTARTBLOCK) {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp, map[1].br_startoff));
+ break;
+ } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
+ if (xfs_has_unwritten_buffer(inode, &map[1],
+ PAGECACHE_TAG_DIRTY,
+ &offset) ||
+ xfs_has_unwritten_buffer(inode, &map[1],
+ PAGECACHE_TAG_WRITEBACK,
+ &offset)) {
+ if (offset > max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp,
+ map[1].br_startoff))) {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp,
+ map[1].br_startoff));
+ break;
+ }
+ } else {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp, map[1].br_startoff));
+ break;
+ }
+ }
+
+ fsbno = map[1].br_startoff + map[1].br_blockcount;
+ }
+
+ /* Landed in a hole, its fine to return */
+ if (map[0].br_startblock == HOLESTARTBLOCK) {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp, map[0].br_startoff));
+ break;
+ }
+
+ /* Return ENXIO if beyond eof */
+ if (XFS_FSB_TO_B(mp, fsbno) > isize) {
+ error = ENXIO;
+ goto out_lock;
+ }
+ }
+
+ if (offset < start)
+ offset = start;
+
+ if (offset != file->f_pos)
+ file->f_pos = offset;
+
+out_lock:
+ xfs_iunlock_map_shared(ip, lock);
+ if (error)
+ return -error;
+
+ return offset;
+}
+
+STATIC loff_t
+xfs_file_llseek(
+ struct file *file,
+ loff_t offset,
+ int origin)
+{
+ switch (origin) {
+ case SEEK_END:
+ case SEEK_CUR:
+ case SEEK_SET:
+ return generic_file_llseek(file, offset, origin);
+ case SEEK_DATA:
+ return xfs_seek_data(file, offset);
+ case SEEK_HOLE:
+ return xfs_seek_hole(file, offset);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
const struct file_operations xfs_file_operations = {
- .llseek = generic_file_llseek,
+ .llseek = xfs_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.aio_read = xfs_file_aio_read,
--
1.7.4.1
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: Introduce SEEK_DATA/SEEK_HOLE to XFS V5
2012-01-06 13:28 Introduce SEEK_DATA/SEEK_HOLE to XFS V5 Jeff Liu
@ 2012-01-10 17:18 ` Ben Myers
2012-01-11 5:45 ` Jeff Liu
2012-01-11 15:43 ` Ben Myers
1 sibling, 1 reply; 20+ messages in thread
From: Ben Myers @ 2012-01-10 17:18 UTC (permalink / raw)
To: Jeff Liu; +Cc: Christoph Hellwig, Chris Mason, xfs
Hey Jeff,
On Fri, Jan 06, 2012 at 09:28:58PM +0800, Jeff Liu wrote:
> This is a revised patch according to Christoph's comments at V4.
I got caught up on versions 1-4, and am looking at this now.
Thanks for your excellent contribution!
I have some inital comments here, and then I'll get into it a little
more deeply.
> Changes to V5:
> --------------
> * Revise xfs_has_unwritten_buffer() to lookup pages match tag.
> * For unwritten extents, in both xfs_seek_data() and xfs_seek_hole(), call xfs_has_unwritten_buffer() to search
> DIRTY pages firstly, if no dirty data found, call it again to search WRITEBACK pages.
> * In xfs_seek_hole(), if dirty data was found in page cache for an unwritten extents, but its start offset past the start block
> of the map, treat it as a hole, returns the offset if possible(data_buffer_offset > max(seek_offset, start_block_of_map)).
>
> Tests:
> ------
> seek sanity tester:
> http://patchwork.xfs.org/patch/3108/
> seek copy tester:
> http://patchwork.xfs.org/patch/3109/
>
>
> Thanks,
> -Jeff
>
> Signed-off-by: Jie Liu <jeff.liu@oracle.com>
>
> ---
> fs/xfs/xfs_file.c | 466 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
> 1 files changed, 465 insertions(+), 1 deletions(-)
>
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 753ed9b..24ae40a 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -38,6 +38,7 @@
>
> #include <linux/dcache.h>
> #include <linux/falloc.h>
> +#include <linux/pagevec.h>
>
> static const struct vm_operations_struct xfs_file_vm_ops;
>
> @@ -1141,8 +1142,471 @@ xfs_vm_page_mkwrite(
> return block_page_mkwrite(vma, vmf, xfs_get_blocks);
> }
>
> +/*
> + * Probe the data buffer offset in page cache for unwritten extents.
> + * Fetch all the pages match @tag, and iterate each page to find out
> + * if a buffer head state has BH_Unwritten or BH_Uptodate set.
> + */
> +STATIC bool
> +xfs_has_unwritten_buffer(
> + struct inode *inode,
> + struct xfs_bmbt_irec *map,
> + int tag,
> + loff_t *offset)
> +{
> + struct xfs_inode *ip = XFS_I(inode);
> + struct xfs_mount *mp = ip->i_mount;
> + struct pagevec pvec;
> + pgoff_t index;
> + pgoff_t end;
> + bool found = false;
> +
> + pagevec_init(&pvec, 0);
> +
> + index = XFS_FSB_TO_B(mp, map->br_startoff) >> PAGE_CACHE_SHIFT;
> + end = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount)
> + >> PAGE_CACHE_SHIFT;
> +
> + do {
> + unsigned int i;
> + unsigned nr_pages;
> + int want = min_t(pgoff_t, end - index,
> + (pgoff_t)PAGEVEC_SIZE - 1) + 1;
> + nr_pages = pagevec_lookup_tag(&pvec, inode->i_mapping,
> + &index, tag, want);
> + if (nr_pages == 0)
> + break;
> +
> + for (i = 0; i < nr_pages; i++) {
> + struct page *page = pvec.pages[i];
> + struct buffer_head *bh;
> + struct buffer_head *head;
> + xfs_fileoff_t last;
You should take the page lock here, before looking at PagePrivate and
the buffers. This is out of order with respect to the ilock. (The
page_lock is held when xfs_vm_writepage calls xfs_map_blocks which will
go after the ilock, and the same is the case when __xfs_get_blocks is
called when writing into the page.) So... here you need to use trylock
to avoid deadlocks, and assume that there is data on the page if you
don't get the lock. Take a look at xfs_cluster_write and
xfs_convert_page for an example.
I think that this also means that you can check for PageDirty and
PageWriteback under lock together... and so the upside is that you don't
have to call this once for each tag anymore. Just use pagevec_lookup.
> +
> + if (!page_has_buffers(page))
> + continue;
> +
> + /*
> + * There is no need to check the following pages
> + * if the current page offset is out of range.
> + */
> + if (page->index > end)
> + goto out;
> +
> + last = XFS_B_TO_FSBT(mp,
> + page->index << PAGE_CACHE_SHIFT);
> +
> + bh = head = page_buffers(page);
> + do {
> + /*
> + * An extent in XFS_EXT_UNWRITTEN have disk
> + * blocks already mapped to it, but no data
> + * has been committed to them yet. If it has
> + * dirty data in the page cache it can be
> + * identified by having BH_Unwritten set in
> + * each buffer. Also, the buffer head state
> + * might be in BH_Uptodate too if the buffer
> + * writeback procedure was fired, we need to
> + * examine it as well.
> + */
> + if (buffer_unwritten(bh) ||
> + buffer_uptodate(bh)) {
> + found = true;
> + *offset = XFS_FSB_TO_B(mp, last);
> + goto out;
> + }
> + last++;
> + } while ((bh = bh->b_this_page) != head);
> + }
> +
> + /*
> + * If the number of probed pages less than our desired,
> + * there should no more pages mapped, search done.
> + */
> + if (nr_pages < want)
> + break;
> +
> + index = pvec.pages[i - 1]->index + 1;
> + pagevec_release(&pvec);
> + } while (index < end);
> +
> +out:
> + pagevec_release(&pvec);
> + if (!found)
> + *offset = 0;
> +
> + return found;
> +}
> +
> +STATIC loff_t
> +xfs_seek_data(
> + struct file *file,
> + loff_t start)
> +{
> + struct inode *inode = file->f_mapping->host;
> + struct xfs_inode *ip = XFS_I(inode);
> + struct xfs_mount *mp = ip->i_mount;
> + xfs_fsize_t isize = i_size_read(inode);
> + loff_t offset = 0;
> + struct xfs_ifork *ifp;
> + xfs_fileoff_t fsbno;
> + xfs_filblks_t len;
> + int lock;
> + int error;
> +
> + lock = xfs_ilock_map_shared(ip);
> +
> + if (start >= isize) {
> + error = ENXIO;
> + goto out_lock;
> + }
> +
> + fsbno = XFS_B_TO_FSBT(mp, start);
> + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
Nit: calculation of start_fsb and end_fsb belong together. Just move
ifp up a line. ;)
That's all I have for right now. I'll be looking at it in greater
detail today.
Thanks,
Ben
> + len = XFS_B_TO_FSB(mp, isize);
> +
> + for (;;) {
> + struct xfs_bmbt_irec map[2];
> + int nmap = 2;
> + loff_t seekoff;
> +
> + error = xfs_bmapi_read(ip, fsbno, len - fsbno, map, &nmap,
> + XFS_BMAPI_ENTIRE);
> + if (error)
> + goto out_lock;
> +
> + /* No extents at given offset, must be beyond EOF */
> + if (!nmap) {
> + error = ENXIO;
> + goto out_lock;
> + }
> +
> + seekoff = XFS_FSB_TO_B(mp, fsbno);
> + /*
> + * Landed in a hole, skip to check the next extent.
> + * If the next extent landed in an in-memory data extent,
> + * or it is a normal extent, its fine to return.
> + * If the next extent landed in a hole extent, calculate
> + * the start file system block number for the next scan.
> + * If the next extent landed in an unwritten extent, we
> + * need to lookup the page cache to examine the data
> + * buffer offset, if nothing found, treat it as a hole
> + * extent too.
> + */
> + if (map[0].br_startblock == HOLESTARTBLOCK) {
> + /*
> + * Return ENXIO if no data extent behind
> + * the given offset. In this case, the seek
> + * offset should be landed in a hole.
> + */
> + if (nmap == 1) {
> + error = ENXIO;
> + break;
> + }
> +
> + if (map[1].br_state == XFS_EXT_NORM ||
> + map[1].br_startblock == DELAYSTARTBLOCK) {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp, map[1].br_startoff));
> +
> + break;
> + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
> + if (xfs_has_unwritten_buffer(inode, &map[1],
> + PAGECACHE_TAG_DIRTY,
> + &offset) ||
> + xfs_has_unwritten_buffer(inode, &map[1],
> + PAGECACHE_TAG_WRITEBACK,
> + &offset)) {
> + offset = max_t(loff_t, seekoff, offset);
> + break;
> + }
> + }
> +
> + fsbno = map[1].br_startoff + map[1].br_blockcount;
> + }
> +
> + /*
> + * Landed in an unwritten extent, try to find out the data
> + * buffer offset from page cache firstly. If nothing was
> + * found, treat it as a hole, and skip to check the next
> + * extent, something just like above.
> + */
> + if (map[0].br_state == XFS_EXT_UNWRITTEN) {
> + if (xfs_has_unwritten_buffer(inode, &map[0],
> + PAGECACHE_TAG_DIRTY,
> + &offset) ||
> + xfs_has_unwritten_buffer(inode, &map[0],
> + PAGECACHE_TAG_WRITEBACK,
> + &offset)) {
> + offset = max_t(loff_t, seekoff, offset);
> + break;
> + }
> +
> + /* No data extent at the given offset */
> + if (nmap == 1) {
> + error = ENXIO;
> + break;
> + }
> +
> + if (map[1].br_state == XFS_EXT_NORM ||
> + map[1].br_startblock == DELAYSTARTBLOCK) {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp, map[1].br_startoff));
> + break;
> + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
> + if (xfs_has_unwritten_buffer(inode, &map[1],
> + PAGECACHE_TAG_DIRTY,
> + &offset) ||
> + xfs_has_unwritten_buffer(inode, &map[1],
> + PAGECACHE_TAG_WRITEBACK,
> + &offset)) {
> + offset = max_t(loff_t, seekoff, offset);
> + break;
> + }
> + }
> +
> + fsbno = map[1].br_startoff + map[1].br_blockcount;
> + }
> +
> + /* Landed in a delay allocated extent or a real data extent */
> + if (map[0].br_startblock == DELAYSTARTBLOCK ||
> + map[0].br_state == XFS_EXT_NORM) {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp, map[0].br_startoff));
> + break;
> + }
> +
> + /* Return ENXIO if beyond eof */
> + if (XFS_FSB_TO_B(mp, fsbno) > isize) {
> + error = ENXIO;
> + goto out_lock;
> + }
> + }
> +
> + if (offset < start)
> + offset = start;
> +
> + if (offset != file->f_pos)
> + file->f_pos = offset;
> +
> +out_lock:
> + xfs_iunlock_map_shared(ip, lock);
> + if (error)
> + return -error;
> +
> + return offset;
> +}
> +
> +STATIC loff_t
> +xfs_seek_hole(
> + struct file *file,
> + loff_t start)
> +{
> + struct inode *inode = file->f_mapping->host;
> + struct xfs_inode *ip = XFS_I(inode);
> + struct xfs_mount *mp = ip->i_mount;
> + xfs_fsize_t isize = i_size_read(inode);
> + loff_t offset = 0;
> + struct xfs_ifork *ifp;
> + xfs_fileoff_t fsbno;
> + xfs_filblks_t len;
> + int lock;
> + int error;
> +
> + lock = xfs_ilock_map_shared(ip);
> +
> + if (start >= isize) {
> + error = ENXIO;
> + goto out_lock;
> + }
> +
> + fsbno = XFS_B_TO_FSBT(mp, start);
> + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
> + len = XFS_B_TO_FSB(mp, isize);
> +
> + for (;;) {
> + struct xfs_bmbt_irec map[2];
> + int nmap = 2;
> + loff_t seekoff;
> +
> + error = xfs_bmapi_read(ip, fsbno, len - fsbno, map, &nmap,
> + XFS_BMAPI_ENTIRE);
> + if (error)
> + goto out_lock;
> +
> + /* No extents at given offset, must be beyond EOF */
> + if (!nmap) {
> + error = ENXIO;
> + goto out_lock;
> + }
> +
> + seekoff = XFS_FSB_TO_B(mp, fsbno);
> + /*
> + * Landed in an unwritten extent, try to lookup the page
> + * cache to find out if there is dirty data or not. If
> + * nothing was found, treate it as a hole. If there has
> + * dirty data and its offset starts past both the start
> + * block of the map and the current seek offset, it should
> + * be treated as hole too. Otherwise, go through the next
> + * extent to fetch holes.
> + */
> + if (map[0].br_state == XFS_EXT_UNWRITTEN) {
> + if (xfs_has_unwritten_buffer(inode, &map[0],
> + PAGECACHE_TAG_DIRTY,
> + &offset) ||
> + xfs_has_unwritten_buffer(inode, &map[0],
> + PAGECACHE_TAG_WRITEBACK,
> + &offset)) {
> + if (offset > max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp,
> + map[0].br_startoff))) {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp,
> + map[0].br_startoff));
> + break;
> + }
> + } else {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp, map[0].br_startoff));
> + break;
> + }
> +
> + /*
> + * No more extent at the given offst, return the total
> + * file size.
> + */
> + if (nmap == 1) {
> + offset = isize;
> + break;
> + }
> +
> + if (map[1].br_startblock == HOLESTARTBLOCK) {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp, map[1].br_startoff));
> + break;
> + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
> + if (xfs_has_unwritten_buffer(inode, &map[1],
> + PAGECACHE_TAG_DIRTY,
> + &offset) ||
> + xfs_has_unwritten_buffer(inode, &map[1],
> + PAGECACHE_TAG_WRITEBACK,
> + &offset)) {
> + if (offset > max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp,
> + map[1].br_startoff))) {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp,
> + map[1].br_startoff));
> + break;
> + }
> + } else {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp, map[1].br_startoff));
> + break;
> + }
> + }
> +
> + fsbno = map[1].br_startoff + map[1].br_blockcount;
> + }
> +
> + /*
> + * Landed in a delay allocated extent or a real data extent,
> + * if the next extent is landed in a hole or in an unwritten
> + * extent but without data committed in the page cache, return
> + * its offset. If the next extent has dirty data in page cache,
> + * but its offset starts past both the start block of the map
> + * and the seek offset, it still be a hole.
> + */
> + if (map[0].br_startblock == DELAYSTARTBLOCK ||
> + map[0].br_state == XFS_EXT_NORM) {
> + /*
> + * No more extent at the give offset, return the
> + * total file size.
> + */
> + if (nmap == 1) {
> + offset = isize;
> + break;
> + }
> +
> + if (map[1].br_startblock == HOLESTARTBLOCK) {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp, map[1].br_startoff));
> + break;
> + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
> + if (xfs_has_unwritten_buffer(inode, &map[1],
> + PAGECACHE_TAG_DIRTY,
> + &offset) ||
> + xfs_has_unwritten_buffer(inode, &map[1],
> + PAGECACHE_TAG_WRITEBACK,
> + &offset)) {
> + if (offset > max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp,
> + map[1].br_startoff))) {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp,
> + map[1].br_startoff));
> + break;
> + }
> + } else {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp, map[1].br_startoff));
> + break;
> + }
> + }
> +
> + fsbno = map[1].br_startoff + map[1].br_blockcount;
> + }
> +
> + /* Landed in a hole, its fine to return */
> + if (map[0].br_startblock == HOLESTARTBLOCK) {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp, map[0].br_startoff));
> + break;
> + }
> +
> + /* Return ENXIO if beyond eof */
> + if (XFS_FSB_TO_B(mp, fsbno) > isize) {
> + error = ENXIO;
> + goto out_lock;
> + }
> + }
> +
> + if (offset < start)
> + offset = start;
> +
> + if (offset != file->f_pos)
> + file->f_pos = offset;
> +
> +out_lock:
> + xfs_iunlock_map_shared(ip, lock);
> + if (error)
> + return -error;
> +
> + return offset;
> +}
> +
> +STATIC loff_t
> +xfs_file_llseek(
> + struct file *file,
> + loff_t offset,
> + int origin)
> +{
> + switch (origin) {
> + case SEEK_END:
> + case SEEK_CUR:
> + case SEEK_SET:
> + return generic_file_llseek(file, offset, origin);
> + case SEEK_DATA:
> + return xfs_seek_data(file, offset);
> + case SEEK_HOLE:
> + return xfs_seek_hole(file, offset);
> + default:
> + return -EOPNOTSUPP;
> + }
> +}
> +
> const struct file_operations xfs_file_operations = {
> - .llseek = generic_file_llseek,
> + .llseek = xfs_file_llseek,
> .read = do_sync_read,
> .write = do_sync_write,
> .aio_read = xfs_file_aio_read,
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Introduce SEEK_DATA/SEEK_HOLE to XFS V5
2012-01-10 17:18 ` Ben Myers
@ 2012-01-11 5:45 ` Jeff Liu
2012-01-11 21:06 ` Mark Tinguely
` (2 more replies)
0 siblings, 3 replies; 20+ messages in thread
From: Jeff Liu @ 2012-01-11 5:45 UTC (permalink / raw)
To: Ben Myers; +Cc: Christoph Hellwig, Chris Mason, xfs
Hi Ben,
On 01/11/2012 01:18 AM, Ben Myers wrote:
> Hey Jeff,
>
> On Fri, Jan 06, 2012 at 09:28:58PM +0800, Jeff Liu wrote:
>> This is a revised patch according to Christoph's comments at V4.
>
> I got caught up on versions 1-4, and am looking at this now.
> Thanks for your excellent contribution!
>
> I have some inital comments here, and then I'll get into it a little
> more deeply.
>
>> Changes to V5:
>> --------------
>> * Revise xfs_has_unwritten_buffer() to lookup pages match tag.
>> * For unwritten extents, in both xfs_seek_data() and xfs_seek_hole(), call xfs_has_unwritten_buffer() to search
>> DIRTY pages firstly, if no dirty data found, call it again to search WRITEBACK pages.
>> * In xfs_seek_hole(), if dirty data was found in page cache for an unwritten extents, but its start offset past the start block
>> of the map, treat it as a hole, returns the offset if possible(data_buffer_offset > max(seek_offset, start_block_of_map)).
>>
>> Tests:
>> ------
>> seek sanity tester:
>> http://patchwork.xfs.org/patch/3108/
>> seek copy tester:
>> http://patchwork.xfs.org/patch/3109/
>>
>>
>> Thanks,
>> -Jeff
>>
>> Signed-off-by: Jie Liu <jeff.liu@oracle.com>
>>
>> ---
>> fs/xfs/xfs_file.c | 466 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>> 1 files changed, 465 insertions(+), 1 deletions(-)
>>
>> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
>> index 753ed9b..24ae40a 100644
>> --- a/fs/xfs/xfs_file.c
>> +++ b/fs/xfs/xfs_file.c
>> @@ -38,6 +38,7 @@
>>
>> #include <linux/dcache.h>
>> #include <linux/falloc.h>
>> +#include <linux/pagevec.h>
>>
>> static const struct vm_operations_struct xfs_file_vm_ops;
>>
>> @@ -1141,8 +1142,471 @@ xfs_vm_page_mkwrite(
>> return block_page_mkwrite(vma, vmf, xfs_get_blocks);
>> }
>>
>> +/*
>> + * Probe the data buffer offset in page cache for unwritten extents.
>> + * Fetch all the pages match @tag, and iterate each page to find out
>> + * if a buffer head state has BH_Unwritten or BH_Uptodate set.
>> + */
>> +STATIC bool
>> +xfs_has_unwritten_buffer(
>> + struct inode *inode,
>> + struct xfs_bmbt_irec *map,
>> + int tag,
>> + loff_t *offset)
>> +{
>> + struct xfs_inode *ip = XFS_I(inode);
>> + struct xfs_mount *mp = ip->i_mount;
>> + struct pagevec pvec;
>> + pgoff_t index;
>> + pgoff_t end;
>> + bool found = false;
>> +
>> + pagevec_init(&pvec, 0);
>> +
>> + index = XFS_FSB_TO_B(mp, map->br_startoff) >> PAGE_CACHE_SHIFT;
>> + end = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount)
>> + >> PAGE_CACHE_SHIFT;
>> +
>> + do {
>> + unsigned int i;
>> + unsigned nr_pages;
>> + int want = min_t(pgoff_t, end - index,
>> + (pgoff_t)PAGEVEC_SIZE - 1) + 1;
>> + nr_pages = pagevec_lookup_tag(&pvec, inode->i_mapping,
>> + &index, tag, want);
>> + if (nr_pages == 0)
>> + break;
>> +
>> + for (i = 0; i < nr_pages; i++) {
>> + struct page *page = pvec.pages[i];
>> + struct buffer_head *bh;
>> + struct buffer_head *head;
>> + xfs_fileoff_t last;
>
> You should take the page lock here, before looking at PagePrivate and
> the buffers. This is out of order with respect to the ilock. (The
> page_lock is held when xfs_vm_writepage calls xfs_map_blocks which will
> go after the ilock, and the same is the case when __xfs_get_blocks is
> called when writing into the page.) So... here you need to use trylock
> to avoid deadlocks, and assume that there is data on the page if you
> don't get the lock. Take a look at xfs_cluster_write and
> xfs_convert_page for an example.
Thanks for your timely response and pointing this out!
I am suffering with a data loss when improving seek copy tester to produce a file with
around thousands of extents these days, looks the root cause is most likely related to miss the page lock stuff.
>
> I think that this also means that you can check for PageDirty and
> PageWriteback under lock together... and so the upside is that you don't
> have to call this once for each tag anymore. Just use pagevec_lookup.
Cool. originally, I have prepared to fix the below stupid code blocks at V5:
if (offset > max_t(loff_t, seekoff, XFS_FSB_TO_B(mp, map[0].br_startoff))) {
offset = max_t(loff_t, seekoff, XFS_FSB_TO_B(mp, map[0].br_startoff));
break;
}
to:
max_off = max_t(loff_t, seekoff, XFS_FSB_TO_B(mp, map[0].br_startoff));
if (offset > max_off) {
offset = max_off;
break;
}
With your comments, this issue will gone :-P.
>
>> +
>> + if (!page_has_buffers(page))
>> + continue;
>> +
>> + /*
>> + * There is no need to check the following pages
>> + * if the current page offset is out of range.
>> + */
>> + if (page->index > end)
>> + goto out;
>> +
>> + last = XFS_B_TO_FSBT(mp,
>> + page->index << PAGE_CACHE_SHIFT);
>> +
>> + bh = head = page_buffers(page);
>> + do {
>> + /*
>> + * An extent in XFS_EXT_UNWRITTEN have disk
>> + * blocks already mapped to it, but no data
>> + * has been committed to them yet. If it has
>> + * dirty data in the page cache it can be
>> + * identified by having BH_Unwritten set in
>> + * each buffer. Also, the buffer head state
>> + * might be in BH_Uptodate too if the buffer
>> + * writeback procedure was fired, we need to
>> + * examine it as well.
>> + */
>> + if (buffer_unwritten(bh) ||
>> + buffer_uptodate(bh)) {
>> + found = true;
>> + *offset = XFS_FSB_TO_B(mp, last);
>> + goto out;
>> + }
>> + last++;
>> + } while ((bh = bh->b_this_page) != head);
>> + }
>> +
>> + /*
>> + * If the number of probed pages less than our desired,
>> + * there should no more pages mapped, search done.
>> + */
>> + if (nr_pages < want)
>> + break;
>> +
>> + index = pvec.pages[i - 1]->index + 1;
>> + pagevec_release(&pvec);
>> + } while (index < end);
>> +
>> +out:
>> + pagevec_release(&pvec);
>> + if (!found)
>> + *offset = 0;
>> +
>> + return found;
>> +}
>> +
>> +STATIC loff_t
>> +xfs_seek_data(
>> + struct file *file,
>> + loff_t start)
>> +{
>> + struct inode *inode = file->f_mapping->host;
>> + struct xfs_inode *ip = XFS_I(inode);
>> + struct xfs_mount *mp = ip->i_mount;
>> + xfs_fsize_t isize = i_size_read(inode);
>> + loff_t offset = 0;
>> + struct xfs_ifork *ifp;
>> + xfs_fileoff_t fsbno;
>> + xfs_filblks_t len;
>> + int lock;
>> + int error;
>> +
>> + lock = xfs_ilock_map_shared(ip);
>> +
>> + if (start >= isize) {
>> + error = ENXIO;
>> + goto out_lock;
>> + }
>> +
>> + fsbno = XFS_B_TO_FSBT(mp, start);
>> + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
>
> Nit: calculation of start_fsb and end_fsb belong together. Just move
> ifp up a line. ;)
Hmm, looks we can safely remove ifp line since XFS_BMAPI_ENTIRE is used for xfs_bmapi_read().
>
> That's all I have for right now. I'll be looking at it in greater
> detail today.
So I'll collect all your comments and submit it again.
Thanks,
-Jeff
>
> Thanks,
> Ben
>
>> + len = XFS_B_TO_FSB(mp, isize);
>> +
>> + for (;;) {
>> + struct xfs_bmbt_irec map[2];
>> + int nmap = 2;
>> + loff_t seekoff;
>> +
>> + error = xfs_bmapi_read(ip, fsbno, len - fsbno, map, &nmap,
>> + XFS_BMAPI_ENTIRE);
>> + if (error)
>> + goto out_lock;
>> +
>> + /* No extents at given offset, must be beyond EOF */
>> + if (!nmap) {
>> + error = ENXIO;
>> + goto out_lock;
>> + }
>> +
>> + seekoff = XFS_FSB_TO_B(mp, fsbno);
>> + /*
>> + * Landed in a hole, skip to check the next extent.
>> + * If the next extent landed in an in-memory data extent,
>> + * or it is a normal extent, its fine to return.
>> + * If the next extent landed in a hole extent, calculate
>> + * the start file system block number for the next scan.
>> + * If the next extent landed in an unwritten extent, we
>> + * need to lookup the page cache to examine the data
>> + * buffer offset, if nothing found, treat it as a hole
>> + * extent too.
>> + */
>> + if (map[0].br_startblock == HOLESTARTBLOCK) {
>> + /*
>> + * Return ENXIO if no data extent behind
>> + * the given offset. In this case, the seek
>> + * offset should be landed in a hole.
>> + */
>> + if (nmap == 1) {
>> + error = ENXIO;
>> + break;
>> + }
>> +
>> + if (map[1].br_state == XFS_EXT_NORM ||
>> + map[1].br_startblock == DELAYSTARTBLOCK) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[1].br_startoff));
>> +
>> + break;
>> + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
>> + if (xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_DIRTY,
>> + &offset) ||
>> + xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_WRITEBACK,
>> + &offset)) {
>> + offset = max_t(loff_t, seekoff, offset);
>> + break;
>> + }
>> + }
>> +
>> + fsbno = map[1].br_startoff + map[1].br_blockcount;
>> + }
>> +
>> + /*
>> + * Landed in an unwritten extent, try to find out the data
>> + * buffer offset from page cache firstly. If nothing was
>> + * found, treat it as a hole, and skip to check the next
>> + * extent, something just like above.
>> + */
>> + if (map[0].br_state == XFS_EXT_UNWRITTEN) {
>> + if (xfs_has_unwritten_buffer(inode, &map[0],
>> + PAGECACHE_TAG_DIRTY,
>> + &offset) ||
>> + xfs_has_unwritten_buffer(inode, &map[0],
>> + PAGECACHE_TAG_WRITEBACK,
>> + &offset)) {
>> + offset = max_t(loff_t, seekoff, offset);
>> + break;
>> + }
>> +
>> + /* No data extent at the given offset */
>> + if (nmap == 1) {
>> + error = ENXIO;
>> + break;
>> + }
>> +
>> + if (map[1].br_state == XFS_EXT_NORM ||
>> + map[1].br_startblock == DELAYSTARTBLOCK) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[1].br_startoff));
>> + break;
>> + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
>> + if (xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_DIRTY,
>> + &offset) ||
>> + xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_WRITEBACK,
>> + &offset)) {
>> + offset = max_t(loff_t, seekoff, offset);
>> + break;
>> + }
>> + }
>> +
>> + fsbno = map[1].br_startoff + map[1].br_blockcount;
>> + }
>> +
>> + /* Landed in a delay allocated extent or a real data extent */
>> + if (map[0].br_startblock == DELAYSTARTBLOCK ||
>> + map[0].br_state == XFS_EXT_NORM) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[0].br_startoff));
>> + break;
>> + }
>> +
>> + /* Return ENXIO if beyond eof */
>> + if (XFS_FSB_TO_B(mp, fsbno) > isize) {
>> + error = ENXIO;
>> + goto out_lock;
>> + }
>> + }
>> +
>> + if (offset < start)
>> + offset = start;
>> +
>> + if (offset != file->f_pos)
>> + file->f_pos = offset;
>> +
>> +out_lock:
>> + xfs_iunlock_map_shared(ip, lock);
>> + if (error)
>> + return -error;
>> +
>> + return offset;
>> +}
>> +
>> +STATIC loff_t
>> +xfs_seek_hole(
>> + struct file *file,
>> + loff_t start)
>> +{
>> + struct inode *inode = file->f_mapping->host;
>> + struct xfs_inode *ip = XFS_I(inode);
>> + struct xfs_mount *mp = ip->i_mount;
>> + xfs_fsize_t isize = i_size_read(inode);
>> + loff_t offset = 0;
>> + struct xfs_ifork *ifp;
>> + xfs_fileoff_t fsbno;
>> + xfs_filblks_t len;
>> + int lock;
>> + int error;
>> +
>> + lock = xfs_ilock_map_shared(ip);
>> +
>> + if (start >= isize) {
>> + error = ENXIO;
>> + goto out_lock;
>> + }
>> +
>> + fsbno = XFS_B_TO_FSBT(mp, start);
>> + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
>> + len = XFS_B_TO_FSB(mp, isize);
>> +
>> + for (;;) {
>> + struct xfs_bmbt_irec map[2];
>> + int nmap = 2;
>> + loff_t seekoff;
>> +
>> + error = xfs_bmapi_read(ip, fsbno, len - fsbno, map, &nmap,
>> + XFS_BMAPI_ENTIRE);
>> + if (error)
>> + goto out_lock;
>> +
>> + /* No extents at given offset, must be beyond EOF */
>> + if (!nmap) {
>> + error = ENXIO;
>> + goto out_lock;
>> + }
>> +
>> + seekoff = XFS_FSB_TO_B(mp, fsbno);
>> + /*
>> + * Landed in an unwritten extent, try to lookup the page
>> + * cache to find out if there is dirty data or not. If
>> + * nothing was found, treate it as a hole. If there has
>> + * dirty data and its offset starts past both the start
>> + * block of the map and the current seek offset, it should
>> + * be treated as hole too. Otherwise, go through the next
>> + * extent to fetch holes.
>> + */
>> + if (map[0].br_state == XFS_EXT_UNWRITTEN) {
>> + if (xfs_has_unwritten_buffer(inode, &map[0],
>> + PAGECACHE_TAG_DIRTY,
>> + &offset) ||
>> + xfs_has_unwritten_buffer(inode, &map[0],
>> + PAGECACHE_TAG_WRITEBACK,
>> + &offset)) {
>> + if (offset > max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp,
>> + map[0].br_startoff))) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp,
>> + map[0].br_startoff));
>> + break;
>> + }
>> + } else {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[0].br_startoff));
>> + break;
>> + }
>> +
>> + /*
>> + * No more extent at the given offst, return the total
>> + * file size.
>> + */
>> + if (nmap == 1) {
>> + offset = isize;
>> + break;
>> + }
>> +
>> + if (map[1].br_startblock == HOLESTARTBLOCK) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[1].br_startoff));
>> + break;
>> + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
>> + if (xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_DIRTY,
>> + &offset) ||
>> + xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_WRITEBACK,
>> + &offset)) {
>> + if (offset > max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp,
>> + map[1].br_startoff))) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp,
>> + map[1].br_startoff));
>> + break;
>> + }
>> + } else {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[1].br_startoff));
>> + break;
>> + }
>> + }
>> +
>> + fsbno = map[1].br_startoff + map[1].br_blockcount;
>> + }
>> +
>> + /*
>> + * Landed in a delay allocated extent or a real data extent,
>> + * if the next extent is landed in a hole or in an unwritten
>> + * extent but without data committed in the page cache, return
>> + * its offset. If the next extent has dirty data in page cache,
>> + * but its offset starts past both the start block of the map
>> + * and the seek offset, it still be a hole.
>> + */
>> + if (map[0].br_startblock == DELAYSTARTBLOCK ||
>> + map[0].br_state == XFS_EXT_NORM) {
>> + /*
>> + * No more extent at the give offset, return the
>> + * total file size.
>> + */
>> + if (nmap == 1) {
>> + offset = isize;
>> + break;
>> + }
>> +
>> + if (map[1].br_startblock == HOLESTARTBLOCK) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[1].br_startoff));
>> + break;
>> + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
>> + if (xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_DIRTY,
>> + &offset) ||
>> + xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_WRITEBACK,
>> + &offset)) {
>> + if (offset > max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp,
>> + map[1].br_startoff))) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp,
>> + map[1].br_startoff));
>> + break;
>> + }
>> + } else {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[1].br_startoff));
>> + break;
>> + }
>> + }
>> +
>> + fsbno = map[1].br_startoff + map[1].br_blockcount;
>> + }
>> +
>> + /* Landed in a hole, its fine to return */
>> + if (map[0].br_startblock == HOLESTARTBLOCK) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[0].br_startoff));
>> + break;
>> + }
>> +
>> + /* Return ENXIO if beyond eof */
>> + if (XFS_FSB_TO_B(mp, fsbno) > isize) {
>> + error = ENXIO;
>> + goto out_lock;
>> + }
>> + }
>> +
>> + if (offset < start)
>> + offset = start;
>> +
>> + if (offset != file->f_pos)
>> + file->f_pos = offset;
>> +
>> +out_lock:
>> + xfs_iunlock_map_shared(ip, lock);
>> + if (error)
>> + return -error;
>> +
>> + return offset;
>> +}
>> +
>> +STATIC loff_t
>> +xfs_file_llseek(
>> + struct file *file,
>> + loff_t offset,
>> + int origin)
>> +{
>> + switch (origin) {
>> + case SEEK_END:
>> + case SEEK_CUR:
>> + case SEEK_SET:
>> + return generic_file_llseek(file, offset, origin);
>> + case SEEK_DATA:
>> + return xfs_seek_data(file, offset);
>> + case SEEK_HOLE:
>> + return xfs_seek_hole(file, offset);
>> + default:
>> + return -EOPNOTSUPP;
>> + }
>> +}
>> +
>> const struct file_operations xfs_file_operations = {
>> - .llseek = generic_file_llseek,
>> + .llseek = xfs_file_llseek,
>> .read = do_sync_read,
>> .write = do_sync_write,
>> .aio_read = xfs_file_aio_read,
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Introduce SEEK_DATA/SEEK_HOLE to XFS V5
2012-01-06 13:28 Introduce SEEK_DATA/SEEK_HOLE to XFS V5 Jeff Liu
2012-01-10 17:18 ` Ben Myers
@ 2012-01-11 15:43 ` Ben Myers
2012-01-11 22:28 ` Ben Myers
2012-01-12 12:53 ` Jeff Liu
1 sibling, 2 replies; 20+ messages in thread
From: Ben Myers @ 2012-01-11 15:43 UTC (permalink / raw)
To: Jeff Liu; +Cc: Christoph Hellwig, Chris Mason, xfs
Hey Jeff,
Here are a few additional minor comments from yesterday.
I'm looking forward to seeing your next version, and I'm still working
through this one.
I would like to suggest that you split this into two patches. The first
patch should be the 'simple' implementation that that you began with
that only looks at extents, and assumes that unwritten extents contain
data. The second patch can remove the assumption that unwritten extents
contain data, and go over pages over the extent to determine if it is
clean. I feel we have a better chance of coming to consensus that the
first patch is correct in the near term, and then we can move on to the
more complicated matter of whether the unwritten extent can be treated
as a hole safe in the knowledge that the initial implementation was
awesome.
Regards,
Ben
On Fri, Jan 06, 2012 at 09:28:58PM +0800, Jeff Liu wrote:
> This is a revised patch according to Christoph's comments at V4.
>
> Changes to V5:
> --------------
> * Revise xfs_has_unwritten_buffer() to lookup pages match tag.
> * For unwritten extents, in both xfs_seek_data() and xfs_seek_hole(), call xfs_has_unwritten_buffer() to search
> DIRTY pages firstly, if no dirty data found, call it again to search WRITEBACK pages.
> * In xfs_seek_hole(), if dirty data was found in page cache for an unwritten extents, but its start offset past the start block
> of the map, treat it as a hole, returns the offset if possible(data_buffer_offset > max(seek_offset, start_block_of_map)).
>
> Tests:
> ------
> seek sanity tester:
> http://patchwork.xfs.org/patch/3108/
> seek copy tester:
> http://patchwork.xfs.org/patch/3109/
>
>
> Thanks,
> -Jeff
>
> Signed-off-by: Jie Liu <jeff.liu@oracle.com>
>
> ---
> fs/xfs/xfs_file.c | 466 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
> 1 files changed, 465 insertions(+), 1 deletions(-)
>
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 753ed9b..24ae40a 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -38,6 +38,7 @@
>
> #include <linux/dcache.h>
> #include <linux/falloc.h>
> +#include <linux/pagevec.h>
>
> static const struct vm_operations_struct xfs_file_vm_ops;
>
> @@ -1141,8 +1142,471 @@ xfs_vm_page_mkwrite(
> return block_page_mkwrite(vma, vmf, xfs_get_blocks);
> }
>
> +/*
> + * Probe the data buffer offset in page cache for unwritten extents.
> + * Fetch all the pages match @tag, and iterate each page to find out
> + * if a buffer head state has BH_Unwritten or BH_Uptodate set.
> + */
> +STATIC bool
> +xfs_has_unwritten_buffer(
> + struct inode *inode,
> + struct xfs_bmbt_irec *map,
> + int tag,
> + loff_t *offset)
> +{
> + struct xfs_inode *ip = XFS_I(inode);
> + struct xfs_mount *mp = ip->i_mount;
> + struct pagevec pvec;
> + pgoff_t index;
> + pgoff_t end;
> + bool found = false;
> +
> + pagevec_init(&pvec, 0);
> +
> + index = XFS_FSB_TO_B(mp, map->br_startoff) >> PAGE_CACHE_SHIFT;
> + end = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount)
> + >> PAGE_CACHE_SHIFT;
> +
> + do {
> + unsigned int i;
> + unsigned nr_pages;
> + int want = min_t(pgoff_t, end - index,
> + (pgoff_t)PAGEVEC_SIZE - 1) + 1;
> + nr_pages = pagevec_lookup_tag(&pvec, inode->i_mapping,
> + &index, tag, want);
> + if (nr_pages == 0)
> + break;
> +
> + for (i = 0; i < nr_pages; i++) {
> + struct page *page = pvec.pages[i];
> + struct buffer_head *bh;
> + struct buffer_head *head;
> + xfs_fileoff_t last;
> +
> + if (!page_has_buffers(page))
> + continue;
> +
> + /*
> + * There is no need to check the following pages
> + * if the current page offset is out of range.
> + */
> + if (page->index > end)
> + goto out;
> +
> + last = XFS_B_TO_FSBT(mp,
> + page->index << PAGE_CACHE_SHIFT);
> +
> + bh = head = page_buffers(page);
> + do {
> + /*
> + * An extent in XFS_EXT_UNWRITTEN have disk
> + * blocks already mapped to it, but no data
> + * has been committed to them yet. If it has
> + * dirty data in the page cache it can be
> + * identified by having BH_Unwritten set in
> + * each buffer. Also, the buffer head state
> + * might be in BH_Uptodate too if the buffer
> + * writeback procedure was fired, we need to
> + * examine it as well.
> + */
> + if (buffer_unwritten(bh) ||
> + buffer_uptodate(bh)) {
> + found = true;
> + *offset = XFS_FSB_TO_B(mp, last);
> + goto out;
> + }
> + last++;
> + } while ((bh = bh->b_this_page) != head);
> + }
> +
> + /*
> + * If the number of probed pages less than our desired,
> + * there should no more pages mapped, search done.
> + */
> + if (nr_pages < want)
> + break;
> +
> + index = pvec.pages[i - 1]->index + 1;
> + pagevec_release(&pvec);
> + } while (index < end);
> +
> +out:
> + pagevec_release(&pvec);
> + if (!found)
> + *offset = 0;
> +
> + return found;
> +}
> +
> +STATIC loff_t
> +xfs_seek_data(
> + struct file *file,
> + loff_t start)
> +{
> + struct inode *inode = file->f_mapping->host;
> + struct xfs_inode *ip = XFS_I(inode);
> + struct xfs_mount *mp = ip->i_mount;
> + xfs_fsize_t isize = i_size_read(inode);
> + loff_t offset = 0;
> + struct xfs_ifork *ifp;
> + xfs_fileoff_t fsbno;
> + xfs_filblks_t len;
> + int lock;
> + int error;
> +
> + lock = xfs_ilock_map_shared(ip);
> +
> + if (start >= isize) {
> + error = ENXIO;
> + goto out_lock;
> + }
In Christoph's v3 review he asked you to move this check to after the
lock is taken, which you've done. Note that you've read from ip->i_size
using i_size_read before taking the lock, so isize could be stale. Call
i_size_read only after taking the ilock shared.
> +
> + fsbno = XFS_B_TO_FSBT(mp, start);
> + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
> + len = XFS_B_TO_FSB(mp, isize);
Put calculation of start_fsb and end_fsb next to each other.
> +
> + for (;;) {
> + struct xfs_bmbt_irec map[2];
> + int nmap = 2;
> + loff_t seekoff;
> +
> + error = xfs_bmapi_read(ip, fsbno, len - fsbno, map, &nmap,
> + XFS_BMAPI_ENTIRE);
> + if (error)
> + goto out_lock;
> +
> + /* No extents at given offset, must be beyond EOF */
> + if (!nmap) {
> + error = ENXIO;
> + goto out_lock;
> + }
> +
> + seekoff = XFS_FSB_TO_B(mp, fsbno);
> + /*
> + * Landed in a hole, skip to check the next extent.
> + * If the next extent landed in an in-memory data extent,
> + * or it is a normal extent, its fine to return.
> + * If the next extent landed in a hole extent, calculate
> + * the start file system block number for the next scan.
> + * If the next extent landed in an unwritten extent, we
> + * need to lookup the page cache to examine the data
> + * buffer offset, if nothing found, treat it as a hole
> + * extent too.
> + */
> + if (map[0].br_startblock == HOLESTARTBLOCK) {
> + /*
> + * Return ENXIO if no data extent behind
> + * the given offset. In this case, the seek
> + * offset should be landed in a hole.
> + */
> + if (nmap == 1) {
> + error = ENXIO;
> + break;
> + }
> +
> + if (map[1].br_state == XFS_EXT_NORM ||
> + map[1].br_startblock == DELAYSTARTBLOCK) {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp, map[1].br_startoff));
> +
> + break;
> + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
> + if (xfs_has_unwritten_buffer(inode, &map[1],
> + PAGECACHE_TAG_DIRTY,
> + &offset) ||
> + xfs_has_unwritten_buffer(inode, &map[1],
> + PAGECACHE_TAG_WRITEBACK,
> + &offset)) {
> + offset = max_t(loff_t, seekoff, offset);
> + break;
> + }
> + }
> +
> + fsbno = map[1].br_startoff + map[1].br_blockcount;
> + }
> +
> + /*
> + * Landed in an unwritten extent, try to find out the data
> + * buffer offset from page cache firstly. If nothing was
> + * found, treat it as a hole, and skip to check the next
> + * extent, something just like above.
> + */
> + if (map[0].br_state == XFS_EXT_UNWRITTEN) {
> + if (xfs_has_unwritten_buffer(inode, &map[0],
> + PAGECACHE_TAG_DIRTY,
> + &offset) ||
> + xfs_has_unwritten_buffer(inode, &map[0],
> + PAGECACHE_TAG_WRITEBACK,
> + &offset)) {
> + offset = max_t(loff_t, seekoff, offset);
> + break;
> + }
> +
> + /* No data extent at the given offset */
> + if (nmap == 1) {
> + error = ENXIO;
> + break;
> + }
> +
> + if (map[1].br_state == XFS_EXT_NORM ||
> + map[1].br_startblock == DELAYSTARTBLOCK) {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp, map[1].br_startoff));
> + break;
> + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
> + if (xfs_has_unwritten_buffer(inode, &map[1],
> + PAGECACHE_TAG_DIRTY,
> + &offset) ||
> + xfs_has_unwritten_buffer(inode, &map[1],
> + PAGECACHE_TAG_WRITEBACK,
> + &offset)) {
> + offset = max_t(loff_t, seekoff, offset);
> + break;
> + }
> + }
> +
> + fsbno = map[1].br_startoff + map[1].br_blockcount;
> + }
> +
> + /* Landed in a delay allocated extent or a real data extent */
> + if (map[0].br_startblock == DELAYSTARTBLOCK ||
> + map[0].br_state == XFS_EXT_NORM) {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp, map[0].br_startoff));
> + break;
> + }
> +
> + /* Return ENXIO if beyond eof */
> + if (XFS_FSB_TO_B(mp, fsbno) > isize) {
> + error = ENXIO;
> + goto out_lock;
> + }
> + }
> +
> + if (offset < start)
> + offset = start;
> +
> + if (offset != file->f_pos)
> + file->f_pos = offset;
> +
> +out_lock:
> + xfs_iunlock_map_shared(ip, lock);
> + if (error)
> + return -error;
> +
> + return offset;
> +}
> +
> +STATIC loff_t
> +xfs_seek_hole(
> + struct file *file,
> + loff_t start)
> +{
> + struct inode *inode = file->f_mapping->host;
> + struct xfs_inode *ip = XFS_I(inode);
> + struct xfs_mount *mp = ip->i_mount;
> + xfs_fsize_t isize = i_size_read(inode);
Call i_size_read under ilock.
> + loff_t offset = 0;
> + struct xfs_ifork *ifp;
> + xfs_fileoff_t fsbno;
> + xfs_filblks_t len;
> + int lock;
lock should be a uint
> + int error;
> +
> + lock = xfs_ilock_map_shared(ip);
> +
> + if (start >= isize) {
> + error = ENXIO;
> + goto out_lock;
> + }
> +
> + fsbno = XFS_B_TO_FSBT(mp, start);
> + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
> + len = XFS_B_TO_FSB(mp, isize);
Calculation of start_fsb and end_fsb look nicer next to each other.
> +
> + for (;;) {
> + struct xfs_bmbt_irec map[2];
> + int nmap = 2;
> + loff_t seekoff;
> +
> + error = xfs_bmapi_read(ip, fsbno, len - fsbno, map, &nmap,
> + XFS_BMAPI_ENTIRE);
> + if (error)
> + goto out_lock;
> +
> + /* No extents at given offset, must be beyond EOF */
> + if (!nmap) {
> + error = ENXIO;
> + goto out_lock;
> + }
> +
> + seekoff = XFS_FSB_TO_B(mp, fsbno);
> + /*
> + * Landed in an unwritten extent, try to lookup the page
> + * cache to find out if there is dirty data or not. If
> + * nothing was found, treate it as a hole. If there has
> + * dirty data and its offset starts past both the start
> + * block of the map and the current seek offset, it should
> + * be treated as hole too. Otherwise, go through the next
> + * extent to fetch holes.
> + */
> + if (map[0].br_state == XFS_EXT_UNWRITTEN) {
> + if (xfs_has_unwritten_buffer(inode, &map[0],
> + PAGECACHE_TAG_DIRTY,
> + &offset) ||
> + xfs_has_unwritten_buffer(inode, &map[0],
> + PAGECACHE_TAG_WRITEBACK,
> + &offset)) {
> + if (offset > max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp,
> + map[0].br_startoff))) {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp,
> + map[0].br_startoff));
> + break;
> + }
> + } else {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp, map[0].br_startoff));
> + break;
> + }
> +
> + /*
> + * No more extent at the given offst, return the total
> + * file size.
> + */
> + if (nmap == 1) {
> + offset = isize;
> + break;
> + }
> +
> + if (map[1].br_startblock == HOLESTARTBLOCK) {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp, map[1].br_startoff));
> + break;
> + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
> + if (xfs_has_unwritten_buffer(inode, &map[1],
> + PAGECACHE_TAG_DIRTY,
> + &offset) ||
> + xfs_has_unwritten_buffer(inode, &map[1],
> + PAGECACHE_TAG_WRITEBACK,
> + &offset)) {
> + if (offset > max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp,
> + map[1].br_startoff))) {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp,
> + map[1].br_startoff));
> + break;
> + }
> + } else {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp, map[1].br_startoff));
> + break;
> + }
> + }
> +
> + fsbno = map[1].br_startoff + map[1].br_blockcount;
> + }
> +
> + /*
> + * Landed in a delay allocated extent or a real data extent,
> + * if the next extent is landed in a hole or in an unwritten
> + * extent but without data committed in the page cache, return
> + * its offset. If the next extent has dirty data in page cache,
> + * but its offset starts past both the start block of the map
> + * and the seek offset, it still be a hole.
> + */
> + if (map[0].br_startblock == DELAYSTARTBLOCK ||
> + map[0].br_state == XFS_EXT_NORM) {
> + /*
> + * No more extent at the give offset, return the
> + * total file size.
> + */
> + if (nmap == 1) {
> + offset = isize;
> + break;
> + }
> +
> + if (map[1].br_startblock == HOLESTARTBLOCK) {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp, map[1].br_startoff));
> + break;
> + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
> + if (xfs_has_unwritten_buffer(inode, &map[1],
> + PAGECACHE_TAG_DIRTY,
> + &offset) ||
> + xfs_has_unwritten_buffer(inode, &map[1],
> + PAGECACHE_TAG_WRITEBACK,
> + &offset)) {
> + if (offset > max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp,
> + map[1].br_startoff))) {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp,
> + map[1].br_startoff));
> + break;
> + }
> + } else {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp, map[1].br_startoff));
> + break;
> + }
> + }
> +
> + fsbno = map[1].br_startoff + map[1].br_blockcount;
> + }
> +
> + /* Landed in a hole, its fine to return */
> + if (map[0].br_startblock == HOLESTARTBLOCK) {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp, map[0].br_startoff));
> + break;
> + }
> +
> + /* Return ENXIO if beyond eof */
> + if (XFS_FSB_TO_B(mp, fsbno) > isize) {
> + error = ENXIO;
> + goto out_lock;
> + }
> + }
> +
> + if (offset < start)
> + offset = start;
> +
> + if (offset != file->f_pos)
> + file->f_pos = offset;
> +
> +out_lock:
name this out_unlock
> + xfs_iunlock_map_shared(ip, lock);
> + if (error)
> + return -error;
> +
> + return offset;
> +}
> +
> +STATIC loff_t
> +xfs_file_llseek(
> + struct file *file,
> + loff_t offset,
> + int origin)
> +{
> + switch (origin) {
> + case SEEK_END:
> + case SEEK_CUR:
> + case SEEK_SET:
> + return generic_file_llseek(file, offset, origin);
> + case SEEK_DATA:
> + return xfs_seek_data(file, offset);
> + case SEEK_HOLE:
> + return xfs_seek_hole(file, offset);
> + default:
> + return -EOPNOTSUPP;
I suggest -EINVAL here, as per http://linux.die.net/man/2/lseek
> + }
> +}
> +
> const struct file_operations xfs_file_operations = {
> - .llseek = generic_file_llseek,
> + .llseek = xfs_file_llseek,
> .read = do_sync_read,
> .write = do_sync_write,
> .aio_read = xfs_file_aio_read,
> --
> 1.7.4.1
>
>
>
>
>
>
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Introduce SEEK_DATA/SEEK_HOLE to XFS V5
2012-01-11 5:45 ` Jeff Liu
@ 2012-01-11 21:06 ` Mark Tinguely
2012-01-12 16:22 ` Christoph Hellwig
2012-01-11 21:07 ` Mark Tinguely
2012-01-11 21:12 ` Mark Tinguely
2 siblings, 1 reply; 20+ messages in thread
From: Mark Tinguely @ 2012-01-11 21:06 UTC (permalink / raw)
To: jeff.liu; +Cc: Christoph Hellwig, Ben Myers, Chris Mason, xfs
Good work. I know this is an important addition.
The test in the for loop of the xfs_seek_data() and xfs_seek_hole()
routines,
are independent of each other except in order of them being called.
One could add a "continue" statement after computing the offset for the next
iteration of the loop.
This allows the addition of an error condition if no conditions are matched.
Rough patch relative to your patch of the idea.
--- fs/xfs/xfs_file.c.orig 2012-01-11 08:50:50.000000000 -0600
+++ fs/xfs/xfs_file.c 2012-01-11 09:12:47.000000000 -0600
@@ -1265,7 +1265,7 @@ xfs_seek_data(
ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
len = XFS_B_TO_FSB(mp, isize);
- for (;;) {
+ do {
struct xfs_bmbt_irec map[2];
int nmap = 2;
loff_t seekoff;
@@ -1323,6 +1323,7 @@ xfs_seek_data(
}
fsbno = map[1].br_startoff + map[1].br_blockcount;
+ continue;
}
/*
@@ -1366,6 +1367,7 @@ xfs_seek_data(
}
fsbno = map[1].br_startoff + map[1].br_blockcount;
+ continue;
}
/* Landed in a delay allocated extent or a real data extent */
@@ -1376,11 +1378,14 @@ xfs_seek_data(
break;
}
+ error = ENXIO;
+ goto out_lock;
+ } while (XFS_FSB_TO_B(mp, fsbno) < isize);
+
/* Return ENXIO if beyond eof */
- if (XFS_FSB_TO_B(mp, fsbno) > isize) {
- error = ENXIO;
- goto out_lock;
- }
+ if (XFS_FSB_TO_B(mp, fsbno) > isize) {
+ error = ENXIO;
+ goto out_lock;
}
if (offset < start)
@@ -1424,7 +1429,7 @@ xfs_seek_hole(
ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
len = XFS_B_TO_FSB(mp, isize);
- for (;;) {
+ do {
struct xfs_bmbt_irec map[2];
int nmap = 2;
loff_t seekoff;
@@ -1507,6 +1512,7 @@ xfs_seek_hole(
}
fsbno = map[1].br_startoff + map[1].br_blockcount;
+ continue;
}
/*
@@ -1555,6 +1561,7 @@ xfs_seek_hole(
}
fsbno = map[1].br_startoff + map[1].br_blockcount;
+ continue;
}
/* Landed in a hole, its fine to return */
@@ -1564,11 +1571,14 @@ xfs_seek_hole(
break;
}
+ error = ENXIO;
+ goto out_lock;
+ } while (XFS_FSB_TO_B(mp, fsbno) < isize);
+
/* Return ENXIO if beyond eof */
- if (XFS_FSB_TO_B(mp, fsbno) > isize) {
- error = ENXIO;
- goto out_lock;
- }
+ if (XFS_FSB_TO_B(mp, fsbno) > isize) {
+ error = ENXIO;
+ goto out_lock;
}
if (offset < start)
--Mark Tinguely
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Introduce SEEK_DATA/SEEK_HOLE to XFS V5
2012-01-11 5:45 ` Jeff Liu
2012-01-11 21:06 ` Mark Tinguely
@ 2012-01-11 21:07 ` Mark Tinguely
2012-01-12 13:29 ` Jeff Liu
2012-01-12 16:39 ` Christoph Hellwig
2012-01-11 21:12 ` Mark Tinguely
2 siblings, 2 replies; 20+ messages in thread
From: Mark Tinguely @ 2012-01-11 21:07 UTC (permalink / raw)
To: jeff.liu; +Cc: Christoph Hellwig, Ben Myers, Chris Mason, xfs
xfs_bmapi_read() returns the br_state == XFS_EXT_NORM for a hole.
There are a couple places that a hole can trigger a data test.
BTW, I could not generate a large enough hole that xfs_bmapi_read()
would return as more than one hole entry, so I will ignore those
situations and just list the couple places that a hole may be match
a data rule:
in xfs_seek_data():
+ /*
+ * Landed in an unwritten extent, try to find out the data
+ * buffer offset from page cache firstly. If nothing was
+ * found, treat it as a hole, and skip to check the next
+ * extent, something just like above.
+ */
+ if (map[0].br_state == XFS_EXT_UNWRITTEN) {
+ if (xfs_has_unwritten_buffer(inode, &map[0],
+ PAGECACHE_TAG_DIRTY,
+ &offset) ||
+ xfs_has_unwritten_buffer(inode, &map[0],
+ PAGECACHE_TAG_WRITEBACK,
+ &offset)) {
+ offset = max_t(loff_t, seekoff, offset);
+ break;
+ }
+
+ /* No data extent at the given offset */
+ if (nmap == 1) {
+ error = ENXIO;
+ break;
+ }
+
+ if (map[1].br_state == XFS_EXT_NORM ||
^^^ could be a hole and not data^^^
I think you need to add back the br_startblock test:
+ if ((map[1].br_state == XFS_EXT_NORM &&
+ map[1].br_startblock != HOLESTARTBLOCK) ||
in xfs_seek_hole():
+ /*
+ * Landed in a delay allocated extent or a real data extent,
+ * if the next extent is landed in a hole or in an unwritten
+ * extent but without data committed in the page cache, return
+ * its offset. If the next extent has dirty data in page cache,
+ * but its offset starts past both the start block of the map
+ * and the seek offset, it still be a hole.
+ */
+ if (map[0].br_startblock == DELAYSTARTBLOCK ||
+ map[0].br_state == XFS_EXT_NORM) {
^^^ could be a hole ^^^
and this only matters because this test is checked before the next test:
+
+ /* Landed in a hole, its fine to return */
+ if (map[0].br_startblock == HOLESTARTBLOCK) {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp, map[0].br_startoff));
+ break;
+ }
Switching the order of these two tests would return the immediate offset
starting a hole seek at the offset of a hole.
None of these conditions will result in data corruption, only earlier
detection of a hole.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Introduce SEEK_DATA/SEEK_HOLE to XFS V5
2012-01-11 5:45 ` Jeff Liu
2012-01-11 21:06 ` Mark Tinguely
2012-01-11 21:07 ` Mark Tinguely
@ 2012-01-11 21:12 ` Mark Tinguely
2012-01-12 13:52 ` Jeff Liu
2 siblings, 1 reply; 20+ messages in thread
From: Mark Tinguely @ 2012-01-11 21:12 UTC (permalink / raw)
To: jeff.liu; +Cc: Christoph Hellwig, Ben Myers, Chris Mason, xfs
xfs_has_unwritten_buffer() always returns the offset of the first
dirty unwritten page. This can cause xfs_seek_data() and xfs_seek_hole()
to give the wrong results in certain circumstances.
In xfs_seek_data(), every page past first dirty/unwritten page in the
unwritten extent will be reported as data.
in xfs_seek_data():
+ /*
+ * Landed in an unwritten extent, try to find out the data
+ * buffer offset from page cache firstly. If nothing was
+ * found, treat it as a hole, and skip to check the next
+ * extent, something just like above.
+ */
+ if (map[0].br_state == XFS_EXT_UNWRITTEN) {
+ if (xfs_has_unwritten_buffer(inode, &map[0],
+ PAGECACHE_TAG_DIRTY,
+ &offset) ||
+ xfs_has_unwritten_buffer(inode, &map[0],
+ PAGECACHE_TAG_WRITEBACK,
+ &offset)) {
+ offset = max_t(loff_t, seekoff, offset);
+ break;
+ }
Since the xfs_has_unwritten_buffer() returns the offset of the first
dirty/unwritten page (the first page in this example), the max_t()
comparison will say that every page after the first dirty page has
data.
-----
xfs_seek_hole() can only find a hole if it precedes the first dirty
page.
in xfs_seek_hole():
+ /*
+ * Landed in an unwritten extent, try to lookup the page
+ * cache to find out if there is dirty data or not. If
+ * nothing was found, treate it as a hole. If there has
+ * dirty data and its offset starts past both the start
+ * block of the map and the current seek offset, it should
+ * be treated as hole too. Otherwise, go through the next
+ * extent to fetch holes.
+ */
+ if (map[0].br_state == XFS_EXT_UNWRITTEN) {
+ if (xfs_has_unwritten_buffer(inode, &map[0],
+ PAGECACHE_TAG_DIRTY,
+ &offset) ||
+ xfs_has_unwritten_buffer(inode, &map[0],
+ PAGECACHE_TAG_WRITEBACK,
+ &offset)) {
+ if (offset > max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp,
+ map[0].br_startoff))) {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp,
+ map[0].br_startoff));
+ break;
+ }
+ } else {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp, map[0].br_startoff));
+ break;
+ }
--Mark Tinguely.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Introduce SEEK_DATA/SEEK_HOLE to XFS V5
2012-01-11 15:43 ` Ben Myers
@ 2012-01-11 22:28 ` Ben Myers
2012-01-12 13:21 ` Jeff Liu
2012-01-12 12:53 ` Jeff Liu
1 sibling, 1 reply; 20+ messages in thread
From: Ben Myers @ 2012-01-11 22:28 UTC (permalink / raw)
To: Jeff Liu; +Cc: Christoph Hellwig, Chris Mason, xfs
Hey Jeff,
On Wed, Jan 11, 2012 at 09:43:18AM -0600, Ben Myers wrote:
> Here are a few additional minor comments from yesterday.
>
> I'm looking forward to seeing your next version, and I'm still working
> through this one.
>
> I would like to suggest that you split this into two patches. The first
> patch should be the 'simple' implementation that that you began with
> that only looks at extents, and assumes that unwritten extents contain
> data. The second patch can remove the assumption that unwritten extents
> contain data, and go over pages over the extent to determine if it is
> clean. I feel we have a better chance of coming to consensus that the
> first patch is correct in the near term, and then we can move on to the
> more complicated matter of whether the unwritten extent can be treated
> as a hole safe in the knowledge that the initial implementation was
> awesome.
Ok, since I'm the jackass who is asking you to do the extra work I'll
try to be of assistance. Understand that at this point I'm trying to
make sure that I understand your code fully. I'm not trying to give you
a hard time or make your life miserable.
Here I am assuming that we'll treat unwritten extents as containing data
and leaving the enhancement of probing unwritten extents for later.
This is a table of some of the results from xfs_bmapi_read, and what
should be done in each situation.
SEEK_DATA:
Where nmap = 0:
return ENXIO. * maybe not possible, unless len = 0?
Where nmap = 1:
map[0]
written data @ offset
delay data @ offset
unwritten data @ offset
hole return ENXIO? * empty file?
Where nmap = 2:
map[0] map[1]
written written data @ offset
written delay data @ offset
written unwritten data @ offset
written hole data @ offset
delay written data @ offset
delay delay data @ offset * maybe not possible?
delay unwritten data @ offset
delay hole data @ offset
unwritten written data @ offset
unwritten delay data @ offset
unwritten unwritten data @ offset
unwritten hole data @ offset
hole written data @ map[1].br_startoff
hole delay data @ map[1].br_startoff
hole unwritten data @ map[1].br_startoff
hole hole * not possible
(DELAYSTARTBLOCK and HOLESTARTBLOCK are both 'isnullstartblock')
written:
(!isnullstartblock(map.br_startblock) && map.br_state == XFS_EXT_NORMAL)
delay:
map.br_startblock == DELAYSTARTBLOCK
unwritten:
map.br_state == XFS_EXT_UNWRITTEN
hole:
map.br_startblock == HOLESTARTBLOCK
xfs_seek_data(file, startoff)
{
loff_t offset;
int error;
take ilock
isize = i_size_read
start_fsb = XFS_B_TO_FSBT(startoff)
end_fsb = XFS_B_TO_FSB(i_size) # inode size
error = xfs_bmapi_read(map, &nmap)
if (error)
goto out_unlock;
if (nmap == 0) {
/*
* return an error. I'm not sure that this necessarily
* means we're reading after EOF, since it looks like
* xfs_bmapi_read would return one hole in that case.
*/
error = ERROR /* EIO? */
goto out_unlock
}
/* check map[0] first */
if (map[0].br_state == XFS_EXT_NORMAL &&
!isnullstartblock(map[0].br_startblock) {
/*
* startoff is already within data. remember
* that it can anywhere within start_fsb
*/
offset = startoff
} else if (map[0].br_startblock == DELAYSTARTBLOCK) {
offset = startoff
} else if (map[0].br_state == XFS_EXT_UNWRITTEN) {
offset = startoff;
} else if (map[0].br_startblock == HOLESTARTBLOCK) {
if (nmap == 1) {
/*
* finding a hole in map[0] and nothing in
* map[1] probably means that we are reading
* after eof
*/
ASSERT(startoff >= isize)
error = ENXIO
goto out_unlock
}
/*
* we have two mappings, and need to check map[1] to see
* if there is data.
*/
if (map[1].br_state == XFS_EXT_NORMAL &&
!isnullstartblock(map[1].br_startblock)) {
offset = XFS_FSB_TO_B(map[1].br_startoff);
} else if (map[1].br_startblock == DELAYSTARTBLOCK) {
offset = XFS_FSB_TO_B(map[1].br_startoff);
} else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
offset = XFS_FSB_TO_B(map[1].br_startoff);
} else if (map[1].br_startblock == HOLESTARTBLOCK) {
/*
* this should never happen, but we could
*/
ASSERT(startoff >= isize);
error = ENXIO
/* BUG(); */
} else {
offset = startoff
/* BUG(); */
}
} else {
offset = startoff
/* BUG(); */
}
out_unlock:
drop ilock
if (error)
return -error;
return offset;
}
I think that is sufficiently straightforward that even I can understand
it, or am I off my rocker? IMO it's not that bad that we have to write
the if/else to determine extent type twice and that there is some
duplication when setting the offset. When you come back to enhance it
further by probing unwritten extents I think a goto would probably be
more readable than trying to shoehorn this into a for/do, but that's
just me.
Jeff, I hope that doesn't ruffle any feathers. I know I came to the
party a bit late. After a break I am going to go look at your code for
xfs_seek_data again. I think I'll understand it better now. After that
I am going to look into SEEK_HOLE...
Regards,
Ben
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Introduce SEEK_DATA/SEEK_HOLE to XFS V5
2012-01-11 15:43 ` Ben Myers
2012-01-11 22:28 ` Ben Myers
@ 2012-01-12 12:53 ` Jeff Liu
1 sibling, 0 replies; 20+ messages in thread
From: Jeff Liu @ 2012-01-12 12:53 UTC (permalink / raw)
To: Ben Myers; +Cc: Christoph Hellwig, Chris Mason, xfs
Hi Ben,
On 01/11/2012 11:43 PM, Ben Myers wrote:
> Hey Jeff,
>
> Here are a few additional minor comments from yesterday.
>
> I'm looking forward to seeing your next version, and I'm still working
> through this one.
>
> I would like to suggest that you split this into two patches. The first
> patch should be the 'simple' implementation that that you began with
> that only looks at extents, and assumes that unwritten extents contain
> data. The second patch can remove the assumption that unwritten extents
> contain data, and go over pages over the extent to determine if it is
> clean. I feel we have a better chance of coming to consensus that the
> first patch is correct in the near term, and then we can move on to the
> more complicated matter of whether the unwritten extent can be treated
> as a hole safe in the knowledge that the initial implementation was
> awesome.
>
> Regards,
> Ben
>
> On Fri, Jan 06, 2012 at 09:28:58PM +0800, Jeff Liu wrote:
>> This is a revised patch according to Christoph's comments at V4.
>>
>> Changes to V5:
>> --------------
>> * Revise xfs_has_unwritten_buffer() to lookup pages match tag.
>> * For unwritten extents, in both xfs_seek_data() and xfs_seek_hole(), call xfs_has_unwritten_buffer() to search
>> DIRTY pages firstly, if no dirty data found, call it again to search WRITEBACK pages.
>> * In xfs_seek_hole(), if dirty data was found in page cache for an unwritten extents, but its start offset past the start block
>> of the map, treat it as a hole, returns the offset if possible(data_buffer_offset > max(seek_offset, start_block_of_map)).
>>
>> Tests:
>> ------
>> seek sanity tester:
>> http://patchwork.xfs.org/patch/3108/
>> seek copy tester:
>> http://patchwork.xfs.org/patch/3109/
>>
>>
>> Thanks,
>> -Jeff
>>
>> Signed-off-by: Jie Liu <jeff.liu@oracle.com>
>>
>> ---
>> fs/xfs/xfs_file.c | 466 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>> 1 files changed, 465 insertions(+), 1 deletions(-)
>>
>> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
>> index 753ed9b..24ae40a 100644
>> --- a/fs/xfs/xfs_file.c
>> +++ b/fs/xfs/xfs_file.c
>> @@ -38,6 +38,7 @@
>>
>> #include <linux/dcache.h>
>> #include <linux/falloc.h>
>> +#include <linux/pagevec.h>
>>
>> static const struct vm_operations_struct xfs_file_vm_ops;
>>
>> @@ -1141,8 +1142,471 @@ xfs_vm_page_mkwrite(
>> return block_page_mkwrite(vma, vmf, xfs_get_blocks);
>> }
>>
>> +/*
>> + * Probe the data buffer offset in page cache for unwritten extents.
>> + * Fetch all the pages match @tag, and iterate each page to find out
>> + * if a buffer head state has BH_Unwritten or BH_Uptodate set.
>> + */
>> +STATIC bool
>> +xfs_has_unwritten_buffer(
>> + struct inode *inode,
>> + struct xfs_bmbt_irec *map,
>> + int tag,
>> + loff_t *offset)
>> +{
>> + struct xfs_inode *ip = XFS_I(inode);
>> + struct xfs_mount *mp = ip->i_mount;
>> + struct pagevec pvec;
>> + pgoff_t index;
>> + pgoff_t end;
>> + bool found = false;
>> +
>> + pagevec_init(&pvec, 0);
>> +
>> + index = XFS_FSB_TO_B(mp, map->br_startoff) >> PAGE_CACHE_SHIFT;
>> + end = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount)
>> + >> PAGE_CACHE_SHIFT;
>> +
>> + do {
>> + unsigned int i;
>> + unsigned nr_pages;
>> + int want = min_t(pgoff_t, end - index,
>> + (pgoff_t)PAGEVEC_SIZE - 1) + 1;
>> + nr_pages = pagevec_lookup_tag(&pvec, inode->i_mapping,
>> + &index, tag, want);
>> + if (nr_pages == 0)
>> + break;
>> +
>> + for (i = 0; i < nr_pages; i++) {
>> + struct page *page = pvec.pages[i];
>> + struct buffer_head *bh;
>> + struct buffer_head *head;
>> + xfs_fileoff_t last;
>> +
>> + if (!page_has_buffers(page))
>> + continue;
>> +
>> + /*
>> + * There is no need to check the following pages
>> + * if the current page offset is out of range.
>> + */
>> + if (page->index > end)
>> + goto out;
>> +
>> + last = XFS_B_TO_FSBT(mp,
>> + page->index << PAGE_CACHE_SHIFT);
>> +
>> + bh = head = page_buffers(page);
>> + do {
>> + /*
>> + * An extent in XFS_EXT_UNWRITTEN have disk
>> + * blocks already mapped to it, but no data
>> + * has been committed to them yet. If it has
>> + * dirty data in the page cache it can be
>> + * identified by having BH_Unwritten set in
>> + * each buffer. Also, the buffer head state
>> + * might be in BH_Uptodate too if the buffer
>> + * writeback procedure was fired, we need to
>> + * examine it as well.
>> + */
>> + if (buffer_unwritten(bh) ||
>> + buffer_uptodate(bh)) {
>> + found = true;
>> + *offset = XFS_FSB_TO_B(mp, last);
>> + goto out;
>> + }
>> + last++;
>> + } while ((bh = bh->b_this_page) != head);
>> + }
>> +
>> + /*
>> + * If the number of probed pages less than our desired,
>> + * there should no more pages mapped, search done.
>> + */
>> + if (nr_pages < want)
>> + break;
>> +
>> + index = pvec.pages[i - 1]->index + 1;
>> + pagevec_release(&pvec);
>> + } while (index < end);
>> +
>> +out:
>> + pagevec_release(&pvec);
>> + if (!found)
>> + *offset = 0;
>> +
>> + return found;
>> +}
>> +
>> +STATIC loff_t
>> +xfs_seek_data(
>> + struct file *file,
>> + loff_t start)
>> +{
>> + struct inode *inode = file->f_mapping->host;
>> + struct xfs_inode *ip = XFS_I(inode);
>> + struct xfs_mount *mp = ip->i_mount;
>> + xfs_fsize_t isize = i_size_read(inode);
>> + loff_t offset = 0;
>> + struct xfs_ifork *ifp;
>> + xfs_fileoff_t fsbno;
>> + xfs_filblks_t len;
>> + int lock;
>> + int error;
>> +
>> + lock = xfs_ilock_map_shared(ip);
>> +
>> + if (start >= isize) {
>> + error = ENXIO;
>> + goto out_lock;
>> + }
>
> In Christoph's v3 review he asked you to move this check to after the
> lock is taken, which you've done. Note that you've read from ip->i_size
> using i_size_read before taking the lock, so isize could be stale. Call
> i_size_read only after taking the ilock shared.
Thank you for pointing this out!
>
>> +
>> + fsbno = XFS_B_TO_FSBT(mp, start);
>> + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
>> + len = XFS_B_TO_FSB(mp, isize);
>
> Put calculation of start_fsb and end_fsb next to each other.
ok. I will take care the same issues below too.
>
>> +
>> + for (;;) {
>> + struct xfs_bmbt_irec map[2];
>> + int nmap = 2;
>> + loff_t seekoff;
>> +
>> + error = xfs_bmapi_read(ip, fsbno, len - fsbno, map, &nmap,
>> + XFS_BMAPI_ENTIRE);
>> + if (error)
>> + goto out_lock;
>> +
>> + /* No extents at given offset, must be beyond EOF */
>> + if (!nmap) {
>> + error = ENXIO;
>> + goto out_lock;
>> + }
>> +
>> + seekoff = XFS_FSB_TO_B(mp, fsbno);
>> + /*
>> + * Landed in a hole, skip to check the next extent.
>> + * If the next extent landed in an in-memory data extent,
>> + * or it is a normal extent, its fine to return.
>> + * If the next extent landed in a hole extent, calculate
>> + * the start file system block number for the next scan.
>> + * If the next extent landed in an unwritten extent, we
>> + * need to lookup the page cache to examine the data
>> + * buffer offset, if nothing found, treat it as a hole
>> + * extent too.
>> + */
>> + if (map[0].br_startblock == HOLESTARTBLOCK) {
>> + /*
>> + * Return ENXIO if no data extent behind
>> + * the given offset. In this case, the seek
>> + * offset should be landed in a hole.
>> + */
>> + if (nmap == 1) {
>> + error = ENXIO;
>> + break;
>> + }
>> +
>> + if (map[1].br_state == XFS_EXT_NORM ||
>> + map[1].br_startblock == DELAYSTARTBLOCK) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[1].br_startoff));
>> +
>> + break;
>> + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
>> + if (xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_DIRTY,
>> + &offset) ||
>> + xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_WRITEBACK,
>> + &offset)) {
>> + offset = max_t(loff_t, seekoff, offset);
>> + break;
>> + }
>> + }
>> +
>> + fsbno = map[1].br_startoff + map[1].br_blockcount;
>> + }
>> +
>> + /*
>> + * Landed in an unwritten extent, try to find out the data
>> + * buffer offset from page cache firstly. If nothing was
>> + * found, treat it as a hole, and skip to check the next
>> + * extent, something just like above.
>> + */
>> + if (map[0].br_state == XFS_EXT_UNWRITTEN) {
>> + if (xfs_has_unwritten_buffer(inode, &map[0],
>> + PAGECACHE_TAG_DIRTY,
>> + &offset) ||
>> + xfs_has_unwritten_buffer(inode, &map[0],
>> + PAGECACHE_TAG_WRITEBACK,
>> + &offset)) {
>> + offset = max_t(loff_t, seekoff, offset);
>> + break;
>> + }
>> +
>> + /* No data extent at the given offset */
>> + if (nmap == 1) {
>> + error = ENXIO;
>> + break;
>> + }
>> +
>> + if (map[1].br_state == XFS_EXT_NORM ||
>> + map[1].br_startblock == DELAYSTARTBLOCK) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[1].br_startoff));
>> + break;
>> + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
>> + if (xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_DIRTY,
>> + &offset) ||
>> + xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_WRITEBACK,
>> + &offset)) {
>> + offset = max_t(loff_t, seekoff, offset);
>> + break;
>> + }
>> + }
>> +
>> + fsbno = map[1].br_startoff + map[1].br_blockcount;
>> + }
>> +
>> + /* Landed in a delay allocated extent or a real data extent */
>> + if (map[0].br_startblock == DELAYSTARTBLOCK ||
>> + map[0].br_state == XFS_EXT_NORM) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[0].br_startoff));
>> + break;
>> + }
>> +
>> + /* Return ENXIO if beyond eof */
>> + if (XFS_FSB_TO_B(mp, fsbno) > isize) {
>> + error = ENXIO;
>> + goto out_lock;
>> + }
>> + }
>> +
>> + if (offset < start)
>> + offset = start;
>> +
>> + if (offset != file->f_pos)
>> + file->f_pos = offset;
>> +
>> +out_lock:
>> + xfs_iunlock_map_shared(ip, lock);
>> + if (error)
>> + return -error;
>> +
>> + return offset;
>> +}
>> +
>> +STATIC loff_t
>> +xfs_seek_hole(
>> + struct file *file,
>> + loff_t start)
>> +{
>> + struct inode *inode = file->f_mapping->host;
>> + struct xfs_inode *ip = XFS_I(inode);
>> + struct xfs_mount *mp = ip->i_mount;
>> + xfs_fsize_t isize = i_size_read(inode);
>
> Call i_size_read under ilock.
>
>> + loff_t offset = 0;
>> + struct xfs_ifork *ifp;
>> + xfs_fileoff_t fsbno;
>> + xfs_filblks_t len;
>> + int lock;
>
> lock should be a uint
>
>> + int error;
>> +
>> + lock = xfs_ilock_map_shared(ip);
>> +
>> + if (start >= isize) {
>> + error = ENXIO;
>> + goto out_lock;
>> + }
>> +
>> + fsbno = XFS_B_TO_FSBT(mp, start);
>> + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
>> + len = XFS_B_TO_FSB(mp, isize);
>
> Calculation of start_fsb and end_fsb look nicer next to each other.
>
>> +
>> + for (;;) {
>> + struct xfs_bmbt_irec map[2];
>> + int nmap = 2;
>> + loff_t seekoff;
>> +
>> + error = xfs_bmapi_read(ip, fsbno, len - fsbno, map, &nmap,
>> + XFS_BMAPI_ENTIRE);
>> + if (error)
>> + goto out_lock;
>> +
>> + /* No extents at given offset, must be beyond EOF */
>> + if (!nmap) {
>> + error = ENXIO;
>> + goto out_lock;
>> + }
>> +
>> + seekoff = XFS_FSB_TO_B(mp, fsbno);
>> + /*
>> + * Landed in an unwritten extent, try to lookup the page
>> + * cache to find out if there is dirty data or not. If
>> + * nothing was found, treate it as a hole. If there has
>> + * dirty data and its offset starts past both the start
>> + * block of the map and the current seek offset, it should
>> + * be treated as hole too. Otherwise, go through the next
>> + * extent to fetch holes.
>> + */
>> + if (map[0].br_state == XFS_EXT_UNWRITTEN) {
>> + if (xfs_has_unwritten_buffer(inode, &map[0],
>> + PAGECACHE_TAG_DIRTY,
>> + &offset) ||
>> + xfs_has_unwritten_buffer(inode, &map[0],
>> + PAGECACHE_TAG_WRITEBACK,
>> + &offset)) {
>> + if (offset > max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp,
>> + map[0].br_startoff))) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp,
>> + map[0].br_startoff));
>> + break;
>> + }
>> + } else {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[0].br_startoff));
>> + break;
>> + }
>> +
>> + /*
>> + * No more extent at the given offst, return the total
>> + * file size.
>> + */
>> + if (nmap == 1) {
>> + offset = isize;
>> + break;
>> + }
>> +
>> + if (map[1].br_startblock == HOLESTARTBLOCK) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[1].br_startoff));
>> + break;
>> + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
>> + if (xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_DIRTY,
>> + &offset) ||
>> + xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_WRITEBACK,
>> + &offset)) {
>> + if (offset > max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp,
>> + map[1].br_startoff))) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp,
>> + map[1].br_startoff));
>> + break;
>> + }
>> + } else {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[1].br_startoff));
>> + break;
>> + }
>> + }
>> +
>> + fsbno = map[1].br_startoff + map[1].br_blockcount;
>> + }
>> +
>> + /*
>> + * Landed in a delay allocated extent or a real data extent,
>> + * if the next extent is landed in a hole or in an unwritten
>> + * extent but without data committed in the page cache, return
>> + * its offset. If the next extent has dirty data in page cache,
>> + * but its offset starts past both the start block of the map
>> + * and the seek offset, it still be a hole.
>> + */
>> + if (map[0].br_startblock == DELAYSTARTBLOCK ||
>> + map[0].br_state == XFS_EXT_NORM) {
>> + /*
>> + * No more extent at the give offset, return the
>> + * total file size.
>> + */
>> + if (nmap == 1) {
>> + offset = isize;
>> + break;
>> + }
>> +
>> + if (map[1].br_startblock == HOLESTARTBLOCK) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[1].br_startoff));
>> + break;
>> + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
>> + if (xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_DIRTY,
>> + &offset) ||
>> + xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_WRITEBACK,
>> + &offset)) {
>> + if (offset > max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp,
>> + map[1].br_startoff))) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp,
>> + map[1].br_startoff));
>> + break;
>> + }
>> + } else {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[1].br_startoff));
>> + break;
>> + }
>> + }
>> +
>> + fsbno = map[1].br_startoff + map[1].br_blockcount;
>> + }
>> +
>> + /* Landed in a hole, its fine to return */
>> + if (map[0].br_startblock == HOLESTARTBLOCK) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[0].br_startoff));
>> + break;
>> + }
>> +
>> + /* Return ENXIO if beyond eof */
>> + if (XFS_FSB_TO_B(mp, fsbno) > isize) {
>> + error = ENXIO;
>> + goto out_lock;
>> + }
>> + }
>> +
>> + if (offset < start)
>> + offset = start;
>> +
>> + if (offset != file->f_pos)
>> + file->f_pos = offset;
>> +
>> +out_lock:
>
> name this out_unlock
ok. :)
>
>> + xfs_iunlock_map_shared(ip, lock);
>> + if (error)
>> + return -error;
>> +
>> + return offset;
>> +}
>> +
>> +STATIC loff_t
>> +xfs_file_llseek(
>> + struct file *file,
>> + loff_t offset,
>> + int origin)
>> +{
>> + switch (origin) {
>> + case SEEK_END:
>> + case SEEK_CUR:
>> + case SEEK_SET:
>> + return generic_file_llseek(file, offset, origin);
>> + case SEEK_DATA:
>> + return xfs_seek_data(file, offset);
>> + case SEEK_HOLE:
>> + return xfs_seek_hole(file, offset);
>> + default:
>> + return -EOPNOTSUPP;
>
> I suggest -EINVAL here, as per http://linux.die.net/man/2/lseek
Definitely! I have gone through other file systems have SEEK_XXX stuff
support, OCFS2 returns -EINVAL in this case.
Btrfs will return -EINVAL too.
Thanks,
-Jeff
>
>> + }
>> +}
>> +
>> const struct file_operations xfs_file_operations = {
>> - .llseek = generic_file_llseek,
>> + .llseek = xfs_file_llseek,
>> .read = do_sync_read,
>> .write = do_sync_write,
>> .aio_read = xfs_file_aio_read,
>> --
>> 1.7.4.1
>>
>>
>>
>>
>>
>>
>> _______________________________________________
>> xfs mailing list
>> xfs@oss.sgi.com
>> http://oss.sgi.com/mailman/listinfo/xfs
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Introduce SEEK_DATA/SEEK_HOLE to XFS V5
2012-01-11 22:28 ` Ben Myers
@ 2012-01-12 13:21 ` Jeff Liu
0 siblings, 0 replies; 20+ messages in thread
From: Jeff Liu @ 2012-01-12 13:21 UTC (permalink / raw)
To: Ben Myers; +Cc: Christoph Hellwig, Chris Mason, xfs
Hi Ben,
Thanks a lot for your so much detailed info!
On 01/12/2012 06:28 AM, Ben Myers wrote:
> Hey Jeff,
>
> On Wed, Jan 11, 2012 at 09:43:18AM -0600, Ben Myers wrote:
>> Here are a few additional minor comments from yesterday.
>>
>> I'm looking forward to seeing your next version, and I'm still working
>> through this one.
>>
>> I would like to suggest that you split this into two patches. The first
>> patch should be the 'simple' implementation that that you began with
>> that only looks at extents, and assumes that unwritten extents contain
>> data. The second patch can remove the assumption that unwritten extents
>> contain data, and go over pages over the extent to determine if it is
>> clean. I feel we have a better chance of coming to consensus that the
>> first patch is correct in the near term, and then we can move on to the
>> more complicated matter of whether the unwritten extent can be treated
>> as a hole safe in the knowledge that the initial implementation was
>> awesome.
>
> Ok, since I'm the jackass who is asking you to do the extra work I'll
> try to be of assistance. Understand that at this point I'm trying to
> make sure that I understand your code fully. I'm not trying to give you
> a hard time or make your life miserable.
>
> Here I am assuming that we'll treat unwritten extents as containing data
> and leaving the enhancement of probing unwritten extents for later.
Do you means I only need to post a patch to treat unwritten extents as
data next time, and then try to work out another patch for probing
unwritten extents until the first one became stable?
>
> This is a table of some of the results from xfs_bmapi_read, and what
> should be done in each situation.
>
> SEEK_DATA:
>
> Where nmap = 0:
> return ENXIO. * maybe not possible, unless len = 0?
Per my previous tryout, this situation can be triggered when no extent
behind the seek offset for SEEK_HOLE; for SEEK_DATA, it will be caught
by the following checking:
if (start >= isize)
return -ENXIO;
>
> Where nmap = 1:
> map[0]
> written data @ offset
> delay data @ offset
> unwritten data @ offset
> hole return ENXIO? * empty file?
>
> Where nmap = 2:
> map[0] map[1]
> written written data @ offset
> written delay data @ offset
> written unwritten data @ offset
> written hole data @ offset
> delay written data @ offset
> delay delay data @ offset * maybe not possible?
Hmm, maybe we can design a case to trigger it out later. :-P.
I'm going to write the patch by referring to the following codes.
> delay unwritten data @ offset
> delay hole data @ offset
> unwritten written data @ offset
> unwritten delay data @ offset
> unwritten unwritten data @ offset
> unwritten hole data @ offset
> hole written data @ map[1].br_startoff
> hole delay data @ map[1].br_startoff
> hole unwritten data @ map[1].br_startoff
> hole hole * not possible
>
> (DELAYSTARTBLOCK and HOLESTARTBLOCK are both 'isnullstartblock')
>
> written:
> (!isnullstartblock(map.br_startblock) && map.br_state == XFS_EXT_NORMAL)
> delay:
> map.br_startblock == DELAYSTARTBLOCK
>
> unwritten:
> map.br_state == XFS_EXT_UNWRITTEN
>
> hole:
> map.br_startblock == HOLESTARTBLOCK
>
> xfs_seek_data(file, startoff)
> {
> loff_t offset;
> int error;
>
> take ilock
>
> isize = i_size_read
>
> start_fsb = XFS_B_TO_FSBT(startoff)
> end_fsb = XFS_B_TO_FSB(i_size) # inode size
>
> error = xfs_bmapi_read(map, &nmap)
> if (error)
> goto out_unlock;
>
> if (nmap == 0) {
> /*
> * return an error. I'm not sure that this necessarily
> * means we're reading after EOF, since it looks like
> * xfs_bmapi_read would return one hole in that case.
> */
>
> error = ERROR /* EIO? */
> goto out_unlock
> }
>
> /* check map[0] first */
> if (map[0].br_state == XFS_EXT_NORMAL &&
> !isnullstartblock(map[0].br_startblock) {
> /*
> * startoff is already within data. remember
> * that it can anywhere within start_fsb
> */
> offset = startoff
> } else if (map[0].br_startblock == DELAYSTARTBLOCK) {
> offset = startoff
> } else if (map[0].br_state == XFS_EXT_UNWRITTEN) {
> offset = startoff;
> } else if (map[0].br_startblock == HOLESTARTBLOCK) {
> if (nmap == 1) {
> /*
> * finding a hole in map[0] and nothing in
> * map[1] probably means that we are reading
> * after eof
> */
> ASSERT(startoff >= isize)
> error = ENXIO
> goto out_unlock
> }
>
> /*
> * we have two mappings, and need to check map[1] to see
> * if there is data.
> */
> if (map[1].br_state == XFS_EXT_NORMAL &&
> !isnullstartblock(map[1].br_startblock)) {
> offset = XFS_FSB_TO_B(map[1].br_startoff);
> } else if (map[1].br_startblock == DELAYSTARTBLOCK) {
> offset = XFS_FSB_TO_B(map[1].br_startoff);
> } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
> offset = XFS_FSB_TO_B(map[1].br_startoff);
> } else if (map[1].br_startblock == HOLESTARTBLOCK) {
> /*
> * this should never happen, but we could
> */
> ASSERT(startoff >= isize);
> error = ENXIO
> /* BUG(); */
> } else {
> offset = startoff
> /* BUG(); */
> }
> } else {
> offset = startoff
> /* BUG(); */
> }
> out_unlock:
> drop ilock
> if (error)
> return -error;
>
> return offset;
> }
>
> I think that is sufficiently straightforward that even I can understand
> it, or am I off my rocker? IMO it's not that bad that we have to write
> the if/else to determine extent type twice and that there is some
> duplication when setting the offset. When you come back to enhance it
> further by probing unwritten extents I think a goto would probably be
> more readable than trying to shoehorn this into a for/do, but that's
> just me.
>
> Jeff, I hope that doesn't ruffle any feathers. I know I came to the
> party a bit late. After a break I am going to go look at your code for
> xfs_seek_data again. I think I'll understand it better now. After that
> I am going to look into SEEK_HOLE...
Thanks you!
-Jeff
>
> Regards,
> Ben
>
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Introduce SEEK_DATA/SEEK_HOLE to XFS V5
2012-01-11 21:07 ` Mark Tinguely
@ 2012-01-12 13:29 ` Jeff Liu
2012-01-12 16:39 ` Christoph Hellwig
1 sibling, 0 replies; 20+ messages in thread
From: Jeff Liu @ 2012-01-12 13:29 UTC (permalink / raw)
To: Mark Tinguely; +Cc: Christoph Hellwig, Ben Myers, Chris Mason, xfs
Hi Mark,
Thanks for your comments!
On 01/12/2012 05:07 AM, Mark Tinguely wrote:
>
> xfs_bmapi_read() returns the br_state == XFS_EXT_NORM for a hole.
Yes, this is key point I have missed before.
> There are a couple places that a hole can trigger a data test.
> BTW, I could not generate a large enough hole that xfs_bmapi_read()
> would return as more than one hole entry, so I will ignore those
> situations and just list the couple places that a hole may be match
> a data rule:
>
> in xfs_seek_data():
> + /*
> + * Landed in an unwritten extent, try to find out the data
> + * buffer offset from page cache firstly. If nothing was
> + * found, treat it as a hole, and skip to check the next
> + * extent, something just like above.
> + */
> + if (map[0].br_state == XFS_EXT_UNWRITTEN) {
> + if (xfs_has_unwritten_buffer(inode, &map[0],
> + PAGECACHE_TAG_DIRTY,
> + &offset) ||
> + xfs_has_unwritten_buffer(inode, &map[0],
> + PAGECACHE_TAG_WRITEBACK,
> + &offset)) {
> + offset = max_t(loff_t, seekoff, offset);
> + break;
> + }
> +
> + /* No data extent at the given offset */
> + if (nmap == 1) {
> + error = ENXIO;
> + break;
> + }
> +
> + if (map[1].br_state == XFS_EXT_NORM ||
> ^^^ could be a hole and not data^^^
>
> I think you need to add back the br_startblock test:
>
> + if ((map[1].br_state == XFS_EXT_NORM &&
> + map[1].br_startblock != HOLESTARTBLOCK) ||
Ok, I'll add !isnullstartblock() test for normal extents test.
>
>
> in xfs_seek_hole():
> + /*
> + * Landed in a delay allocated extent or a real data extent,
> + * if the next extent is landed in a hole or in an unwritten
> + * extent but without data committed in the page cache, return
> + * its offset. If the next extent has dirty data in page cache,
> + * but its offset starts past both the start block of the map
> + * and the seek offset, it still be a hole.
> + */
> + if (map[0].br_startblock == DELAYSTARTBLOCK ||
> + map[0].br_state == XFS_EXT_NORM) {
> ^^^ could be a hole ^^^
>
> and this only matters because this test is checked before the next test:
>
> +
> + /* Landed in a hole, its fine to return */
> + if (map[0].br_startblock == HOLESTARTBLOCK) {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp, map[0].br_startoff));
> + break;
> + }
>
>
>
> Switching the order of these two tests would return the immediate offset
> starting a hole seek at the offset of a hole.
looks this issue is caused by missing hole test for extents at
XFS_EXT_NORM state. I'll fix them later.
Thanks,
-Jeff
>
>
> None of these conditions will result in data corruption, only earlier
> detection of a hole.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Introduce SEEK_DATA/SEEK_HOLE to XFS V5
2012-01-11 21:12 ` Mark Tinguely
@ 2012-01-12 13:52 ` Jeff Liu
2012-01-12 15:01 ` Mark Tinguely
0 siblings, 1 reply; 20+ messages in thread
From: Jeff Liu @ 2012-01-12 13:52 UTC (permalink / raw)
To: Mark Tinguely; +Cc: Christoph Hellwig, Ben Myers, Chris Mason, xfs
Hi Mark,
On 01/12/2012 05:12 AM, Mark Tinguely wrote:
> xfs_has_unwritten_buffer() always returns the offset of the first
> dirty unwritten page. This can cause xfs_seek_data() and xfs_seek_hole()
> to give the wrong results in certain circumstances.
Sorry, am was well understood your opinions in this point for now.
IMHO, we can only find and return the data buffer offset at a dirty or
unwritten page once the first page was probed.
>
>
> In xfs_seek_data(), every page past first dirty/unwritten page in the
> unwritten extent will be reported as data.
Hmm, consider the user level utility that make use of SEEK_XXX stuff to
copy data from an offset in source file:
Generally, it will call xfs_seek_data() firstly,
if we read an unwritten extent and there is data buffer was probed in
xfs_seek_data(), it only means we can read file data starting from the
returned offset of xfs_has_unwritten_buffer().
Then it will call xfs_seek_hole() to calculate this extent length.
next, a couple of read()/write() will be called in a loop depending on
the extent length.
[ page 1 ] | [ page 2 ] | [ page 3 ] | .... [ page N ]
|data offset at page 2|
If we got the data offset from page2, and there is no data at page 3,
the user utility call read(2) will returns ZERO, and it will break
immediately.
>
>
> in xfs_seek_data():
> + /*
> + * Landed in an unwritten extent, try to find out the data
> + * buffer offset from page cache firstly. If nothing was
> + * found, treat it as a hole, and skip to check the next
> + * extent, something just like above.
> + */
> + if (map[0].br_state == XFS_EXT_UNWRITTEN) {
> + if (xfs_has_unwritten_buffer(inode, &map[0],
> + PAGECACHE_TAG_DIRTY,
> + &offset) ||
> + xfs_has_unwritten_buffer(inode, &map[0],
> + PAGECACHE_TAG_WRITEBACK,
> + &offset)) {
> + offset = max_t(loff_t, seekoff, offset);
> + break;
> + }
>
> Since the xfs_has_unwritten_buffer() returns the offset of the first
> dirty/unwritten page (the first page in this example), the max_t()
> comparison will say that every page after the first dirty page has
> data.
>
> -----
>
> xfs_seek_hole() can only find a hole if it precedes the first dirty
> page.
Yes, for my current implementation, it is the case.
Anyway, I need a careful consideration for probing unwritten extent. :-)
Thanks,
-Jeff
>
> in xfs_seek_hole():
> + /*
> + * Landed in an unwritten extent, try to lookup the page
> + * cache to find out if there is dirty data or not. If
> + * nothing was found, treate it as a hole. If there has
> + * dirty data and its offset starts past both the start
> + * block of the map and the current seek offset, it should
> + * be treated as hole too. Otherwise, go through the next
> + * extent to fetch holes.
> + */
> + if (map[0].br_state == XFS_EXT_UNWRITTEN) {
> + if (xfs_has_unwritten_buffer(inode, &map[0],
> + PAGECACHE_TAG_DIRTY,
> + &offset) ||
> + xfs_has_unwritten_buffer(inode, &map[0],
> + PAGECACHE_TAG_WRITEBACK,
> + &offset)) {
> + if (offset > max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp,
> + map[0].br_startoff))) {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp,
> + map[0].br_startoff));
> + break;
> + }
> + } else {
> + offset = max_t(loff_t, seekoff,
> + XFS_FSB_TO_B(mp, map[0].br_startoff));
> + break;
> + }
>
> --Mark Tinguely.
>
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Introduce SEEK_DATA/SEEK_HOLE to XFS V5
2012-01-12 13:52 ` Jeff Liu
@ 2012-01-12 15:01 ` Mark Tinguely
2012-01-12 16:41 ` Christoph Hellwig
2012-01-13 2:41 ` Jeff Liu
0 siblings, 2 replies; 20+ messages in thread
From: Mark Tinguely @ 2012-01-12 15:01 UTC (permalink / raw)
To: jeff.liu; +Cc: Christoph Hellwig, Ben Myers, Chris Mason, xfs
On 01/12/12 07:52, Jeff Liu wrote:
> Hi Mark,
>
> On 01/12/2012 05:12 AM, Mark Tinguely wrote:
>
>> xfs_has_unwritten_buffer() always returns the offset of the first
>> dirty unwritten page. This can cause xfs_seek_data() and xfs_seek_hole()
>> to give the wrong results in certain circumstances.
>
> Sorry, am was well understood your opinions in this point for now.
> IMHO, we can only find and return the data buffer offset at a dirty or
> unwritten page once the first page was probed.
>
From my tests, xfs_bmapi_read() can only find holes if they cross or
start on a 64KB boundary. It would be nice if unwritten extents were at
least that good at finding holes.
In xfs_has_unwritten_buffer(), could you start searching from the seek
offset? The variable *offset could pass in that seek address and us that
offset as the starting "index" rather than the beginning of the extent?
You start:
index = XFS_FSB_TO_B(mp, map->br_startoff) >> PAGE_CACHE_SHIFT;
Could we do?:
index = XFS_FSB_TO_B(mp, *offset) >> PAGE_CACHE_SHIFT;
And before calling xfs_has_unwritten_buffer():
offset = seekoff;
Also, my idea to find the next data/hole requires that
xfs_has_unwritten_buffer() finds the smallest PAGECACHE_TAG_DIRTY or
PAGECACHE_TAG_WRITEBACK page if any starting at the seek offset.
>>
>> In xfs_seek_data(), every page past first dirty/unwritten page in the
>> unwritten extent will be reported as data.
>
> Hmm, consider the user level utility that make use of SEEK_XXX stuff to
> copy data from an offset in source file:
>
> Generally, it will call xfs_seek_data() firstly,
> if we read an unwritten extent and there is data buffer was probed in
> xfs_seek_data(), it only means we can read file data starting from the
> returned offset of xfs_has_unwritten_buffer().
>
> Then it will call xfs_seek_hole() to calculate this extent length.
> next, a couple of read()/write() will be called in a loop depending on
> the extent length.
>
> [ page 1 ] | [ page 2 ] | [ page 3 ] | .... [ page N ]
> |data offset at page 2|
>
> If we got the data offset from page2, and there is no data at page 3,
> the user utility call read(2) will returns ZERO, and it will break
> immediately.
>
Something like:
loop
s = lseek(fd, off, SEEK_DATA);
if (s == -1)
if we errno == ENXIO
return done /* eof */
else
return errno
e = lseek(fd, s, SEEK_HOLE);
if (e == -1)
return errno
dest = copy from s to e
off = e
end loop (if not eof or other condition)
You will seek for next hole at the found data position. Even if
xfs_has_unwritten_buffer() does the right thing and returns the
dirty/unwritten page starting from seekoff, we need go a page past the
current page (which has data) to look for the next hole.
Something like (again psuedo-code)
loop
offset1 = offset2 = seekoff
xfs_has_unwritten_buffer(seekoff, &offset1, DIRTY)
xfs_has_unwritten_buffer(seekoff, &offset2, WRITEBACK)
d = min(offset1, offset2)
if (d > seekoff OR d == NULL)
return found a hole at seekoff
if (d == seekoff) /* standard case assuming how we
* use SEEK_DATA/SEEK_HOLE
* This is the step your code
* does not perform. It jumps
* to the next extent
*/
seekoff += page size of dirty/writeback **
end while the seekoff < extent size
** here we could jump to the next 64KB boundary and be as accurate as
xfs_bmapi_read().
Good job. This is an important feature.
--Mark Tinguely.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Introduce SEEK_DATA/SEEK_HOLE to XFS V5
2012-01-11 21:06 ` Mark Tinguely
@ 2012-01-12 16:22 ` Christoph Hellwig
2012-01-12 17:50 ` Ben Myers
0 siblings, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2012-01-12 16:22 UTC (permalink / raw)
To: Mark Tinguely; +Cc: jeff.liu, Ben Myers, Chris Mason, xfs
With all the complications that we got compared to the initial version,
namely multiple hole extents, dirty unwritten extent detection and
so on I think it's time to stop using xfs_bmapi_read against Dave's
initial suggestion, and switch to using xfs_bmap_search_extents
directly.
The rationale for that is that
a) using xfs_bmapi_read makes hole detection more complex, given
that it has to fill potentially multiple xfs_bmbt_irec structures
instead of skipping over them
b) reading two extents at a time means we have to duplicate all the
detection code.
if we use xfs_bmap_search_extents we need a bit of boilerplate code,
but xfs_seek_data becomes really simple - we just loop over
xfs_bmap_search_extents until we either find an extent or EOF.
If we find an extent and it's unwritten we might have to probe for
dirty areas from one single point, or just skip it but the code is
still simple. xfs_seek_hole is just as simple - if
xfs_bmap_search_extents fits the condition for a hole as written
down in xfs_bmapi_read we've found it, if not we might again have
to do the unwritten extent probing, but just from a single place
instead of duplicating it twice.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Introduce SEEK_DATA/SEEK_HOLE to XFS V5
2012-01-11 21:07 ` Mark Tinguely
2012-01-12 13:29 ` Jeff Liu
@ 2012-01-12 16:39 ` Christoph Hellwig
2012-01-13 2:14 ` Jeff Liu
1 sibling, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2012-01-12 16:39 UTC (permalink / raw)
To: Mark Tinguely; +Cc: Christoph Hellwig, jeff.liu, Ben Myers, Chris Mason, xfs
On Wed, Jan 11, 2012 at 03:07:29PM -0600, Mark Tinguely wrote:
>
> xfs_bmapi_read() returns the br_state == XFS_EXT_NORM for a hole.
> There are a couple places that a hole can trigger a data test.
> BTW, I could not generate a large enough hole that xfs_bmapi_read()
> would return as more than one hole entry, so I will ignore those
> situations and just list the couple places that a hole may be match
> a data rule:
We've been through this before, you need to overflow the 32-bit extent
length counter to get there. Jeff, did you manage to create a test
case for that particular scenario?
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Introduce SEEK_DATA/SEEK_HOLE to XFS V5
2012-01-12 15:01 ` Mark Tinguely
@ 2012-01-12 16:41 ` Christoph Hellwig
2012-01-12 17:39 ` Ben Myers
2012-01-13 2:41 ` Jeff Liu
1 sibling, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2012-01-12 16:41 UTC (permalink / raw)
To: Mark Tinguely; +Cc: Christoph Hellwig, jeff.liu, Ben Myers, Chris Mason, xfs
On Thu, Jan 12, 2012 at 09:01:48AM -0600, Mark Tinguely wrote:
> >Sorry, am was well understood your opinions in this point for now.
> >IMHO, we can only find and return the data buffer offset at a dirty or
> >unwritten page once the first page was probed.
> >
>
> From my tests, xfs_bmapi_read() can only find holes if they cross or
> start on a 64KB boundary. It would be nice if unwritten extents were
> at least that good at finding holes.
Are you testing on ia64 with 64k blocks? :) xfs_bmapi_read will
find holes down to block granularity, that's how it's implemented.
However recent XFS does fairly aggressive preallocation, so you probably
won't find small holes unless you explicitly punch them out using
XFS_IOC_UNRESVSP or fallocate with the hole punch flag.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Introduce SEEK_DATA/SEEK_HOLE to XFS V5
2012-01-12 16:41 ` Christoph Hellwig
@ 2012-01-12 17:39 ` Ben Myers
0 siblings, 0 replies; 20+ messages in thread
From: Ben Myers @ 2012-01-12 17:39 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: jeff.liu, Mark Tinguely, Chris Mason, xfs
On Thu, Jan 12, 2012 at 11:41:48AM -0500, Christoph Hellwig wrote:
> On Thu, Jan 12, 2012 at 09:01:48AM -0600, Mark Tinguely wrote:
> > >Sorry, am was well understood your opinions in this point for now.
> > >IMHO, we can only find and return the data buffer offset at a dirty or
> > >unwritten page once the first page was probed.
> > >
> >
> > From my tests, xfs_bmapi_read() can only find holes if they cross or
> > start on a 64KB boundary. It would be nice if unwritten extents were
> > at least that good at finding holes.
>
> Are you testing on ia64 with 64k blocks? :) xfs_bmapi_read will
> find holes down to block granularity, that's how it's implemented.
> However recent XFS does fairly aggressive preallocation, so you probably
^^^^
it is speculative delay in this case, I think.
-Ben
> won't find small holes unless you explicitly punch them out using
> XFS_IOC_UNRESVSP or fallocate with the hole punch flag.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Introduce SEEK_DATA/SEEK_HOLE to XFS V5
2012-01-12 16:22 ` Christoph Hellwig
@ 2012-01-12 17:50 ` Ben Myers
0 siblings, 0 replies; 20+ messages in thread
From: Ben Myers @ 2012-01-12 17:50 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: jeff.liu, Mark Tinguely, Chris Mason, xfs
Hey,
On Thu, Jan 12, 2012 at 11:22:10AM -0500, Christoph Hellwig wrote:
> With all the complications that we got compared to the initial version,
> namely multiple hole extents, dirty unwritten extent detection and
> so on I think it's time to stop using xfs_bmapi_read against Dave's
> initial suggestion, and switch to using xfs_bmap_search_extents
> directly.
>
> The rationale for that is that
>
> a) using xfs_bmapi_read makes hole detection more complex, given
> that it has to fill potentially multiple xfs_bmbt_irec structures
> instead of skipping over them
> b) reading two extents at a time means we have to duplicate all the
> detection code.
c) having a cursor here means that Jeff can always get the job done
with a single btree search, which could be an important
optimisation for heavily preallocated workloads.
> if we use xfs_bmap_search_extents we need a bit of boilerplate code,
> but xfs_seek_data becomes really simple - we just loop over
> xfs_bmap_search_extents until we either find an extent or EOF.
> If we find an extent and it's unwritten we might have to probe for
> dirty areas from one single point, or just skip it but the code is
> still simple. xfs_seek_hole is just as simple - if
> xfs_bmap_search_extents fits the condition for a hole as written
> down in xfs_bmapi_read we've found it, if not we might again have
> to do the unwritten extent probing, but just from a single place
> instead of duplicating it twice.
I agree that this is a good idea. I would like to reiterate my
suggestion that Jeff go for the 'simple' implementation (assume
unwritten extents contain data) before going about scanning unwritten
extents for holes/data.
Regards,
Ben
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Introduce SEEK_DATA/SEEK_HOLE to XFS V5
2012-01-12 16:39 ` Christoph Hellwig
@ 2012-01-13 2:14 ` Jeff Liu
0 siblings, 0 replies; 20+ messages in thread
From: Jeff Liu @ 2012-01-13 2:14 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Ben Myers, Mark Tinguely, Chris Mason, xfs
On 01/13/2012 12:39 AM, Christoph Hellwig wrote:
> On Wed, Jan 11, 2012 at 03:07:29PM -0600, Mark Tinguely wrote:
>>
>> xfs_bmapi_read() returns the br_state == XFS_EXT_NORM for a hole.
>> There are a couple places that a hole can trigger a data test.
>> BTW, I could not generate a large enough hole that xfs_bmapi_read()
>> would return as more than one hole entry, so I will ignore those
>> situations and just list the couple places that a hole may be match
>> a data rule:
>
> We've been through this before, you need to overflow the 32-bit extent
> length counter to get there. Jeff, did you manage to create a test
> case for that particular scenario?
Ok, I'll try it out.
Thanks,
-Jeff
>
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Introduce SEEK_DATA/SEEK_HOLE to XFS V5
2012-01-12 15:01 ` Mark Tinguely
2012-01-12 16:41 ` Christoph Hellwig
@ 2012-01-13 2:41 ` Jeff Liu
1 sibling, 0 replies; 20+ messages in thread
From: Jeff Liu @ 2012-01-13 2:41 UTC (permalink / raw)
To: Mark Tinguely; +Cc: Christoph Hellwig, Ben Myers, Chris Mason, xfs
On 01/12/2012 11:01 PM, Mark Tinguely wrote:
> On 01/12/12 07:52, Jeff Liu wrote:
>> Hi Mark,
>>
>> On 01/12/2012 05:12 AM, Mark Tinguely wrote:
>>
>>> xfs_has_unwritten_buffer() always returns the offset of the first
>>> dirty unwritten page. This can cause xfs_seek_data() and xfs_seek_hole()
>>> to give the wrong results in certain circumstances.
>>
>> Sorry, am was well understood your opinions in this point for now.
>> IMHO, we can only find and return the data buffer offset at a dirty or
>> unwritten page once the first page was probed.
>>
>
> From my tests, xfs_bmapi_read() can only find holes if they cross or
> start on a 64KB boundary. It would be nice if unwritten extents were at
> least that good at finding holes.
>
>
> In xfs_has_unwritten_buffer(), could you start searching from the seek
> offset? The variable *offset could pass in that seek address and us that
> offset as the starting "index" rather than the beginning of the extent?
>
> You start:
>
> index = XFS_FSB_TO_B(mp, map->br_startoff) >> PAGE_CACHE_SHIFT;
>
> Could we do?:
>
> index = XFS_FSB_TO_B(mp, *offset) >> PAGE_CACHE_SHIFT;
>
> And before calling xfs_has_unwritten_buffer():
> offset = seekoff;
Good catch!
Looks we need to examine the max value between seekoff and
map->br_startoff, before passing it to xfs_has_unwritten_buffer().
For SEEK_DATA, if the seekoff is less than map->br_startoff, IMHO, we
need to pass the map->br_startoff to it.
>
> Also, my idea to find the next data/hole requires that
> xfs_has_unwritten_buffer() finds the smallest PAGECACHE_TAG_DIRTY or
> PAGECACHE_TAG_WRITEBACK page if any starting at the seek offset.
By combining with all your comments below, now I feel a bits clear about
your opinions. :)
I think it is definitely needed if we continue to use the current idea,
i.e, probing the unwritten extent twice(DIRTY, WRITEBACK).
Thanks,
-Jeff
>
>
>>>
>>> In xfs_seek_data(), every page past first dirty/unwritten page in the
>>> unwritten extent will be reported as data.
>>
>> Hmm, consider the user level utility that make use of SEEK_XXX stuff to
>> copy data from an offset in source file:
>>
>> Generally, it will call xfs_seek_data() firstly,
>> if we read an unwritten extent and there is data buffer was probed in
>> xfs_seek_data(), it only means we can read file data starting from the
>> returned offset of xfs_has_unwritten_buffer().
>>
>> Then it will call xfs_seek_hole() to calculate this extent length.
>> next, a couple of read()/write() will be called in a loop depending on
>> the extent length.
>>
>> [ page 1 ] | [ page 2 ] | [ page 3 ] | .... [ page N ]
>> |data offset at page 2|
>>
>> If we got the data offset from page2, and there is no data at page 3,
>> the user utility call read(2) will returns ZERO, and it will break
>> immediately.
>>
>
> Something like:
> loop
> s = lseek(fd, off, SEEK_DATA);
> if (s == -1)
> if we errno == ENXIO
> return done /* eof */
> else
> return errno
>
> e = lseek(fd, s, SEEK_HOLE);
> if (e == -1)
> return errno
>
> dest = copy from s to e
> off = e
> end loop (if not eof or other condition)
>
> You will seek for next hole at the found data position. Even if
> xfs_has_unwritten_buffer() does the right thing and returns the
> dirty/unwritten page starting from seekoff, we need go a page past the
> current page (which has data) to look for the next hole.
>
>
>
> Something like (again psuedo-code)
> loop
> offset1 = offset2 = seekoff
> xfs_has_unwritten_buffer(seekoff, &offset1, DIRTY)
> xfs_has_unwritten_buffer(seekoff, &offset2, WRITEBACK)
> d = min(offset1, offset2)
>
> if (d > seekoff OR d == NULL)
> return found a hole at seekoff
>
> if (d == seekoff) /* standard case assuming how we
> * use SEEK_DATA/SEEK_HOLE
> * This is the step your code
> * does not perform. It jumps
> * to the next extent
> */
> seekoff += page size of dirty/writeback **
> end while the seekoff < extent size
>
> ** here we could jump to the next 64KB boundary and be as accurate as
> xfs_bmapi_read().
>
> Good job. This is an important feature.
>
> --Mark Tinguely.
>
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 20+ messages in thread
end of thread, other threads:[~2012-01-13 2:42 UTC | newest]
Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-01-06 13:28 Introduce SEEK_DATA/SEEK_HOLE to XFS V5 Jeff Liu
2012-01-10 17:18 ` Ben Myers
2012-01-11 5:45 ` Jeff Liu
2012-01-11 21:06 ` Mark Tinguely
2012-01-12 16:22 ` Christoph Hellwig
2012-01-12 17:50 ` Ben Myers
2012-01-11 21:07 ` Mark Tinguely
2012-01-12 13:29 ` Jeff Liu
2012-01-12 16:39 ` Christoph Hellwig
2012-01-13 2:14 ` Jeff Liu
2012-01-11 21:12 ` Mark Tinguely
2012-01-12 13:52 ` Jeff Liu
2012-01-12 15:01 ` Mark Tinguely
2012-01-12 16:41 ` Christoph Hellwig
2012-01-12 17:39 ` Ben Myers
2012-01-13 2:41 ` Jeff Liu
2012-01-11 15:43 ` Ben Myers
2012-01-11 22:28 ` Ben Myers
2012-01-12 13:21 ` Jeff Liu
2012-01-12 12:53 ` Jeff Liu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox