* [PATCH 01/15] dax: export a low-level __dax_zero_page_range helper
2016-05-09 8:47 iomap infrastructure and multipage writes V4 Christoph Hellwig
@ 2016-05-09 8:47 ` Christoph Hellwig
2016-05-11 22:43 ` Verma, Vishal L
2016-05-09 8:47 ` [PATCH 02/15] fs: move struct iomap from exportfs.h to a separate header Christoph Hellwig
` (15 subsequent siblings)
16 siblings, 1 reply; 24+ messages in thread
From: Christoph Hellwig @ 2016-05-09 8:47 UTC (permalink / raw)
To: xfs; +Cc: rpeterso, vishal.l.verma, linux-fsdevel
This allows XFS to perform zeroing using the iomap infrastructure and
avoid buffer heads.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/dax.c | 35 ++++++++++++++++++++---------------
include/linux/dax.h | 7 +++++++
2 files changed, 27 insertions(+), 15 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index 90322eb..6d5d744 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1082,6 +1082,23 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
}
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
+int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
+ unsigned int offset, unsigned int length)
+{
+ struct blk_dax_ctl dax = {
+ .sector = sector,
+ .size = PAGE_CACHE_SIZE,
+ };
+
+ if (dax_map_atomic(bdev, &dax) < 0)
+ return PTR_ERR(dax.addr);
+ clear_pmem(dax.addr + offset, length);
+ wmb_pmem();
+ dax_unmap_atomic(bdev, &dax);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__dax_zero_page_range);
+
/**
* dax_zero_page_range - zero a range within a page of a DAX file
* @inode: The file being truncated
@@ -1117,23 +1134,11 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
bh.b_bdev = inode->i_sb->s_bdev;
bh.b_size = PAGE_CACHE_SIZE;
err = get_block(inode, index, &bh, 0);
- if (err < 0)
+ if (err < 0 || !buffer_written(&bh))
return err;
- if (buffer_written(&bh)) {
- struct block_device *bdev = bh.b_bdev;
- struct blk_dax_ctl dax = {
- .sector = to_sector(&bh, inode),
- .size = PAGE_CACHE_SIZE,
- };
- if (dax_map_atomic(bdev, &dax) < 0)
- return PTR_ERR(dax.addr);
- clear_pmem(dax.addr + offset, length);
- wmb_pmem();
- dax_unmap_atomic(bdev, &dax);
- }
-
- return 0;
+ return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
+ offset, length);
}
EXPORT_SYMBOL_GPL(dax_zero_page_range);
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 636dd59..8155b81 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -17,12 +17,19 @@ int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
#ifdef CONFIG_FS_DAX
struct page *read_dax_sector(struct block_device *bdev, sector_t n);
+int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
+ unsigned int offset, unsigned int length);
#else
static inline struct page *read_dax_sector(struct block_device *bdev,
sector_t n)
{
return ERR_PTR(-ENXIO);
}
+static inline int __dax_zero_page_range(struct block_device *bdev,
+ sector_t sector, unsigned int offset, unsigned int length)
+{
+ return -ENXIO;
+}
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
--
2.1.4
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH 01/15] dax: export a low-level __dax_zero_page_range helper
2016-05-09 8:47 ` [PATCH 01/15] dax: export a low-level __dax_zero_page_range helper Christoph Hellwig
@ 2016-05-11 22:43 ` Verma, Vishal L
0 siblings, 0 replies; 24+ messages in thread
From: Verma, Vishal L @ 2016-05-11 22:43 UTC (permalink / raw)
To: hch@lst.de, xfs@oss.sgi.com
Cc: rpeterso@redhat.com, linux-fsdevel@vger.kernel.org
On Mon, 2016-05-09 at 10:47 +0200, Christoph Hellwig wrote:
> This allows XFS to perform zeroing using the iomap infrastructure and
> avoid buffer heads.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/dax.c | 35 ++++++++++++++++++++---------------
> include/linux/dax.h | 7 +++++++
> 2 files changed, 27 insertions(+), 15 deletions(-)
This looks good to me.
Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 90322eb..6d5d744 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1082,6 +1082,23 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma,
> struct vm_fault *vmf)
> }
> EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
>
> +int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
> + unsigned int offset, unsigned int length)
> +{
> + struct blk_dax_ctl dax = {
> + .sector = sector,
> + .size = PAGE_CACHE_SIZE,
> + };
> +
> + if (dax_map_atomic(bdev, &dax) < 0)
> + return PTR_ERR(dax.addr);
> + clear_pmem(dax.addr + offset, length);
> + wmb_pmem();
> + dax_unmap_atomic(bdev, &dax);
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(__dax_zero_page_range);
> +
> /**
> * dax_zero_page_range - zero a range within a page of a DAX file
> * @inode: The file being truncated
> @@ -1117,23 +1134,11 @@ int dax_zero_page_range(struct inode *inode,
> loff_t from, unsigned length,
> bh.b_bdev = inode->i_sb->s_bdev;
> bh.b_size = PAGE_CACHE_SIZE;
> err = get_block(inode, index, &bh, 0);
> - if (err < 0)
> + if (err < 0 || !buffer_written(&bh))
> return err;
> - if (buffer_written(&bh)) {
> - struct block_device *bdev = bh.b_bdev;
> - struct blk_dax_ctl dax = {
> - .sector = to_sector(&bh, inode),
> - .size = PAGE_CACHE_SIZE,
> - };
>
> - if (dax_map_atomic(bdev, &dax) < 0)
> - return PTR_ERR(dax.addr);
> - clear_pmem(dax.addr + offset, length);
> - wmb_pmem();
> - dax_unmap_atomic(bdev, &dax);
> - }
> -
> - return 0;
> + return __dax_zero_page_range(bh.b_bdev, to_sector(&bh,
> inode),
> + offset, length);
> }
> EXPORT_SYMBOL_GPL(dax_zero_page_range);
>
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index 636dd59..8155b81 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -17,12 +17,19 @@ int __dax_fault(struct vm_area_struct *, struct
> vm_fault *, get_block_t,
>
> #ifdef CONFIG_FS_DAX
> struct page *read_dax_sector(struct block_device *bdev, sector_t n);
> +int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
> + unsigned int offset, unsigned int length);
> #else
> static inline struct page *read_dax_sector(struct block_device *bdev,
> sector_t n)
> {
> return ERR_PTR(-ENXIO);
> }
> +static inline int __dax_zero_page_range(struct block_device *bdev,
> + sector_t sector, unsigned int offset, unsigned int
> length)
> +{
> + return -ENXIO;
> +}
> #endif
>
> #ifdef CONFIG_TRANSPARENT_HUGEPAGE
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH 02/15] fs: move struct iomap from exportfs.h to a separate header
2016-05-09 8:47 iomap infrastructure and multipage writes V4 Christoph Hellwig
2016-05-09 8:47 ` [PATCH 01/15] dax: export a low-level __dax_zero_page_range helper Christoph Hellwig
@ 2016-05-09 8:47 ` Christoph Hellwig
2016-05-09 8:47 ` [PATCH 03/15] fs: introduce iomap infrastructure Christoph Hellwig
` (14 subsequent siblings)
16 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2016-05-09 8:47 UTC (permalink / raw)
To: xfs; +Cc: rpeterso, vishal.l.verma, linux-fsdevel
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bob Peterson <rpeterso@redhat.com>
---
fs/nfsd/blocklayout.c | 1 +
fs/nfsd/blocklayoutxdr.c | 1 +
fs/xfs/xfs_pnfs.c | 1 +
include/linux/exportfs.h | 16 +---------------
include/linux/iomap.h | 21 +++++++++++++++++++++
5 files changed, 25 insertions(+), 15 deletions(-)
create mode 100644 include/linux/iomap.h
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index e55b524..4df16ae 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -2,6 +2,7 @@
* Copyright (c) 2014-2016 Christoph Hellwig.
*/
#include <linux/exportfs.h>
+#include <linux/iomap.h>
#include <linux/genhd.h>
#include <linux/slab.h>
#include <linux/pr.h>
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 6c3b316..4ebaaf4 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -3,6 +3,7 @@
*/
#include <linux/sunrpc/svc.h>
#include <linux/exportfs.h>
+#include <linux/iomap.h>
#include <linux/nfs4.h>
#include "nfsd.h"
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 3332bae..0a56787 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -1,6 +1,7 @@
/*
* Copyright (c) 2014 Christoph Hellwig.
*/
+#include <linux/iomap.h>
#include "xfs.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index d841450..b03c062 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -6,6 +6,7 @@
struct dentry;
struct iattr;
struct inode;
+struct iomap;
struct super_block;
struct vfsmount;
@@ -187,21 +188,6 @@ struct fid {
* get_name is not (which is possibly inconsistent)
*/
-/* types of block ranges for multipage write mappings. */
-#define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */
-#define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */
-#define IOMAP_MAPPED 0x03 /* blocks allocated @blkno */
-#define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */
-
-#define IOMAP_NULL_BLOCK -1LL /* blkno is not valid */
-
-struct iomap {
- sector_t blkno; /* first sector of mapping */
- loff_t offset; /* file offset of mapping, bytes */
- u64 length; /* length of mapping, bytes */
- int type; /* type of mapping */
-};
-
struct export_operations {
int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len,
struct inode *parent);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
new file mode 100644
index 0000000..1b22197
--- /dev/null
+++ b/include/linux/iomap.h
@@ -0,0 +1,21 @@
+#ifndef LINUX_IOMAP_H
+#define LINUX_IOMAP_H 1
+
+#include <linux/types.h>
+
+/* types of block ranges for multipage write mappings. */
+#define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */
+#define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */
+#define IOMAP_MAPPED 0x03 /* blocks allocated @blkno */
+#define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */
+
+#define IOMAP_NULL_BLOCK -1LL /* blkno is not valid */
+
+struct iomap {
+ sector_t blkno; /* first sector of mapping */
+ loff_t offset; /* file offset of mapping, bytes */
+ u64 length; /* length of mapping, bytes */
+ int type; /* type of mapping */
+};
+
+#endif /* LINUX_IOMAP_H */
--
2.1.4
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 03/15] fs: introduce iomap infrastructure
2016-05-09 8:47 iomap infrastructure and multipage writes V4 Christoph Hellwig
2016-05-09 8:47 ` [PATCH 01/15] dax: export a low-level __dax_zero_page_range helper Christoph Hellwig
2016-05-09 8:47 ` [PATCH 02/15] fs: move struct iomap from exportfs.h to a separate header Christoph Hellwig
@ 2016-05-09 8:47 ` Christoph Hellwig
2016-05-09 8:47 ` [PATCH 04/15] fs: support DAX based iomap zeroing Christoph Hellwig
` (13 subsequent siblings)
16 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2016-05-09 8:47 UTC (permalink / raw)
To: xfs; +Cc: rpeterso, vishal.l.verma, linux-fsdevel
Add infrastructure for multipage buffered writes. This is implemented
using an main iterator that applies an actor function to a range that
can be written.
This infrastucture is used to implement a buffered write helper, one
to zero file ranges and one to implement the ->page_mkwrite VM
operations. All of them borrow a fair amount of code from fs/buffers.
for now by using an internal version of __block_write_begin that
gets passed an iomap and builds the corresponding buffer head.
The file system is gets a set of paired ->iomap_begin and ->iomap_end
calls which allow it to map/reserve a range and get a notification
once the write code is finished with it.
Based on earlier code from Dave Chinner.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bob Peterson <rpeterso@redhat.com>
---
fs/Kconfig | 3 +
fs/Makefile | 1 +
fs/buffer.c | 76 +++++++++-
fs/internal.h | 3 +
fs/iomap.c | 394 ++++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/iomap.h | 56 ++++++-
6 files changed, 523 insertions(+), 10 deletions(-)
create mode 100644 fs/iomap.c
diff --git a/fs/Kconfig b/fs/Kconfig
index 6725f59..276fcfb 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -10,6 +10,9 @@ config DCACHE_WORD_ACCESS
if BLOCK
+config FS_IOMAP
+ bool
+
source "fs/ext2/Kconfig"
source "fs/ext4/Kconfig"
source "fs/jbd2/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 85b6e13..ed2b632 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_COREDUMP) += coredump.o
obj-$(CONFIG_SYSCTL) += drop_caches.o
obj-$(CONFIG_FHANDLE) += fhandle.o
+obj-$(CONFIG_FS_IOMAP) += iomap.o
obj-y += quota/
diff --git a/fs/buffer.c b/fs/buffer.c
index 33be296..d4429c3 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -21,6 +21,7 @@
#include <linux/kernel.h>
#include <linux/syscalls.h>
#include <linux/fs.h>
+#include <linux/iomap.h>
#include <linux/mm.h>
#include <linux/percpu.h>
#include <linux/slab.h>
@@ -1891,8 +1892,62 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
}
EXPORT_SYMBOL(page_zero_new_buffers);
-int __block_write_begin(struct page *page, loff_t pos, unsigned len,
- get_block_t *get_block)
+static void
+iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
+ struct iomap *iomap)
+{
+ loff_t offset = block << inode->i_blkbits;
+
+ bh->b_bdev = iomap->bdev;
+
+ /*
+ * Block points to offset in file we need to map, iomap contains
+ * the offset at which the map starts. If the map ends before the
+ * current block, then do not map the buffer and let the caller
+ * handle it.
+ */
+ BUG_ON(offset >= iomap->offset + iomap->length);
+
+ switch (iomap->type) {
+ case IOMAP_HOLE:
+ /*
+ * If the buffer is not up to date or beyond the current EOF,
+ * we need to mark it as new to ensure sub-block zeroing is
+ * executed if necessary.
+ */
+ if (!buffer_uptodate(bh) ||
+ (offset >= i_size_read(inode)))
+ set_buffer_new(bh);
+ break;
+ case IOMAP_DELALLOC:
+ if (!buffer_uptodate(bh) ||
+ (offset >= i_size_read(inode)))
+ set_buffer_new(bh);
+ set_buffer_uptodate(bh);
+ set_buffer_mapped(bh);
+ set_buffer_delay(bh);
+ break;
+ case IOMAP_UNWRITTEN:
+ /*
+ * For unwritten regions, we always need to ensure that
+ * sub-block writes cause the regions in the block we are not
+ * writing to are zeroed. Set the buffer as new to ensure this.
+ */
+ set_buffer_new(bh);
+ set_buffer_unwritten(bh);
+ /* FALLTHRU */
+ case IOMAP_MAPPED:
+ if (offset >= i_size_read(inode))
+ set_buffer_new(bh);
+ bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
+ ((offset - iomap->offset) >> inode->i_blkbits);
+ set_buffer_mapped(bh);
+ break;
+ }
+}
+
+int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+ get_block_t *get_block, struct iomap *iomap)
{
unsigned from = pos & (PAGE_CACHE_SIZE - 1);
unsigned to = from + len;
@@ -1928,9 +1983,14 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
clear_buffer_new(bh);
if (!buffer_mapped(bh)) {
WARN_ON(bh->b_size != blocksize);
- err = get_block(inode, block, bh, 1);
- if (err)
- break;
+ if (get_block) {
+ err = get_block(inode, block, bh, 1);
+ if (err)
+ break;
+ } else {
+ iomap_to_bh(inode, block, bh, iomap);
+ }
+
if (buffer_new(bh)) {
unmap_underlying_metadata(bh->b_bdev,
bh->b_blocknr);
@@ -1971,6 +2031,12 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
page_zero_new_buffers(page, from, to);
return err;
}
+
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+ get_block_t *get_block)
+{
+ return __block_write_begin_int(page, pos, len, get_block, NULL);
+}
EXPORT_SYMBOL(__block_write_begin);
static int __block_commit_write(struct inode *inode, struct page *page,
diff --git a/fs/internal.h b/fs/internal.h
index b71deee..c0c6f49 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
struct super_block;
struct file_system_type;
+struct iomap;
struct linux_binprm;
struct path;
struct mount;
@@ -39,6 +40,8 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
* buffer.c
*/
extern void guard_bio_eod(int rw, struct bio *bio);
+extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+ get_block_t *get_block, struct iomap *iomap);
/*
* char_dev.c
diff --git a/fs/iomap.c b/fs/iomap.c
new file mode 100644
index 0000000..fac9285
--- /dev/null
+++ b/fs/iomap.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (c) 2016 Christoph Hellwig.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/iomap.h>
+#include <linux/uaccess.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include "internal.h"
+
+typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
+ void *data, struct iomap *iomap);
+
+/*
+ * Execute a iomap write on a segment of the mapping that spans a
+ * contiguous range of pages that have identical block mapping state.
+ *
+ * This avoids the need to map pages individually, do individual allocations
+ * for each page and most importantly avoid the need for filesystem specific
+ * locking per page. Instead, all the operations are amortised over the entire
+ * range of pages. It is assumed that the filesystems will lock whatever
+ * resources they require in the iomap_begin call, and release them in the
+ * iomap_end call.
+ */
+static loff_t
+iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
+ struct iomap_ops *ops, void *data, iomap_actor_t actor)
+{
+ struct iomap iomap = { 0 };
+ loff_t written = 0, ret;
+
+ /*
+ * Need to map a range from start position for count bytes. This can
+ * span multiple pages - it is only guaranteed to return a range of a
+ * single type of pages (e.g. all into a hole, all mapped or all
+ * unwritten). Failure at this point has nothing to undo.
+ *
+ * If allocation is required for this range, reserve the space now so
+ * that the allocation is guaranteed to succeed later on. Once we copy
+ * the data into the page cache pages, then we cannot fail otherwise we
+ * expose transient stale data. If the reserve fails, we can safely
+ * back out at this point as there is nothing to undo.
+ */
+ ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
+ if (ret)
+ return ret;
+ if (WARN_ON(iomap.offset > pos))
+ return -EIO;
+
+ /*
+ * Cut down the length to the one actually provided by the filesystem,
+ * as it might not be able to give us the whole size that we requested.
+ */
+ if (iomap.offset + iomap.length < pos + length)
+ length = iomap.offset + iomap.length - pos;
+
+ /*
+ * Now that we have guaranteed that the space allocation will succeed.
+ * we can do the copy-in page by page without having to worry about
+ * failures exposing transient data.
+ */
+ written = actor(inode, pos, length, data, &iomap);
+
+ /*
+ * Now the data has been copied, commit the range we've copied. This
+ * should not fail unless the filesystem has had a fatal error.
+ */
+ ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0,
+ flags, &iomap);
+
+ return written ? written : ret;
+}
+
+static void
+iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
+{
+ loff_t i_size = i_size_read(inode);
+
+ /*
+ * Only truncate newly allocated pages beyoned EOF, even if the
+ * write started inside the existing inode size.
+ */
+ if (pos + len > i_size)
+ truncate_pagecache_range(inode, max(pos, i_size), pos + len);
+}
+
+static int
+iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, struct iomap *iomap)
+{
+ pgoff_t index = pos >> PAGE_SHIFT;
+ struct page *page;
+ int status = 0;
+
+ BUG_ON(pos + len > iomap->offset + iomap->length);
+
+ page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
+ if (!page)
+ return -ENOMEM;
+
+ status = __block_write_begin_int(page, pos, len, NULL, iomap);
+ if (unlikely(status)) {
+ unlock_page(page);
+ page_cache_release(page);
+ page = NULL;
+
+ iomap_write_failed(inode, pos, len);
+ }
+
+ *pagep = page;
+ return status;
+}
+
+static int
+iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
+ unsigned copied, struct page *page)
+{
+ int ret;
+
+ ret = generic_write_end(NULL, inode->i_mapping, pos, len,
+ copied, page, NULL);
+ if (ret < len)
+ iomap_write_failed(inode, pos, len);
+ return ret;
+}
+
+static loff_t
+iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+ struct iomap *iomap)
+{
+ struct iov_iter *i = data;
+ long status = 0;
+ ssize_t written = 0;
+ unsigned int flags = AOP_FLAG_NOFS;
+
+ /*
+ * Copies from kernel address space cannot fail (NFSD is a big user).
+ */
+ if (!iter_is_iovec(i))
+ flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+ do {
+ struct page *page;
+ unsigned long offset; /* Offset into pagecache page */
+ unsigned long bytes; /* Bytes to write to page */
+ size_t copied; /* Bytes copied from user */
+
+ offset = (pos & (PAGE_SIZE - 1));
+ bytes = min_t(unsigned long, PAGE_SIZE - offset,
+ iov_iter_count(i));
+again:
+ if (bytes > length)
+ bytes = length;
+
+ /*
+ * Bring in the user page that we will copy from _first_.
+ * Otherwise there's a nasty deadlock on copying from the
+ * same page as we're writing to, without it being marked
+ * up-to-date.
+ *
+ * Not only is this an optimisation, but it is also required
+ * to check that the address is actually valid, when atomic
+ * usercopies are used, below.
+ */
+ if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+ status = -EFAULT;
+ break;
+ }
+
+ status = iomap_write_begin(inode, pos, bytes, flags, &page,
+ iomap);
+ if (unlikely(status))
+ break;
+
+ if (mapping_writably_mapped(inode->i_mapping))
+ flush_dcache_page(page);
+
+ pagefault_disable();
+ copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+ pagefault_enable();
+
+ flush_dcache_page(page);
+ mark_page_accessed(page);
+
+ status = iomap_write_end(inode, pos, bytes, copied, page);
+ if (unlikely(status < 0))
+ break;
+ copied = status;
+
+ cond_resched();
+
+ iov_iter_advance(i, copied);
+ if (unlikely(copied == 0)) {
+ /*
+ * If we were unable to copy any data at all, we must
+ * fall back to a single segment length write.
+ *
+ * If we didn't fallback here, we could livelock
+ * because not all segments in the iov can be copied at
+ * once without a pagefault.
+ */
+ bytes = min_t(unsigned long, PAGE_SIZE - offset,
+ iov_iter_single_seg_count(i));
+ goto again;
+ }
+ pos += copied;
+ written += copied;
+ length -= copied;
+
+ balance_dirty_pages_ratelimited(inode->i_mapping);
+ } while (iov_iter_count(i) && length);
+
+ return written ? written : status;
+}
+
+ssize_t
+iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
+ struct iomap_ops *ops)
+{
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ loff_t pos = iocb->ki_pos, ret = 0, written = 0;
+
+ while (iov_iter_count(iter)) {
+ ret = iomap_apply(inode, pos, iov_iter_count(iter),
+ IOMAP_WRITE, ops, iter, iomap_write_actor);
+ if (ret <= 0)
+ break;
+ pos += ret;
+ written += ret;
+ }
+
+ return written ? written : ret;
+}
+EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
+
+static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
+ unsigned bytes, struct iomap *iomap)
+{
+ struct page *page;
+ int status;
+
+ status = iomap_write_begin(inode, pos, bytes,
+ AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap);
+ if (status)
+ return status;
+
+ zero_user(page, offset, bytes);
+ mark_page_accessed(page);
+
+ return iomap_write_end(inode, pos, bytes, bytes, page);
+}
+
+static loff_t
+iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
+ void *data, struct iomap *iomap)
+{
+ bool *did_zero = data;
+ loff_t written = 0;
+ int status;
+
+ /* already zeroed? we're done. */
+ if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
+ return count;
+
+ do {
+ unsigned offset, bytes;
+
+ offset = pos & (PAGE_SIZE - 1); /* Within page */
+ bytes = min_t(unsigned, PAGE_SIZE - offset, count);
+
+ status = iomap_zero(inode, pos, offset, bytes, iomap);
+ if (status < 0)
+ return status;
+
+ pos += bytes;
+ count -= bytes;
+ written += bytes;
+ if (did_zero)
+ *did_zero = true;
+ } while (count > 0);
+
+ return written;
+}
+
+int
+iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
+ struct iomap_ops *ops)
+{
+ loff_t ret;
+
+ while (len > 0) {
+ ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
+ ops, did_zero, iomap_zero_range_actor);
+ if (ret <= 0)
+ return ret;
+
+ pos += ret;
+ len -= ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_zero_range);
+
+int
+iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+ struct iomap_ops *ops)
+{
+ unsigned blocksize = (1 << inode->i_blkbits);
+ unsigned off = pos & (blocksize - 1);
+
+ /* Block boundary? Nothing to do */
+ if (!off)
+ return 0;
+ return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
+}
+EXPORT_SYMBOL_GPL(iomap_truncate_page);
+
+static loff_t
+iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
+ void *data, struct iomap *iomap)
+{
+ struct page *page = data;
+ int ret;
+
+ ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length,
+ NULL, iomap);
+ if (ret)
+ return ret;
+
+ block_commit_write(page, 0, length);
+ return length;
+}
+
+int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+ struct iomap_ops *ops)
+{
+ struct page *page = vmf->page;
+ struct inode *inode = file_inode(vma->vm_file);
+ unsigned long length;
+ loff_t offset, size;
+ ssize_t ret;
+
+ lock_page(page);
+ size = i_size_read(inode);
+ if ((page->mapping != inode->i_mapping) ||
+ (page_offset(page) > size)) {
+ /* We overload EFAULT to mean page got truncated */
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ /* page is wholly or partially inside EOF */
+ if (((page->index + 1) << PAGE_SHIFT) > size)
+ length = size & ~PAGE_MASK;
+ else
+ length = PAGE_SIZE;
+
+ offset = page_offset(page);
+ while (length > 0) {
+ ret = iomap_apply(inode, offset, length, IOMAP_WRITE,
+ ops, page, iomap_page_mkwrite_actor);
+ if (unlikely(ret <= 0))
+ goto out_unlock;
+ offset += ret;
+ length -= ret;
+ }
+
+ set_page_dirty(page);
+ wait_for_stable_page(page);
+ return 0;
+out_unlock:
+ unlock_page(page);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 1b22197..854766f 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -3,19 +3,65 @@
#include <linux/types.h>
-/* types of block ranges for multipage write mappings. */
+struct inode;
+struct iov_iter;
+struct kiocb;
+struct vm_area_struct;
+struct vm_fault;
+
+/*
+ * Types of block ranges for iomap mappings:
+ */
#define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */
#define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */
#define IOMAP_MAPPED 0x03 /* blocks allocated @blkno */
#define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */
+/*
+ * Magic value for blkno:
+ */
#define IOMAP_NULL_BLOCK -1LL /* blkno is not valid */
struct iomap {
- sector_t blkno; /* first sector of mapping */
- loff_t offset; /* file offset of mapping, bytes */
- u64 length; /* length of mapping, bytes */
- int type; /* type of mapping */
+ sector_t blkno; /* first sector of mapping, fs blocks */
+ loff_t offset; /* file offset of mapping, bytes */
+ u64 length; /* length of mapping, bytes */
+ int type; /* type of mapping */
+ struct block_device *bdev; /* block device for I/O */
+};
+
+/*
+ * Flags for iomap_begin / iomap_end. No flag implies a read.
+ */
+#define IOMAP_WRITE (1 << 0)
+#define IOMAP_ZERO (1 << 1)
+
+struct iomap_ops {
+ /*
+ * Return the existing mapping at pos, or reserve space starting at
+ * pos for up to length, as long as we can do it as a single mapping.
+ * The actual length is returned in iomap->length.
+ */
+ int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length,
+ unsigned flags, struct iomap *iomap);
+
+ /*
+ * Commit and/or unreserve space previous allocated using iomap_begin.
+ * Written indicates the length of the successful write operation which
+ * needs to be commited, while the rest needs to be unreserved.
+ * Written might be zero if no data was written.
+ */
+ int (*iomap_end)(struct inode *inode, loff_t pos, loff_t length,
+ ssize_t written, unsigned flags, struct iomap *iomap);
};
+ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
+ struct iomap_ops *ops);
+int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
+ bool *did_zero, struct iomap_ops *ops);
+int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+ struct iomap_ops *ops);
+int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+ struct iomap_ops *ops);
+
#endif /* LINUX_IOMAP_H */
--
2.1.4
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 04/15] fs: support DAX based iomap zeroing
2016-05-09 8:47 iomap infrastructure and multipage writes V4 Christoph Hellwig
` (2 preceding siblings ...)
2016-05-09 8:47 ` [PATCH 03/15] fs: introduce iomap infrastructure Christoph Hellwig
@ 2016-05-09 8:47 ` Christoph Hellwig
2016-05-09 8:47 ` [PATCH 05/15] xfs: make xfs_bmbt_to_iomap available outside of xfs_pnfs.c Christoph Hellwig
` (12 subsequent siblings)
16 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2016-05-09 8:47 UTC (permalink / raw)
To: xfs; +Cc: rpeterso, vishal.l.verma, linux-fsdevel
This avoid needing a separate inefficient get_block based DAX zero_range
implementation in file systems.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/iomap.c | 15 ++++++++++++++-
1 file changed, 14 insertions(+), 1 deletion(-)
diff --git a/fs/iomap.c b/fs/iomap.c
index fac9285..f84c6eb 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -24,6 +24,7 @@
#include <linux/uio.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
+#include <linux/dax.h>
#include "internal.h"
typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
@@ -268,6 +269,15 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
return iomap_write_end(inode, pos, bytes, bytes, page);
}
+static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
+ struct iomap *iomap)
+{
+ sector_t sector = iomap->blkno +
+ (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
+
+ return __dax_zero_page_range(iomap->bdev, sector, offset, bytes);
+}
+
static loff_t
iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
void *data, struct iomap *iomap)
@@ -286,7 +296,10 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
offset = pos & (PAGE_SIZE - 1); /* Within page */
bytes = min_t(unsigned, PAGE_SIZE - offset, count);
- status = iomap_zero(inode, pos, offset, bytes, iomap);
+ if (IS_DAX(inode))
+ status = iomap_dax_zero(pos, offset, bytes, iomap);
+ else
+ status = iomap_zero(inode, pos, offset, bytes, iomap);
if (status < 0)
return status;
--
2.1.4
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 05/15] xfs: make xfs_bmbt_to_iomap available outside of xfs_pnfs.c
2016-05-09 8:47 iomap infrastructure and multipage writes V4 Christoph Hellwig
` (3 preceding siblings ...)
2016-05-09 8:47 ` [PATCH 04/15] fs: support DAX based iomap zeroing Christoph Hellwig
@ 2016-05-09 8:47 ` Christoph Hellwig
2016-05-09 8:47 ` [PATCH 06/15] xfs: reorder zeroing and flushing sequence in truncate Christoph Hellwig
` (11 subsequent siblings)
16 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2016-05-09 8:47 UTC (permalink / raw)
To: xfs; +Cc: rpeterso, vishal.l.verma, linux-fsdevel
And ensure it works for RT subvolume files an set the block device,
both of which will be needed to be able to use the function in the
buffered write path.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bob Peterson <rpeterso@redhat.com>
---
fs/xfs/xfs_iomap.c | 27 +++++++++++++++++++++++++++
fs/xfs/xfs_iomap.h | 4 ++++
fs/xfs/xfs_pnfs.c | 26 --------------------------
3 files changed, 31 insertions(+), 26 deletions(-)
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 5839135..2f37194 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -15,6 +15,7 @@
* along with this program; if not, write the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include <linux/iomap.h>
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
@@ -940,3 +941,29 @@ error_on_bmapi_transaction:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
+
+void
+xfs_bmbt_to_iomap(
+ struct xfs_inode *ip,
+ struct iomap *iomap,
+ struct xfs_bmbt_irec *imap)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (imap->br_startblock == HOLESTARTBLOCK) {
+ iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->type = IOMAP_HOLE;
+ } else if (imap->br_startblock == DELAYSTARTBLOCK) {
+ iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->type = IOMAP_DELALLOC;
+ } else {
+ iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
+ if (imap->br_state == XFS_EXT_UNWRITTEN)
+ iomap->type = IOMAP_UNWRITTEN;
+ else
+ iomap->type = IOMAP_MAPPED;
+ }
+ iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+ iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
+ iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
+}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 8688e66..718f07c 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,6 +18,7 @@
#ifndef __XFS_IOMAP_H__
#define __XFS_IOMAP_H__
+struct iomap;
struct xfs_inode;
struct xfs_bmbt_irec;
@@ -29,4 +30,7 @@ int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
struct xfs_bmbt_irec *);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
+void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
+ struct xfs_bmbt_irec *);
+
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 0a56787..dc1a71f 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -80,32 +80,6 @@ xfs_fs_get_uuid(
return 0;
}
-static void
-xfs_bmbt_to_iomap(
- struct xfs_inode *ip,
- struct iomap *iomap,
- struct xfs_bmbt_irec *imap)
-{
- struct xfs_mount *mp = ip->i_mount;
-
- if (imap->br_startblock == HOLESTARTBLOCK) {
- iomap->blkno = IOMAP_NULL_BLOCK;
- iomap->type = IOMAP_HOLE;
- } else if (imap->br_startblock == DELAYSTARTBLOCK) {
- iomap->blkno = IOMAP_NULL_BLOCK;
- iomap->type = IOMAP_DELALLOC;
- } else {
- iomap->blkno =
- XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock);
- if (imap->br_state == XFS_EXT_UNWRITTEN)
- iomap->type = IOMAP_UNWRITTEN;
- else
- iomap->type = IOMAP_MAPPED;
- }
- iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
- iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
-}
-
/*
* Get a layout for the pNFS client.
*/
--
2.1.4
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 06/15] xfs: reorder zeroing and flushing sequence in truncate
2016-05-09 8:47 iomap infrastructure and multipage writes V4 Christoph Hellwig
` (4 preceding siblings ...)
2016-05-09 8:47 ` [PATCH 05/15] xfs: make xfs_bmbt_to_iomap available outside of xfs_pnfs.c Christoph Hellwig
@ 2016-05-09 8:47 ` Christoph Hellwig
2016-05-09 8:47 ` [PATCH 07/15] xfs: implement iomap based buffered write path Christoph Hellwig
` (10 subsequent siblings)
16 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2016-05-09 8:47 UTC (permalink / raw)
To: xfs; +Cc: rpeterso, vishal.l.verma, linux-fsdevel
Currently zeroing out blocks and waiting for writeout is a bit of a mess in
truncate. This patch gives it a clear order in preparation for the iomap
path:
(1) we first wait for any direct I/O to complete to prevent any races
for it
(2) we then perform the actual zeroing, and only use the truncate_page
helpers for truncating down. The truncate up case already is
handled by the separate call to xfs_zero_eof.
(3) only then we write back dirty data, as zeroing block may cause
dirty pages when using either xfs_zero_eof or the new iomap
infrastructure.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bob Peterson <rpeterso@redhat.com>
---
fs/xfs/xfs_iops.c | 33 +++++++++++++++++++--------------
1 file changed, 19 insertions(+), 14 deletions(-)
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index c5d4eba..1a5ca4b 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -801,20 +801,35 @@ xfs_setattr_size(
return error;
/*
+ * Wait for all direct I/O to complete.
+ */
+ inode_dio_wait(inode);
+
+ /*
* File data changes must be complete before we start the transaction to
* modify the inode. This needs to be done before joining the inode to
* the transaction because the inode cannot be unlocked once it is a
* part of the transaction.
*
- * Start with zeroing any data block beyond EOF that we may expose on
- * file extension.
+ * Start with zeroing any data beyond EOF that we may expose on file
+ * extension, or zeroing out the rest of the block on a downward
+ * truncate.
*/
if (newsize > oldsize) {
error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
- if (error)
- return error;
+ } else {
+ if (IS_DAX(inode)) {
+ error = dax_truncate_page(inode, newsize,
+ xfs_get_blocks_direct);
+ } else {
+ error = block_truncate_page(inode->i_mapping, newsize,
+ xfs_get_blocks);
+ }
}
+ if (error)
+ return error;
+
/*
* We are going to log the inode size change in this transaction so
* any previous writes that are beyond the on disk EOF and the new
@@ -831,9 +846,6 @@ xfs_setattr_size(
return error;
}
- /* Now wait for all direct I/O to complete. */
- inode_dio_wait(inode);
-
/*
* We've already locked out new page faults, so now we can safely remove
* pages from the page cache knowing they won't get refaulted until we
@@ -851,13 +863,6 @@ xfs_setattr_size(
* to hope that the caller sees ENOMEM and retries the truncate
* operation.
*/
- if (IS_DAX(inode))
- error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
- else
- error = block_truncate_page(inode->i_mapping, newsize,
- xfs_get_blocks);
- if (error)
- return error;
truncate_setsize(inode, newsize);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
--
2.1.4
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 07/15] xfs: implement iomap based buffered write path
2016-05-09 8:47 iomap infrastructure and multipage writes V4 Christoph Hellwig
` (5 preceding siblings ...)
2016-05-09 8:47 ` [PATCH 06/15] xfs: reorder zeroing and flushing sequence in truncate Christoph Hellwig
@ 2016-05-09 8:47 ` Christoph Hellwig
2016-05-09 8:47 ` [PATCH 08/15] xfs: remove buffered write support from __xfs_get_blocks Christoph Hellwig
` (9 subsequent siblings)
16 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2016-05-09 8:47 UTC (permalink / raw)
To: xfs; +Cc: rpeterso, vishal.l.verma, linux-fsdevel
Convert XFS to use the new iomap based multipage write path. This involves
implementing the ->iomap_begin and ->iomap_end methods, and switching the
buffered file write, page_mkwrite and xfs_iozero paths to the new iomap
helpers.
With this change __xfs_get_blocks will never be used for buffered writes,
and the code handling them can be removed.
Based on earlier code from Dave Chinner.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bob Peterson <rpeterso@redhat.com>
---
fs/xfs/Kconfig | 1 +
fs/xfs/xfs_aops.c | 212 -----------------------------------------------------
fs/xfs/xfs_file.c | 71 ++++++++----------
fs/xfs/xfs_iomap.c | 144 ++++++++++++++++++++++++++++++++++++
fs/xfs/xfs_iomap.h | 5 +-
fs/xfs/xfs_iops.c | 9 ++-
fs/xfs/xfs_trace.h | 3 +
7 files changed, 187 insertions(+), 258 deletions(-)
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 5d47b4d..35faf12 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -4,6 +4,7 @@ config XFS_FS
depends on (64BIT || LBDAF)
select EXPORTFS
select LIBCRC32C
+ select FS_IOMAP
help
XFS is a high performance journaling filesystem which originated
on the SGI IRIX platform. It is completely multi-threaded, can
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 40645a4..e481c80 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1429,216 +1429,6 @@ xfs_vm_direct_IO(
xfs_get_blocks_direct, endio, NULL, flags);
}
-/*
- * Punch out the delalloc blocks we have already allocated.
- *
- * Don't bother with xfs_setattr given that nothing can have made it to disk yet
- * as the page is still locked at this point.
- */
-STATIC void
-xfs_vm_kill_delalloc_range(
- struct inode *inode,
- loff_t start,
- loff_t end)
-{
- struct xfs_inode *ip = XFS_I(inode);
- xfs_fileoff_t start_fsb;
- xfs_fileoff_t end_fsb;
- int error;
-
- start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
- end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
- if (end_fsb <= start_fsb)
- return;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
- end_fsb - start_fsb);
- if (error) {
- /* something screwed, just bail */
- if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- xfs_alert(ip->i_mount,
- "xfs_vm_write_failed: unable to clean up ino %lld",
- ip->i_ino);
- }
- }
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-}
-
-STATIC void
-xfs_vm_write_failed(
- struct inode *inode,
- struct page *page,
- loff_t pos,
- unsigned len)
-{
- loff_t block_offset;
- loff_t block_start;
- loff_t block_end;
- loff_t from = pos & (PAGE_CACHE_SIZE - 1);
- loff_t to = from + len;
- struct buffer_head *bh, *head;
- struct xfs_mount *mp = XFS_I(inode)->i_mount;
-
- /*
- * The request pos offset might be 32 or 64 bit, this is all fine
- * on 64-bit platform. However, for 64-bit pos request on 32-bit
- * platform, the high 32-bit will be masked off if we evaluate the
- * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
- * 0xfffff000 as an unsigned long, hence the result is incorrect
- * which could cause the following ASSERT failed in most cases.
- * In order to avoid this, we can evaluate the block_offset of the
- * start of the page by using shifts rather than masks the mismatch
- * problem.
- */
- block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT;
-
- ASSERT(block_offset + from == pos);
-
- head = page_buffers(page);
- block_start = 0;
- for (bh = head; bh != head || !block_start;
- bh = bh->b_this_page, block_start = block_end,
- block_offset += bh->b_size) {
- block_end = block_start + bh->b_size;
-
- /* skip buffers before the write */
- if (block_end <= from)
- continue;
-
- /* if the buffer is after the write, we're done */
- if (block_start >= to)
- break;
-
- /*
- * Process delalloc and unwritten buffers beyond EOF. We can
- * encounter unwritten buffers in the event that a file has
- * post-EOF unwritten extents and an extending write happens to
- * fail (e.g., an unaligned write that also involves a delalloc
- * to the same page).
- */
- if (!buffer_delay(bh) && !buffer_unwritten(bh))
- continue;
-
- if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
- block_offset < i_size_read(inode))
- continue;
-
- if (buffer_delay(bh))
- xfs_vm_kill_delalloc_range(inode, block_offset,
- block_offset + bh->b_size);
-
- /*
- * This buffer does not contain data anymore. make sure anyone
- * who finds it knows that for certain.
- */
- clear_buffer_delay(bh);
- clear_buffer_uptodate(bh);
- clear_buffer_mapped(bh);
- clear_buffer_new(bh);
- clear_buffer_dirty(bh);
- clear_buffer_unwritten(bh);
- }
-
-}
-
-/*
- * This used to call block_write_begin(), but it unlocks and releases the page
- * on error, and we need that page to be able to punch stale delalloc blocks out
- * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
- * the appropriate point.
- */
-STATIC int
-xfs_vm_write_begin(
- struct file *file,
- struct address_space *mapping,
- loff_t pos,
- unsigned len,
- unsigned flags,
- struct page **pagep,
- void **fsdata)
-{
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- struct page *page;
- int status;
- struct xfs_mount *mp = XFS_I(mapping->host)->i_mount;
-
- ASSERT(len <= PAGE_CACHE_SIZE);
-
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page)
- return -ENOMEM;
-
- status = __block_write_begin(page, pos, len, xfs_get_blocks);
- if (xfs_mp_fail_writes(mp))
- status = -EIO;
- if (unlikely(status)) {
- struct inode *inode = mapping->host;
- size_t isize = i_size_read(inode);
-
- xfs_vm_write_failed(inode, page, pos, len);
- unlock_page(page);
-
- /*
- * If the write is beyond EOF, we only want to kill blocks
- * allocated in this write, not blocks that were previously
- * written successfully.
- */
- if (xfs_mp_fail_writes(mp))
- isize = 0;
- if (pos + len > isize) {
- ssize_t start = max_t(ssize_t, pos, isize);
-
- truncate_pagecache_range(inode, start, pos + len);
- }
-
- page_cache_release(page);
- page = NULL;
- }
-
- *pagep = page;
- return status;
-}
-
-/*
- * On failure, we only need to kill delalloc blocks beyond EOF in the range of
- * this specific write because they will never be written. Previous writes
- * beyond EOF where block allocation succeeded do not need to be trashed, so
- * only new blocks from this write should be trashed. For blocks within
- * EOF, generic_write_end() zeros them so they are safe to leave alone and be
- * written with all the other valid data.
- */
-STATIC int
-xfs_vm_write_end(
- struct file *file,
- struct address_space *mapping,
- loff_t pos,
- unsigned len,
- unsigned copied,
- struct page *page,
- void *fsdata)
-{
- int ret;
-
- ASSERT(len <= PAGE_CACHE_SIZE);
-
- ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
- if (unlikely(ret < len)) {
- struct inode *inode = mapping->host;
- size_t isize = i_size_read(inode);
- loff_t to = pos + len;
-
- if (to > isize) {
- /* only kill blocks in this write beyond EOF */
- if (pos > isize)
- isize = pos;
- xfs_vm_kill_delalloc_range(inode, isize, to);
- truncate_pagecache_range(inode, isize, to);
- }
- }
- return ret;
-}
-
STATIC sector_t
xfs_vm_bmap(
struct address_space *mapping,
@@ -1749,8 +1539,6 @@ const struct address_space_operations xfs_address_space_operations = {
.set_page_dirty = xfs_vm_set_page_dirty,
.releasepage = xfs_vm_releasepage,
.invalidatepage = xfs_vm_invalidatepage,
- .write_begin = xfs_vm_write_begin,
- .write_end = xfs_vm_write_end,
.bmap = xfs_vm_bmap,
.direct_IO = xfs_vm_direct_IO,
.migratepage = buffer_migrate_page,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 98bbd8f..bcedd80 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -37,6 +37,7 @@
#include "xfs_log.h"
#include "xfs_icache.h"
#include "xfs_pnfs.h"
+#include "xfs_iomap.h"
#include <linux/dcache.h>
#include <linux/falloc.h>
@@ -79,57 +80,27 @@ xfs_rw_ilock_demote(
inode_unlock(VFS_I(ip));
}
-/*
- * xfs_iozero clears the specified range supplied via the page cache (except in
- * the DAX case). Writes through the page cache will allocate blocks over holes,
- * though the callers usually map the holes first and avoid them. If a block is
- * not completely zeroed, then it will be read from disk before being partially
- * zeroed.
- *
- * In the DAX case, we can just directly write to the underlying pages. This
- * will not allocate blocks, but will avoid holes and unwritten extents and so
- * not do unnecessary work.
- */
-int
-xfs_iozero(
- struct xfs_inode *ip, /* inode */
- loff_t pos, /* offset in file */
- size_t count) /* size of data to zero */
+static int
+xfs_dax_zero_range(
+ struct inode *inode,
+ loff_t pos,
+ size_t count)
{
- struct page *page;
- struct address_space *mapping;
int status = 0;
-
- mapping = VFS_I(ip)->i_mapping;
do {
unsigned offset, bytes;
- void *fsdata;
offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
bytes = PAGE_CACHE_SIZE - offset;
if (bytes > count)
bytes = count;
- if (IS_DAX(VFS_I(ip))) {
- status = dax_zero_page_range(VFS_I(ip), pos, bytes,
- xfs_get_blocks_direct);
- if (status)
- break;
- } else {
- status = pagecache_write_begin(NULL, mapping, pos, bytes,
- AOP_FLAG_UNINTERRUPTIBLE,
- &page, &fsdata);
- if (status)
- break;
-
- zero_user(page, offset, bytes);
+ status = dax_zero_page_range(inode, pos, bytes,
+ xfs_get_blocks_direct);
+ if (status)
+ break;
- status = pagecache_write_end(NULL, mapping, pos, bytes,
- bytes, page, fsdata);
- WARN_ON(status <= 0); /* can't return less than zero! */
- status = 0;
- }
pos += bytes;
count -= bytes;
} while (count);
@@ -137,6 +108,24 @@ xfs_iozero(
return status;
}
+/*
+ * Clear the specified ranges to zero through either the pagecache or DAX.
+ * Holes and unwritten extents will be left as-is as they already are zeroed.
+ */
+int
+xfs_iozero(
+ struct xfs_inode *ip,
+ loff_t pos,
+ size_t count)
+{
+ struct inode *inode = VFS_I(ip);
+
+ if (IS_DAX(VFS_I(ip)))
+ return xfs_dax_zero_range(inode, pos, count);
+ else
+ return iomap_zero_range(inode, pos, count, NULL, &xfs_iomap_ops);
+}
+
int
xfs_update_prealloc_flags(
struct xfs_inode *ip,
@@ -842,7 +831,7 @@ xfs_file_buffered_aio_write(
write_retry:
trace_xfs_file_buffered_write(ip, iov_iter_count(from),
iocb->ki_pos, 0);
- ret = generic_perform_write(file, from, iocb->ki_pos);
+ ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
if (likely(ret >= 0))
iocb->ki_pos += ret;
@@ -1558,7 +1547,7 @@ xfs_filemap_page_mkwrite(
if (IS_DAX(inode)) {
ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
} else {
- ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+ ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
ret = block_page_mkwrite_return(ret);
}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 2f37194..620fc91 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -967,3 +967,147 @@ xfs_bmbt_to_iomap(
iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
}
+
+static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
+{
+ return !nimaps ||
+ imap->br_startblock == HOLESTARTBLOCK ||
+ imap->br_startblock == DELAYSTARTBLOCK;
+}
+
+static int
+xfs_file_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ unsigned flags,
+ struct iomap *iomap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec imap;
+ xfs_fileoff_t offset_fsb, end_fsb;
+ int nimaps = 1, error = 0;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ ASSERT(offset <= mp->m_super->s_maxbytes);
+ if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
+ length = mp->m_super->s_maxbytes - offset;
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ end_fsb = XFS_B_TO_FSB(mp, offset + length);
+
+ error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+ &nimaps, XFS_BMAPI_ENTIRE);
+ if (error) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+ }
+
+ if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) {
+ /*
+ * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+ * pages to keep the chunks of work done where somewhat symmetric
+ * with the work writeback does. This is a completely arbitrary
+ * number pulled out of thin air as a best guess for initial
+ * testing.
+ *
+ * Note that the values needs to be less than 32-bits wide until
+ * the lower level functions are updated.
+ */
+ length = min_t(loff_t, length, 1024 * PAGE_SIZE);
+ if (xfs_get_extsz_hint(ip)) {
+ /*
+ * xfs_iomap_write_direct() expects the shared lock. It
+ * is unlocked on return.
+ */
+ xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
+ error = xfs_iomap_write_direct(ip, offset, length, &imap,
+ nimaps);
+ } else {
+ error = xfs_iomap_write_delay(ip, offset, length, &imap);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+
+ if (error)
+ return error;
+
+ trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
+ xfs_bmbt_to_iomap(ip, iomap, &imap);
+ } else if (nimaps) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+ xfs_bmbt_to_iomap(ip, iomap, &imap);
+ } else {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ trace_xfs_iomap_not_found(ip, offset, length, 0, &imap);
+ iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->type = IOMAP_HOLE;
+ iomap->offset = offset;
+ iomap->length = length;
+ }
+
+ return 0;
+}
+
+static int
+xfs_file_iomap_end_delalloc(
+ struct xfs_inode *ip,
+ loff_t offset,
+ loff_t length,
+ ssize_t written)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t start_fsb;
+ xfs_fileoff_t end_fsb;
+ int error = 0;
+
+ start_fsb = XFS_B_TO_FSB(mp, offset + written);
+ end_fsb = XFS_B_TO_FSB(mp, offset + length);
+
+ /*
+ * Trim back delalloc blocks if we didn't manage to write the whole
+ * range reserved.
+ *
+ * We don't need to care about racing delalloc as we hold i_mutex
+ * across the reserve/allocate/unreserve calls. If there are delalloc
+ * blocks in the range, they are ours.
+ */
+ if (start_fsb < end_fsb) {
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+ end_fsb - start_fsb);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ if (error && !XFS_FORCED_SHUTDOWN(mp)) {
+ xfs_alert(mp, "%s: unable to clean up ino %lld",
+ __func__, ip->i_ino);
+ return error;
+ }
+ }
+
+ return 0;
+}
+
+static int
+xfs_file_iomap_end(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ ssize_t written,
+ unsigned flags,
+ struct iomap *iomap)
+{
+ if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
+ return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
+ length, written);
+ return 0;
+}
+
+struct iomap_ops xfs_iomap_ops = {
+ .iomap_begin = xfs_file_iomap_begin,
+ .iomap_end = xfs_file_iomap_end,
+};
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 718f07c..e066d04 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,7 +18,8 @@
#ifndef __XFS_IOMAP_H__
#define __XFS_IOMAP_H__
-struct iomap;
+#include <linux/iomap.h>
+
struct xfs_inode;
struct xfs_bmbt_irec;
@@ -33,4 +34,6 @@ int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
struct xfs_bmbt_irec *);
+extern struct iomap_ops xfs_iomap_ops;
+
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 1a5ca4b..5d1fdae 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -38,6 +38,7 @@
#include "xfs_dir2.h"
#include "xfs_trans_space.h"
#include "xfs_pnfs.h"
+#include "xfs_iomap.h"
#include <linux/capability.h>
#include <linux/xattr.h>
@@ -822,8 +823,8 @@ xfs_setattr_size(
error = dax_truncate_page(inode, newsize,
xfs_get_blocks_direct);
} else {
- error = block_truncate_page(inode->i_mapping, newsize,
- xfs_get_blocks);
+ error = iomap_truncate_page(inode, newsize,
+ &did_zeroing, &xfs_iomap_ops);
}
}
@@ -838,8 +839,8 @@ xfs_setattr_size(
* problem. Note that this includes any block zeroing we did above;
* otherwise those blocks may not be zeroed after a crash.
*/
- if (newsize > ip->i_d.di_size &&
- (oldsize != ip->i_d.di_size || did_zeroing)) {
+ if (did_zeroing ||
+ (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
ip->i_d.di_size, newsize);
if (error)
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 840d52e..86fb345 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1296,6 +1296,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
+DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IOMAP_EVENT(xfs_iomap_found);
+DEFINE_IOMAP_EVENT(xfs_iomap_not_found);
DECLARE_EVENT_CLASS(xfs_simple_io_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
--
2.1.4
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 08/15] xfs: remove buffered write support from __xfs_get_blocks
2016-05-09 8:47 iomap infrastructure and multipage writes V4 Christoph Hellwig
` (6 preceding siblings ...)
2016-05-09 8:47 ` [PATCH 07/15] xfs: implement iomap based buffered write path Christoph Hellwig
@ 2016-05-09 8:47 ` Christoph Hellwig
2016-05-09 8:47 ` [PATCH 09/15] fs: iomap based fiemap implementation Christoph Hellwig
` (8 subsequent siblings)
16 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2016-05-09 8:47 UTC (permalink / raw)
To: xfs; +Cc: rpeterso, vishal.l.verma, linux-fsdevel
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bob Peterson <rpeterso@redhat.com>
---
fs/xfs/xfs_aops.c | 77 +++++++++++++++----------------------------------------
1 file changed, 21 insertions(+), 56 deletions(-)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e481c80..cb4f75c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1130,7 +1130,6 @@ __xfs_get_blocks(
sector_t iblock,
struct buffer_head *bh_result,
int create,
- bool direct,
bool dax_fault)
{
struct xfs_inode *ip = XFS_I(inode);
@@ -1151,22 +1150,14 @@ __xfs_get_blocks(
ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
size = bh_result->b_size;
- if (!create && direct && offset >= i_size_read(inode))
+ if (!create && offset >= i_size_read(inode))
return 0;
/*
* Direct I/O is usually done on preallocated files, so try getting
- * a block mapping without an exclusive lock first. For buffered
- * writes we already have the exclusive iolock anyway, so avoiding
- * a lock roundtrip here by taking the ilock exclusive from the
- * beginning is a useful micro optimization.
+ * a block mapping without an exclusive lock first.
*/
- if (create && !direct) {
- lockmode = XFS_ILOCK_EXCL;
- xfs_ilock(ip, lockmode);
- } else {
- lockmode = xfs_ilock_data_map_shared(ip);
- }
+ lockmode = xfs_ilock_data_map_shared(ip);
ASSERT(offset <= mp->m_super->s_maxbytes);
if (offset + size > mp->m_super->s_maxbytes)
@@ -1185,37 +1176,19 @@ __xfs_get_blocks(
(imap.br_startblock == HOLESTARTBLOCK ||
imap.br_startblock == DELAYSTARTBLOCK) ||
(IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
- if (direct || xfs_get_extsz_hint(ip)) {
- /*
- * xfs_iomap_write_direct() expects the shared lock. It
- * is unlocked on return.
- */
- if (lockmode == XFS_ILOCK_EXCL)
- xfs_ilock_demote(ip, lockmode);
-
- error = xfs_iomap_write_direct(ip, offset, size,
- &imap, nimaps);
- if (error)
- return error;
- new = 1;
+ /*
+ * xfs_iomap_write_direct() expects the shared lock. It
+ * is unlocked on return.
+ */
+ if (lockmode == XFS_ILOCK_EXCL)
+ xfs_ilock_demote(ip, lockmode);
- } else {
- /*
- * Delalloc reservations do not require a transaction,
- * we can go on without dropping the lock here. If we
- * are allocating a new delalloc block, make sure that
- * we set the new flag so that we mark the buffer new so
- * that we know that it is newly allocated if the write
- * fails.
- */
- if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
- new = 1;
- error = xfs_iomap_write_delay(ip, offset, size, &imap);
- if (error)
- goto out_unlock;
+ error = xfs_iomap_write_direct(ip, offset, size,
+ &imap, nimaps);
+ if (error)
+ return error;
+ new = 1;
- xfs_iunlock(ip, lockmode);
- }
trace_xfs_get_blocks_alloc(ip, offset, size,
ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
: XFS_IO_DELALLOC, &imap);
@@ -1236,9 +1209,7 @@ __xfs_get_blocks(
}
/* trim mapping down to size requested */
- if (direct || size > (1 << inode->i_blkbits))
- xfs_map_trim_size(inode, iblock, bh_result,
- &imap, offset, size);
+ xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
/*
* For unwritten extents do not report a disk address in the buffered
@@ -1251,7 +1222,7 @@ __xfs_get_blocks(
if (ISUNWRITTEN(&imap))
set_buffer_unwritten(bh_result);
/* direct IO needs special help */
- if (create && direct) {
+ if (create) {
if (dax_fault)
ASSERT(!ISUNWRITTEN(&imap));
else
@@ -1280,14 +1251,7 @@ __xfs_get_blocks(
(new || ISUNWRITTEN(&imap))))
set_buffer_new(bh_result);
- if (imap.br_startblock == DELAYSTARTBLOCK) {
- BUG_ON(direct);
- if (create) {
- set_buffer_uptodate(bh_result);
- set_buffer_mapped(bh_result);
- set_buffer_delay(bh_result);
- }
- }
+ BUG_ON(imap.br_startblock == DELAYSTARTBLOCK);
return 0;
@@ -1303,7 +1267,8 @@ xfs_get_blocks(
struct buffer_head *bh_result,
int create)
{
- return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
+ BUG_ON(create);
+ return __xfs_get_blocks(inode, iblock, bh_result, create, false);
}
int
@@ -1313,7 +1278,7 @@ xfs_get_blocks_direct(
struct buffer_head *bh_result,
int create)
{
- return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
+ return __xfs_get_blocks(inode, iblock, bh_result, create, false);
}
int
@@ -1323,7 +1288,7 @@ xfs_get_blocks_dax_fault(
struct buffer_head *bh_result,
int create)
{
- return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
+ return __xfs_get_blocks(inode, iblock, bh_result, create, true);
}
/*
--
2.1.4
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 09/15] fs: iomap based fiemap implementation
2016-05-09 8:47 iomap infrastructure and multipage writes V4 Christoph Hellwig
` (7 preceding siblings ...)
2016-05-09 8:47 ` [PATCH 08/15] xfs: remove buffered write support from __xfs_get_blocks Christoph Hellwig
@ 2016-05-09 8:47 ` Christoph Hellwig
2016-05-23 20:09 ` Bob Peterson
2016-05-09 8:47 ` [PATCH 10/15] xfs: use iomap " Christoph Hellwig
` (7 subsequent siblings)
16 siblings, 1 reply; 24+ messages in thread
From: Christoph Hellwig @ 2016-05-09 8:47 UTC (permalink / raw)
To: xfs; +Cc: rpeterso, vishal.l.verma, linux-fsdevel
Add a simple fiemap implementation based on iomap_ops, partially based
on a previous implementation from Bob Peterson <rpeterso@redhat.com>.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/iomap.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/iomap.h | 3 ++
2 files changed, 93 insertions(+)
diff --git a/fs/iomap.c b/fs/iomap.c
index f84c6eb..7e639bf 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -405,3 +405,93 @@ out_unlock:
return ret;
}
EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
+
+struct fiemap_ctx {
+ struct fiemap_extent_info *fi;
+ struct iomap prev;
+};
+
+static int iomap_to_fiemap(struct fiemap_extent_info *fi,
+ struct iomap *iomap, u32 flags)
+{
+ switch (iomap->type) {
+ case IOMAP_HOLE:
+ /* skip holes */
+ return 0;
+ case IOMAP_DELALLOC:
+ flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
+ break;
+ case IOMAP_UNWRITTEN:
+ flags |= FIEMAP_EXTENT_UNWRITTEN;
+ break;
+ case IOMAP_MAPPED:
+ break;
+ }
+
+ return fiemap_fill_next_extent(fi, iomap->offset,
+ iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
+ iomap->length, flags | FIEMAP_EXTENT_MERGED);
+
+}
+
+static loff_t
+iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+ struct iomap *iomap)
+{
+ struct fiemap_ctx *ctx = data;
+ loff_t ret = length;
+
+ if (iomap->type == IOMAP_HOLE)
+ return length;
+
+ ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
+ ctx->prev = *iomap;
+ switch (ret) {
+ case 0: /* success */
+ return length;
+ case 1: /* extent array full */
+ return 0;
+ default:
+ return ret;
+ }
+}
+
+int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
+ loff_t start, loff_t len, struct iomap_ops *ops)
+{
+ struct fiemap_ctx ctx;
+ loff_t ret;
+
+ memset(&ctx, 0, sizeof(ctx));
+ ctx.fi = fi;
+ ctx.prev.type = IOMAP_HOLE;
+
+ ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
+ if (ret)
+ return ret;
+
+ ret = filemap_write_and_wait(inode->i_mapping);
+ if (ret)
+ return ret;
+
+ while (len > 0) {
+ ret = iomap_apply(inode, start, len, 0, ops, &ctx,
+ iomap_fiemap_actor);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ break;
+
+ start += ret;
+ len -= ret;
+ }
+
+ if (ctx.prev.type != IOMAP_HOLE) {
+ ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_fiemap);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 854766f..b3deee1 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -3,6 +3,7 @@
#include <linux/types.h>
+struct fiemap_extent_info;
struct inode;
struct iov_iter;
struct kiocb;
@@ -63,5 +64,7 @@ int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
struct iomap_ops *ops);
int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
struct iomap_ops *ops);
+int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ loff_t start, loff_t len, struct iomap_ops *ops);
#endif /* LINUX_IOMAP_H */
--
2.1.4
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH 09/15] fs: iomap based fiemap implementation
2016-05-09 8:47 ` [PATCH 09/15] fs: iomap based fiemap implementation Christoph Hellwig
@ 2016-05-23 20:09 ` Bob Peterson
2016-05-24 13:10 ` Christoph Hellwig
0 siblings, 1 reply; 24+ messages in thread
From: Bob Peterson @ 2016-05-23 20:09 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: xfs, vishal l verma, linux-fsdevel
----- Original Message -----
| Add a simple fiemap implementation based on iomap_ops, partially based
| on a previous implementation from Bob Peterson <rpeterso@redhat.com>.
|
| Signed-off-by: Christoph Hellwig <hch@lst.de>
| ---
| fs/iomap.c | 90
| +++++++++++++++++++++++++++++++++++++++++++++++++++
| include/linux/iomap.h | 3 ++
| 2 files changed, 93 insertions(+)
|
| diff --git a/fs/iomap.c b/fs/iomap.c
| index f84c6eb..7e639bf 100644
| --- a/fs/iomap.c
| +++ b/fs/iomap.c
| @@ -405,3 +405,93 @@ out_unlock:
| return ret;
| }
| EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
| +
| +struct fiemap_ctx {
| + struct fiemap_extent_info *fi;
| + struct iomap prev;
| +};
| +
| +static int iomap_to_fiemap(struct fiemap_extent_info *fi,
| + struct iomap *iomap, u32 flags)
| +{
| + switch (iomap->type) {
| + case IOMAP_HOLE:
| + /* skip holes */
| + return 0;
| + case IOMAP_DELALLOC:
| + flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
| + break;
| + case IOMAP_UNWRITTEN:
| + flags |= FIEMAP_EXTENT_UNWRITTEN;
| + break;
| + case IOMAP_MAPPED:
| + break;
| + }
| +
| + return fiemap_fill_next_extent(fi, iomap->offset,
| + iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
| + iomap->length, flags | FIEMAP_EXTENT_MERGED);
| +
| +}
| +
| +static loff_t
| +iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void
| *data,
| + struct iomap *iomap)
| +{
| + struct fiemap_ctx *ctx = data;
| + loff_t ret = length;
| +
| + if (iomap->type == IOMAP_HOLE)
| + return length;
| +
| + ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
| + ctx->prev = *iomap;
| + switch (ret) {
| + case 0: /* success */
| + return length;
| + case 1: /* extent array full */
| + return 0;
| + default:
| + return ret;
| + }
| +}
| +
| +int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
| + loff_t start, loff_t len, struct iomap_ops *ops)
| +{
| + struct fiemap_ctx ctx;
| + loff_t ret;
| +
| + memset(&ctx, 0, sizeof(ctx));
| + ctx.fi = fi;
| + ctx.prev.type = IOMAP_HOLE;
| +
| + ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
| + if (ret)
| + return ret;
| +
| + ret = filemap_write_and_wait(inode->i_mapping);
| + if (ret)
| + return ret;
| +
| + while (len > 0) {
| + ret = iomap_apply(inode, start, len, 0, ops, &ctx,
| + iomap_fiemap_actor);
| + if (ret < 0)
| + return ret;
| + if (ret == 0)
| + break;
| +
| + start += ret;
| + len -= ret;
| + }
| +
| + if (ctx.prev.type != IOMAP_HOLE) {
| + ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
| + if (ret < 0)
| + return ret;
| + }
| +
| + return 0;
| +}
| +EXPORT_SYMBOL_GPL(iomap_fiemap);
| diff --git a/include/linux/iomap.h b/include/linux/iomap.h
| index 854766f..b3deee1 100644
| --- a/include/linux/iomap.h
| +++ b/include/linux/iomap.h
| @@ -3,6 +3,7 @@
|
| #include <linux/types.h>
|
| +struct fiemap_extent_info;
| struct inode;
| struct iov_iter;
| struct kiocb;
| @@ -63,5 +64,7 @@ int iomap_truncate_page(struct inode *inode, loff_t pos,
| bool *did_zero,
| struct iomap_ops *ops);
| int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
| struct iomap_ops *ops);
| +int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
| + loff_t start, loff_t len, struct iomap_ops *ops);
|
| #endif /* LINUX_IOMAP_H */
| --
| 2.1.4
|
|
Hi Christoph,
I've been looking at this again. Where are the calls to the fs-specific bits
for fiemap? It looks like iomap_fiemap calls iomap_apply, which calls
iomap_fiemap_actor, but that doesn't call any ops->iomap_get_iomap or similar.
It calls the iomap_begin (which BTW has a comment that says "Execute a iomap
write" which is probably wrong and should be more generic, as for cases like
fiemap) and it calls iomap_end. But it never calls an fs-specific actor
anywhere. Am I missing something? My earlier version passed in the actor
function, as per Dave Chinner's request, but yours doesn't.
Regards,
Bob Peterson
Red Hat File Systems
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 09/15] fs: iomap based fiemap implementation
2016-05-23 20:09 ` Bob Peterson
@ 2016-05-24 13:10 ` Christoph Hellwig
2016-05-26 18:19 ` Bob Peterson
0 siblings, 1 reply; 24+ messages in thread
From: Christoph Hellwig @ 2016-05-24 13:10 UTC (permalink / raw)
To: Bob Peterson; +Cc: Christoph Hellwig, xfs, vishal l verma, linux-fsdevel
On Mon, May 23, 2016 at 04:09:26PM -0400, Bob Peterson wrote:
> I've been looking at this again. Where are the calls to the fs-specific bits
> for fiemap?
In the iomap_ops structure passed to iomap_fiemap.
> It looks like iomap_fiemap calls iomap_apply, which calls
> iomap_fiemap_actor, but that doesn't call any ops->iomap_get_iomap or similar.
> It calls the iomap_begin (which BTW has a comment that says "Execute a iomap
> write" which is probably wrong and should be more generic, as for cases like
> fiemap) and it calls iomap_end. But it never calls an fs-specific actor
> anywhere. Am I missing something? My earlier version passed in the actor
> function, as per Dave Chinner's request, but yours doesn't.
The iomap_begin callback is where you do the mapping. the iomap_end
callback does any required cleanup, which in case of GFS2 probably
would be dropping the cluster lock protecting the mapping.
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 09/15] fs: iomap based fiemap implementation
2016-05-24 13:10 ` Christoph Hellwig
@ 2016-05-26 18:19 ` Bob Peterson
2016-05-26 22:57 ` Dave Chinner
0 siblings, 1 reply; 24+ messages in thread
From: Bob Peterson @ 2016-05-26 18:19 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: xfs, vishal l verma, linux-fsdevel
----- Original Message -----
| On Mon, May 23, 2016 at 04:09:26PM -0400, Bob Peterson wrote:
| > I've been looking at this again. Where are the calls to the fs-specific
| > bits
| > for fiemap?
|
| In the iomap_ops structure passed to iomap_fiemap.
|
| > It looks like iomap_fiemap calls iomap_apply, which calls
| > iomap_fiemap_actor, but that doesn't call any ops->iomap_get_iomap or
| > similar.
| > It calls the iomap_begin (which BTW has a comment that says "Execute a
| > iomap
| > write" which is probably wrong and should be more generic, as for cases
| > like
| > fiemap) and it calls iomap_end. But it never calls an fs-specific actor
| > anywhere. Am I missing something? My earlier version passed in the actor
| > function, as per Dave Chinner's request, but yours doesn't.
|
| The iomap_begin callback is where you do the mapping. the iomap_end
| callback does any required cleanup, which in case of GFS2 probably
| would be dropping the cluster lock protecting the mapping.
|
Okay, got it. So a couple things:
1. I verified that the vfs bits of the patch set work properly for GFS2
using a modified iomap-based fiemap. And it's fast.
2. I'm not sure I like the fact that instead of begin->main->end it is
essentially begin->end with begin doing all the work. It works, and
it's better than we have today. But I'd prefer either renaming the
first function from "iomap_begin" to something that indicates it's
more than just a precursor to the actual function? Or else split it
into begin->main->end? I was kinda hoping to pass in the iomap_actor
somehow. It's not a tragic loss, but the way I've got the gfs2
function coded, the begin function does: locking plus the main
functionality, and all the fiemap_end function basically does is unlock.
3. I had to do something like this to get "make menuconfig" to work:
index 276fcfb..daa129c 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -11,7 +11,7 @@ config DCACHE_WORD_ACCESS
if BLOCK
config FS_IOMAP
- bool
+ bool "File IOMAP Support"
4. I don't suppose you could split this patch set up so that the vfs
bits are independent, so someone like Al Viro could grab them
rather than getting pulled along size the xfs-specific bits?
(Which is essentially what I've done for GFS2; I did not drag in
all the xfs bits).
Regards,
Bob Peterson
Red Hat File Systems
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH 09/15] fs: iomap based fiemap implementation
2016-05-26 18:19 ` Bob Peterson
@ 2016-05-26 22:57 ` Dave Chinner
0 siblings, 0 replies; 24+ messages in thread
From: Dave Chinner @ 2016-05-26 22:57 UTC (permalink / raw)
To: Bob Peterson; +Cc: Christoph Hellwig, linux-fsdevel, vishal l verma, xfs
On Thu, May 26, 2016 at 02:19:57PM -0400, Bob Peterson wrote:
> ----- Original Message -----
> | On Mon, May 23, 2016 at 04:09:26PM -0400, Bob Peterson wrote:
> | > I've been looking at this again. Where are the calls to the fs-specific
> | > bits
> | > for fiemap?
> |
> | In the iomap_ops structure passed to iomap_fiemap.
> |
> | > It looks like iomap_fiemap calls iomap_apply, which calls
> | > iomap_fiemap_actor, but that doesn't call any ops->iomap_get_iomap or
> | > similar.
> | > It calls the iomap_begin (which BTW has a comment that says "Execute a
> | > iomap
> | > write" which is probably wrong and should be more generic, as for cases
> | > like
> | > fiemap) and it calls iomap_end. But it never calls an fs-specific actor
> | > anywhere. Am I missing something? My earlier version passed in the actor
> | > function, as per Dave Chinner's request, but yours doesn't.
> |
> | The iomap_begin callback is where you do the mapping. the iomap_end
> | callback does any required cleanup, which in case of GFS2 probably
> | would be dropping the cluster lock protecting the mapping.
> |
>
> Okay, got it. So a couple things:
>
> 1. I verified that the vfs bits of the patch set work properly for GFS2
> using a modified iomap-based fiemap. And it's fast.
> 2. I'm not sure I like the fact that instead of begin->main->end it is
> essentially begin->end with begin doing all the work. It works, and
> it's better than we have today. But I'd prefer either renaming the
> first function from "iomap_begin" to something that indicates it's
> more than just a precursor to the actual function?
<shrug> Same can be said for the existing write_begin/end interface
we have for block mapping right now. I suppose iomap_prepare/finish
or iomap_get/put might be more obvious. Pink or purple? :P
> Or else split it
> into begin->main->end? I was kinda hoping to pass in the iomap_actor
> somehow.
Can you explain what for? What are you wanting to put in the fiemap
that you can't pass though the iomap as a map flag?
> It's not a tragic loss, but the way I've got the gfs2
> function coded, the begin function does: locking plus the main
> functionality, and all the fiemap_end function basically does is unlock.
Yup, that's pretty much how it was been intended to work.
> 4. I don't suppose you could split this patch set up so that the vfs
> bits are independent, so someone like Al Viro could grab them
> rather than getting pulled along size the xfs-specific bits?
> (Which is essentially what I've done for GFS2; I did not drag in
> all the xfs bits).
Once the merge window is over and the VFS bits are stable and
reviewed, I'll put the code into two branches in the XFS tree - the
first will just have the VFS iomap bits and the second will contain
the XFS bits. Once committed, they'll be stable branches, so you
should be able to merge it into the GFS2 tree and work from there.
The duplication of the branch in different trees will resolve
automatically when Linux pulls the trees. That way I can carry the
XFS changes in the XFS tree, and you don't have to see any of them
in the GFS2 tree...
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH 10/15] xfs: use iomap fiemap implementation
2016-05-09 8:47 iomap infrastructure and multipage writes V4 Christoph Hellwig
` (8 preceding siblings ...)
2016-05-09 8:47 ` [PATCH 09/15] fs: iomap based fiemap implementation Christoph Hellwig
@ 2016-05-09 8:47 ` Christoph Hellwig
2016-05-09 8:47 ` [PATCH 11/15] xfs: use iomap infrastructure for DAX zeroing Christoph Hellwig
` (6 subsequent siblings)
16 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2016-05-09 8:47 UTC (permalink / raw)
To: xfs; +Cc: rpeterso, vishal.l.verma, linux-fsdevel
Note that this removes support for the untested FIEMAP_FLAG_XATTR. It
could be added relatively easily with iomap ops for the attr fork, but
without test coverage I don't feel safe doing this.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_iops.c | 80 ++++---------------------------------------------------
1 file changed, 5 insertions(+), 75 deletions(-)
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 5d1fdae..985a263 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -44,7 +44,7 @@
#include <linux/xattr.h>
#include <linux/posix_acl.h>
#include <linux/security.h>
-#include <linux/fiemap.h>
+#include <linux/iomap.h>
#include <linux/slab.h>
/*
@@ -1004,51 +1004,6 @@ xfs_vn_update_time(
return xfs_trans_commit(tp);
}
-#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
-
-/*
- * Call fiemap helper to fill in user data.
- * Returns positive errors to xfs_getbmap.
- */
-STATIC int
-xfs_fiemap_format(
- void **arg,
- struct getbmapx *bmv,
- int *full)
-{
- int error;
- struct fiemap_extent_info *fieinfo = *arg;
- u32 fiemap_flags = 0;
- u64 logical, physical, length;
-
- /* Do nothing for a hole */
- if (bmv->bmv_block == -1LL)
- return 0;
-
- logical = BBTOB(bmv->bmv_offset);
- physical = BBTOB(bmv->bmv_block);
- length = BBTOB(bmv->bmv_length);
-
- if (bmv->bmv_oflags & BMV_OF_PREALLOC)
- fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
- else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
- fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
- FIEMAP_EXTENT_UNKNOWN);
- physical = 0; /* no block yet */
- }
- if (bmv->bmv_oflags & BMV_OF_LAST)
- fiemap_flags |= FIEMAP_EXTENT_LAST;
-
- error = fiemap_fill_next_extent(fieinfo, logical, physical,
- length, fiemap_flags);
- if (error > 0) {
- error = 0;
- *full = 1; /* user array now full */
- }
-
- return error;
-}
-
STATIC int
xfs_vn_fiemap(
struct inode *inode,
@@ -1056,38 +1011,13 @@ xfs_vn_fiemap(
u64 start,
u64 length)
{
- xfs_inode_t *ip = XFS_I(inode);
- struct getbmapx bm;
int error;
- error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS);
- if (error)
- return error;
-
- /* Set up bmap header for xfs internal routine */
- bm.bmv_offset = BTOBBT(start);
- /* Special case for whole file */
- if (length == FIEMAP_MAX_OFFSET)
- bm.bmv_length = -1LL;
- else
- bm.bmv_length = BTOBB(start + length) - bm.bmv_offset;
-
- /* We add one because in getbmap world count includes the header */
- bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
- fieinfo->fi_extents_max + 1;
- bm.bmv_count = min_t(__s32, bm.bmv_count,
- (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
- bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
- if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
- bm.bmv_iflags |= BMV_IF_ATTRFORK;
- if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
- bm.bmv_iflags |= BMV_IF_DELALLOC;
-
- error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
- if (error)
- return error;
+ xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED);
+ error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops);
+ xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);
- return 0;
+ return error;
}
STATIC int
--
2.1.4
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 11/15] xfs: use iomap infrastructure for DAX zeroing
2016-05-09 8:47 iomap infrastructure and multipage writes V4 Christoph Hellwig
` (9 preceding siblings ...)
2016-05-09 8:47 ` [PATCH 10/15] xfs: use iomap " Christoph Hellwig
@ 2016-05-09 8:47 ` Christoph Hellwig
2016-05-09 8:47 ` [PATCH 12/15] xfs: handle 64-bit length in xfs_iozero Christoph Hellwig
` (5 subsequent siblings)
16 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2016-05-09 8:47 UTC (permalink / raw)
To: xfs; +Cc: rpeterso, vishal.l.verma, linux-fsdevel
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_file.c | 35 +----------------------------------
fs/xfs/xfs_iops.c | 9 ++-------
2 files changed, 3 insertions(+), 41 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index bcedd80..bd07913 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -80,34 +80,6 @@ xfs_rw_ilock_demote(
inode_unlock(VFS_I(ip));
}
-static int
-xfs_dax_zero_range(
- struct inode *inode,
- loff_t pos,
- size_t count)
-{
- int status = 0;
-
- do {
- unsigned offset, bytes;
-
- offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
- bytes = PAGE_CACHE_SIZE - offset;
- if (bytes > count)
- bytes = count;
-
- status = dax_zero_page_range(inode, pos, bytes,
- xfs_get_blocks_direct);
- if (status)
- break;
-
- pos += bytes;
- count -= bytes;
- } while (count);
-
- return status;
-}
-
/*
* Clear the specified ranges to zero through either the pagecache or DAX.
* Holes and unwritten extents will be left as-is as they already are zeroed.
@@ -118,12 +90,7 @@ xfs_iozero(
loff_t pos,
size_t count)
{
- struct inode *inode = VFS_I(ip);
-
- if (IS_DAX(VFS_I(ip)))
- return xfs_dax_zero_range(inode, pos, count);
- else
- return iomap_zero_range(inode, pos, count, NULL, &xfs_iomap_ops);
+ return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops);
}
int
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 985a263..ab820f8 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -819,13 +819,8 @@ xfs_setattr_size(
if (newsize > oldsize) {
error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
} else {
- if (IS_DAX(inode)) {
- error = dax_truncate_page(inode, newsize,
- xfs_get_blocks_direct);
- } else {
- error = iomap_truncate_page(inode, newsize,
- &did_zeroing, &xfs_iomap_ops);
- }
+ error = iomap_truncate_page(inode, newsize, &did_zeroing,
+ &xfs_iomap_ops);
}
if (error)
--
2.1.4
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 12/15] xfs: handle 64-bit length in xfs_iozero
2016-05-09 8:47 iomap infrastructure and multipage writes V4 Christoph Hellwig
` (10 preceding siblings ...)
2016-05-09 8:47 ` [PATCH 11/15] xfs: use iomap infrastructure for DAX zeroing Christoph Hellwig
@ 2016-05-09 8:47 ` Christoph Hellwig
2016-05-09 8:47 ` [PATCH 13/15] xfs: use xfs_zero_range in xfs_zero_eof Christoph Hellwig
` (4 subsequent siblings)
16 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2016-05-09 8:47 UTC (permalink / raw)
To: xfs; +Cc: rpeterso, vishal.l.verma, linux-fsdevel
We'll want to use this code for large offsets now that we're skipping holes
and unwritten extents efficiently. Also rename it to xfs_zero_range to be
a bit more descriptive, and tell the caller if we actually did any zeroing.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_file.c | 11 ++++++-----
fs/xfs/xfs_inode.h | 3 ++-
2 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index bd07913..85369a9 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -85,10 +85,11 @@ xfs_rw_ilock_demote(
* Holes and unwritten extents will be left as-is as they already are zeroed.
*/
int
-xfs_iozero(
+xfs_zero_range(
struct xfs_inode *ip,
- loff_t pos,
- size_t count)
+ xfs_off_t pos,
+ xfs_off_t count,
+ bool *did_zero)
{
return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops);
}
@@ -419,7 +420,7 @@ xfs_zero_last_block(
if (isize + zero_len > offset)
zero_len = offset - isize;
*did_zeroing = true;
- return xfs_iozero(ip, isize, zero_len);
+ return xfs_zero_range(ip, isize, zero_len, NULL);
}
/*
@@ -518,7 +519,7 @@ xfs_zero_eof(
if ((zero_off + zero_len) > offset)
zero_len = offset - zero_off;
- error = xfs_iozero(ip, zero_off, zero_len);
+ error = xfs_zero_range(ip, zero_off, zero_len, NULL);
if (error)
return error;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index e52d7c7..dbb0bcf 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -434,7 +434,8 @@ int xfs_update_prealloc_flags(struct xfs_inode *ip,
enum xfs_prealloc_flags flags);
int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
xfs_fsize_t isize, bool *did_zeroing);
-int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
+int xfs_zero_range(struct xfs_inode *ip, xfs_off_t pos, xfs_off_t count,
+ bool *did_zero);
loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start,
loff_t eof, int whence);
--
2.1.4
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 13/15] xfs: use xfs_zero_range in xfs_zero_eof
2016-05-09 8:47 iomap infrastructure and multipage writes V4 Christoph Hellwig
` (11 preceding siblings ...)
2016-05-09 8:47 ` [PATCH 12/15] xfs: handle 64-bit length in xfs_iozero Christoph Hellwig
@ 2016-05-09 8:47 ` Christoph Hellwig
2016-05-09 8:47 ` [PATCH 14/15] xfs: split xfs_free_file_space in manageable pieces Christoph Hellwig
` (3 subsequent siblings)
16 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2016-05-09 8:47 UTC (permalink / raw)
To: xfs; +Cc: rpeterso, vishal.l.verma, linux-fsdevel
We now skip holes in it, so no need to have the caller do it as well.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_file.c | 128 +-----------------------------------------------------
1 file changed, 1 insertion(+), 127 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 85369a9..6ad30dd 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -381,49 +381,6 @@ out:
}
/*
- * This routine is called to handle zeroing any space in the last block of the
- * file that is beyond the EOF. We do this since the size is being increased
- * without writing anything to that block and we don't want to read the
- * garbage on the disk.
- */
-STATIC int /* error (positive) */
-xfs_zero_last_block(
- struct xfs_inode *ip,
- xfs_fsize_t offset,
- xfs_fsize_t isize,
- bool *did_zeroing)
-{
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize);
- int zero_offset = XFS_B_FSB_OFFSET(mp, isize);
- int zero_len;
- int nimaps = 1;
- int error = 0;
- struct xfs_bmbt_irec imap;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- return error;
-
- ASSERT(nimaps > 0);
-
- /*
- * If the block underlying isize is just a hole, then there
- * is nothing to zero.
- */
- if (imap.br_startblock == HOLESTARTBLOCK)
- return 0;
-
- zero_len = mp->m_sb.sb_blocksize - zero_offset;
- if (isize + zero_len > offset)
- zero_len = offset - isize;
- *did_zeroing = true;
- return xfs_zero_range(ip, isize, zero_len, NULL);
-}
-
-/*
* Zero any on disk space between the current EOF and the new, larger EOF.
*
* This handles the normal case of zeroing the remainder of the last block in
@@ -441,94 +398,11 @@ xfs_zero_eof(
xfs_fsize_t isize, /* current inode size */
bool *did_zeroing)
{
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t start_zero_fsb;
- xfs_fileoff_t end_zero_fsb;
- xfs_fileoff_t zero_count_fsb;
- xfs_fileoff_t last_fsb;
- xfs_fileoff_t zero_off;
- xfs_fsize_t zero_len;
- int nimaps;
- int error = 0;
- struct xfs_bmbt_irec imap;
-
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
ASSERT(offset > isize);
trace_xfs_zero_eof(ip, isize, offset - isize);
-
- /*
- * First handle zeroing the block on which isize resides.
- *
- * We only zero a part of that block so it is handled specially.
- */
- if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
- error = xfs_zero_last_block(ip, offset, isize, did_zeroing);
- if (error)
- return error;
- }
-
- /*
- * Calculate the range between the new size and the old where blocks
- * needing to be zeroed may exist.
- *
- * To get the block where the last byte in the file currently resides,
- * we need to subtract one from the size and truncate back to a block
- * boundary. We subtract 1 in case the size is exactly on a block
- * boundary.
- */
- last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
- start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
- end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
- ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
- if (last_fsb == end_zero_fsb) {
- /*
- * The size was only incremented on its last block.
- * We took care of that above, so just return.
- */
- return 0;
- }
-
- ASSERT(start_zero_fsb <= end_zero_fsb);
- while (start_zero_fsb <= end_zero_fsb) {
- nimaps = 1;
- zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
- &imap, &nimaps, 0);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- return error;
-
- ASSERT(nimaps > 0);
-
- if (imap.br_state == XFS_EXT_UNWRITTEN ||
- imap.br_startblock == HOLESTARTBLOCK) {
- start_zero_fsb = imap.br_startoff + imap.br_blockcount;
- ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
- continue;
- }
-
- /*
- * There are blocks we need to zero.
- */
- zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
- zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
-
- if ((zero_off + zero_len) > offset)
- zero_len = offset - zero_off;
-
- error = xfs_zero_range(ip, zero_off, zero_len, NULL);
- if (error)
- return error;
-
- *did_zeroing = true;
- start_zero_fsb = imap.br_startoff + imap.br_blockcount;
- ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
- }
-
- return 0;
+ return xfs_zero_range(ip, isize, offset - isize, did_zeroing);
}
/*
--
2.1.4
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 14/15] xfs: split xfs_free_file_space in manageable pieces
2016-05-09 8:47 iomap infrastructure and multipage writes V4 Christoph Hellwig
` (12 preceding siblings ...)
2016-05-09 8:47 ` [PATCH 13/15] xfs: use xfs_zero_range in xfs_zero_eof Christoph Hellwig
@ 2016-05-09 8:47 ` Christoph Hellwig
2016-05-09 8:47 ` [PATCH 15/15] xfs: kill xfs_zero_remaining_bytes Christoph Hellwig
` (2 subsequent siblings)
16 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2016-05-09 8:47 UTC (permalink / raw)
To: xfs; +Cc: rpeterso, vishal.l.verma, linux-fsdevel
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_bmap_util.c | 252 +++++++++++++++++++++++++++----------------------
1 file changed, 137 insertions(+), 115 deletions(-)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 3246ebc..5d030b0 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1191,30 +1191,132 @@ xfs_zero_remaining_bytes(
return error;
}
+static int
+xfs_unmap_extent(
+ struct xfs_inode *ip,
+ xfs_fileoff_t startoffset_fsb,
+ xfs_filblks_t len_fsb,
+ int *done)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ struct xfs_bmap_free free_list;
+ xfs_fsblock_t firstfsb;
+ uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ int error;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+ if (error) {
+ ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+ return error;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot,
+ ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS);
+ if (error)
+ goto out_trans_cancel;
+
+ xfs_trans_ijoin(tp, ip, 0);
+
+ xfs_bmap_init(&free_list, &firstfsb);
+ error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, &firstfsb,
+ &free_list, done);
+ if (error)
+ goto out_bmap_cancel;
+
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
+ if (error)
+ goto out_bmap_cancel;
+
+ error = xfs_trans_commit(tp);
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+
+out_bmap_cancel:
+ xfs_bmap_cancel(&free_list);
+out_trans_cancel:
+ xfs_trans_cancel(tp);
+ goto out_unlock;
+}
+
+static int
+xfs_adjust_extent_unmap_boundaries(
+ struct xfs_inode *ip,
+ xfs_fileoff_t *startoffset_fsb,
+ xfs_fileoff_t *endoffset_fsb)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec imap;
+ int nimap, error;
+ xfs_extlen_t mod = 0;
+
+ nimap = 1;
+ error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0);
+ if (error)
+ return error;
+
+ if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+ xfs_daddr_t block;
+
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ block = imap.br_startblock;
+ mod = do_div(block, mp->m_sb.sb_rextsize);
+ if (mod)
+ *startoffset_fsb += mp->m_sb.sb_rextsize - mod;
+ }
+
+ nimap = 1;
+ error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0);
+ if (error)
+ return error;
+
+ if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ mod++;
+ if (mod && mod != mp->m_sb.sb_rextsize)
+ *endoffset_fsb -= mod;
+ }
+
+ return 0;
+}
+
+static int
+xfs_flush_unmap_range(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t len)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct inode *inode = VFS_I(ip);
+ xfs_off_t rounding, start, end;
+ int error;
+
+ /* wait for the completion of any pending DIOs */
+ inode_dio_wait(inode);
+
+ rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
+ start = round_down(offset, rounding);
+ end = round_up(offset + len, rounding) - 1;
+
+ error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (error)
+ return error;
+ truncate_pagecache_range(inode, start, end);
+ return 0;
+}
+
int
xfs_free_file_space(
struct xfs_inode *ip,
xfs_off_t offset,
xfs_off_t len)
{
- int done;
- xfs_fileoff_t endoffset_fsb;
- int error;
- xfs_fsblock_t firstfsb;
- xfs_bmap_free_t free_list;
- xfs_bmbt_irec_t imap;
- xfs_off_t ioffset;
- xfs_off_t iendoffset;
- xfs_extlen_t mod=0;
- xfs_mount_t *mp;
- int nimap;
- uint resblks;
- xfs_off_t rounding;
- int rt;
+ struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t startoffset_fsb;
- xfs_trans_t *tp;
-
- mp = ip->i_mount;
+ xfs_fileoff_t endoffset_fsb;
+ int done, error;
trace_xfs_free_file_space(ip);
@@ -1222,60 +1324,30 @@ xfs_free_file_space(
if (error)
return error;
- error = 0;
if (len <= 0) /* if nothing being freed */
- return error;
- rt = XFS_IS_REALTIME_INODE(ip);
- startoffset_fsb = XFS_B_TO_FSB(mp, offset);
- endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
-
- /* wait for the completion of any pending DIOs */
- inode_dio_wait(VFS_I(ip));
+ return 0;
- rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
- ioffset = round_down(offset, rounding);
- iendoffset = round_up(offset + len, rounding) - 1;
- error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset,
- iendoffset);
+ error = xfs_flush_unmap_range(ip, offset, len);
if (error)
- goto out;
- truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset);
+ return error;
+
+ startoffset_fsb = XFS_B_TO_FSB(mp, offset);
+ endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
/*
- * Need to zero the stuff we're not freeing, on disk.
- * If it's a realtime file & can't use unwritten extents then we
- * actually need to zero the extent edges. Otherwise xfs_bunmapi
- * will take care of it for us.
+ * Need to zero the stuff we're not freeing, on disk. If it's a RT file
+ * and we can't use unwritten extents then we actually need to ensure
+ * to zero the whole extent, otherwise we just need to take of block
+ * boundaries, and xfs_bunmapi will handle the rest.
*/
- if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
- nimap = 1;
- error = xfs_bmapi_read(ip, startoffset_fsb, 1,
- &imap, &nimap, 0);
+ if (XFS_IS_REALTIME_INODE(ip) &&
+ !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+ error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb,
+ &endoffset_fsb);
if (error)
- goto out;
- ASSERT(nimap == 0 || nimap == 1);
- if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
- xfs_daddr_t block;
-
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
- block = imap.br_startblock;
- mod = do_div(block, mp->m_sb.sb_rextsize);
- if (mod)
- startoffset_fsb += mp->m_sb.sb_rextsize - mod;
- }
- nimap = 1;
- error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
- &imap, &nimap, 0);
- if (error)
- goto out;
- ASSERT(nimap == 0 || nimap == 1);
- if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
- mod++;
- if (mod && (mod != mp->m_sb.sb_rextsize))
- endoffset_fsb -= mod;
- }
+ return error;
}
+
if ((done = (endoffset_fsb <= startoffset_fsb)))
/*
* One contiguous piece to clear
@@ -1295,62 +1367,12 @@ xfs_free_file_space(
offset + len - 1);
}
- /*
- * free file space until done or until there is an error
- */
- resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
while (!error && !done) {
-
- /*
- * allocate and setup the transaction. Allow this
- * transaction to dip into the reserve blocks to ensure
- * the freeing of the space succeeds at ENOSPC.
- */
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
- &tp);
- if (error) {
- ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
- break;
- }
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota(tp, mp,
- ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
- resblks, 0, XFS_QMOPT_RES_REGBLKS);
- if (error)
- goto error1;
-
- xfs_trans_ijoin(tp, ip, 0);
-
- /*
- * issue the bunmapi() call to free the blocks
- */
- xfs_bmap_init(&free_list, &firstfsb);
- error = xfs_bunmapi(tp, ip, startoffset_fsb,
- endoffset_fsb - startoffset_fsb,
- 0, 2, &firstfsb, &free_list, &done);
- if (error)
- goto error0;
-
- /*
- * complete the transaction
- */
- error = xfs_bmap_finish(&tp, &free_list, NULL);
- if (error)
- goto error0;
-
- error = xfs_trans_commit(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ error = xfs_unmap_extent(ip, startoffset_fsb,
+ endoffset_fsb - startoffset_fsb, &done);
}
- out:
return error;
-
- error0:
- xfs_bmap_cancel(&free_list);
- error1:
- xfs_trans_cancel(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- goto out;
}
/*
--
2.1.4
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 15/15] xfs: kill xfs_zero_remaining_bytes
2016-05-09 8:47 iomap infrastructure and multipage writes V4 Christoph Hellwig
` (13 preceding siblings ...)
2016-05-09 8:47 ` [PATCH 14/15] xfs: split xfs_free_file_space in manageable pieces Christoph Hellwig
@ 2016-05-09 8:47 ` Christoph Hellwig
2016-05-11 22:42 ` iomap infrastructure and multipage writes V4 Verma, Vishal L
2016-06-01 6:35 ` Dave Chinner
16 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2016-05-09 8:47 UTC (permalink / raw)
To: xfs; +Cc: rpeterso, vishal.l.verma, linux-fsdevel
Instead punch the whole first, and the use the our zeroing helper
to punch out the edge blocks.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_bmap_util.c | 133 ++++++-------------------------------------------
1 file changed, 14 insertions(+), 119 deletions(-)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 5d030b0..0efabc1 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1096,101 +1096,6 @@ error1: /* Just cancel transaction */
return error;
}
-/*
- * Zero file bytes between startoff and endoff inclusive.
- * The iolock is held exclusive and no blocks are buffered.
- *
- * This function is used by xfs_free_file_space() to zero
- * partial blocks when the range to free is not block aligned.
- * When unreserving space with boundaries that are not block
- * aligned we round up the start and round down the end
- * boundaries and then use this function to zero the parts of
- * the blocks that got dropped during the rounding.
- */
-STATIC int
-xfs_zero_remaining_bytes(
- xfs_inode_t *ip,
- xfs_off_t startoff,
- xfs_off_t endoff)
-{
- xfs_bmbt_irec_t imap;
- xfs_fileoff_t offset_fsb;
- xfs_off_t lastoffset;
- xfs_off_t offset;
- xfs_buf_t *bp;
- xfs_mount_t *mp = ip->i_mount;
- int nimap;
- int error = 0;
-
- /*
- * Avoid doing I/O beyond eof - it's not necessary
- * since nothing can read beyond eof. The space will
- * be zeroed when the file is extended anyway.
- */
- if (startoff >= XFS_ISIZE(ip))
- return 0;
-
- if (endoff > XFS_ISIZE(ip))
- endoff = XFS_ISIZE(ip);
-
- for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
- uint lock_mode;
-
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
- nimap = 1;
-
- lock_mode = xfs_ilock_data_map_shared(ip);
- error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
- xfs_iunlock(ip, lock_mode);
-
- if (error || nimap < 1)
- break;
- ASSERT(imap.br_blockcount >= 1);
- ASSERT(imap.br_startoff == offset_fsb);
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-
- if (imap.br_startblock == HOLESTARTBLOCK ||
- imap.br_state == XFS_EXT_UNWRITTEN) {
- /* skip the entire extent */
- lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff +
- imap.br_blockcount) - 1;
- continue;
- }
-
- lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
- if (lastoffset > endoff)
- lastoffset = endoff;
-
- /* DAX can just zero the backing device directly */
- if (IS_DAX(VFS_I(ip))) {
- error = dax_zero_page_range(VFS_I(ip), offset,
- lastoffset - offset + 1,
- xfs_get_blocks_direct);
- if (error)
- return error;
- continue;
- }
-
- error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
- mp->m_rtdev_targp : mp->m_ddev_targp,
- xfs_fsb_to_db(ip, imap.br_startblock),
- BTOBB(mp->m_sb.sb_blocksize),
- 0, &bp, NULL);
- if (error)
- return error;
-
- memset(bp->b_addr +
- (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
- 0, lastoffset - offset + 1);
-
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- if (error)
- return error;
- }
- return error;
-}
-
static int
xfs_unmap_extent(
struct xfs_inode *ip,
@@ -1316,7 +1221,7 @@ xfs_free_file_space(
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t startoffset_fsb;
xfs_fileoff_t endoffset_fsb;
- int done, error;
+ int done = 0, error;
trace_xfs_free_file_space(ip);
@@ -1348,31 +1253,21 @@ xfs_free_file_space(
return error;
}
- if ((done = (endoffset_fsb <= startoffset_fsb)))
- /*
- * One contiguous piece to clear
- */
- error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
- else {
- /*
- * Some full blocks, possibly two pieces to clear
- */
- if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
- error = xfs_zero_remaining_bytes(ip, offset,
- XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
- if (!error &&
- XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
- error = xfs_zero_remaining_bytes(ip,
- XFS_FSB_TO_B(mp, endoffset_fsb),
- offset + len - 1);
- }
-
- while (!error && !done) {
- error = xfs_unmap_extent(ip, startoffset_fsb,
- endoffset_fsb - startoffset_fsb, &done);
+ if (endoffset_fsb > startoffset_fsb) {
+ while (!done) {
+ error = xfs_unmap_extent(ip, startoffset_fsb,
+ endoffset_fsb - startoffset_fsb, &done);
+ if (error)
+ return error;
+ }
}
- return error;
+ /*
+ * Now that we've unmap all full blocks we'll have to zero out any
+ * partial block at the beginning and/or end. xfs_zero_range is
+ * smart enough to skip any holes, including those we just created.
+ */
+ return xfs_zero_range(ip, offset, len, NULL);
}
/*
--
2.1.4
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: iomap infrastructure and multipage writes V4
2016-05-09 8:47 iomap infrastructure and multipage writes V4 Christoph Hellwig
` (14 preceding siblings ...)
2016-05-09 8:47 ` [PATCH 15/15] xfs: kill xfs_zero_remaining_bytes Christoph Hellwig
@ 2016-05-11 22:42 ` Verma, Vishal L
2016-06-01 6:35 ` Dave Chinner
16 siblings, 0 replies; 24+ messages in thread
From: Verma, Vishal L @ 2016-05-11 22:42 UTC (permalink / raw)
To: hch@lst.de, xfs@oss.sgi.com
Cc: rpeterso@redhat.com, linux-fsdevel@vger.kernel.org
On Mon, 2016-05-09 at 10:47 +0200, Christoph Hellwig wrote:
> This series add a new file system I/O path that uses the iomap
> structure
> introduced for the pNFS support and support multi-page buffered
> writes.
>
> This was first started by Dave Chinner a long time ago, then I did
> beat
> it into shape for production runs in a very constrained ARM NAS
> enviroment for Tuxera almost as long ago, and now half a dozen
> rewrites
> later it's back.
>
> The basic idea is to avoid the per-block get_blocks overhead
> and make use of extents in the buffered write path by iterating over
> them instead.
>
> Note that patch 1 conflicts with Vishals dax error handling series.
> It would be great to have a stable branch with it so that both the
> XFS and nvdimm tree could pull it in before the other changes in this
> area.
I'm looking to post a stable branch with my patches soon. My patches
depend on Jan's fixes for ext4 and DAX, and as soon as they land in a
stable ext4 branch, I'll create one for dax error handling. In the
meanwhile, v7 of my patches includes your patch 1 updated for the error
handling series.
>
> Changes since V3:
> - fix DAX based zeroing
> - Reviews and trivial fixes from Bob
>
> Changes since V2:
> - fix the range for delalloc punches after failed writes
> - updated some changelogs
>
> Chances since V1:
> - add support for fiemap
> - fix a test fail on 1k block sizes
> - prepare for 64-bit length, this will be used in a follow on
> patchset
>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: iomap infrastructure and multipage writes V4
2016-05-09 8:47 iomap infrastructure and multipage writes V4 Christoph Hellwig
` (15 preceding siblings ...)
2016-05-11 22:42 ` iomap infrastructure and multipage writes V4 Verma, Vishal L
@ 2016-06-01 6:35 ` Dave Chinner
2016-06-01 12:31 ` Christoph Hellwig
16 siblings, 1 reply; 24+ messages in thread
From: Dave Chinner @ 2016-06-01 6:35 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: xfs, rpeterso, vishal.l.verma, linux-fsdevel
On Mon, May 09, 2016 at 10:47:03AM +0200, Christoph Hellwig wrote:
> This series add a new file system I/O path that uses the iomap structure
> introduced for the pNFS support and support multi-page buffered writes.
>
> This was first started by Dave Chinner a long time ago, then I did beat
> it into shape for production runs in a very constrained ARM NAS
> enviroment for Tuxera almost as long ago, and now half a dozen rewrites
> later it's back.
>
> The basic idea is to avoid the per-block get_blocks overhead
> and make use of extents in the buffered write path by iterating over
> them instead.
>
> Note that patch 1 conflicts with Vishals dax error handling series.
> It would be great to have a stable branch with it so that both the
> XFS and nvdimm tree could pull it in before the other changes in this
> area.
I just pulled this forward to 4.7-rc1, and I get an immediate
failure in generic/346:
[ 70.701300] ------------[ cut here ]------------
[ 70.702029] kernel BUG at fs/xfs/xfs_aops.c:1253!
[ 70.702778] invalid opcode: 0000 [#1] PREEMPT SMP
[ 70.703484] Modules linked in:
[ 70.703952] CPU: 2 PID: 5374 Comm: holetest Not tainted 4.7.0-rc1-dgc+ #812
[ 70.704991] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Debian-1.8.2-1 04/01/2014
[ 70.706285] task: ffff8801365e23c0 ti: ffff8800b0698000 task.ti: ffff8800b0698000
[ 70.707395] RIP: 0010:[<ffffffff814f5ba7>] [<ffffffff814f5ba7>] __xfs_get_blocks+0x597/0x6b0
[ 70.708768] RSP: 0000:ffff8800b069b990 EFLAGS: 00010246
[ 70.709518] RAX: ffff88013ac283c0 RBX: 000000000005c000 RCX: 000000000000000c
[ 70.710527] RDX: 000000000005d000 RSI: 0000000000000008 RDI: ffff8800b3fc1b90
[ 70.711579] RBP: ffff8800b069ba18 R08: 000000000000006b R09: ffff8800b069b914
[ 70.712626] R10: 0000000000000000 R11: 000000000000006b R12: ffff8800b3fc1ce0
[ 70.713656] R13: 0000000000001000 R14: ffff8800b069bb38 R15: ffff8800b9442000
[ 70.714653] FS: 00007ff002a27700(0000) GS:ffff88013fd00000(0000) knlGS:0000000000000000
[ 70.715820] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 70.716669] CR2: 00007ff00436ec00 CR3: 00000000ae8c1000 CR4: 00000000000006e0
[ 70.717656] Stack:
[ 70.717940] ffff8800b3fc1b40 ffff8800b3fc1b60 ffff880000000000 000000000000005c
[ 70.719062] ffff880100000000 ffff8800b3fc1b00 0000000000000000 00000001b069b9d8
[ 70.720199] 0000000000000000 ffffffffffffffff 000000000000005d 0000000000000000
[ 70.721294] Call Trace:
[ 70.721644] [<ffffffff814f5cd7>] xfs_get_blocks+0x17/0x20
[ 70.722401] [<ffffffff812368f4>] do_mpage_readpage+0x3d4/0x710
[ 70.723250] [<ffffffff811ab61e>] ? lru_cache_add+0xe/0x10
[ 70.724013] [<ffffffff81236d28>] mpage_readpages+0xf8/0x150
[ 70.724828] [<ffffffff814f5cc0>] ? __xfs_get_blocks+0x6b0/0x6b0
[ 70.725654] [<ffffffff814f5cc0>] ? __xfs_get_blocks+0x6b0/0x6b0
[ 70.726504] [<ffffffff811e544c>] ? alloc_pages_current+0x8c/0x110
[ 70.727365] [<ffffffff814f38d8>] xfs_vm_readpages+0x38/0xa0
[ 70.728177] [<ffffffff811a97f2>] __do_page_cache_readahead+0x192/0x230
[ 70.729107] [<ffffffff8119e030>] filemap_fault+0x440/0x4b0
[ 70.729881] [<ffffffff81e39080>] ? down_read+0x20/0x40
[ 70.730616] [<ffffffff815007cf>] xfs_filemap_fault+0x5f/0x110
[ 70.731456] [<ffffffff811c2907>] __do_fault+0x67/0xf0
[ 70.732205] [<ffffffff811c6aa9>] handle_mm_fault+0x239/0x1460
[ 70.733015] [<ffffffff810a2403>] __do_page_fault+0x1c3/0x4f0
[ 70.733821] [<ffffffff810a27f3>] trace_do_page_fault+0x43/0x140
[ 70.734654] [<ffffffff8109cc8a>] do_async_page_fault+0x1a/0xa0
[ 70.735493] [<ffffffff81e3d018>] async_page_fault+0x28/0x30
[ 70.736500] Code: 41 ff d2 4d 8b 16 4d 85 d2 75 dd 4c 8b 65 98 4c 8b 75 80 65 ff 0d 82 69 b1 7e 74 11 e9 e6 fb ff ff e8 76 c4 b0 ff e9 e9 fd ff ff <0f> 0b e8 6a c4
[ 70.740283] RIP [<ffffffff814f5ba7>] __xfs_get_blocks+0x597/0x6b0
[ 70.741157] RSP <ffff8800b069b990>
[ 70.742097] ---[ end trace aeed47f2452ca28a ]---
Maybe I screwed up the forward merge sorting out all the bits that
conflicted with what went into 4.7-rc1. Perhaps it would be best if
you rebased and reposted, Christoph?
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: iomap infrastructure and multipage writes V4
2016-06-01 6:35 ` Dave Chinner
@ 2016-06-01 12:31 ` Christoph Hellwig
0 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2016-06-01 12:31 UTC (permalink / raw)
To: Dave Chinner
Cc: Christoph Hellwig, xfs, rpeterso, vishal.l.verma, linux-fsdevel
On Wed, Jun 01, 2016 at 04:35:53PM +1000, Dave Chinner wrote:
> I just pulled this forward to 4.7-rc1, and I get an immediate
> failure in generic/346:
> Maybe I screwed up the forward merge sorting out all the bits that
> conflicted with what went into 4.7-rc1. Perhaps it would be best if
> you rebased and reposted, Christoph?
No, I screwed up and enabled that BUG_ON for the buffered read
case accidentally - it was only supposed to be there for direct I/O.
I've rebase three and fixed it up. I'll resend it as soon as the
xfstests run completes.
^ permalink raw reply [flat|nested] 24+ messages in thread