* [PATCH 1/7] fs: move struct iomap from exportfs.h to a separate header
2016-03-14 21:02 [RFC] iomap infrastructure and multipage writes Christoph Hellwig
@ 2016-03-14 21:02 ` Christoph Hellwig
2016-03-14 21:02 ` [PATCH 2/7] fs: introduce iomap infrastructure Christoph Hellwig
` (5 subsequent siblings)
6 siblings, 0 replies; 12+ messages in thread
From: Christoph Hellwig @ 2016-03-14 21:02 UTC (permalink / raw)
To: xfs
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/nfsd/blocklayout.c | 1 +
fs/nfsd/blocklayoutxdr.c | 1 +
fs/xfs/xfs_pnfs.c | 1 +
include/linux/exportfs.h | 16 +---------------
include/linux/iomap.h | 21 +++++++++++++++++++++
5 files changed, 25 insertions(+), 15 deletions(-)
create mode 100644 include/linux/iomap.h
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index c29d942..c49adba 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -2,6 +2,7 @@
* Copyright (c) 2014 Christoph Hellwig.
*/
#include <linux/exportfs.h>
+#include <linux/iomap.h>
#include <linux/genhd.h>
#include <linux/slab.h>
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 6d834dc..19c8168 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -3,6 +3,7 @@
*/
#include <linux/sunrpc/svc.h>
#include <linux/exportfs.h>
+#include <linux/iomap.h>
#include <linux/nfs4.h>
#include "nfsd.h"
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index ade236e..8b6f20f 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -1,6 +1,7 @@
/*
* Copyright (c) 2014 Christoph Hellwig.
*/
+#include <linux/iomap.h>
#include "xfs.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index fa05e04..c96f47b 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -6,6 +6,7 @@
struct dentry;
struct iattr;
struct inode;
+struct iomap;
struct super_block;
struct vfsmount;
@@ -181,21 +182,6 @@ struct fid {
* get_name is not (which is possibly inconsistent)
*/
-/* types of block ranges for multipage write mappings. */
-#define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */
-#define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */
-#define IOMAP_MAPPED 0x03 /* blocks allocated @blkno */
-#define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */
-
-#define IOMAP_NULL_BLOCK -1LL /* blkno is not valid */
-
-struct iomap {
- sector_t blkno; /* first sector of mapping */
- loff_t offset; /* file offset of mapping, bytes */
- u64 length; /* length of mapping, bytes */
- int type; /* type of mapping */
-};
-
struct export_operations {
int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len,
struct inode *parent);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
new file mode 100644
index 0000000..1b22197
--- /dev/null
+++ b/include/linux/iomap.h
@@ -0,0 +1,21 @@
+#ifndef LINUX_IOMAP_H
+#define LINUX_IOMAP_H 1
+
+#include <linux/types.h>
+
+/* types of block ranges for multipage write mappings. */
+#define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */
+#define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */
+#define IOMAP_MAPPED 0x03 /* blocks allocated @blkno */
+#define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */
+
+#define IOMAP_NULL_BLOCK -1LL /* blkno is not valid */
+
+struct iomap {
+ sector_t blkno; /* first sector of mapping */
+ loff_t offset; /* file offset of mapping, bytes */
+ u64 length; /* length of mapping, bytes */
+ int type; /* type of mapping */
+};
+
+#endif /* LINUX_IOMAP_H */
--
2.1.4
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH 2/7] fs: introduce iomap infrastructure
2016-03-14 21:02 [RFC] iomap infrastructure and multipage writes Christoph Hellwig
2016-03-14 21:02 ` [PATCH 1/7] fs: move struct iomap from exportfs.h to a separate header Christoph Hellwig
@ 2016-03-14 21:02 ` Christoph Hellwig
2016-04-04 1:28 ` Dave Chinner
2016-03-14 21:02 ` [PATCH 3/7] xfs: make xfs_find_bdev_for_inode available outside of xfs_aops.c Christoph Hellwig
` (4 subsequent siblings)
6 siblings, 1 reply; 12+ messages in thread
From: Christoph Hellwig @ 2016-03-14 21:02 UTC (permalink / raw)
To: xfs
Add infrastructure for multipage buffered writes. This is implemented
using an main iterator that applies an actor function to a range that
can be written.
This infrastucture is used to implement a buffered write helper, one
to zero file ranges and one to implement the ->page_mkwrite VM
operations. All of them borrow a fair amount of code from fs/buffers.
for now by using an internal version of __block_write_begin that
gets passed an iomap and builds the corresponding buffer head.
The file system is gets a set of paired ->iomap_begin and ->iomap_end
calls which allow it to map/reserve a range and get a notification
once the write code is finished with it.
Based on earlier code from Dave Chinner.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/Makefile | 2 +-
fs/buffer.c | 77 +++++++++-
fs/internal.h | 3 +
fs/iomap.c | 381 ++++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/iomap.h | 55 +++++++-
5 files changed, 507 insertions(+), 11 deletions(-)
create mode 100644 fs/iomap.c
diff --git a/fs/Makefile b/fs/Makefile
index 79f5225..d522fd2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -14,7 +14,7 @@ obj-y := open.o read_write.o file_table.o super.o \
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
ifeq ($(CONFIG_BLOCK),y)
-obj-y += buffer.o block_dev.o direct-io.o mpage.o
+obj-y += buffer.o block_dev.o direct-io.o mpage.o iomap.o
else
obj-y += no-block.o
endif
diff --git a/fs/buffer.c b/fs/buffer.c
index e1632ab..03379de 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -21,6 +21,7 @@
#include <linux/kernel.h>
#include <linux/syscalls.h>
#include <linux/fs.h>
+#include <linux/iomap.h>
#include <linux/mm.h>
#include <linux/percpu.h>
#include <linux/slab.h>
@@ -1893,8 +1894,63 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
}
EXPORT_SYMBOL(page_zero_new_buffers);
-int __block_write_begin(struct page *page, loff_t pos, unsigned len,
- get_block_t *get_block)
+static void
+iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
+ struct iomap *iomap)
+{
+ loff_t offset = block << inode->i_blkbits;
+
+ bh->b_bdev = iomap->bdev;
+
+ /*
+ * Block points to offset in file we need to map, iomap contains
+ * the offset at which the map starts. If the map ends before the
+ * current block, then do not map the buffer and let the caller
+ * handle it.
+ */
+ BUG_ON(offset >= iomap->offset + iomap->length);
+
+ switch (iomap->type) {
+ case IOMAP_HOLE:
+ /*
+ * If the buffer is not up to date or beyond the current EOF,
+ * we need to mark it as new to ensure sub-block zeroing is
+ * executed if necessary.
+ */
+ if (!buffer_uptodate(bh) ||
+ (offset >= i_size_read(inode)))
+ set_buffer_new(bh);
+ break;
+ case IOMAP_DELALLOC:
+ if (!buffer_uptodate(bh) ||
+ (offset >= i_size_read(inode)))
+ set_buffer_new(bh);
+ set_buffer_uptodate(bh);
+ set_buffer_mapped(bh);
+ set_buffer_delay(bh);
+ break;
+ case IOMAP_UNWRITTEN:
+ /*
+ * For unwritten regions, we always need to ensure that
+ * sub-block writes cause the regions in the block we are not
+ * writing to are zeroed. Set the buffer as new to ensre this.
+ */
+ set_buffer_new(bh);
+ set_buffer_unwritten(bh);
+ /* FALLTHRU */
+ case IOMAP_MAPPED:
+ if (offset >= i_size_read(inode))
+ set_buffer_new(bh);
+ bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
+ ((offset - iomap->offset) >> inode->i_blkbits);
+ set_buffer_mapped(bh);
+ break;
+ }
+
+}
+
+int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+ get_block_t *get_block, struct iomap *iomap)
{
unsigned from = pos & (PAGE_CACHE_SIZE - 1);
unsigned to = from + len;
@@ -1930,9 +1986,14 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
clear_buffer_new(bh);
if (!buffer_mapped(bh)) {
WARN_ON(bh->b_size != blocksize);
- err = get_block(inode, block, bh, 1);
- if (err)
- break;
+ if (get_block) {
+ err = get_block(inode, block, bh, 1);
+ if (err)
+ break;
+ } else {
+ iomap_to_bh(inode, block, bh, iomap);
+ }
+
if (buffer_new(bh)) {
unmap_underlying_metadata(bh->b_bdev,
bh->b_blocknr);
@@ -1973,6 +2034,12 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
page_zero_new_buffers(page, from, to);
return err;
}
+
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+ get_block_t *get_block)
+{
+ return __block_write_begin_int(page, pos, len, get_block, NULL);
+}
EXPORT_SYMBOL(__block_write_begin);
static int __block_commit_write(struct inode *inode, struct page *page,
diff --git a/fs/internal.h b/fs/internal.h
index b71deee..c0c6f49 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
struct super_block;
struct file_system_type;
+struct iomap;
struct linux_binprm;
struct path;
struct mount;
@@ -39,6 +40,8 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
* buffer.c
*/
extern void guard_bio_eod(int rw, struct bio *bio);
+extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+ get_block_t *get_block, struct iomap *iomap);
/*
* char_dev.c
diff --git a/fs/iomap.c b/fs/iomap.c
new file mode 100644
index 0000000..d4528cb
--- /dev/null
+++ b/fs/iomap.c
@@ -0,0 +1,381 @@
+
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/iomap.h>
+#include <linux/uaccess.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include "internal.h"
+
+typedef ssize_t (*write_actor_t)(struct inode *inode, loff_t pos, ssize_t len,
+ void *data, struct iomap *iomap);
+
+/*
+ * Execute a iomap write on a segment of the mapping that spans a
+ * contiguous range of pages that have identical block mapping state.
+ *
+ * This avoids the need to map pages individually, do individual allocations
+ * for each page and most importantly avoid the need for filesystem specific
+ * locking per page. Instead, all the operations are amortised over the entire
+ * range of pages. It is assumed that the filesystems will lock whatever
+ * resources they require in the iomap_begin call, and release them in the
+ * iomap_end call.
+ */
+static ssize_t
+iomap_write_segment(struct inode *inode, loff_t pos, ssize_t length,
+ unsigned flags, struct iomap_ops *ops, void *data,
+ write_actor_t actor)
+{
+ struct iomap iomap = { 0 };
+ ssize_t written;
+ int error;
+
+ /*
+ * Need to map a range from start position for count bytes. This can
+ * span multiple pages - it is only guaranteed to return a range of a
+ * single type of pages (e.g. all into a hole, all mapped or all
+ * unwritten). Failure at this point has nothing to undo.
+ *
+ * If allocation is required for this range, reserve the space now so
+ * that the allocation is guaranteed to succeed later on. Once we copy
+ * the data into the page cache pages, then we cannot fail otherwise we
+ * expose transient stale data. If the reserve fails, we can safely
+ * back out at this point as there is nothing to undo.
+ *
+ * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
+ * to keep the chunks of work done where somewhat symmetric with the
+ * work writeback does. This is a completely arbitrary number pulled
+ * out of thin air as a best guess for initial testing.
+ */
+ length = min_t(size_t, length, 1024 * PAGE_SIZE);
+
+ error = ops->iomap_begin(inode, pos, length, flags, &iomap);
+ if (error)
+ return error;
+ if (WARN_ON(iomap.offset > pos))
+ return -EIO;
+
+ /*
+ * Cut down the length to the one actually provided by the filesystem,
+ * as it might not be able to give us the whole size that we requested.
+ */
+ if (iomap.offset + iomap.length < pos + length)
+ length = iomap.offset + iomap.length - pos;
+
+ /*
+ * Now that we have guaranteed that the space allocation will succeed.
+ * we can do the copy-in page by page without having to worry about
+ * failures exposing transient data.
+ */
+ written = actor(inode, pos, length, data, &iomap);
+
+ /*
+ * Now the data has been copied, commit the range we've copied. This
+ * should not fail unless the filesystem has had a fatal error.
+ */
+ error = ops->iomap_end(inode, pos, length,
+ written > 0 ? written : 0, &iomap);
+
+ return written > 0 ? written : error;
+}
+
+static void
+iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
+{
+ loff_t i_size = i_size_read(inode);
+
+ /*
+ * Only truncate newly allocated pages beyoned EOF, even if the
+ * write started inside the existing inode size.
+ */
+ if (pos + len > i_size)
+ truncate_pagecache_range(inode, max(pos, i_size), pos + len);
+}
+
+static int
+iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, struct iomap *iomap)
+{
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ struct page *page;
+ int status = 0;
+
+ BUG_ON(pos + len > iomap->offset + iomap->length);
+
+ page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
+ if (!page)
+ return -ENOMEM;
+
+ status = __block_write_begin_int(page, pos, len, NULL, iomap);
+ if (unlikely(status)) {
+ unlock_page(page);
+ page_cache_release(page);
+ page = NULL;
+
+ iomap_write_failed(inode, pos, len);
+ }
+
+ *pagep = page;
+ return status;
+}
+
+static int
+iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
+ unsigned copied, struct page *page)
+{
+ int ret;
+
+ ret = generic_write_end(NULL, inode->i_mapping, pos, len,
+ copied, page, NULL);
+ if (ret < len)
+ iomap_write_failed(inode, pos, len);
+ return ret;
+}
+
+static ssize_t
+iomap_write_actor(struct inode *inode, loff_t pos, ssize_t length, void *data,
+ struct iomap *iomap)
+{
+ struct iov_iter *i = data;
+ long status = 0;
+ ssize_t written = 0;
+ unsigned int flags = AOP_FLAG_NOFS;
+
+ /*
+ * Copies from kernel address space cannot fail (NFSD is a big user).
+ */
+ if (!iter_is_iovec(i))
+ flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+ do {
+ struct page *page;
+ unsigned long offset; /* Offset into pagecache page */
+ unsigned long bytes; /* Bytes to write to page */
+ size_t copied; /* Bytes copied from user */
+
+ offset = (pos & (PAGE_CACHE_SIZE - 1));
+ bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+ iov_iter_count(i));
+again:
+ if (bytes > length)
+ bytes = length;
+
+ /*
+ * Bring in the user page that we will copy from _first_.
+ * Otherwise there's a nasty deadlock on copying from the
+ * same page as we're writing to, without it being marked
+ * up-to-date.
+ *
+ * Not only is this an optimisation, but it is also required
+ * to check that the address is actually valid, when atomic
+ * usercopies are used, below.
+ */
+ if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+ status = -EFAULT;
+ break;
+ }
+
+ status = iomap_write_begin(inode, pos, bytes, flags, &page,
+ iomap);
+ if (unlikely(status))
+ break;
+
+ if (mapping_writably_mapped(inode->i_mapping))
+ flush_dcache_page(page);
+
+ pagefault_disable();
+ copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+ pagefault_enable();
+
+ flush_dcache_page(page);
+ mark_page_accessed(page);
+
+ status = iomap_write_end(inode, pos, bytes, copied, page);
+ if (unlikely(status < 0))
+ break;
+ copied = status;
+
+ cond_resched();
+
+ iov_iter_advance(i, copied);
+ if (unlikely(copied == 0)) {
+ /*
+ * If we were unable to copy any data at all, we must
+ * fall back to a single segment length write.
+ *
+ * If we didn't fallback here, we could livelock
+ * because not all segments in the iov can be copied at
+ * once without a pagefault.
+ */
+ bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+ iov_iter_single_seg_count(i));
+ goto again;
+ }
+ pos += copied;
+ written += copied;
+ length -= copied;
+
+ balance_dirty_pages_ratelimited(inode->i_mapping);
+ } while (iov_iter_count(i) && length);
+
+ return written ? written : status;
+}
+
+ssize_t
+iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
+ struct iomap_ops *ops)
+{
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ loff_t pos = iocb->ki_pos;
+ ssize_t ret = 0, written = 0;
+
+ while (iov_iter_count(iter)) {
+ ret = iomap_write_segment(inode, pos, iov_iter_count(iter),
+ IOMAP_ALLOCATE, ops, iter, iomap_write_actor);
+ if (ret <= 0)
+ break;
+ pos += ret;
+ written += ret;
+ }
+
+ return written ? written : ret;
+}
+EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
+
+static ssize_t
+iomap_zero_range_actor(struct inode *inode, loff_t pos, ssize_t count,
+ void *data, struct iomap *iomap)
+{
+ bool *did_zero = data;
+ struct page *page;
+ int status;
+ ssize_t written = 0;
+
+ /*
+ * No need to zero anything if we fall into a hole or unwritten extent.
+ */
+ if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
+ return count;
+
+ do {
+ unsigned offset, bytes;
+
+ offset = pos & (PAGE_CACHE_SIZE - 1); /* Within page */
+ bytes = min_t(unsigned, PAGE_CACHE_SIZE - offset, count);
+
+ status = iomap_write_begin(inode, pos, bytes,
+ AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS,
+ &page, iomap);
+ if (status)
+ break;
+
+ zero_user(page, offset, bytes);
+ mark_page_accessed(page);
+
+ status = iomap_write_end(inode, pos, bytes, bytes, page);
+ if (status)
+ break;
+
+ pos += bytes;
+ count -= bytes;
+ written += bytes;
+ if (did_zero)
+ *did_zero = true;
+ } while (count > 0);
+
+ return status ? status : written;
+}
+
+int
+iomap_zero_range(struct inode *inode, loff_t pos, u64 len, bool *did_zero,
+ struct iomap_ops *ops)
+{
+ ssize_t ret;
+
+ while (len > 0) {
+ ssize_t chunk_size = min_t(u64, len, INT_MAX);
+
+ ret = iomap_write_segment(inode, pos, chunk_size, 0, ops,
+ did_zero, iomap_zero_range_actor);
+ if (ret <= 0)
+ return ret;
+
+ pos += ret;
+ len -= ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_zero_range);
+
+int
+iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+ struct iomap_ops *ops)
+{
+ unsigned blocksize = (1 << inode->i_blkbits);
+ unsigned off = pos & (blocksize - 1);
+
+ /* Block boundary? Nothing to do */
+ if (!off)
+ return 0;
+ return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
+}
+EXPORT_SYMBOL_GPL(iomap_truncate_page);
+
+static ssize_t
+iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, ssize_t length,
+ void *data, struct iomap *iomap)
+{
+ struct page *page = data;
+ int ret;
+
+ ret = __block_write_begin_int(page, 0, length, NULL, iomap);
+ if (!ret)
+ ret = block_commit_write(page, 0, length);
+
+ return ret;
+}
+
+int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+ struct iomap_ops *ops)
+{
+ struct page *page = vmf->page;
+ struct inode *inode = file_inode(vma->vm_file);
+ unsigned long length;
+ loff_t size;
+ int ret;
+
+ lock_page(page);
+ size = i_size_read(inode);
+ if ((page->mapping != inode->i_mapping) ||
+ (page_offset(page) > size)) {
+ /* We overload EFAULT to mean page got truncated */
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ /* page is wholly or partially inside EOF */
+ if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
+ length = size & ~PAGE_CACHE_MASK;
+ else
+ length = PAGE_CACHE_SIZE;
+
+ ret = iomap_write_segment(inode, page_offset(page), length,
+ IOMAP_ALLOCATE, ops, page, iomap_page_mkwrite_actor);
+ if (unlikely(ret < 0))
+ goto out_unlock;
+ set_page_dirty(page);
+ wait_for_stable_page(page);
+ return 0;
+out_unlock:
+ unlock_page(page);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 1b22197..ae0b92c 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -3,19 +3,64 @@
#include <linux/types.h>
-/* types of block ranges for multipage write mappings. */
+struct inode;
+struct iov_iter;
+struct kiocb;
+struct vm_area_struct;
+struct vm_fault;
+
+/*
+ * Types of block ranges for iomap mappings:
+ */
#define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */
#define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */
#define IOMAP_MAPPED 0x03 /* blocks allocated @blkno */
#define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */
+/*
+ * Magic value for blkno:
+ */
#define IOMAP_NULL_BLOCK -1LL /* blkno is not valid */
struct iomap {
- sector_t blkno; /* first sector of mapping */
- loff_t offset; /* file offset of mapping, bytes */
- u64 length; /* length of mapping, bytes */
- int type; /* type of mapping */
+ sector_t blkno; /* first sector of mapping, fs blocks */
+ loff_t offset; /* file offset of mapping, bytes */
+ u64 length; /* length of mapping, bytes */
+ int type; /* type of mapping */
+ struct block_device *bdev; /* block device for I/O */
+};
+
+/*
+ * Flags for iomap_begin:
+ */
+#define IOMAP_ALLOCATE 0x01 /* allocate / reserve blocks if not present */
+
+struct iomap_ops {
+ /*
+ * Return the existing mapping at pos, or reserve space starting at
+ * pos for up to length, as long as we can do it as a single mapping.
+ * The actual length is returned in iomap->length.
+ */
+ int (*iomap_begin)(struct inode *inode, loff_t pos, ssize_t length,
+ unsigned flags, struct iomap *iomap);
+
+ /*
+ * Commit and/or unreserve space previous allocated using iomap_begin.
+ * Written indicates the length of the successful write operation which
+ * needs to be commited, while the rest needs to be unreserved.
+ * Written might be zero if no data was written.
+ */
+ int (*iomap_end)(struct inode *inode, loff_t pos, ssize_t length,
+ ssize_t written, struct iomap *iomap);
};
+ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
+ struct iomap_ops *ops);
+int iomap_zero_range(struct inode *inode, loff_t pos, u64 len, bool *did_zero,
+ struct iomap_ops *ops);
+int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+ struct iomap_ops *ops);
+int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+ struct iomap_ops *ops);
+
#endif /* LINUX_IOMAP_H */
--
2.1.4
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 12+ messages in thread* Re: [PATCH 2/7] fs: introduce iomap infrastructure
2016-03-14 21:02 ` [PATCH 2/7] fs: introduce iomap infrastructure Christoph Hellwig
@ 2016-04-04 1:28 ` Dave Chinner
2016-04-04 1:47 ` Dave Chinner
0 siblings, 1 reply; 12+ messages in thread
From: Dave Chinner @ 2016-04-04 1:28 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: xfs
On Mon, Mar 14, 2016 at 10:02:45PM +0100, Christoph Hellwig wrote:
> Add infrastructure for multipage buffered writes. This is implemented
> using an main iterator that applies an actor function to a range that
> can be written.
>
> This infrastucture is used to implement a buffered write helper, one
> to zero file ranges and one to implement the ->page_mkwrite VM
> operations. All of them borrow a fair amount of code from fs/buffers.
> for now by using an internal version of __block_write_begin that
> gets passed an iomap and builds the corresponding buffer head.
>
> The file system is gets a set of paired ->iomap_begin and ->iomap_end
> calls which allow it to map/reserve a range and get a notification
> once the write code is finished with it.
>
> Based on earlier code from Dave Chinner.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
.....
> +/*
> + * Execute a iomap write on a segment of the mapping that spans a
> + * contiguous range of pages that have identical block mapping state.
> + *
> + * This avoids the need to map pages individually, do individual allocations
> + * for each page and most importantly avoid the need for filesystem specific
> + * locking per page. Instead, all the operations are amortised over the entire
> + * range of pages. It is assumed that the filesystems will lock whatever
> + * resources they require in the iomap_begin call, and release them in the
> + * iomap_end call.
> + */
> +static ssize_t
> +iomap_write_segment(struct inode *inode, loff_t pos, ssize_t length,
> + unsigned flags, struct iomap_ops *ops, void *data,
> + write_actor_t actor)
This requires external iteration to write the entire range required
if the allocation does not cover the entire length requested (i.e.
written < length).
Also, if the actor returns an error into written, that gets ignored
and the return status is whatever the ->iomap_end call returns.
....
> +int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
> + struct iomap_ops *ops)
> +{
> + struct page *page = vmf->page;
> + struct inode *inode = file_inode(vma->vm_file);
> + unsigned long length;
> + loff_t size;
> + int ret;
> +
> + lock_page(page);
> + size = i_size_read(inode);
> + if ((page->mapping != inode->i_mapping) ||
> + (page_offset(page) > size)) {
> + /* We overload EFAULT to mean page got truncated */
> + ret = -EFAULT;
> + goto out_unlock;
> + }
> +
> + /* page is wholly or partially inside EOF */
> + if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
> + length = size & ~PAGE_CACHE_MASK;
> + else
> + length = PAGE_CACHE_SIZE;
> +
> + ret = iomap_write_segment(inode, page_offset(page), length,
> + IOMAP_ALLOCATE, ops, page, iomap_page_mkwrite_actor);
> + if (unlikely(ret < 0))
> + goto out_unlock;
> + set_page_dirty(page);
> + wait_for_stable_page(page);
> + return 0;
> +out_unlock:
> + unlock_page(page);
> + return ret;
> +}
Because we don't handle short segment writes here,
iomap_page_mkwrite() fails to allocate blocks on partial pages when
block size < page size. This can be seen by generic/030 on XFS with
a 1k block size.
Patch below fixes the issue, as well as the fact that
iomap_page_mkwrite_actor() needs to return the count of bytes
"written", not zero on success for iomap_write_segment() to do the
right thing on multi-segment writes.
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
iomap: fix page_mkwrite on bs < ps
Fixes generic/030.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
fs/iomap.c | 30 ++++++++++++++++++++++--------
1 file changed, 22 insertions(+), 8 deletions(-)
diff --git a/fs/iomap.c b/fs/iomap.c
index d4528cb..c4d3511 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -337,10 +337,16 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, ssize_t length,
int ret;
ret = __block_write_begin_int(page, 0, length, NULL, iomap);
- if (!ret)
- ret = block_commit_write(page, 0, length);
+ if (ret)
+ return ret;
+
+ /*
+ * block_commit_write always returns 0, we need to return the length we
+ * successfully allocated.
+ */
+ block_commit_write(page, 0, length);
+ return length;
- return ret;
}
int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
@@ -350,7 +356,8 @@ int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
struct inode *inode = file_inode(vma->vm_file);
unsigned long length;
loff_t size;
- int ret;
+ loff_t offset;
+ ssize_t ret;
lock_page(page);
size = i_size_read(inode);
@@ -367,10 +374,17 @@ int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
else
length = PAGE_CACHE_SIZE;
- ret = iomap_write_segment(inode, page_offset(page), length,
- IOMAP_ALLOCATE, ops, page, iomap_page_mkwrite_actor);
- if (unlikely(ret < 0))
- goto out_unlock;
+ offset = page_offset(page);
+ while (length > 0) {
+ ret = iomap_write_segment(inode, offset, length,
+ IOMAP_ALLOCATE, ops, page,
+ iomap_page_mkwrite_actor);
+ if (unlikely(ret < 0))
+ goto out_unlock;
+ offset += ret;
+ length -= ret;
+ }
+
set_page_dirty(page);
wait_for_stable_page(page);
return 0;
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 12+ messages in thread* Re: [PATCH 2/7] fs: introduce iomap infrastructure
2016-04-04 1:28 ` Dave Chinner
@ 2016-04-04 1:47 ` Dave Chinner
2016-04-04 7:12 ` Christoph Hellwig
0 siblings, 1 reply; 12+ messages in thread
From: Dave Chinner @ 2016-04-04 1:47 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: xfs
On Mon, Apr 04, 2016 at 11:28:49AM +1000, Dave Chinner wrote:
> On Mon, Mar 14, 2016 at 10:02:45PM +0100, Christoph Hellwig wrote:
> > Add infrastructure for multipage buffered writes. This is implemented
> > using an main iterator that applies an actor function to a range that
> > can be written.
....
> Patch below fixes the issue, as well as the fact that
> iomap_page_mkwrite_actor() needs to return the count of bytes
> "written", not zero on success for iomap_write_segment() to do the
> right thing on multi-segment writes.
Hmm - this fix then breaks generic/029. Christoph, can you look into
these issues?
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/7] fs: introduce iomap infrastructure
2016-04-04 1:47 ` Dave Chinner
@ 2016-04-04 7:12 ` Christoph Hellwig
2016-04-04 7:55 ` Dave Chinner
0 siblings, 1 reply; 12+ messages in thread
From: Christoph Hellwig @ 2016-04-04 7:12 UTC (permalink / raw)
To: Dave Chinner; +Cc: Christoph Hellwig, xfs
On Mon, Apr 04, 2016 at 11:47:06AM +1000, Dave Chinner wrote:
> > Patch below fixes the issue, as well as the fact that
> > iomap_page_mkwrite_actor() needs to return the count of bytes
> > "written", not zero on success for iomap_write_segment() to do the
> > right thing on multi-segment writes.
>
> Hmm - this fix then breaks generic/029. Christoph, can you look into
> these issues?
I've allocated some time for the series this week, I'll also add the
iomap based fiemap implementation from the gfs2 folks while I'm at it.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/7] fs: introduce iomap infrastructure
2016-04-04 7:12 ` Christoph Hellwig
@ 2016-04-04 7:55 ` Dave Chinner
0 siblings, 0 replies; 12+ messages in thread
From: Dave Chinner @ 2016-04-04 7:55 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: xfs
On Mon, Apr 04, 2016 at 09:12:03AM +0200, Christoph Hellwig wrote:
> On Mon, Apr 04, 2016 at 11:47:06AM +1000, Dave Chinner wrote:
> > > Patch below fixes the issue, as well as the fact that
> > > iomap_page_mkwrite_actor() needs to return the count of bytes
> > > "written", not zero on success for iomap_write_segment() to do the
> > > right thing on multi-segment writes.
> >
> > Hmm - this fix then breaks generic/029. Christoph, can you look into
> > these issues?
>
> I've allocated some time for the series this week, I'll also add the
> iomap based fiemap implementation from the gfs2 folks while I'm at it.
Great! I've been running it here for the past week or so, and on 4k
block size filesystems it hasn't caused any regressions. I've been
running it with the straight-to-bio writeback paches for XFS as
well, and it seems to be playing nicely with that, too. The above
problems were seen independent of the witeback patch series,
though...
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 3/7] xfs: make xfs_find_bdev_for_inode available outside of xfs_aops.c
2016-03-14 21:02 [RFC] iomap infrastructure and multipage writes Christoph Hellwig
2016-03-14 21:02 ` [PATCH 1/7] fs: move struct iomap from exportfs.h to a separate header Christoph Hellwig
2016-03-14 21:02 ` [PATCH 2/7] fs: introduce iomap infrastructure Christoph Hellwig
@ 2016-03-14 21:02 ` Christoph Hellwig
2016-03-14 21:02 ` [PATCH 4/7] xfs: make xfs_bmbt_to_iomap available outside of xfs_pnfs.c Christoph Hellwig
` (3 subsequent siblings)
6 siblings, 0 replies; 12+ messages in thread
From: Christoph Hellwig @ 2016-03-14 21:02 UTC (permalink / raw)
To: xfs
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_aops.c | 17 ++---------------
fs/xfs/xfs_inode.h | 9 +++++++++
2 files changed, 11 insertions(+), 15 deletions(-)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 75a39a8..72ee3f2 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -70,19 +70,6 @@ xfs_count_page_state(
} while ((bh = bh->b_this_page) != head);
}
-STATIC struct block_device *
-xfs_find_bdev_for_inode(
- struct inode *inode)
-{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
-
- if (XFS_IS_REALTIME_INODE(ip))
- return mp->m_rtdev_targp->bt_bdev;
- else
- return mp->m_ddev_targp->bt_bdev;
-}
-
/*
* We're now finished for good with this ioend structure.
* Update the page state via the associated buffer_heads,
@@ -1256,7 +1243,7 @@ __xfs_get_blocks(
* If this is a realtime file, data may be on a different device.
* to that pointed to from the buffer_head b_bdev currently.
*/
- bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
+ bh_result->b_bdev = xfs_find_bdev_for_inode(ip);
/*
* If we previously allocated a block out beyond eof and we are now
@@ -1420,7 +1407,7 @@ xfs_vm_direct_IO(
xfs_get_blocks_direct, endio, 0);
}
- bdev = xfs_find_bdev_for_inode(inode);
+ bdev = xfs_find_bdev_for_inode(XFS_I(inode));
return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
xfs_get_blocks_direct, endio, NULL, flags);
}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 43e1d51..70bedf0 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -474,6 +474,15 @@ do { \
iput(VFS_I(ip)); \
} while (0)
+static inline struct block_device *xfs_find_bdev_for_inode(struct xfs_inode *ip)
+{
+ if (XFS_IS_REALTIME_INODE(ip))
+ return ip->i_mount->m_rtdev_targp->bt_bdev;
+ else
+ return ip->i_mount->m_ddev_targp->bt_bdev;
+}
+
+
extern struct kmem_zone *xfs_inode_zone;
/*
--
2.1.4
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH 4/7] xfs: make xfs_bmbt_to_iomap available outside of xfs_pnfs.c
2016-03-14 21:02 [RFC] iomap infrastructure and multipage writes Christoph Hellwig
` (2 preceding siblings ...)
2016-03-14 21:02 ` [PATCH 3/7] xfs: make xfs_find_bdev_for_inode available outside of xfs_aops.c Christoph Hellwig
@ 2016-03-14 21:02 ` Christoph Hellwig
2016-03-14 21:02 ` [PATCH 5/7] xfs: reshuffle truncate Christoph Hellwig
` (2 subsequent siblings)
6 siblings, 0 replies; 12+ messages in thread
From: Christoph Hellwig @ 2016-03-14 21:02 UTC (permalink / raw)
To: xfs
And ensure it works for RT subvolume files an set the block device,
both of which will be needed to be able to use the function in the
buffered write path.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_iomap.c | 27 +++++++++++++++++++++++++++
fs/xfs/xfs_iomap.h | 4 ++++
fs/xfs/xfs_pnfs.c | 26 --------------------------
3 files changed, 31 insertions(+), 26 deletions(-)
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d81bdc0..cc53430 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -15,6 +15,7 @@
* along with this program; if not, write the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include <linux/iomap.h>
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
@@ -959,3 +960,29 @@ error_on_bmapi_transaction:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
+
+void
+xfs_bmbt_to_iomap(
+ struct xfs_inode *ip,
+ struct iomap *iomap,
+ struct xfs_bmbt_irec *imap)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (imap->br_startblock == HOLESTARTBLOCK) {
+ iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->type = IOMAP_HOLE;
+ } else if (imap->br_startblock == DELAYSTARTBLOCK) {
+ iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->type = IOMAP_DELALLOC;
+ } else {
+ iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
+ if (imap->br_state == XFS_EXT_UNWRITTEN)
+ iomap->type = IOMAP_UNWRITTEN;
+ else
+ iomap->type = IOMAP_MAPPED;
+ }
+ iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+ iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
+ iomap->bdev = xfs_find_bdev_for_inode(ip);
+}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 8688e66..718f07c 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,6 +18,7 @@
#ifndef __XFS_IOMAP_H__
#define __XFS_IOMAP_H__
+struct iomap;
struct xfs_inode;
struct xfs_bmbt_irec;
@@ -29,4 +30,7 @@ int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
struct xfs_bmbt_irec *);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
+void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
+ struct xfs_bmbt_irec *);
+
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 8b6f20f..7c5a8d5 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -80,32 +80,6 @@ xfs_fs_get_uuid(
return 0;
}
-static void
-xfs_bmbt_to_iomap(
- struct xfs_inode *ip,
- struct iomap *iomap,
- struct xfs_bmbt_irec *imap)
-{
- struct xfs_mount *mp = ip->i_mount;
-
- if (imap->br_startblock == HOLESTARTBLOCK) {
- iomap->blkno = IOMAP_NULL_BLOCK;
- iomap->type = IOMAP_HOLE;
- } else if (imap->br_startblock == DELAYSTARTBLOCK) {
- iomap->blkno = IOMAP_NULL_BLOCK;
- iomap->type = IOMAP_DELALLOC;
- } else {
- iomap->blkno =
- XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock);
- if (imap->br_state == XFS_EXT_UNWRITTEN)
- iomap->type = IOMAP_UNWRITTEN;
- else
- iomap->type = IOMAP_MAPPED;
- }
- iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
- iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
-}
-
/*
* Get a layout for the pNFS client.
*/
--
2.1.4
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH 5/7] xfs: reshuffle truncate
2016-03-14 21:02 [RFC] iomap infrastructure and multipage writes Christoph Hellwig
` (3 preceding siblings ...)
2016-03-14 21:02 ` [PATCH 4/7] xfs: make xfs_bmbt_to_iomap available outside of xfs_pnfs.c Christoph Hellwig
@ 2016-03-14 21:02 ` Christoph Hellwig
2016-03-14 21:02 ` [PATCH 6/7] xfs: implement iomap based buffered write path Christoph Hellwig
2016-03-14 21:02 ` [PATCH 7/7] xfs: remove buffered write support from __xfs_get_blocks Christoph Hellwig
6 siblings, 0 replies; 12+ messages in thread
From: Christoph Hellwig @ 2016-03-14 21:02 UTC (permalink / raw)
To: xfs
---
fs/xfs/xfs_iops.c | 33 +++++++++++++++++++--------------
1 file changed, 19 insertions(+), 14 deletions(-)
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index fb7dc61..1aab9f6 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -790,20 +790,35 @@ xfs_setattr_size(
return error;
/*
+ * Wait for all direct I/O to complete.
+ */
+ inode_dio_wait(inode);
+
+ /*
* File data changes must be complete before we start the transaction to
* modify the inode. This needs to be done before joining the inode to
* the transaction because the inode cannot be unlocked once it is a
* part of the transaction.
*
- * Start with zeroing any data block beyond EOF that we may expose on
- * file extension.
+ * Start with zeroing any data beyond EOF that we may expose on file
+ & extension, or zeroing out the rest of the block on a downward
+ * truncate.
*/
if (newsize > oldsize) {
error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
- if (error)
- return error;
+ } else {
+ if (IS_DAX(inode)) {
+ error = dax_truncate_page(inode, newsize,
+ xfs_get_blocks_direct);
+ } else {
+ error = block_truncate_page(inode->i_mapping, newsize,
+ xfs_get_blocks);
+ }
}
+ if (error)
+ return error;
+
/*
* We are going to log the inode size change in this transaction so
* any previous writes that are beyond the on disk EOF and the new
@@ -820,9 +835,6 @@ xfs_setattr_size(
return error;
}
- /* Now wait for all direct I/O to complete. */
- inode_dio_wait(inode);
-
/*
* We've already locked out new page faults, so now we can safely remove
* pages from the page cache knowing they won't get refaulted until we
@@ -840,13 +852,6 @@ xfs_setattr_size(
* to hope that the caller sees ENOMEM and retries the truncate
* operation.
*/
- if (IS_DAX(inode))
- error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
- else
- error = block_truncate_page(inode->i_mapping, newsize,
- xfs_get_blocks);
- if (error)
- return error;
truncate_setsize(inode, newsize);
tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
--
2.1.4
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH 6/7] xfs: implement iomap based buffered write path
2016-03-14 21:02 [RFC] iomap infrastructure and multipage writes Christoph Hellwig
` (4 preceding siblings ...)
2016-03-14 21:02 ` [PATCH 5/7] xfs: reshuffle truncate Christoph Hellwig
@ 2016-03-14 21:02 ` Christoph Hellwig
2016-03-14 21:02 ` [PATCH 7/7] xfs: remove buffered write support from __xfs_get_blocks Christoph Hellwig
6 siblings, 0 replies; 12+ messages in thread
From: Christoph Hellwig @ 2016-03-14 21:02 UTC (permalink / raw)
To: xfs
Convert XFS to use the new iomap based multipage write path. This involves
implementing the ->iomap_begin and ->iomap_end methods, and switching the
buffered file write, page_mkwrite and xfs_iozero paths to the new iomap
helpers.
With this change __xfs_get_blocks will never be used for buffered writes,
and the code handling them can be removed.
Based on earlier code from Dave Chinner.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_aops.c | 205 -----------------------------------------------------
fs/xfs/xfs_file.c | 71 ++++++++-----------
fs/xfs/xfs_iomap.c | 121 +++++++++++++++++++++++++++++++
fs/xfs/xfs_iomap.h | 5 +-
fs/xfs/xfs_iops.c | 9 +--
fs/xfs/xfs_trace.h | 3 +
6 files changed, 163 insertions(+), 251 deletions(-)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 72ee3f2..32aae77 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1412,209 +1412,6 @@ xfs_vm_direct_IO(
xfs_get_blocks_direct, endio, NULL, flags);
}
-/*
- * Punch out the delalloc blocks we have already allocated.
- *
- * Don't bother with xfs_setattr given that nothing can have made it to disk yet
- * as the page is still locked at this point.
- */
-STATIC void
-xfs_vm_kill_delalloc_range(
- struct inode *inode,
- loff_t start,
- loff_t end)
-{
- struct xfs_inode *ip = XFS_I(inode);
- xfs_fileoff_t start_fsb;
- xfs_fileoff_t end_fsb;
- int error;
-
- start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
- end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
- if (end_fsb <= start_fsb)
- return;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
- end_fsb - start_fsb);
- if (error) {
- /* something screwed, just bail */
- if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- xfs_alert(ip->i_mount,
- "xfs_vm_write_failed: unable to clean up ino %lld",
- ip->i_ino);
- }
- }
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-}
-
-STATIC void
-xfs_vm_write_failed(
- struct inode *inode,
- struct page *page,
- loff_t pos,
- unsigned len)
-{
- loff_t block_offset;
- loff_t block_start;
- loff_t block_end;
- loff_t from = pos & (PAGE_CACHE_SIZE - 1);
- loff_t to = from + len;
- struct buffer_head *bh, *head;
-
- /*
- * The request pos offset might be 32 or 64 bit, this is all fine
- * on 64-bit platform. However, for 64-bit pos request on 32-bit
- * platform, the high 32-bit will be masked off if we evaluate the
- * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
- * 0xfffff000 as an unsigned long, hence the result is incorrect
- * which could cause the following ASSERT failed in most cases.
- * In order to avoid this, we can evaluate the block_offset of the
- * start of the page by using shifts rather than masks the mismatch
- * problem.
- */
- block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT;
-
- ASSERT(block_offset + from == pos);
-
- head = page_buffers(page);
- block_start = 0;
- for (bh = head; bh != head || !block_start;
- bh = bh->b_this_page, block_start = block_end,
- block_offset += bh->b_size) {
- block_end = block_start + bh->b_size;
-
- /* skip buffers before the write */
- if (block_end <= from)
- continue;
-
- /* if the buffer is after the write, we're done */
- if (block_start >= to)
- break;
-
- /*
- * Process delalloc and unwritten buffers beyond EOF. We can
- * encounter unwritten buffers in the event that a file has
- * post-EOF unwritten extents and an extending write happens to
- * fail (e.g., an unaligned write that also involves a delalloc
- * to the same page).
- */
- if (!buffer_delay(bh) && !buffer_unwritten(bh))
- continue;
-
- if (!buffer_new(bh) && block_offset < i_size_read(inode))
- continue;
-
- if (buffer_delay(bh))
- xfs_vm_kill_delalloc_range(inode, block_offset,
- block_offset + bh->b_size);
-
- /*
- * This buffer does not contain data anymore. make sure anyone
- * who finds it knows that for certain.
- */
- clear_buffer_delay(bh);
- clear_buffer_uptodate(bh);
- clear_buffer_mapped(bh);
- clear_buffer_new(bh);
- clear_buffer_dirty(bh);
- clear_buffer_unwritten(bh);
- }
-
-}
-
-/*
- * This used to call block_write_begin(), but it unlocks and releases the page
- * on error, and we need that page to be able to punch stale delalloc blocks out
- * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
- * the appropriate point.
- */
-STATIC int
-xfs_vm_write_begin(
- struct file *file,
- struct address_space *mapping,
- loff_t pos,
- unsigned len,
- unsigned flags,
- struct page **pagep,
- void **fsdata)
-{
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- struct page *page;
- int status;
-
- ASSERT(len <= PAGE_CACHE_SIZE);
-
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page)
- return -ENOMEM;
-
- status = __block_write_begin(page, pos, len, xfs_get_blocks);
- if (unlikely(status)) {
- struct inode *inode = mapping->host;
- size_t isize = i_size_read(inode);
-
- xfs_vm_write_failed(inode, page, pos, len);
- unlock_page(page);
-
- /*
- * If the write is beyond EOF, we only want to kill blocks
- * allocated in this write, not blocks that were previously
- * written successfully.
- */
- if (pos + len > isize) {
- ssize_t start = max_t(ssize_t, pos, isize);
-
- truncate_pagecache_range(inode, start, pos + len);
- }
-
- page_cache_release(page);
- page = NULL;
- }
-
- *pagep = page;
- return status;
-}
-
-/*
- * On failure, we only need to kill delalloc blocks beyond EOF in the range of
- * this specific write because they will never be written. Previous writes
- * beyond EOF where block allocation succeeded do not need to be trashed, so
- * only new blocks from this write should be trashed. For blocks within
- * EOF, generic_write_end() zeros them so they are safe to leave alone and be
- * written with all the other valid data.
- */
-STATIC int
-xfs_vm_write_end(
- struct file *file,
- struct address_space *mapping,
- loff_t pos,
- unsigned len,
- unsigned copied,
- struct page *page,
- void *fsdata)
-{
- int ret;
-
- ASSERT(len <= PAGE_CACHE_SIZE);
-
- ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
- if (unlikely(ret < len)) {
- struct inode *inode = mapping->host;
- size_t isize = i_size_read(inode);
- loff_t to = pos + len;
-
- if (to > isize) {
- /* only kill blocks in this write beyond EOF */
- if (pos > isize)
- isize = pos;
- xfs_vm_kill_delalloc_range(inode, isize, to);
- truncate_pagecache_range(inode, isize, to);
- }
- }
- return ret;
-}
-
STATIC sector_t
xfs_vm_bmap(
struct address_space *mapping,
@@ -1726,8 +1523,6 @@ const struct address_space_operations xfs_address_space_operations = {
.set_page_dirty = xfs_vm_set_page_dirty,
.releasepage = xfs_vm_releasepage,
.invalidatepage = xfs_vm_invalidatepage,
- .write_begin = xfs_vm_write_begin,
- .write_end = xfs_vm_write_end,
.bmap = xfs_vm_bmap,
.direct_IO = xfs_vm_direct_IO,
.migratepage = buffer_migrate_page,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ac0fd32..f2cb984 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -37,6 +37,7 @@
#include "xfs_log.h"
#include "xfs_icache.h"
#include "xfs_pnfs.h"
+#include "xfs_iomap.h"
#include <linux/dcache.h>
#include <linux/falloc.h>
@@ -79,57 +80,27 @@ xfs_rw_ilock_demote(
inode_unlock(VFS_I(ip));
}
-/*
- * xfs_iozero clears the specified range supplied via the page cache (except in
- * the DAX case). Writes through the page cache will allocate blocks over holes,
- * though the callers usually map the holes first and avoid them. If a block is
- * not completely zeroed, then it will be read from disk before being partially
- * zeroed.
- *
- * In the DAX case, we can just directly write to the underlying pages. This
- * will not allocate blocks, but will avoid holes and unwritten extents and so
- * not do unnecessary work.
- */
-int
-xfs_iozero(
- struct xfs_inode *ip, /* inode */
- loff_t pos, /* offset in file */
- size_t count) /* size of data to zero */
+static int
+xfs_dax_zero_range(
+ struct inode *inode,
+ loff_t pos,
+ size_t count)
{
- struct page *page;
- struct address_space *mapping;
int status = 0;
-
- mapping = VFS_I(ip)->i_mapping;
do {
unsigned offset, bytes;
- void *fsdata;
offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
bytes = PAGE_CACHE_SIZE - offset;
if (bytes > count)
bytes = count;
- if (IS_DAX(VFS_I(ip))) {
- status = dax_zero_page_range(VFS_I(ip), pos, bytes,
- xfs_get_blocks_direct);
- if (status)
- break;
- } else {
- status = pagecache_write_begin(NULL, mapping, pos, bytes,
- AOP_FLAG_UNINTERRUPTIBLE,
- &page, &fsdata);
- if (status)
- break;
-
- zero_user(page, offset, bytes);
+ status = dax_zero_page_range(inode, pos, bytes,
+ xfs_get_blocks_direct);
+ if (status)
+ break;
- status = pagecache_write_end(NULL, mapping, pos, bytes,
- bytes, page, fsdata);
- WARN_ON(status <= 0); /* can't return less than zero! */
- status = 0;
- }
pos += bytes;
count -= bytes;
} while (count);
@@ -137,6 +108,24 @@ xfs_iozero(
return status;
}
+/*
+ * Clear the specified ranges to zero through either the pagecache or DAX.
+ * Holes and unwritten extents will be left as-is as they already are zeroed.
+ */
+int
+xfs_iozero(
+ struct xfs_inode *ip,
+ loff_t pos,
+ size_t count)
+{
+ struct inode *inode = VFS_I(ip);
+
+ if (IS_DAX(VFS_I(ip)))
+ return xfs_dax_zero_range(inode, pos, count);
+ else
+ return iomap_zero_range(inode, pos, count, NULL, &xfs_iomap_ops);
+}
+
int
xfs_update_prealloc_flags(
struct xfs_inode *ip,
@@ -844,7 +833,7 @@ xfs_file_buffered_aio_write(
write_retry:
trace_xfs_file_buffered_write(ip, iov_iter_count(from),
iocb->ki_pos, 0);
- ret = generic_perform_write(file, from, iocb->ki_pos);
+ ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
if (likely(ret >= 0))
iocb->ki_pos += ret;
@@ -1560,7 +1549,7 @@ xfs_filemap_page_mkwrite(
if (IS_DAX(inode)) {
ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
} else {
- ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+ ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
ret = block_page_mkwrite_return(ret);
}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index cc53430..68e8907 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -986,3 +986,124 @@ xfs_bmbt_to_iomap(
iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
iomap->bdev = xfs_find_bdev_for_inode(ip);
}
+
+static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
+{
+ return !nimaps ||
+ imap->br_startblock == HOLESTARTBLOCK ||
+ imap->br_startblock == DELAYSTARTBLOCK;
+}
+
+static int
+xfs_file_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ ssize_t count,
+ unsigned flags,
+ struct iomap *iomap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec imap;
+ xfs_fileoff_t offset_fsb, end_fsb;
+ int nimaps = 1, error = 0;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ ASSERT(offset <= mp->m_super->s_maxbytes);
+ if ((xfs_fsize_t)offset + count > mp->m_super->s_maxbytes)
+ count = mp->m_super->s_maxbytes - offset;
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ end_fsb = XFS_B_TO_FSB(mp, offset + count);
+
+ error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+ &nimaps, XFS_BMAPI_ENTIRE);
+ if (error) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+ }
+
+ if ((flags & IOMAP_ALLOCATE) && imap_needs_alloc(&imap, nimaps)) {
+ if (xfs_get_extsz_hint(ip)) {
+ /*
+ * xfs_iomap_write_direct() expects the shared lock. It
+ * is unlocked on return.
+ */
+ xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
+ error = xfs_iomap_write_direct(ip, offset, count, &imap,
+ nimaps);
+ } else {
+ error = xfs_iomap_write_delay(ip, offset, count, &imap);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+
+ trace_xfs_iomap_alloc(ip, offset, count, 0, &imap);
+ } else if (nimaps) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ trace_xfs_iomap_found(ip, offset, count, 0, &imap);
+ } else {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ trace_xfs_iomap_not_found(ip, offset, count, 0, &imap);
+ iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->type = IOMAP_HOLE;
+ iomap->offset = offset;
+ iomap->length = count;
+
+ return 0;
+ }
+
+ if (!error)
+ xfs_bmbt_to_iomap(ip, iomap, &imap);
+ return error;
+}
+
+static int
+xfs_file_iomap_end(
+ struct inode *inode,
+ loff_t offset,
+ ssize_t count,
+ ssize_t written,
+ struct iomap *iomap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t start_fsb;
+ xfs_fileoff_t end_fsb;
+ int error = 0;
+
+ if (iomap->type != IOMAP_DELALLOC)
+ return 0;
+
+ start_fsb = XFS_B_TO_FSB(mp, offset + written);
+ end_fsb = XFS_B_TO_FSB(mp, offset + count - written);
+
+ /*
+ * Trim back delalloc blocks if we didn't manage to write the whole
+ * range reserved.
+ *
+ * We don't need to care about racing delalloc as we hold i_mutex
+ * across the reserve/allocate/unreserve calls. If there are delalloc
+ * blocks in the range, they are ours.
+ */
+ if (start_fsb < end_fsb) {
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+ end_fsb - start_fsb);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ if (error && !XFS_FORCED_SHUTDOWN(mp)) {
+ xfs_alert(mp, "%s: unable to clean up ino %lld",
+ __func__, ip->i_ino);
+ }
+ }
+
+ return error;
+}
+
+struct iomap_ops xfs_iomap_ops = {
+ .iomap_begin = xfs_file_iomap_begin,
+ .iomap_end = xfs_file_iomap_end,
+};
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 718f07c..e066d04 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,7 +18,8 @@
#ifndef __XFS_IOMAP_H__
#define __XFS_IOMAP_H__
-struct iomap;
+#include <linux/iomap.h>
+
struct xfs_inode;
struct xfs_bmbt_irec;
@@ -33,4 +34,6 @@ int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
struct xfs_bmbt_irec *);
+extern struct iomap_ops xfs_iomap_ops;
+
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 1aab9f6..cbd8fab 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -38,6 +38,7 @@
#include "xfs_dir2.h"
#include "xfs_trans_space.h"
#include "xfs_pnfs.h"
+#include "xfs_iomap.h"
#include <linux/capability.h>
#include <linux/xattr.h>
@@ -811,8 +812,8 @@ xfs_setattr_size(
error = dax_truncate_page(inode, newsize,
xfs_get_blocks_direct);
} else {
- error = block_truncate_page(inode->i_mapping, newsize,
- xfs_get_blocks);
+ error = iomap_truncate_page(inode, newsize,
+ &did_zeroing, &xfs_iomap_ops);
}
}
@@ -827,8 +828,8 @@ xfs_setattr_size(
* problem. Note that this includes any block zeroing we did above;
* otherwise those blocks may not be zeroed after a crash.
*/
- if (newsize > ip->i_d.di_size &&
- (oldsize != ip->i_d.di_size || did_zeroing)) {
+ if (did_zeroing ||
+ (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
ip->i_d.di_size, newsize);
if (error)
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index c8d5842..1bf3840 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1297,6 +1297,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
+DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IOMAP_EVENT(xfs_iomap_found);
+DEFINE_IOMAP_EVENT(xfs_iomap_not_found);
DECLARE_EVENT_CLASS(xfs_simple_io_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
--
2.1.4
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH 7/7] xfs: remove buffered write support from __xfs_get_blocks
2016-03-14 21:02 [RFC] iomap infrastructure and multipage writes Christoph Hellwig
` (5 preceding siblings ...)
2016-03-14 21:02 ` [PATCH 6/7] xfs: implement iomap based buffered write path Christoph Hellwig
@ 2016-03-14 21:02 ` Christoph Hellwig
6 siblings, 0 replies; 12+ messages in thread
From: Christoph Hellwig @ 2016-03-14 21:02 UTC (permalink / raw)
To: xfs
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_aops.c | 77 +++++++++++++++----------------------------------------
1 file changed, 21 insertions(+), 56 deletions(-)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 32aae77..8b09e93 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1110,7 +1110,6 @@ __xfs_get_blocks(
sector_t iblock,
struct buffer_head *bh_result,
int create,
- bool direct,
bool dax_fault)
{
struct xfs_inode *ip = XFS_I(inode);
@@ -1131,22 +1130,14 @@ __xfs_get_blocks(
ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
size = bh_result->b_size;
- if (!create && direct && offset >= i_size_read(inode))
+ if (!create && offset >= i_size_read(inode))
return 0;
/*
* Direct I/O is usually done on preallocated files, so try getting
- * a block mapping without an exclusive lock first. For buffered
- * writes we already have the exclusive iolock anyway, so avoiding
- * a lock roundtrip here by taking the ilock exclusive from the
- * beginning is a useful micro optimization.
+ * a block mapping without an exclusive lock first.
*/
- if (create && !direct) {
- lockmode = XFS_ILOCK_EXCL;
- xfs_ilock(ip, lockmode);
- } else {
- lockmode = xfs_ilock_data_map_shared(ip);
- }
+ lockmode = xfs_ilock_data_map_shared(ip);
ASSERT(offset <= mp->m_super->s_maxbytes);
if (offset + size > mp->m_super->s_maxbytes)
@@ -1165,37 +1156,19 @@ __xfs_get_blocks(
(imap.br_startblock == HOLESTARTBLOCK ||
imap.br_startblock == DELAYSTARTBLOCK) ||
(IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
- if (direct || xfs_get_extsz_hint(ip)) {
- /*
- * xfs_iomap_write_direct() expects the shared lock. It
- * is unlocked on return.
- */
- if (lockmode == XFS_ILOCK_EXCL)
- xfs_ilock_demote(ip, lockmode);
-
- error = xfs_iomap_write_direct(ip, offset, size,
- &imap, nimaps);
- if (error)
- return error;
- new = 1;
+ /*
+ * xfs_iomap_write_direct() expects the shared lock. It
+ * is unlocked on return.
+ */
+ if (lockmode == XFS_ILOCK_EXCL)
+ xfs_ilock_demote(ip, lockmode);
- } else {
- /*
- * Delalloc reservations do not require a transaction,
- * we can go on without dropping the lock here. If we
- * are allocating a new delalloc block, make sure that
- * we set the new flag so that we mark the buffer new so
- * that we know that it is newly allocated if the write
- * fails.
- */
- if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
- new = 1;
- error = xfs_iomap_write_delay(ip, offset, size, &imap);
- if (error)
- goto out_unlock;
+ error = xfs_iomap_write_direct(ip, offset, size,
+ &imap, nimaps);
+ if (error)
+ return error;
+ new = 1;
- xfs_iunlock(ip, lockmode);
- }
trace_xfs_get_blocks_alloc(ip, offset, size,
ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
: XFS_IO_DELALLOC, &imap);
@@ -1216,9 +1189,7 @@ __xfs_get_blocks(
}
/* trim mapping down to size requested */
- if (direct || size > (1 << inode->i_blkbits))
- xfs_map_trim_size(inode, iblock, bh_result,
- &imap, offset, size);
+ xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
/*
* For unwritten extents do not report a disk address in the buffered
@@ -1231,7 +1202,7 @@ __xfs_get_blocks(
if (ISUNWRITTEN(&imap))
set_buffer_unwritten(bh_result);
/* direct IO needs special help */
- if (create && direct) {
+ if (create) {
if (dax_fault)
ASSERT(!ISUNWRITTEN(&imap));
else
@@ -1260,14 +1231,7 @@ __xfs_get_blocks(
(new || ISUNWRITTEN(&imap))))
set_buffer_new(bh_result);
- if (imap.br_startblock == DELAYSTARTBLOCK) {
- BUG_ON(direct);
- if (create) {
- set_buffer_uptodate(bh_result);
- set_buffer_mapped(bh_result);
- set_buffer_delay(bh_result);
- }
- }
+ BUG_ON(imap.br_startblock == DELAYSTARTBLOCK);
return 0;
@@ -1283,7 +1247,8 @@ xfs_get_blocks(
struct buffer_head *bh_result,
int create)
{
- return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
+ BUG_ON(create);
+ return __xfs_get_blocks(inode, iblock, bh_result, create, false);
}
int
@@ -1293,7 +1258,7 @@ xfs_get_blocks_direct(
struct buffer_head *bh_result,
int create)
{
- return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
+ return __xfs_get_blocks(inode, iblock, bh_result, create, false);
}
int
@@ -1303,7 +1268,7 @@ xfs_get_blocks_dax_fault(
struct buffer_head *bh_result,
int create)
{
- return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
+ return __xfs_get_blocks(inode, iblock, bh_result, create, true);
}
/*
--
2.1.4
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 12+ messages in thread