From: Matthew Wilcox <willy@linux.intel.com>
To: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <willy@linux.intel.com>,
"Wilcox, Matthew R" <matthew.r.wilcox@intel.com>,
"ross.zwisler@linux.intel.com" <ross.zwisler@linux.intel.com>,
"akpm@linux-foundation.org" <akpm@linux-foundation.org>,
"Dilger, Andreas" <andreas.dilger@intel.com>,
"axboe@kernel.dk" <axboe@kernel.dk>,
"boaz@plexistor.com" <boaz@plexistor.com>,
"david@fromorbit.com" <david@fromorbit.com>,
"hch@lst.de" <hch@lst.de>,
"kirill.shutemov@linux.intel.com"
<kirill.shutemov@linux.intel.com>,
"mathieu.desnoyers@efficios.com" <mathieu.desnoyers@efficios.com>,
"rdunlap@infradead.org" <rdunlap@infradead.org>,
"tytso@mit.edu" <tytso@mit.edu>,
"mm-commits@vger.kernel.org" <mm-commits@vger.kernel.org>,
"linux-ext4@vger.kernel.org" <linux-ext4@vger.kernel.org>,
xfs@oss.sgi.com
Subject: Re: + ext4-add-dax-functionality.patch added to -mm tree
Date: Fri, 20 Feb 2015 17:15:51 -0500 [thread overview]
Message-ID: <20150220221551.GB2780@wil.cx> (raw)
In-Reply-To: <20150218104009.GB4614@quack.suse.cz>
> So to handle this it can start transaction in ext4_dax_fault() /
> ext4_dax_mkwrite() if write is requested and call ext4_jbd2_file_inode()
> after dax_fault() / dax_mkwrite() returns. Complete function will look
> something like follows:
How about this? I tried to encompass both the unwritten extent conversion
as well as starting the journal at the right point in the locking hierarchy.
If we're going to expose do_dax_fault(), I think it needs to be called
__dax_fault().
I decided to return VM_FAULT_RETRY and a new flag VM_FAULT_UNWRITTEN from
__dax_fault(), rather than convert it to return an errno.
P.S. I love patches which touch *both* fs.h *and* mm.h. In case there
were any files that weren't already being rebuilt.
diff --git a/fs/dax.c b/fs/dax.c
index 556238f..81dbdaa 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -316,7 +316,7 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
return error;
}
-static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
get_block_t get_block)
{
struct file *file = vma->vm_file;
@@ -329,7 +329,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
sector_t block;
pgoff_t size;
int error;
- int major = 0;
+ int ret = 0;
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (vmf->pgoff >= size)
@@ -367,13 +367,15 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
error = -EIO; /* fs corruption? */
if (error)
goto unlock_page;
+ if (buffer_unwritten(&bh))
+ ret |= VM_FAULT_UNWRITTEN;
if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
if (vmf->flags & FAULT_FLAG_WRITE) {
error = get_block(inode, block, &bh, 1);
count_vm_event(PGMAJFAULT);
mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
- major = VM_FAULT_MAJOR;
+ ret = VM_FAULT_MAJOR;
if (!error && (bh.b_size < PAGE_SIZE))
error = -EIO;
if (error)
@@ -407,7 +409,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
/* Check we didn't race with a read fault installing a new page */
- if (!page && major)
+ if (!page && (ret & VM_FAULT_MAJOR))
page = find_lock_page(mapping, vmf->pgoff);
if (page) {
@@ -421,12 +423,14 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
error = dax_insert_mapping(inode, &bh, vma, vmf);
out:
+ if (error == -ENOSPC)
+ return VM_FAULT_RETRY | ret;
if (error == -ENOMEM)
- return VM_FAULT_OOM | major;
+ return VM_FAULT_OOM | ret;
/* -EBUSY is fine, somebody else faulted on the same PTE */
if ((error < 0) && (error != -EBUSY))
- return VM_FAULT_SIGBUS | major;
- return VM_FAULT_NOPAGE | major;
+ return VM_FAULT_SIGBUS | ret;
+ return VM_FAULT_NOPAGE | ret;
unlock_page:
if (page) {
@@ -435,6 +439,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
goto out;
}
+EXPORT_SYMBOL_GPL(__dax_fault);
/**
* dax_fault - handle a page fault on a DAX file
@@ -455,7 +460,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
sb_start_pagefault(sb);
file_update_time(vma->vm_file);
}
- result = do_dax_fault(vma, vmf, get_block);
+ result = __dax_fault(vma, vmf, get_block);
if (vmf->flags & FAULT_FLAG_WRITE)
sb_end_pagefault(sb);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4340e38..84b4f1c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -194,7 +194,58 @@ errout:
#ifdef CONFIG_FS_DAX
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_fault(vma, vmf, ext4_get_block_write);
+ handle_t *handle;
+ int create = (vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page;
+ struct inode *inode = file_inode(vma->vm_file);
+ int ret, err = 0;
+ int retries = 0;
+
+ if (create) {
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ retry_alloc:
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+ ext4_writepage_trans_blocks(inode));
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto err;
+ }
+ }
+
+ ret = __dax_fault(vma, vmf, ext4_get_block);
+
+ if (create) {
+ if (ret & VM_FAULT_UNWRITTEN) {
+ loff_t offset = (loff_t)vmf->pgoff << PAGE_SHIFT;
+ err = ext4_convert_unwritten_extents(NULL, inode,
+ offset, PAGE_SIZE);
+ ret &= ~VM_FAULT_UNWRITTEN;
+ }
+ if (!err &&
+ ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
+ err = ext4_jbd2_file_inode(handle, inode);
+
+ if (err == -ENOSPC) {
+ ret |= VM_FAULT_RETRY;
+ err = 0;
+ }
+
+ ext4_journal_stop(handle);
+ if (err < 0)
+ goto err;
+ if ((ret & VM_FAULT_RETRY) &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_alloc;
+ ret &= ~VM_FAULT_RETRY;
+ }
+
+ out:
+ if (create)
+ sb_end_pagefault(inode->i_sb);
+ return ret;
+ err:
+ ret = block_page_mkwrite_return(err);
+ goto out;
}
static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 85404f1..8f1ea7d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -657,18 +657,6 @@ has_zeroout:
return retval;
}
-static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
-{
- struct inode *inode = bh->b_assoc_map->host;
- /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
- loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
- int err;
- if (!uptodate)
- return;
- WARN_ON(!buffer_unwritten(bh));
- err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
-}
-
/* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096
@@ -706,11 +694,6 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
- if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
- bh->b_assoc_map = inode->i_mapping;
- bh->b_private = (void *)(unsigned long)iblock;
- bh->b_end_io = ext4_end_io_unwritten;
- }
if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
set_buffer_defer_completion(bh);
bh->b_size = inode->i_sb->s_blocksize * map.m_len;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 239c89c..2af5050 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2597,6 +2597,7 @@ int dax_clear_blocks(struct inode *, sector_t block, long size);
int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
int dax_truncate_page(struct inode *, loff_t from, get_block_t);
int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
unsigned int flags, get_block_t);
#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ceb50ec..ffc9947 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1100,7 +1100,7 @@ static inline int page_mapped(struct page *page)
#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */
#define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */
#define VM_FAULT_SIGSEGV 0x0040
next prev parent reply other threads:[~2015-02-20 22:15 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <54b45495.+RptMlNQorYE9TTf%akpm@linux-foundation.org>
2015-01-15 12:41 ` + ext4-add-dax-functionality.patch added to -mm tree Jan Kara
2015-01-16 21:16 ` Wilcox, Matthew R
2015-01-19 14:18 ` Jan Kara
2015-02-17 8:52 ` Jan Kara
2015-02-17 13:37 ` Matthew Wilcox
2015-02-18 10:40 ` Jan Kara
2015-02-18 21:55 ` Dave Chinner
2015-02-18 21:59 ` hch
2015-02-19 15:42 ` Jan Kara
2015-02-19 21:12 ` Dave Chinner
2015-02-19 23:08 ` Dave Chinner
2015-02-20 12:05 ` Jan Kara
2015-02-20 22:15 ` Matthew Wilcox [this message]
2015-02-23 12:52 ` Jan Kara
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20150220221551.GB2780@wil.cx \
--to=willy@linux.intel.com \
--cc=akpm@linux-foundation.org \
--cc=andreas.dilger@intel.com \
--cc=axboe@kernel.dk \
--cc=boaz@plexistor.com \
--cc=david@fromorbit.com \
--cc=hch@lst.de \
--cc=jack@suse.cz \
--cc=kirill.shutemov@linux.intel.com \
--cc=linux-ext4@vger.kernel.org \
--cc=mathieu.desnoyers@efficios.com \
--cc=matthew.r.wilcox@intel.com \
--cc=mm-commits@vger.kernel.org \
--cc=rdunlap@infradead.org \
--cc=ross.zwisler@linux.intel.com \
--cc=tytso@mit.edu \
--cc=xfs@oss.sgi.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).