From: Matthew Wilcox <matthew.r.wilcox@intel.com>
To: Ingo Molnar <mingo@redhat.com>, Andy Lutomirski <luto@amacapital.net>
Cc: Matthew Wilcox <willy@linux.intel.com>,
Kees Cook <keescook@chromium.org>,
Andrew Morton <akpm@linux-foundation.org>,
linux-kernel@vger.kernel.org, linux-mm@kvack.org
Subject: [PATCH 3/3] dax: Handle write faults more efficiently
Date: Mon, 25 Jan 2016 12:25:17 -0500 [thread overview]
Message-ID: <1453742717-10326-4-git-send-email-matthew.r.wilcox@intel.com> (raw)
In-Reply-To: <1453742717-10326-1-git-send-email-matthew.r.wilcox@intel.com>
From: Matthew Wilcox <willy@linux.intel.com>
When we handle a write-fault on a DAX mapping, we currently insert a
read-only mapping and then take the page fault again to convert it to
a writable mapping. This is necessary for the case where we cover a
hole with a read-only zero page, but when we have a data block already
allocated, it is inefficient.
Use the recently added vmf_insert_pfn_prot() to insert a writable mapping,
even though the default VM flags say to use a read-only mapping.
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
fs/dax.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++------------------
1 file changed, 53 insertions(+), 20 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index 206650f..3f6138d 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -519,9 +519,44 @@ int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
+/*
+ * The default page protections for DAX VMAs are set to "copy" so that
+ * we get notifications when zero pages are written to. This function
+ * is called when we're inserting a mapping to a data page. If this is
+ * a write fault, we've already done all the necessary accounting and
+ * it's pointless to insert this translation entry read-only. Convert
+ * the pgprot to be writable.
+ *
+ * While this is not the most elegant code, the compiler can see that (on
+ * any sane architecture) all four arms of the conditional are the same.
+ */
+static pgprot_t dax_pgprot(struct vm_area_struct *vma, bool write)
+{
+ pgprot_t pgprot = vma->vm_page_prot;
+ if (!write)
+ return pgprot;
+ if ((vma->vm_flags & (VM_READ|VM_EXEC)) == (VM_READ|VM_EXEC))
+ return __pgprot(pgprot_val(pgprot) ^
+ pgprot_val(__P111) ^
+ pgprot_val(__S111));
+ else if ((vma->vm_flags & (VM_READ|VM_EXEC)) == VM_READ)
+ return __pgprot(pgprot_val(pgprot) ^
+ pgprot_val(__P110) ^
+ pgprot_val(__S110));
+ else if ((vma->vm_flags & (VM_READ|VM_EXEC)) == VM_EXEC)
+ return __pgprot(pgprot_val(pgprot) ^
+ pgprot_val(__P011) ^
+ pgprot_val(__S011));
+ else
+ return __pgprot(pgprot_val(pgprot) ^
+ pgprot_val(__P010) ^
+ pgprot_val(__S010));
+}
+
static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
unsigned long vaddr = (unsigned long)vmf->virtual_address;
struct address_space *mapping = inode->i_mapping;
struct block_device *bdev = bh->b_bdev;
@@ -530,7 +565,7 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
.size = bh->b_size,
};
pgoff_t size;
- int error;
+ int result;
i_mmap_lock_read(mapping);
@@ -542,15 +577,11 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
* allocated past the end of the file.
*/
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (unlikely(vmf->pgoff >= size)) {
- error = -EIO;
- goto out;
- }
+ if (unlikely(vmf->pgoff >= size))
+ goto sigbus;
- if (dax_map_atomic(bdev, &dax) < 0) {
- error = PTR_ERR(dax.addr);
- goto out;
- }
+ if (dax_map_atomic(bdev, &dax) < 0)
+ goto sigbus;
if (buffer_unwritten(bh) || buffer_new(bh)) {
clear_pmem(dax.addr, PAGE_SIZE);
@@ -558,17 +589,19 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
}
dax_unmap_atomic(bdev, &dax);
- error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
- vmf->flags & FAULT_FLAG_WRITE);
- if (error)
- goto out;
+ if (dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, write))
+ goto sigbus;
- error = vm_insert_mixed(vma, vaddr, dax.pfn);
+ result = vmf_insert_pfn_prot(vma, vaddr, dax.pfn,
+ dax_pgprot(vma, write));
out:
i_mmap_unlock_read(mapping);
+ return result;
- return error;
+ sigbus:
+ result = VM_FAULT_SIGBUS;
+ goto out;
}
/**
@@ -599,7 +632,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
unsigned blkbits = inode->i_blkbits;
sector_t block;
pgoff_t size;
- int error;
+ int result, error;
int major = 0;
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -701,19 +734,19 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
* indicate what the callback should do via the uptodate variable, same
* as for normal BH based IO completions.
*/
- error = dax_insert_mapping(inode, &bh, vma, vmf);
+ result = dax_insert_mapping(inode, &bh, vma, vmf);
if (buffer_unwritten(&bh)) {
if (complete_unwritten)
- complete_unwritten(&bh, !error);
+ complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
else
WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
}
+ return result | major;
out:
if (error == -ENOMEM)
return VM_FAULT_OOM | major;
- /* -EBUSY is fine, somebody else faulted on the same PTE */
- if ((error < 0) && (error != -EBUSY))
+ if (error < 0)
return VM_FAULT_SIGBUS | major;
return VM_FAULT_NOPAGE | major;
--
2.7.0.rc3
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2016-01-25 17:26 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-01-25 17:25 [PATCH 0/3] Fixes for vm_insert_pfn_prot() Matthew Wilcox
2016-01-25 17:25 ` [PATCH 1/3] x86: Honour passed pgprot in track_pfn_insert() and track_pfn_remap() Matthew Wilcox
2016-01-25 17:33 ` Andy Lutomirski
2016-01-25 17:46 ` Andy Lutomirski
2016-01-27 4:40 ` Matthew Wilcox
2016-01-27 5:44 ` Andy Lutomirski
2016-01-29 14:49 ` Matthew Wilcox
2016-01-29 22:19 ` Andy Lutomirski
2016-02-09 14:24 ` Ingo Molnar
2016-02-10 3:06 ` Andy Lutomirski
2016-01-25 17:25 ` [PATCH 2/3] mm: Convert vm_insert_pfn_prot to vmf_insert_pfn_prot Matthew Wilcox
2016-01-25 17:35 ` Andy Lutomirski
2016-01-27 4:18 ` Matthew Wilcox
2016-01-25 17:25 ` Matthew Wilcox [this message]
2016-01-25 17:38 ` [PATCH 3/3] dax: Handle write faults more efficiently Andy Lutomirski
2016-01-27 4:17 ` Matthew Wilcox
2016-01-27 5:22 ` Andy Lutomirski
2016-01-27 6:01 ` Andy Lutomirski
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1453742717-10326-4-git-send-email-matthew.r.wilcox@intel.com \
--to=matthew.r.wilcox@intel.com \
--cc=akpm@linux-foundation.org \
--cc=keescook@chromium.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=luto@amacapital.net \
--cc=mingo@redhat.com \
--cc=willy@linux.intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).