From: Jan Kara <jack@suse.cz>
To: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
linux-kernel@vger.kernel.org, willy@linux.intel.com
Subject: Re: [PATCH v7 02/22] Allow page fault handlers to perform the COW
Date: Tue, 8 Apr 2014 18:34:57 +0200 [thread overview]
Message-ID: <20140408163457.GD2713@quack.suse.cz> (raw)
In-Reply-To: <feee29988e167b019f5726cd497b1470b050a3ce.1395591795.git.matthew.r.wilcox@intel.com>
On Sun 23-03-14 15:08:28, Matthew Wilcox wrote:
> Currently COW of an XIP file is done by first bringing in a read-only
> mapping, then retrying the fault and copying the page. It is much more
> efficient to tell the fault handler that a COW is being attempted (by
> passing in the pre-allocated page in the vm_fault structure), and allow
> the handler to perform the COW operation itself.
>
> Where the filemap code protects against truncation of the file until
> the PTE has been installed with the page lock, the XIP code use the
> i_mmap_mutex instead. We must therefore unlock the i_mmap_mutex after
> inserting the PTE.
Eww, leaking of locking details about DAX into generic fault code is
really ugly. It seems to me that once you pass the cow_page into the fault
handler (which looks OK to me), you can just directly install it in PTE
via vm_insert_page() and you don't have to rely on do_cow_fault() for that.
Thus you can return VM_FAULT_NOPAGE and be done with it? Basically cow
faults will then work the same way as other faults for DAX... Or am I
missing something?
Honza
> Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
> ---
> include/linux/mm.h | 2 ++
> mm/memory.c | 45 +++++++++++++++++++++++++++++++++------------
> 2 files changed, 35 insertions(+), 12 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index c1b7414..513b78a 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -205,6 +205,7 @@ struct vm_fault {
> pgoff_t pgoff; /* Logical page offset based on vma */
> void __user *virtual_address; /* Faulting virtual address */
>
> + struct page *cow_page; /* Handler may choose to COW */
> struct page *page; /* ->fault handlers should return a
> * page here, unless VM_FAULT_NOPAGE
> * is set (which is also implied by
> @@ -1010,6 +1011,7 @@ static inline int page_mapped(struct page *page)
> #define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */
> #define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */
>
> +#define VM_FAULT_COWED 0x0080 /* ->fault COWed the page instead */
> #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
> #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
> #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */
> diff --git a/mm/memory.c b/mm/memory.c
> index 07b4287..2a2ecac 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2602,6 +2602,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
> vmf.pgoff = page->index;
> vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
> vmf.page = page;
> + vmf.cow_page = NULL;
>
> ret = vma->vm_ops->page_mkwrite(vma, &vmf);
> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
> @@ -3288,7 +3289,8 @@ oom:
> }
>
> static int __do_fault(struct vm_area_struct *vma, unsigned long address,
> - pgoff_t pgoff, unsigned int flags, struct page **page)
> + pgoff_t pgoff, unsigned int flags,
> + struct page *cow_page, struct page **page)
> {
> struct vm_fault vmf;
> int ret;
> @@ -3297,10 +3299,13 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
> vmf.pgoff = pgoff;
> vmf.flags = flags;
> vmf.page = NULL;
> + vmf.cow_page = cow_page;
>
> ret = vma->vm_ops->fault(vma, &vmf);
> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
> return ret;
> + if (unlikely(ret & VM_FAULT_COWED))
> + goto out;
>
> if (unlikely(PageHWPoison(vmf.page))) {
> if (ret & VM_FAULT_LOCKED)
> @@ -3314,6 +3319,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
> else
> VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
>
> + out:
> *page = vmf.page;
> return ret;
> }
> @@ -3351,7 +3357,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> pte_t *pte;
> int ret;
>
> - ret = __do_fault(vma, address, pgoff, flags, &fault_page);
> + ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
> return ret;
>
> @@ -3368,6 +3374,12 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> return ret;
> }
>
> +/*
> + * If the fault handler performs the COW, it does not return a page,
> + * so cannot use the page's lock to protect against a concurrent truncate
> + * operation. Instead it returns with the i_mmap_mutex held, which must
> + * be released after the PTE has been inserted.
> + */
> static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> unsigned long address, pmd_t *pmd,
> pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
> @@ -3389,25 +3401,34 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> return VM_FAULT_OOM;
> }
>
> - ret = __do_fault(vma, address, pgoff, flags, &fault_page);
> + ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
> goto uncharge_out;
>
> - copy_user_highpage(new_page, fault_page, address, vma);
> + if (!(ret & VM_FAULT_COWED))
> + copy_user_highpage(new_page, fault_page, address, vma);
> __SetPageUptodate(new_page);
>
> pte = pte_offset_map_lock(mm, pmd, address, &ptl);
> - if (unlikely(!pte_same(*pte, orig_pte))) {
> - pte_unmap_unlock(pte, ptl);
> + if (unlikely(!pte_same(*pte, orig_pte)))
> + goto unlock_out;
> + do_set_pte(vma, address, new_page, pte, true, true);
> + pte_unmap_unlock(pte, ptl);
> + if (ret & VM_FAULT_COWED) {
> + mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
> + } else {
> unlock_page(fault_page);
> page_cache_release(fault_page);
> - goto uncharge_out;
> }
> - do_set_pte(vma, address, new_page, pte, true, true);
> - pte_unmap_unlock(pte, ptl);
> - unlock_page(fault_page);
> - page_cache_release(fault_page);
> return ret;
> +unlock_out:
> + pte_unmap_unlock(pte, ptl);
> + if (ret & VM_FAULT_COWED) {
> + mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
> + } else {
> + unlock_page(fault_page);
> + page_cache_release(fault_page);
> + }
> uncharge_out:
> mem_cgroup_uncharge_page(new_page);
> page_cache_release(new_page);
> @@ -3424,7 +3445,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> int dirtied = 0;
> int ret, tmp;
>
> - ret = __do_fault(vma, address, pgoff, flags, &fault_page);
> + ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
> return ret;
>
> --
> 1.9.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Jan Kara <jack@suse.cz>
SUSE Labs, CR
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2014-04-08 16:35 UTC|newest]
Thread overview: 90+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-03-23 19:08 [PATCH v7 00/22] Support ext4 on NV-DIMMs Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 01/22] Fix XIP fault vs truncate race Matthew Wilcox
2014-03-29 15:57 ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 02/22] Allow page fault handlers to perform the COW Matthew Wilcox
2014-04-08 16:34 ` Jan Kara [this message]
2014-03-23 19:08 ` [PATCH v7 03/22] axonram: Fix bug in direct_access Matthew Wilcox
2014-03-29 16:22 ` Jan Kara
2014-04-02 19:24 ` Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 04/22] Change direct_access calling convention Matthew Wilcox
2014-03-29 16:30 ` Jan Kara
2014-04-02 19:27 ` Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 05/22] Introduce IS_DAX(inode) Matthew Wilcox
2014-04-08 15:32 ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 06/22] Replace XIP read and write with DAX I/O Matthew Wilcox
2014-04-08 17:56 ` Jan Kara
2014-04-08 20:21 ` Matthew Wilcox
2014-04-09 9:14 ` Jan Kara
2014-04-09 15:19 ` Matthew Wilcox
2014-04-09 20:55 ` Jan Kara
2014-04-13 18:05 ` Matthew Wilcox
2014-04-09 12:04 ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 07/22] Replace the XIP page fault handler with the DAX page fault handler Matthew Wilcox
2014-04-08 22:05 ` Jan Kara
2014-04-09 20:48 ` Matthew Wilcox
2014-04-09 21:12 ` Jan Kara
2014-04-13 11:21 ` Matthew Wilcox
2014-04-14 16:04 ` Jan Kara
2014-04-09 10:27 ` Jan Kara
2014-04-09 20:51 ` Matthew Wilcox
2014-04-09 21:43 ` Jan Kara
2014-04-13 18:03 ` Matthew Wilcox
2014-07-29 12:12 ` Matthew Wilcox
2014-07-29 21:04 ` Jan Kara
2014-07-29 21:23 ` Matthew Wilcox
2014-07-30 9:52 ` Jan Kara
2014-07-30 21:02 ` Matthew Wilcox
2014-08-09 11:00 ` Matthew Wilcox
2014-08-11 8:51 ` Jan Kara
2014-08-11 14:13 ` Matthew Wilcox
2014-08-11 14:35 ` Jan Kara
2014-08-11 15:02 ` Matthew Wilcox
2014-08-11 15:25 ` Jan Kara
2014-05-21 20:35 ` Toshi Kani
2014-06-05 22:38 ` Toshi Kani
2014-03-23 19:08 ` [PATCH v7 08/22] Replace xip_truncate_page with dax_truncate_page Matthew Wilcox
2014-04-08 22:17 ` Jan Kara
2014-04-09 9:26 ` Jan Kara
2014-04-13 19:07 ` Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 09/22] Remove mm/filemap_xip.c Matthew Wilcox
2014-04-08 18:21 ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 10/22] Remove get_xip_mem Matthew Wilcox
2014-04-08 18:20 ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 11/22] Replace ext2_clear_xip_target with dax_clear_blocks Matthew Wilcox
2014-04-09 9:46 ` Jan Kara
2014-04-10 14:16 ` Matthew Wilcox
2014-04-10 18:31 ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 12/22] ext2: Remove ext2_xip_verify_sb() Matthew Wilcox
2014-04-09 9:52 ` Jan Kara
2014-04-10 14:22 ` Matthew Wilcox
2014-04-10 18:35 ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 13/22] ext2: Remove ext2_use_xip Matthew Wilcox
2014-04-09 9:55 ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 14/22] ext2: Remove xip.c and xip.h Matthew Wilcox
2014-04-09 9:59 ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 15/22] Remove CONFIG_EXT2_FS_XIP and rename CONFIG_FS_XIP to CONFIG_FS_DAX Matthew Wilcox
2014-04-09 9:59 ` Jan Kara
2014-04-10 14:23 ` Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 16/22] ext2: Remove ext2_aops_xip Matthew Wilcox
2014-04-09 10:02 ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 17/22] Get rid of most mentions of XIP in ext2 Matthew Wilcox
2014-04-09 10:04 ` Jan Kara
2014-04-10 14:26 ` Matthew Wilcox
2014-04-10 18:40 ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 18/22] xip: Add xip_zero_page_range Matthew Wilcox
2014-04-09 10:15 ` Jan Kara
2014-04-10 14:27 ` Matthew Wilcox
2014-04-10 18:43 ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 19/22] ext4: Make ext4_block_zero_page_range static Matthew Wilcox
2014-03-24 19:11 ` tytso
2014-03-23 19:08 ` [PATCH v7 20/22] ext4: Add DAX functionality Matthew Wilcox
2014-04-09 12:17 ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 21/22] ext4: Fix typos Matthew Wilcox
2014-03-24 19:16 ` tytso
2014-03-23 19:08 ` [PATCH v7 22/22] brd: Rename XIP to DAX Matthew Wilcox
2014-04-09 10:07 ` Jan Kara
2014-05-18 14:58 ` [PATCH v7 00/22] Support ext4 on NV-DIMMs Boaz Harrosh
2014-05-18 23:24 ` Matthew Wilcox
2014-06-17 18:11 ` Boaz Harrosh
2014-06-17 18:19 ` Matthew Wilcox
2014-06-17 18:39 ` Boaz Harrosh
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20140408163457.GD2713@quack.suse.cz \
--to=jack@suse.cz \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=matthew.r.wilcox@intel.com \
--cc=willy@linux.intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).