linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Jan Kara <jack@suse.cz>
To: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org, willy@linux.intel.com
Subject: Re: [PATCH v7 02/22] Allow page fault handlers to perform the COW
Date: Tue, 8 Apr 2014 18:34:57 +0200	[thread overview]
Message-ID: <20140408163457.GD2713@quack.suse.cz> (raw)
In-Reply-To: <feee29988e167b019f5726cd497b1470b050a3ce.1395591795.git.matthew.r.wilcox@intel.com>

On Sun 23-03-14 15:08:28, Matthew Wilcox wrote:
> Currently COW of an XIP file is done by first bringing in a read-only
> mapping, then retrying the fault and copying the page.  It is much more
> efficient to tell the fault handler that a COW is being attempted (by
> passing in the pre-allocated page in the vm_fault structure), and allow
> the handler to perform the COW operation itself.
>
> Where the filemap code protects against truncation of the file until
> the PTE has been installed with the page lock, the XIP code use the
> i_mmap_mutex instead.  We must therefore unlock the i_mmap_mutex after
> inserting the PTE.
  Eww, leaking of locking details about DAX into generic fault code is
really ugly. It seems to me that once you pass the cow_page into the fault
handler (which looks OK to me), you can just directly install it in PTE
via vm_insert_page() and you don't have to rely on do_cow_fault() for that.
Thus you can return VM_FAULT_NOPAGE and be done with it? Basically cow
faults will then work the same way as other faults for DAX... Or am I
missing something?

								Honza

> Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
> ---
>  include/linux/mm.h |  2 ++
>  mm/memory.c        | 45 +++++++++++++++++++++++++++++++++------------
>  2 files changed, 35 insertions(+), 12 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index c1b7414..513b78a 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -205,6 +205,7 @@ struct vm_fault {
>  	pgoff_t pgoff;			/* Logical page offset based on vma */
>  	void __user *virtual_address;	/* Faulting virtual address */
>  
> +	struct page *cow_page;		/* Handler may choose to COW */
>  	struct page *page;		/* ->fault handlers should return a
>  					 * page here, unless VM_FAULT_NOPAGE
>  					 * is set (which is also implied by
> @@ -1010,6 +1011,7 @@ static inline int page_mapped(struct page *page)
>  #define VM_FAULT_HWPOISON 0x0010	/* Hit poisoned small page */
>  #define VM_FAULT_HWPOISON_LARGE 0x0020  /* Hit poisoned large page. Index encoded in upper bits */
>  
> +#define VM_FAULT_COWED	0x0080	/* ->fault COWed the page instead */
>  #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
>  #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
>  #define VM_FAULT_RETRY	0x0400	/* ->fault blocked, must retry */
> diff --git a/mm/memory.c b/mm/memory.c
> index 07b4287..2a2ecac 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2602,6 +2602,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
>  	vmf.pgoff = page->index;
>  	vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
>  	vmf.page = page;
> +	vmf.cow_page = NULL;
>  
>  	ret = vma->vm_ops->page_mkwrite(vma, &vmf);
>  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
> @@ -3288,7 +3289,8 @@ oom:
>  }
>  
>  static int __do_fault(struct vm_area_struct *vma, unsigned long address,
> -		pgoff_t pgoff, unsigned int flags, struct page **page)
> +			pgoff_t pgoff, unsigned int flags,
> +			struct page *cow_page, struct page **page)
>  {
>  	struct vm_fault vmf;
>  	int ret;
> @@ -3297,10 +3299,13 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
>  	vmf.pgoff = pgoff;
>  	vmf.flags = flags;
>  	vmf.page = NULL;
> +	vmf.cow_page = cow_page;
>  
>  	ret = vma->vm_ops->fault(vma, &vmf);
>  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
>  		return ret;
> +	if (unlikely(ret & VM_FAULT_COWED))
> +		goto out;
>  
>  	if (unlikely(PageHWPoison(vmf.page))) {
>  		if (ret & VM_FAULT_LOCKED)
> @@ -3314,6 +3319,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
>  	else
>  		VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
>  
> + out:
>  	*page = vmf.page;
>  	return ret;
>  }
> @@ -3351,7 +3357,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  	pte_t *pte;
>  	int ret;
>  
> -	ret = __do_fault(vma, address, pgoff, flags, &fault_page);
> +	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
>  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
>  		return ret;
>  
> @@ -3368,6 +3374,12 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  	return ret;
>  }
>  
> +/*
> + * If the fault handler performs the COW, it does not return a page,
> + * so cannot use the page's lock to protect against a concurrent truncate
> + * operation.  Instead it returns with the i_mmap_mutex held, which must
> + * be released after the PTE has been inserted.
> + */
>  static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  		unsigned long address, pmd_t *pmd,
>  		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
> @@ -3389,25 +3401,34 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  		return VM_FAULT_OOM;
>  	}
>  
> -	ret = __do_fault(vma, address, pgoff, flags, &fault_page);
> +	ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
>  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
>  		goto uncharge_out;
>  
> -	copy_user_highpage(new_page, fault_page, address, vma);
> +	if (!(ret & VM_FAULT_COWED))
> +		copy_user_highpage(new_page, fault_page, address, vma);
>  	__SetPageUptodate(new_page);
>  
>  	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
> -	if (unlikely(!pte_same(*pte, orig_pte))) {
> -		pte_unmap_unlock(pte, ptl);
> +	if (unlikely(!pte_same(*pte, orig_pte)))
> +		goto unlock_out;
> +	do_set_pte(vma, address, new_page, pte, true, true);
> +	pte_unmap_unlock(pte, ptl);
> +	if (ret & VM_FAULT_COWED) {
> +		mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
> +	} else {
>  		unlock_page(fault_page);
>  		page_cache_release(fault_page);
> -		goto uncharge_out;
>  	}
> -	do_set_pte(vma, address, new_page, pte, true, true);
> -	pte_unmap_unlock(pte, ptl);
> -	unlock_page(fault_page);
> -	page_cache_release(fault_page);
>  	return ret;
> +unlock_out:
> +	pte_unmap_unlock(pte, ptl);
> +	if (ret & VM_FAULT_COWED) {
> +		mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
> +	} else {
> +		unlock_page(fault_page);
> +		page_cache_release(fault_page);
> +	}
>  uncharge_out:
>  	mem_cgroup_uncharge_page(new_page);
>  	page_cache_release(new_page);
> @@ -3424,7 +3445,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  	int dirtied = 0;
>  	int ret, tmp;
>  
> -	ret = __do_fault(vma, address, pgoff, flags, &fault_page);
> +	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
>  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
>  		return ret;
>  
> -- 
> 1.9.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  reply	other threads:[~2014-04-08 16:35 UTC|newest]

Thread overview: 90+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-03-23 19:08 [PATCH v7 00/22] Support ext4 on NV-DIMMs Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 01/22] Fix XIP fault vs truncate race Matthew Wilcox
2014-03-29 15:57   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 02/22] Allow page fault handlers to perform the COW Matthew Wilcox
2014-04-08 16:34   ` Jan Kara [this message]
2014-03-23 19:08 ` [PATCH v7 03/22] axonram: Fix bug in direct_access Matthew Wilcox
2014-03-29 16:22   ` Jan Kara
2014-04-02 19:24     ` Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 04/22] Change direct_access calling convention Matthew Wilcox
2014-03-29 16:30   ` Jan Kara
2014-04-02 19:27     ` Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 05/22] Introduce IS_DAX(inode) Matthew Wilcox
2014-04-08 15:32   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 06/22] Replace XIP read and write with DAX I/O Matthew Wilcox
2014-04-08 17:56   ` Jan Kara
2014-04-08 20:21     ` Matthew Wilcox
2014-04-09  9:14       ` Jan Kara
2014-04-09 15:19         ` Matthew Wilcox
2014-04-09 20:55           ` Jan Kara
2014-04-13 18:05             ` Matthew Wilcox
2014-04-09 12:04   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 07/22] Replace the XIP page fault handler with the DAX page fault handler Matthew Wilcox
2014-04-08 22:05   ` Jan Kara
2014-04-09 20:48     ` Matthew Wilcox
2014-04-09 21:12       ` Jan Kara
2014-04-13 11:21         ` Matthew Wilcox
2014-04-14 16:04           ` Jan Kara
2014-04-09 10:27   ` Jan Kara
2014-04-09 20:51     ` Matthew Wilcox
2014-04-09 21:43       ` Jan Kara
2014-04-13 18:03         ` Matthew Wilcox
2014-07-29 12:12         ` Matthew Wilcox
2014-07-29 21:04           ` Jan Kara
2014-07-29 21:23             ` Matthew Wilcox
2014-07-30  9:52               ` Jan Kara
2014-07-30 21:02                 ` Matthew Wilcox
2014-08-09 11:00                 ` Matthew Wilcox
2014-08-11  8:51                   ` Jan Kara
2014-08-11 14:13                     ` Matthew Wilcox
2014-08-11 14:35                       ` Jan Kara
2014-08-11 15:02                         ` Matthew Wilcox
2014-08-11 15:25                           ` Jan Kara
2014-05-21 20:35   ` Toshi Kani
2014-06-05 22:38     ` Toshi Kani
2014-03-23 19:08 ` [PATCH v7 08/22] Replace xip_truncate_page with dax_truncate_page Matthew Wilcox
2014-04-08 22:17   ` Jan Kara
2014-04-09  9:26     ` Jan Kara
2014-04-13 19:07       ` Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 09/22] Remove mm/filemap_xip.c Matthew Wilcox
2014-04-08 18:21   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 10/22] Remove get_xip_mem Matthew Wilcox
2014-04-08 18:20   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 11/22] Replace ext2_clear_xip_target with dax_clear_blocks Matthew Wilcox
2014-04-09  9:46   ` Jan Kara
2014-04-10 14:16     ` Matthew Wilcox
2014-04-10 18:31       ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 12/22] ext2: Remove ext2_xip_verify_sb() Matthew Wilcox
2014-04-09  9:52   ` Jan Kara
2014-04-10 14:22     ` Matthew Wilcox
2014-04-10 18:35       ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 13/22] ext2: Remove ext2_use_xip Matthew Wilcox
2014-04-09  9:55   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 14/22] ext2: Remove xip.c and xip.h Matthew Wilcox
2014-04-09  9:59   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 15/22] Remove CONFIG_EXT2_FS_XIP and rename CONFIG_FS_XIP to CONFIG_FS_DAX Matthew Wilcox
2014-04-09  9:59   ` Jan Kara
2014-04-10 14:23     ` Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 16/22] ext2: Remove ext2_aops_xip Matthew Wilcox
2014-04-09 10:02   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 17/22] Get rid of most mentions of XIP in ext2 Matthew Wilcox
2014-04-09 10:04   ` Jan Kara
2014-04-10 14:26     ` Matthew Wilcox
2014-04-10 18:40       ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 18/22] xip: Add xip_zero_page_range Matthew Wilcox
2014-04-09 10:15   ` Jan Kara
2014-04-10 14:27     ` Matthew Wilcox
2014-04-10 18:43       ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 19/22] ext4: Make ext4_block_zero_page_range static Matthew Wilcox
2014-03-24 19:11   ` tytso
2014-03-23 19:08 ` [PATCH v7 20/22] ext4: Add DAX functionality Matthew Wilcox
2014-04-09 12:17   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 21/22] ext4: Fix typos Matthew Wilcox
2014-03-24 19:16   ` tytso
2014-03-23 19:08 ` [PATCH v7 22/22] brd: Rename XIP to DAX Matthew Wilcox
2014-04-09 10:07   ` Jan Kara
2014-05-18 14:58 ` [PATCH v7 00/22] Support ext4 on NV-DIMMs Boaz Harrosh
2014-05-18 23:24   ` Matthew Wilcox
2014-06-17 18:11 ` Boaz Harrosh
2014-06-17 18:19   ` Matthew Wilcox
2014-06-17 18:39     ` Boaz Harrosh

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20140408163457.GD2713@quack.suse.cz \
    --to=jack@suse.cz \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=matthew.r.wilcox@intel.com \
    --cc=willy@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).