All of lore.kernel.org
 help / color / mirror / Atom feed
From: Dmitry Monakhov <dmonlist@gmail.com>
To: Ross Zwisler <ross.zwisler@linux.intel.com>,
	linux-kernel@vger.kernel.org
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>,
	"H. Peter Anvin" <hpa@zytor.com>,
	"J. Bruce Fields" <bfields@fieldses.org>,
	Theodore Ts'o <tytso@mit.edu>,
	Alexander Viro <viro@zeniv.linux.org.uk>,
	Andreas Dilger <adilger.kernel@dilger.ca>,
	Andrew Morton <akpm@linux-foundation.org>,
	Dan Williams <dan.j.williams@intel.com>,
	Dave Chinner <david@fromorbit.com>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	Ingo Molnar <mingo@redhat.com>, Jan Kara <jack@suse.com>,
	Jeff Layton <jlayton@poochiereds.net>,
	Matthew Wilcox <matthew.r.wilcox@intel.com>,
	Matthew Wilcox <willy@linux.intel.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	linux-ext4@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-mm@kvack.org, linux-nvdimm@lists.01.org, x86@kernel.org,
	xfs@oss.sgi.com
Subject: Re: [PATCH v8 6/9] dax: add support for fsync/msync
Date: Sat, 06 Feb 2016 17:33:07 +0300	[thread overview]
Message-ID: <878u2xrjrw.fsf@openvz.org> (raw)
In-Reply-To: <1452230879-18117-7-git-send-email-ross.zwisler@linux.intel.com>

[-- Attachment #1: Type: text/plain, Size: 11561 bytes --]

Ross Zwisler <ross.zwisler@linux.intel.com> writes:

> To properly handle fsync/msync in an efficient way DAX needs to track dirty
> pages so it is able to flush them durably to media on demand.
Please see coments below
>
> The tracking of dirty pages is done via the radix tree in struct
> address_space.  This radix tree is already used by the page writeback
> infrastructure for tracking dirty pages associated with an open file, and
> it already has support for exceptional (non struct page*) entries.  We
> build upon these features to add exceptional entries to the radix tree for
> DAX dirty PMD or PTE pages at fault time.
>
> Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> ---
>  fs/dax.c            | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++--
>  include/linux/dax.h |   2 +
>  mm/filemap.c        |   6 ++
>  3 files changed, 196 insertions(+), 6 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 5b84a46..0db21ea 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -24,6 +24,7 @@
>  #include <linux/memcontrol.h>
>  #include <linux/mm.h>
>  #include <linux/mutex.h>
> +#include <linux/pagevec.h>
>  #include <linux/pmem.h>
>  #include <linux/sched.h>
>  #include <linux/uio.h>
> @@ -324,6 +325,174 @@ static int copy_user_bh(struct page *to, struct inode *inode,
>  	return 0;
>  }
>  
> +#define NO_SECTOR -1
> +
> +static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
IMHO it would be sane to call that function as dax_radix_entry_insert() 
> +		sector_t sector, bool pmd_entry, bool dirty)
> +{
> +	struct radix_tree_root *page_tree = &mapping->page_tree;
> +	int type, error = 0;
> +	void *entry;
> +
> +	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
> +
> +	spin_lock_irq(&mapping->tree_lock);
> +	entry = radix_tree_lookup(page_tree, index);
> +
> +	if (entry) {
> +		type = RADIX_DAX_TYPE(entry);
> +		if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
> +					type != RADIX_DAX_PMD)) {
> +			error = -EIO;
> +			goto unlock;
> +		}
> +
> +		if (!pmd_entry || type == RADIX_DAX_PMD)
> +			goto dirty;
> +		radix_tree_delete(&mapping->page_tree, index);
> +		mapping->nrexceptional--;
> +	}
> +
> +	if (sector == NO_SECTOR) {
> +		/*
> +		 * This can happen during correct operation if our pfn_mkwrite
> +		 * fault raced against a hole punch operation.  If this
> +		 * happens the pte that was hole punched will have been
> +		 * unmapped and the radix tree entry will have been removed by
> +		 * the time we are called, but the call will still happen.  We
> +		 * will return all the way up to wp_pfn_shared(), where the
> +		 * pte_same() check will fail, eventually causing page fault
> +		 * to be retried by the CPU.
> +		 */
> +		goto unlock;
> +	}
> +
> +	error = radix_tree_insert(page_tree, index,
> +			RADIX_DAX_ENTRY(sector, pmd_entry));
> +	if (error)
> +		goto unlock;
> +
> +	mapping->nrexceptional++;
> + dirty:
> +	if (dirty)
> +		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
> + unlock:
> +	spin_unlock_irq(&mapping->tree_lock);
> +	return error;
> +}
> +
> +static int dax_writeback_one(struct block_device *bdev,
> +		struct address_space *mapping, pgoff_t index, void *entry)
> +{
> +	struct radix_tree_root *page_tree = &mapping->page_tree;
> +	int type = RADIX_DAX_TYPE(entry);
> +	struct radix_tree_node *node;
> +	struct blk_dax_ctl dax;
> +	void **slot;
> +	int ret = 0;
> +
> +	spin_lock_irq(&mapping->tree_lock);
> +	/*
> +	 * Regular page slots are stabilized by the page lock even
> +	 * without the tree itself locked.  These unlocked entries
> +	 * need verification under the tree lock.
> +	 */
> +	if (!__radix_tree_lookup(page_tree, index, &node, &slot))
> +		goto unlock;
> +	if (*slot != entry)
> +		goto unlock;
> +
> +	/* another fsync thread may have already written back this entry */
> +	if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
> +		goto unlock;
> +
> +	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
> +
> +	if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
> +		ret = -EIO;
> +		goto unlock;
> +	}
> +
> +	dax.sector = RADIX_DAX_SECTOR(entry);
> +	dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
> +	spin_unlock_irq(&mapping->tree_lock);
> +
> +	/*
> +	 * We cannot hold tree_lock while calling dax_map_atomic() because it
> +	 * eventually calls cond_resched().
> +	 */
> +	ret = dax_map_atomic(bdev, &dax);
> +	if (ret < 0)
> +		return ret;
> +
> +	if (WARN_ON_ONCE(ret < dax.size)) {
> +		ret = -EIO;
> +		goto unmap;
> +	}
> +
> +	wb_cache_pmem(dax.addr, dax.size);
> + unmap:
> +	dax_unmap_atomic(bdev, &dax);
> +	return ret;
> +
> + unlock:
> +	spin_unlock_irq(&mapping->tree_lock);
> +	return ret;
> +}
> +
> +/*
> + * Flush the mapping to the persistent domain within the byte range of [start,
> + * end]. This is required by data integrity operations to ensure file data is
> + * on persistent storage prior to completion of the operation.
> + */
> +int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
> +		loff_t end)
> +{
> +	struct inode *inode = mapping->host;
> +	struct block_device *bdev = inode->i_sb->s_bdev;
> +	pgoff_t indices[PAGEVEC_SIZE];
> +	pgoff_t start_page, end_page;
> +	struct pagevec pvec;
> +	void *entry;
> +	int i, ret = 0;
> +
> +	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
> +		return -EIO;
> +
> +	rcu_read_lock();
> +	entry = radix_tree_lookup(&mapping->page_tree, start & PMD_MASK);
> +	rcu_read_unlock();
> +
> +	/* see if the start of our range is covered by a PMD entry */
> +	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
> +		start &= PMD_MASK;
> +
> +	start_page = start >> PAGE_CACHE_SHIFT;
> +	end_page = end >> PAGE_CACHE_SHIFT;
> +
> +	tag_pages_for_writeback(mapping, start_page, end_page);
> +
> +	pagevec_init(&pvec, 0);
> +	while (1) {
> +		pvec.nr = find_get_entries_tag(mapping, start_page,
> +				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
> +				pvec.pages, indices);
> +
> +		if (pvec.nr == 0)
> +			break;
> +
> +		for (i = 0; i < pvec.nr; i++) {
> +			ret = dax_writeback_one(bdev, mapping, indices[i],
> +					pvec.pages[i]);
> +			if (ret < 0)
> +				return ret;
> +		}
I think it would be more efficient to use batched locking like follows:
                spin_lock_irq(&mapping->tree_lock);
		for (i = 0; i < pvec.nr; i++) {
                    struct blk_dax_ctl dax[PAGEVEC_SIZE];                
                    radix_tree_tag_clear(page_tree, indices[i], PAGECACHE_TAG_TOWRITE);
                    /* It is also reasonable to merge adjacent dax
                     * regions in to one */
                    dax[i].sector = RADIX_DAX_SECTOR(entry);
                    dax[i].size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);                    

                }
                spin_unlock_irq(&mapping->tree_lock);
               	if (blk_queue_enter(q, true) != 0)
                    goto error;
                for (i = 0; i < pvec.nr; i++) {
                    rc = bdev_direct_access(bdev, dax[i]);
                    wb_cache_pmem(dax[i].addr, dax[i].size);
                }
                ret = blk_queue_exit(q, true)
> +	}
> +	wmb_pmem();
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
> +
>  static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
>  			struct vm_area_struct *vma, struct vm_fault *vmf)
>  {
> @@ -363,6 +532,11 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
>  	}
>  	dax_unmap_atomic(bdev, &dax);
>  
> +	error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
> +			vmf->flags & FAULT_FLAG_WRITE);
> +	if (error)
> +		goto out;
> +
>  	error = vm_insert_mixed(vma, vaddr, dax.pfn);
>  
>   out:
> @@ -487,6 +661,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  		delete_from_page_cache(page);
>  		unlock_page(page);
>  		page_cache_release(page);
> +		page = NULL;
>  	}
I've realized that I do not understand why dax_fault code works at all.
During dax_fault we want to remove page from mapping and insert dax-entry
 Basically code looks like follows:
0 page = find_get_page()
1 lock_page(page)
2 delete_from_page_cache(page);
3 unlock_page(page);
4 dax_insert_mapping(inode, &bh, vma, vmf);

BUT what on earth protects us from other process to reinsert page again
after step(2) but before (4)?
Imagine we do write to file-hole which result in to dax_fault(write), but
another task also does read fault and reinsert deleted page via dax_hole_load
As result dax_tree_entry will fail with EIO
Testcase looks very trivial, but i can not reproduce this.
>  
>  	/*
> @@ -591,7 +766,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
>  	pgoff_t size, pgoff;
>  	loff_t lstart, lend;
>  	sector_t block;
> -	int result = 0;
> +	int error, result = 0;
>  
>  	/* dax pmd mappings require pfn_t_devmap() */
>  	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
> @@ -733,6 +908,16 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
>  		}
>  		dax_unmap_atomic(bdev, &dax);
>  
> +		if (write) {
> +			error = dax_radix_entry(mapping, pgoff, dax.sector,
> +					true, true);
> +			if (error) {
> +				dax_pmd_dbg(&bh, address,
> +						"PMD radix insertion failed");
> +				goto fallback;
> +			}
> +		}
> +
>  		dev_dbg(part_to_dev(bdev->bd_part),
>  				"%s: %s addr: %lx pfn: %lx sect: %llx\n",
>  				__func__, current->comm, address,
> @@ -791,15 +976,12 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
>   * dax_pfn_mkwrite - handle first write to DAX page
>   * @vma: The virtual memory area where the fault occurred
>   * @vmf: The description of the fault
> - *
>   */
>  int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
>  {
> -	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
> +	struct file *file = vma->vm_file;
>  
> -	sb_start_pagefault(sb);
> -	file_update_time(vma->vm_file);
> -	sb_end_pagefault(sb);
> +	dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
>  	return VM_FAULT_NOPAGE;
>  }
>  EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index e9d57f68..8204c3d 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -41,4 +41,6 @@ static inline bool dax_mapping(struct address_space *mapping)
>  {
>  	return mapping->host && IS_DAX(mapping->host);
>  }
> +int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
> +		loff_t end);
>  #endif
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 1e215fc..2e7c8d9 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -482,6 +482,12 @@ int filemap_write_and_wait_range(struct address_space *mapping,
>  {
>  	int err = 0;
>  
> +	if (dax_mapping(mapping) && mapping->nrexceptional) {
> +		err = dax_writeback_mapping_range(mapping, lstart, lend);
> +		if (err)
> +			return err;
> +	}
> +
>  	if (mapping->nrpages) {
>  		err = __filemap_fdatawrite_range(mapping, lstart, lend,
>  						 WB_SYNC_ALL);
> -- 
> 2.5.0
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 472 bytes --]

WARNING: multiple messages have this Message-ID (diff)
From: Dmitry Monakhov <dmonlist@gmail.com>
To: Ross Zwisler <ross.zwisler@linux.intel.com>,
	linux-kernel@vger.kernel.org
Cc: "H. Peter Anvin" <hpa@zytor.com>,
	"J. Bruce Fields" <bfields@fieldses.org>,
	Theodore Ts'o <tytso@mit.edu>,
	Alexander Viro <viro@zeniv.linux.org.uk>,
	Andreas Dilger <adilger.kernel@dilger.ca>,
	Andrew Morton <akpm@linux-foundation.org>,
	Dan Williams <dan.j.williams@intel.com>,
	Dave Chinner <david@fromorbit.com>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	Ingo Molnar <mingo@redhat.com>, Jan Kara <jack@suse.com>,
	Jeff Layton <jlayton@poochiereds.net>,
	Matthew Wilcox <matthew.r.wilcox@intel.com>,
	Matthew Wilcox <willy@linux.intel.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	linux-ext4@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-mm@kvack.org, linux-nvdimm@lists.01.org, x86@kernel.org,
	xfs@oss.sgi.com
Subject: Re: [PATCH v8 6/9] dax: add support for fsync/msync
Date: Sat, 06 Feb 2016 17:33:07 +0300	[thread overview]
Message-ID: <878u2xrjrw.fsf@openvz.org> (raw)
In-Reply-To: <1452230879-18117-7-git-send-email-ross.zwisler@linux.intel.com>

[-- Attachment #1: Type: text/plain, Size: 11561 bytes --]

Ross Zwisler <ross.zwisler@linux.intel.com> writes:

> To properly handle fsync/msync in an efficient way DAX needs to track dirty
> pages so it is able to flush them durably to media on demand.
Please see coments below
>
> The tracking of dirty pages is done via the radix tree in struct
> address_space.  This radix tree is already used by the page writeback
> infrastructure for tracking dirty pages associated with an open file, and
> it already has support for exceptional (non struct page*) entries.  We
> build upon these features to add exceptional entries to the radix tree for
> DAX dirty PMD or PTE pages at fault time.
>
> Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> ---
>  fs/dax.c            | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++--
>  include/linux/dax.h |   2 +
>  mm/filemap.c        |   6 ++
>  3 files changed, 196 insertions(+), 6 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 5b84a46..0db21ea 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -24,6 +24,7 @@
>  #include <linux/memcontrol.h>
>  #include <linux/mm.h>
>  #include <linux/mutex.h>
> +#include <linux/pagevec.h>
>  #include <linux/pmem.h>
>  #include <linux/sched.h>
>  #include <linux/uio.h>
> @@ -324,6 +325,174 @@ static int copy_user_bh(struct page *to, struct inode *inode,
>  	return 0;
>  }
>  
> +#define NO_SECTOR -1
> +
> +static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
IMHO it would be sane to call that function as dax_radix_entry_insert() 
> +		sector_t sector, bool pmd_entry, bool dirty)
> +{
> +	struct radix_tree_root *page_tree = &mapping->page_tree;
> +	int type, error = 0;
> +	void *entry;
> +
> +	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
> +
> +	spin_lock_irq(&mapping->tree_lock);
> +	entry = radix_tree_lookup(page_tree, index);
> +
> +	if (entry) {
> +		type = RADIX_DAX_TYPE(entry);
> +		if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
> +					type != RADIX_DAX_PMD)) {
> +			error = -EIO;
> +			goto unlock;
> +		}
> +
> +		if (!pmd_entry || type == RADIX_DAX_PMD)
> +			goto dirty;
> +		radix_tree_delete(&mapping->page_tree, index);
> +		mapping->nrexceptional--;
> +	}
> +
> +	if (sector == NO_SECTOR) {
> +		/*
> +		 * This can happen during correct operation if our pfn_mkwrite
> +		 * fault raced against a hole punch operation.  If this
> +		 * happens the pte that was hole punched will have been
> +		 * unmapped and the radix tree entry will have been removed by
> +		 * the time we are called, but the call will still happen.  We
> +		 * will return all the way up to wp_pfn_shared(), where the
> +		 * pte_same() check will fail, eventually causing page fault
> +		 * to be retried by the CPU.
> +		 */
> +		goto unlock;
> +	}
> +
> +	error = radix_tree_insert(page_tree, index,
> +			RADIX_DAX_ENTRY(sector, pmd_entry));
> +	if (error)
> +		goto unlock;
> +
> +	mapping->nrexceptional++;
> + dirty:
> +	if (dirty)
> +		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
> + unlock:
> +	spin_unlock_irq(&mapping->tree_lock);
> +	return error;
> +}
> +
> +static int dax_writeback_one(struct block_device *bdev,
> +		struct address_space *mapping, pgoff_t index, void *entry)
> +{
> +	struct radix_tree_root *page_tree = &mapping->page_tree;
> +	int type = RADIX_DAX_TYPE(entry);
> +	struct radix_tree_node *node;
> +	struct blk_dax_ctl dax;
> +	void **slot;
> +	int ret = 0;
> +
> +	spin_lock_irq(&mapping->tree_lock);
> +	/*
> +	 * Regular page slots are stabilized by the page lock even
> +	 * without the tree itself locked.  These unlocked entries
> +	 * need verification under the tree lock.
> +	 */
> +	if (!__radix_tree_lookup(page_tree, index, &node, &slot))
> +		goto unlock;
> +	if (*slot != entry)
> +		goto unlock;
> +
> +	/* another fsync thread may have already written back this entry */
> +	if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
> +		goto unlock;
> +
> +	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
> +
> +	if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
> +		ret = -EIO;
> +		goto unlock;
> +	}
> +
> +	dax.sector = RADIX_DAX_SECTOR(entry);
> +	dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
> +	spin_unlock_irq(&mapping->tree_lock);
> +
> +	/*
> +	 * We cannot hold tree_lock while calling dax_map_atomic() because it
> +	 * eventually calls cond_resched().
> +	 */
> +	ret = dax_map_atomic(bdev, &dax);
> +	if (ret < 0)
> +		return ret;
> +
> +	if (WARN_ON_ONCE(ret < dax.size)) {
> +		ret = -EIO;
> +		goto unmap;
> +	}
> +
> +	wb_cache_pmem(dax.addr, dax.size);
> + unmap:
> +	dax_unmap_atomic(bdev, &dax);
> +	return ret;
> +
> + unlock:
> +	spin_unlock_irq(&mapping->tree_lock);
> +	return ret;
> +}
> +
> +/*
> + * Flush the mapping to the persistent domain within the byte range of [start,
> + * end]. This is required by data integrity operations to ensure file data is
> + * on persistent storage prior to completion of the operation.
> + */
> +int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
> +		loff_t end)
> +{
> +	struct inode *inode = mapping->host;
> +	struct block_device *bdev = inode->i_sb->s_bdev;
> +	pgoff_t indices[PAGEVEC_SIZE];
> +	pgoff_t start_page, end_page;
> +	struct pagevec pvec;
> +	void *entry;
> +	int i, ret = 0;
> +
> +	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
> +		return -EIO;
> +
> +	rcu_read_lock();
> +	entry = radix_tree_lookup(&mapping->page_tree, start & PMD_MASK);
> +	rcu_read_unlock();
> +
> +	/* see if the start of our range is covered by a PMD entry */
> +	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
> +		start &= PMD_MASK;
> +
> +	start_page = start >> PAGE_CACHE_SHIFT;
> +	end_page = end >> PAGE_CACHE_SHIFT;
> +
> +	tag_pages_for_writeback(mapping, start_page, end_page);
> +
> +	pagevec_init(&pvec, 0);
> +	while (1) {
> +		pvec.nr = find_get_entries_tag(mapping, start_page,
> +				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
> +				pvec.pages, indices);
> +
> +		if (pvec.nr == 0)
> +			break;
> +
> +		for (i = 0; i < pvec.nr; i++) {
> +			ret = dax_writeback_one(bdev, mapping, indices[i],
> +					pvec.pages[i]);
> +			if (ret < 0)
> +				return ret;
> +		}
I think it would be more efficient to use batched locking like follows:
                spin_lock_irq(&mapping->tree_lock);
		for (i = 0; i < pvec.nr; i++) {
                    struct blk_dax_ctl dax[PAGEVEC_SIZE];                
                    radix_tree_tag_clear(page_tree, indices[i], PAGECACHE_TAG_TOWRITE);
                    /* It is also reasonable to merge adjacent dax
                     * regions in to one */
                    dax[i].sector = RADIX_DAX_SECTOR(entry);
                    dax[i].size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);                    

                }
                spin_unlock_irq(&mapping->tree_lock);
               	if (blk_queue_enter(q, true) != 0)
                    goto error;
                for (i = 0; i < pvec.nr; i++) {
                    rc = bdev_direct_access(bdev, dax[i]);
                    wb_cache_pmem(dax[i].addr, dax[i].size);
                }
                ret = blk_queue_exit(q, true)
> +	}
> +	wmb_pmem();
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
> +
>  static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
>  			struct vm_area_struct *vma, struct vm_fault *vmf)
>  {
> @@ -363,6 +532,11 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
>  	}
>  	dax_unmap_atomic(bdev, &dax);
>  
> +	error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
> +			vmf->flags & FAULT_FLAG_WRITE);
> +	if (error)
> +		goto out;
> +
>  	error = vm_insert_mixed(vma, vaddr, dax.pfn);
>  
>   out:
> @@ -487,6 +661,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  		delete_from_page_cache(page);
>  		unlock_page(page);
>  		page_cache_release(page);
> +		page = NULL;
>  	}
I've realized that I do not understand why dax_fault code works at all.
During dax_fault we want to remove page from mapping and insert dax-entry
 Basically code looks like follows:
0 page = find_get_page()
1 lock_page(page)
2 delete_from_page_cache(page);
3 unlock_page(page);
4 dax_insert_mapping(inode, &bh, vma, vmf);

BUT what on earth protects us from other process to reinsert page again
after step(2) but before (4)?
Imagine we do write to file-hole which result in to dax_fault(write), but
another task also does read fault and reinsert deleted page via dax_hole_load
As result dax_tree_entry will fail with EIO
Testcase looks very trivial, but i can not reproduce this.
>  
>  	/*
> @@ -591,7 +766,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
>  	pgoff_t size, pgoff;
>  	loff_t lstart, lend;
>  	sector_t block;
> -	int result = 0;
> +	int error, result = 0;
>  
>  	/* dax pmd mappings require pfn_t_devmap() */
>  	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
> @@ -733,6 +908,16 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
>  		}
>  		dax_unmap_atomic(bdev, &dax);
>  
> +		if (write) {
> +			error = dax_radix_entry(mapping, pgoff, dax.sector,
> +					true, true);
> +			if (error) {
> +				dax_pmd_dbg(&bh, address,
> +						"PMD radix insertion failed");
> +				goto fallback;
> +			}
> +		}
> +
>  		dev_dbg(part_to_dev(bdev->bd_part),
>  				"%s: %s addr: %lx pfn: %lx sect: %llx\n",
>  				__func__, current->comm, address,
> @@ -791,15 +976,12 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
>   * dax_pfn_mkwrite - handle first write to DAX page
>   * @vma: The virtual memory area where the fault occurred
>   * @vmf: The description of the fault
> - *
>   */
>  int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
>  {
> -	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
> +	struct file *file = vma->vm_file;
>  
> -	sb_start_pagefault(sb);
> -	file_update_time(vma->vm_file);
> -	sb_end_pagefault(sb);
> +	dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
>  	return VM_FAULT_NOPAGE;
>  }
>  EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index e9d57f68..8204c3d 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -41,4 +41,6 @@ static inline bool dax_mapping(struct address_space *mapping)
>  {
>  	return mapping->host && IS_DAX(mapping->host);
>  }
> +int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
> +		loff_t end);
>  #endif
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 1e215fc..2e7c8d9 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -482,6 +482,12 @@ int filemap_write_and_wait_range(struct address_space *mapping,
>  {
>  	int err = 0;
>  
> +	if (dax_mapping(mapping) && mapping->nrexceptional) {
> +		err = dax_writeback_mapping_range(mapping, lstart, lend);
> +		if (err)
> +			return err;
> +	}
> +
>  	if (mapping->nrpages) {
>  		err = __filemap_fdatawrite_range(mapping, lstart, lend,
>  						 WB_SYNC_ALL);
> -- 
> 2.5.0
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 472 bytes --]

WARNING: multiple messages have this Message-ID (diff)
From: Dmitry Monakhov <dmonlist@gmail.com>
To: Ross Zwisler <ross.zwisler@linux.intel.com>,
	linux-kernel@vger.kernel.org
Cc: Dave Hansen <dave.hansen@linux.intel.com>,
	"J. Bruce Fields" <bfields@fieldses.org>,
	linux-mm@kvack.org, Andreas Dilger <adilger.kernel@dilger.ca>,
	"H. Peter Anvin" <hpa@zytor.com>,
	Jeff Layton <jlayton@poochiereds.net>,
	Dan Williams <dan.j.williams@intel.com>,
	linux-nvdimm@lists.01.org, x86@kernel.org,
	Ingo Molnar <mingo@redhat.com>,
	Matthew Wilcox <willy@linux.intel.com>,
	linux-ext4@vger.kernel.org, xfs@oss.sgi.com,
	Alexander Viro <viro@zeniv.linux.org.uk>,
	Thomas Gleixner <tglx@linutronix.de>,
	Theodore Ts'o <tytso@mit.edu>, Jan Kara <jack@suse.com>,
	linux-fsdevel@vger.kernel.org,
	Andrew Morton <akpm@linux-foundation.org>,
	Matthew Wilcox <matthew.r.wilcox@intel.com>
Subject: Re: [PATCH v8 6/9] dax: add support for fsync/msync
Date: Sat, 06 Feb 2016 17:33:07 +0300	[thread overview]
Message-ID: <878u2xrjrw.fsf@openvz.org> (raw)
In-Reply-To: <1452230879-18117-7-git-send-email-ross.zwisler@linux.intel.com>


[-- Attachment #1.1: Type: text/plain, Size: 11561 bytes --]

Ross Zwisler <ross.zwisler@linux.intel.com> writes:

> To properly handle fsync/msync in an efficient way DAX needs to track dirty
> pages so it is able to flush them durably to media on demand.
Please see coments below
>
> The tracking of dirty pages is done via the radix tree in struct
> address_space.  This radix tree is already used by the page writeback
> infrastructure for tracking dirty pages associated with an open file, and
> it already has support for exceptional (non struct page*) entries.  We
> build upon these features to add exceptional entries to the radix tree for
> DAX dirty PMD or PTE pages at fault time.
>
> Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> ---
>  fs/dax.c            | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++--
>  include/linux/dax.h |   2 +
>  mm/filemap.c        |   6 ++
>  3 files changed, 196 insertions(+), 6 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 5b84a46..0db21ea 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -24,6 +24,7 @@
>  #include <linux/memcontrol.h>
>  #include <linux/mm.h>
>  #include <linux/mutex.h>
> +#include <linux/pagevec.h>
>  #include <linux/pmem.h>
>  #include <linux/sched.h>
>  #include <linux/uio.h>
> @@ -324,6 +325,174 @@ static int copy_user_bh(struct page *to, struct inode *inode,
>  	return 0;
>  }
>  
> +#define NO_SECTOR -1
> +
> +static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
IMHO it would be sane to call that function as dax_radix_entry_insert() 
> +		sector_t sector, bool pmd_entry, bool dirty)
> +{
> +	struct radix_tree_root *page_tree = &mapping->page_tree;
> +	int type, error = 0;
> +	void *entry;
> +
> +	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
> +
> +	spin_lock_irq(&mapping->tree_lock);
> +	entry = radix_tree_lookup(page_tree, index);
> +
> +	if (entry) {
> +		type = RADIX_DAX_TYPE(entry);
> +		if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
> +					type != RADIX_DAX_PMD)) {
> +			error = -EIO;
> +			goto unlock;
> +		}
> +
> +		if (!pmd_entry || type == RADIX_DAX_PMD)
> +			goto dirty;
> +		radix_tree_delete(&mapping->page_tree, index);
> +		mapping->nrexceptional--;
> +	}
> +
> +	if (sector == NO_SECTOR) {
> +		/*
> +		 * This can happen during correct operation if our pfn_mkwrite
> +		 * fault raced against a hole punch operation.  If this
> +		 * happens the pte that was hole punched will have been
> +		 * unmapped and the radix tree entry will have been removed by
> +		 * the time we are called, but the call will still happen.  We
> +		 * will return all the way up to wp_pfn_shared(), where the
> +		 * pte_same() check will fail, eventually causing page fault
> +		 * to be retried by the CPU.
> +		 */
> +		goto unlock;
> +	}
> +
> +	error = radix_tree_insert(page_tree, index,
> +			RADIX_DAX_ENTRY(sector, pmd_entry));
> +	if (error)
> +		goto unlock;
> +
> +	mapping->nrexceptional++;
> + dirty:
> +	if (dirty)
> +		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
> + unlock:
> +	spin_unlock_irq(&mapping->tree_lock);
> +	return error;
> +}
> +
> +static int dax_writeback_one(struct block_device *bdev,
> +		struct address_space *mapping, pgoff_t index, void *entry)
> +{
> +	struct radix_tree_root *page_tree = &mapping->page_tree;
> +	int type = RADIX_DAX_TYPE(entry);
> +	struct radix_tree_node *node;
> +	struct blk_dax_ctl dax;
> +	void **slot;
> +	int ret = 0;
> +
> +	spin_lock_irq(&mapping->tree_lock);
> +	/*
> +	 * Regular page slots are stabilized by the page lock even
> +	 * without the tree itself locked.  These unlocked entries
> +	 * need verification under the tree lock.
> +	 */
> +	if (!__radix_tree_lookup(page_tree, index, &node, &slot))
> +		goto unlock;
> +	if (*slot != entry)
> +		goto unlock;
> +
> +	/* another fsync thread may have already written back this entry */
> +	if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
> +		goto unlock;
> +
> +	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
> +
> +	if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
> +		ret = -EIO;
> +		goto unlock;
> +	}
> +
> +	dax.sector = RADIX_DAX_SECTOR(entry);
> +	dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
> +	spin_unlock_irq(&mapping->tree_lock);
> +
> +	/*
> +	 * We cannot hold tree_lock while calling dax_map_atomic() because it
> +	 * eventually calls cond_resched().
> +	 */
> +	ret = dax_map_atomic(bdev, &dax);
> +	if (ret < 0)
> +		return ret;
> +
> +	if (WARN_ON_ONCE(ret < dax.size)) {
> +		ret = -EIO;
> +		goto unmap;
> +	}
> +
> +	wb_cache_pmem(dax.addr, dax.size);
> + unmap:
> +	dax_unmap_atomic(bdev, &dax);
> +	return ret;
> +
> + unlock:
> +	spin_unlock_irq(&mapping->tree_lock);
> +	return ret;
> +}
> +
> +/*
> + * Flush the mapping to the persistent domain within the byte range of [start,
> + * end]. This is required by data integrity operations to ensure file data is
> + * on persistent storage prior to completion of the operation.
> + */
> +int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
> +		loff_t end)
> +{
> +	struct inode *inode = mapping->host;
> +	struct block_device *bdev = inode->i_sb->s_bdev;
> +	pgoff_t indices[PAGEVEC_SIZE];
> +	pgoff_t start_page, end_page;
> +	struct pagevec pvec;
> +	void *entry;
> +	int i, ret = 0;
> +
> +	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
> +		return -EIO;
> +
> +	rcu_read_lock();
> +	entry = radix_tree_lookup(&mapping->page_tree, start & PMD_MASK);
> +	rcu_read_unlock();
> +
> +	/* see if the start of our range is covered by a PMD entry */
> +	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
> +		start &= PMD_MASK;
> +
> +	start_page = start >> PAGE_CACHE_SHIFT;
> +	end_page = end >> PAGE_CACHE_SHIFT;
> +
> +	tag_pages_for_writeback(mapping, start_page, end_page);
> +
> +	pagevec_init(&pvec, 0);
> +	while (1) {
> +		pvec.nr = find_get_entries_tag(mapping, start_page,
> +				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
> +				pvec.pages, indices);
> +
> +		if (pvec.nr == 0)
> +			break;
> +
> +		for (i = 0; i < pvec.nr; i++) {
> +			ret = dax_writeback_one(bdev, mapping, indices[i],
> +					pvec.pages[i]);
> +			if (ret < 0)
> +				return ret;
> +		}
I think it would be more efficient to use batched locking like follows:
                spin_lock_irq(&mapping->tree_lock);
		for (i = 0; i < pvec.nr; i++) {
                    struct blk_dax_ctl dax[PAGEVEC_SIZE];                
                    radix_tree_tag_clear(page_tree, indices[i], PAGECACHE_TAG_TOWRITE);
                    /* It is also reasonable to merge adjacent dax
                     * regions in to one */
                    dax[i].sector = RADIX_DAX_SECTOR(entry);
                    dax[i].size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);                    

                }
                spin_unlock_irq(&mapping->tree_lock);
               	if (blk_queue_enter(q, true) != 0)
                    goto error;
                for (i = 0; i < pvec.nr; i++) {
                    rc = bdev_direct_access(bdev, dax[i]);
                    wb_cache_pmem(dax[i].addr, dax[i].size);
                }
                ret = blk_queue_exit(q, true)
> +	}
> +	wmb_pmem();
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
> +
>  static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
>  			struct vm_area_struct *vma, struct vm_fault *vmf)
>  {
> @@ -363,6 +532,11 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
>  	}
>  	dax_unmap_atomic(bdev, &dax);
>  
> +	error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
> +			vmf->flags & FAULT_FLAG_WRITE);
> +	if (error)
> +		goto out;
> +
>  	error = vm_insert_mixed(vma, vaddr, dax.pfn);
>  
>   out:
> @@ -487,6 +661,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  		delete_from_page_cache(page);
>  		unlock_page(page);
>  		page_cache_release(page);
> +		page = NULL;
>  	}
I've realized that I do not understand why dax_fault code works at all.
During dax_fault we want to remove page from mapping and insert dax-entry
 Basically code looks like follows:
0 page = find_get_page()
1 lock_page(page)
2 delete_from_page_cache(page);
3 unlock_page(page);
4 dax_insert_mapping(inode, &bh, vma, vmf);

BUT what on earth protects us from other process to reinsert page again
after step(2) but before (4)?
Imagine we do write to file-hole which result in to dax_fault(write), but
another task also does read fault and reinsert deleted page via dax_hole_load
As result dax_tree_entry will fail with EIO
Testcase looks very trivial, but i can not reproduce this.
>  
>  	/*
> @@ -591,7 +766,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
>  	pgoff_t size, pgoff;
>  	loff_t lstart, lend;
>  	sector_t block;
> -	int result = 0;
> +	int error, result = 0;
>  
>  	/* dax pmd mappings require pfn_t_devmap() */
>  	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
> @@ -733,6 +908,16 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
>  		}
>  		dax_unmap_atomic(bdev, &dax);
>  
> +		if (write) {
> +			error = dax_radix_entry(mapping, pgoff, dax.sector,
> +					true, true);
> +			if (error) {
> +				dax_pmd_dbg(&bh, address,
> +						"PMD radix insertion failed");
> +				goto fallback;
> +			}
> +		}
> +
>  		dev_dbg(part_to_dev(bdev->bd_part),
>  				"%s: %s addr: %lx pfn: %lx sect: %llx\n",
>  				__func__, current->comm, address,
> @@ -791,15 +976,12 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
>   * dax_pfn_mkwrite - handle first write to DAX page
>   * @vma: The virtual memory area where the fault occurred
>   * @vmf: The description of the fault
> - *
>   */
>  int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
>  {
> -	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
> +	struct file *file = vma->vm_file;
>  
> -	sb_start_pagefault(sb);
> -	file_update_time(vma->vm_file);
> -	sb_end_pagefault(sb);
> +	dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
>  	return VM_FAULT_NOPAGE;
>  }
>  EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index e9d57f68..8204c3d 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -41,4 +41,6 @@ static inline bool dax_mapping(struct address_space *mapping)
>  {
>  	return mapping->host && IS_DAX(mapping->host);
>  }
> +int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
> +		loff_t end);
>  #endif
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 1e215fc..2e7c8d9 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -482,6 +482,12 @@ int filemap_write_and_wait_range(struct address_space *mapping,
>  {
>  	int err = 0;
>  
> +	if (dax_mapping(mapping) && mapping->nrexceptional) {
> +		err = dax_writeback_mapping_range(mapping, lstart, lend);
> +		if (err)
> +			return err;
> +	}
> +
>  	if (mapping->nrpages) {
>  		err = __filemap_fdatawrite_range(mapping, lstart, lend,
>  						 WB_SYNC_ALL);
> -- 
> 2.5.0
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 472 bytes --]

[-- Attachment #2: Type: text/plain, Size: 121 bytes --]

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

WARNING: multiple messages have this Message-ID (diff)
From: Dmitry Monakhov <dmonlist@gmail.com>
To: Ross Zwisler <ross.zwisler@linux.intel.com>,
	linux-kernel@vger.kernel.org
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>,
	"H. Peter Anvin" <hpa@zytor.com>,
	"J. Bruce Fields" <bfields@fieldses.org>,
	"Theodore Ts'o" <tytso@mit.edu>,
	Alexander Viro <viro@zeniv.linux.org.uk>,
	Andreas Dilger <adilger.kernel@dilger.ca>,
	Andrew Morton <akpm@linux-foundation.org>,
	Dan Williams <dan.j.williams@intel.com>,
	Dave Chinner <david@fromorbit.com>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	Ingo Molnar <mingo@redhat.com>, Jan Kara <jack@suse.com>,
	Jeff Layton <jlayton@poochiereds.net>,
	Matthew Wilcox <matthew.r.wilcox@intel.com>,
	Matthew Wilcox <willy@linux.intel.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	linux-ext4@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-mm@kvack.org, linux-nvdimm@ml01.01.org, x86@kernel.org,
	xfs@oss.sgi.com
Subject: Re: [PATCH v8 6/9] dax: add support for fsync/msync
Date: Sat, 06 Feb 2016 17:33:07 +0300	[thread overview]
Message-ID: <878u2xrjrw.fsf@openvz.org> (raw)
In-Reply-To: <1452230879-18117-7-git-send-email-ross.zwisler@linux.intel.com>

[-- Attachment #1: Type: text/plain, Size: 11561 bytes --]

Ross Zwisler <ross.zwisler@linux.intel.com> writes:

> To properly handle fsync/msync in an efficient way DAX needs to track dirty
> pages so it is able to flush them durably to media on demand.
Please see coments below
>
> The tracking of dirty pages is done via the radix tree in struct
> address_space.  This radix tree is already used by the page writeback
> infrastructure for tracking dirty pages associated with an open file, and
> it already has support for exceptional (non struct page*) entries.  We
> build upon these features to add exceptional entries to the radix tree for
> DAX dirty PMD or PTE pages at fault time.
>
> Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> ---
>  fs/dax.c            | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++--
>  include/linux/dax.h |   2 +
>  mm/filemap.c        |   6 ++
>  3 files changed, 196 insertions(+), 6 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 5b84a46..0db21ea 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -24,6 +24,7 @@
>  #include <linux/memcontrol.h>
>  #include <linux/mm.h>
>  #include <linux/mutex.h>
> +#include <linux/pagevec.h>
>  #include <linux/pmem.h>
>  #include <linux/sched.h>
>  #include <linux/uio.h>
> @@ -324,6 +325,174 @@ static int copy_user_bh(struct page *to, struct inode *inode,
>  	return 0;
>  }
>  
> +#define NO_SECTOR -1
> +
> +static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
IMHO it would be sane to call that function as dax_radix_entry_insert() 
> +		sector_t sector, bool pmd_entry, bool dirty)
> +{
> +	struct radix_tree_root *page_tree = &mapping->page_tree;
> +	int type, error = 0;
> +	void *entry;
> +
> +	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
> +
> +	spin_lock_irq(&mapping->tree_lock);
> +	entry = radix_tree_lookup(page_tree, index);
> +
> +	if (entry) {
> +		type = RADIX_DAX_TYPE(entry);
> +		if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
> +					type != RADIX_DAX_PMD)) {
> +			error = -EIO;
> +			goto unlock;
> +		}
> +
> +		if (!pmd_entry || type == RADIX_DAX_PMD)
> +			goto dirty;
> +		radix_tree_delete(&mapping->page_tree, index);
> +		mapping->nrexceptional--;
> +	}
> +
> +	if (sector == NO_SECTOR) {
> +		/*
> +		 * This can happen during correct operation if our pfn_mkwrite
> +		 * fault raced against a hole punch operation.  If this
> +		 * happens the pte that was hole punched will have been
> +		 * unmapped and the radix tree entry will have been removed by
> +		 * the time we are called, but the call will still happen.  We
> +		 * will return all the way up to wp_pfn_shared(), where the
> +		 * pte_same() check will fail, eventually causing page fault
> +		 * to be retried by the CPU.
> +		 */
> +		goto unlock;
> +	}
> +
> +	error = radix_tree_insert(page_tree, index,
> +			RADIX_DAX_ENTRY(sector, pmd_entry));
> +	if (error)
> +		goto unlock;
> +
> +	mapping->nrexceptional++;
> + dirty:
> +	if (dirty)
> +		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
> + unlock:
> +	spin_unlock_irq(&mapping->tree_lock);
> +	return error;
> +}
> +
> +static int dax_writeback_one(struct block_device *bdev,
> +		struct address_space *mapping, pgoff_t index, void *entry)
> +{
> +	struct radix_tree_root *page_tree = &mapping->page_tree;
> +	int type = RADIX_DAX_TYPE(entry);
> +	struct radix_tree_node *node;
> +	struct blk_dax_ctl dax;
> +	void **slot;
> +	int ret = 0;
> +
> +	spin_lock_irq(&mapping->tree_lock);
> +	/*
> +	 * Regular page slots are stabilized by the page lock even
> +	 * without the tree itself locked.  These unlocked entries
> +	 * need verification under the tree lock.
> +	 */
> +	if (!__radix_tree_lookup(page_tree, index, &node, &slot))
> +		goto unlock;
> +	if (*slot != entry)
> +		goto unlock;
> +
> +	/* another fsync thread may have already written back this entry */
> +	if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
> +		goto unlock;
> +
> +	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
> +
> +	if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
> +		ret = -EIO;
> +		goto unlock;
> +	}
> +
> +	dax.sector = RADIX_DAX_SECTOR(entry);
> +	dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
> +	spin_unlock_irq(&mapping->tree_lock);
> +
> +	/*
> +	 * We cannot hold tree_lock while calling dax_map_atomic() because it
> +	 * eventually calls cond_resched().
> +	 */
> +	ret = dax_map_atomic(bdev, &dax);
> +	if (ret < 0)
> +		return ret;
> +
> +	if (WARN_ON_ONCE(ret < dax.size)) {
> +		ret = -EIO;
> +		goto unmap;
> +	}
> +
> +	wb_cache_pmem(dax.addr, dax.size);
> + unmap:
> +	dax_unmap_atomic(bdev, &dax);
> +	return ret;
> +
> + unlock:
> +	spin_unlock_irq(&mapping->tree_lock);
> +	return ret;
> +}
> +
> +/*
> + * Flush the mapping to the persistent domain within the byte range of [start,
> + * end]. This is required by data integrity operations to ensure file data is
> + * on persistent storage prior to completion of the operation.
> + */
> +int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
> +		loff_t end)
> +{
> +	struct inode *inode = mapping->host;
> +	struct block_device *bdev = inode->i_sb->s_bdev;
> +	pgoff_t indices[PAGEVEC_SIZE];
> +	pgoff_t start_page, end_page;
> +	struct pagevec pvec;
> +	void *entry;
> +	int i, ret = 0;
> +
> +	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
> +		return -EIO;
> +
> +	rcu_read_lock();
> +	entry = radix_tree_lookup(&mapping->page_tree, start & PMD_MASK);
> +	rcu_read_unlock();
> +
> +	/* see if the start of our range is covered by a PMD entry */
> +	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
> +		start &= PMD_MASK;
> +
> +	start_page = start >> PAGE_CACHE_SHIFT;
> +	end_page = end >> PAGE_CACHE_SHIFT;
> +
> +	tag_pages_for_writeback(mapping, start_page, end_page);
> +
> +	pagevec_init(&pvec, 0);
> +	while (1) {
> +		pvec.nr = find_get_entries_tag(mapping, start_page,
> +				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
> +				pvec.pages, indices);
> +
> +		if (pvec.nr == 0)
> +			break;
> +
> +		for (i = 0; i < pvec.nr; i++) {
> +			ret = dax_writeback_one(bdev, mapping, indices[i],
> +					pvec.pages[i]);
> +			if (ret < 0)
> +				return ret;
> +		}
I think it would be more efficient to use batched locking like follows:
                spin_lock_irq(&mapping->tree_lock);
		for (i = 0; i < pvec.nr; i++) {
                    struct blk_dax_ctl dax[PAGEVEC_SIZE];                
                    radix_tree_tag_clear(page_tree, indices[i], PAGECACHE_TAG_TOWRITE);
                    /* It is also reasonable to merge adjacent dax
                     * regions in to one */
                    dax[i].sector = RADIX_DAX_SECTOR(entry);
                    dax[i].size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);                    

                }
                spin_unlock_irq(&mapping->tree_lock);
               	if (blk_queue_enter(q, true) != 0)
                    goto error;
                for (i = 0; i < pvec.nr; i++) {
                    rc = bdev_direct_access(bdev, dax[i]);
                    wb_cache_pmem(dax[i].addr, dax[i].size);
                }
                ret = blk_queue_exit(q, true)
> +	}
> +	wmb_pmem();
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
> +
>  static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
>  			struct vm_area_struct *vma, struct vm_fault *vmf)
>  {
> @@ -363,6 +532,11 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
>  	}
>  	dax_unmap_atomic(bdev, &dax);
>  
> +	error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
> +			vmf->flags & FAULT_FLAG_WRITE);
> +	if (error)
> +		goto out;
> +
>  	error = vm_insert_mixed(vma, vaddr, dax.pfn);
>  
>   out:
> @@ -487,6 +661,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  		delete_from_page_cache(page);
>  		unlock_page(page);
>  		page_cache_release(page);
> +		page = NULL;
>  	}
I've realized that I do not understand why dax_fault code works at all.
During dax_fault we want to remove page from mapping and insert dax-entry
 Basically code looks like follows:
0 page = find_get_page()
1 lock_page(page)
2 delete_from_page_cache(page);
3 unlock_page(page);
4 dax_insert_mapping(inode, &bh, vma, vmf);

BUT what on earth protects us from other process to reinsert page again
after step(2) but before (4)?
Imagine we do write to file-hole which result in to dax_fault(write), but
another task also does read fault and reinsert deleted page via dax_hole_load
As result dax_tree_entry will fail with EIO
Testcase looks very trivial, but i can not reproduce this.
>  
>  	/*
> @@ -591,7 +766,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
>  	pgoff_t size, pgoff;
>  	loff_t lstart, lend;
>  	sector_t block;
> -	int result = 0;
> +	int error, result = 0;
>  
>  	/* dax pmd mappings require pfn_t_devmap() */
>  	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
> @@ -733,6 +908,16 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
>  		}
>  		dax_unmap_atomic(bdev, &dax);
>  
> +		if (write) {
> +			error = dax_radix_entry(mapping, pgoff, dax.sector,
> +					true, true);
> +			if (error) {
> +				dax_pmd_dbg(&bh, address,
> +						"PMD radix insertion failed");
> +				goto fallback;
> +			}
> +		}
> +
>  		dev_dbg(part_to_dev(bdev->bd_part),
>  				"%s: %s addr: %lx pfn: %lx sect: %llx\n",
>  				__func__, current->comm, address,
> @@ -791,15 +976,12 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
>   * dax_pfn_mkwrite - handle first write to DAX page
>   * @vma: The virtual memory area where the fault occurred
>   * @vmf: The description of the fault
> - *
>   */
>  int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
>  {
> -	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
> +	struct file *file = vma->vm_file;
>  
> -	sb_start_pagefault(sb);
> -	file_update_time(vma->vm_file);
> -	sb_end_pagefault(sb);
> +	dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
>  	return VM_FAULT_NOPAGE;
>  }
>  EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index e9d57f68..8204c3d 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -41,4 +41,6 @@ static inline bool dax_mapping(struct address_space *mapping)
>  {
>  	return mapping->host && IS_DAX(mapping->host);
>  }
> +int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
> +		loff_t end);
>  #endif
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 1e215fc..2e7c8d9 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -482,6 +482,12 @@ int filemap_write_and_wait_range(struct address_space *mapping,
>  {
>  	int err = 0;
>  
> +	if (dax_mapping(mapping) && mapping->nrexceptional) {
> +		err = dax_writeback_mapping_range(mapping, lstart, lend);
> +		if (err)
> +			return err;
> +	}
> +
>  	if (mapping->nrpages) {
>  		err = __filemap_fdatawrite_range(mapping, lstart, lend,
>  						 WB_SYNC_ALL);
> -- 
> 2.5.0
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 472 bytes --]

  parent reply	other threads:[~2016-02-06 14:33 UTC|newest]

Thread overview: 89+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-01-08  5:27 [PATCH v8 0/9] DAX fsync/msync support Ross Zwisler
2016-01-08  5:27 ` Ross Zwisler
2016-01-08  5:27 ` Ross Zwisler
2016-01-08  5:27 ` Ross Zwisler
2016-01-08  5:27 ` [PATCH v8 1/9] dax: fix NULL pointer dereference in __dax_dbg() Ross Zwisler
2016-01-08  5:27   ` Ross Zwisler
2016-01-08  5:27   ` Ross Zwisler
2016-01-12  9:34   ` Jan Kara
2016-01-12  9:34     ` Jan Kara
2016-01-12  9:34     ` Jan Kara
2016-01-13  7:08     ` Ross Zwisler
2016-01-13  7:08       ` Ross Zwisler
2016-01-13  7:08       ` Ross Zwisler
2016-01-13  9:07       ` Jan Kara
2016-01-13  9:07         ` Jan Kara
2016-01-13  9:07         ` Jan Kara
2016-01-08  5:27 ` [PATCH v8 2/9] dax: fix conversion of holes to PMDs Ross Zwisler
2016-01-08  5:27   ` Ross Zwisler
2016-01-08  5:27   ` Ross Zwisler
2016-01-12  9:44   ` Jan Kara
2016-01-12  9:44     ` Jan Kara
2016-01-12  9:44     ` Jan Kara
2016-01-13  7:37     ` Ross Zwisler
2016-01-13  7:37       ` Ross Zwisler
2016-01-13  7:37       ` Ross Zwisler
2016-01-08  5:27 ` [PATCH v8 3/9] pmem: add wb_cache_pmem() to the PMEM API Ross Zwisler
2016-01-08  5:27   ` Ross Zwisler
2016-01-08  5:27   ` Ross Zwisler
2016-01-08  5:27 ` [PATCH v8 4/9] dax: support dirty DAX entries in radix tree Ross Zwisler
2016-01-08  5:27   ` Ross Zwisler
2016-01-08  5:27   ` Ross Zwisler
2016-01-13  9:44   ` Jan Kara
2016-01-13  9:44     ` Jan Kara
2016-01-13  9:44     ` Jan Kara
2016-01-13 18:48     ` Ross Zwisler
2016-01-13 18:48       ` Ross Zwisler
2016-01-13 18:48       ` Ross Zwisler
2016-01-13 18:48       ` Ross Zwisler
2016-01-15 13:22       ` Jan Kara
2016-01-15 13:22         ` Jan Kara
2016-01-15 13:22         ` Jan Kara
2016-01-15 13:22         ` Jan Kara
2016-01-15 19:03         ` Ross Zwisler
2016-01-15 19:03           ` Ross Zwisler
2016-01-15 19:03           ` Ross Zwisler
2016-02-03 16:42         ` Ross Zwisler
2016-02-03 16:42           ` Ross Zwisler
2016-02-03 16:42           ` Ross Zwisler
2016-01-08  5:27 ` [PATCH v8 5/9] mm: add find_get_entries_tag() Ross Zwisler
2016-01-08  5:27   ` Ross Zwisler
2016-01-08  5:27   ` Ross Zwisler
2016-01-08  5:27 ` [PATCH v8 6/9] dax: add support for fsync/msync Ross Zwisler
2016-01-08  5:27   ` Ross Zwisler
2016-01-08  5:27   ` Ross Zwisler
2016-01-12 10:57   ` Jan Kara
2016-01-12 10:57     ` Jan Kara
2016-01-12 10:57     ` Jan Kara
2016-01-13  7:30     ` Ross Zwisler
2016-01-13  7:30       ` Ross Zwisler
2016-01-13  7:30       ` Ross Zwisler
2016-01-13  9:35       ` Jan Kara
2016-01-13  9:35         ` Jan Kara
2016-01-13  9:35         ` Jan Kara
2016-01-13 18:58         ` Ross Zwisler
2016-01-13 18:58           ` Ross Zwisler
2016-01-13 18:58           ` Ross Zwisler
2016-01-15 13:10           ` Jan Kara
2016-01-15 13:10             ` Jan Kara
2016-01-15 13:10             ` Jan Kara
2016-02-06 14:33   ` Dmitry Monakhov [this message]
2016-02-06 14:33     ` Dmitry Monakhov
2016-02-06 14:33     ` Dmitry Monakhov
2016-02-06 14:33     ` Dmitry Monakhov
2016-02-08  9:44     ` Jan Kara
2016-02-08  9:44       ` Jan Kara
2016-02-08  9:44       ` Jan Kara
2016-02-08 22:06     ` Ross Zwisler
2016-02-08 22:06       ` Ross Zwisler
2016-02-08 22:06       ` Ross Zwisler
2016-01-08  5:27 ` [PATCH v8 7/9] ext2: call dax_pfn_mkwrite() for DAX fsync/msync Ross Zwisler
2016-01-08  5:27   ` Ross Zwisler
2016-01-08  5:27   ` Ross Zwisler
2016-01-08  5:27 ` [PATCH v8 8/9] ext4: " Ross Zwisler
2016-01-08  5:27   ` Ross Zwisler
2016-01-08  5:27   ` Ross Zwisler
2016-01-08  5:27 ` [PATCH v8 9/9] xfs: " Ross Zwisler
2016-01-08  5:27   ` Ross Zwisler
2016-01-08  5:27   ` Ross Zwisler
2016-01-08  5:27   ` Ross Zwisler

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=878u2xrjrw.fsf@openvz.org \
    --to=dmonlist@gmail.com \
    --cc=adilger.kernel@dilger.ca \
    --cc=akpm@linux-foundation.org \
    --cc=bfields@fieldses.org \
    --cc=dan.j.williams@intel.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@fromorbit.com \
    --cc=hpa@zytor.com \
    --cc=jack@suse.com \
    --cc=jlayton@poochiereds.net \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-nvdimm@lists.01.org \
    --cc=matthew.r.wilcox@intel.com \
    --cc=mingo@redhat.com \
    --cc=ross.zwisler@linux.intel.com \
    --cc=tglx@linutronix.de \
    --cc=tytso@mit.edu \
    --cc=viro@zeniv.linux.org.uk \
    --cc=willy@linux.intel.com \
    --cc=x86@kernel.org \
    --cc=xfs@oss.sgi.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.