linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH] Support map_pages() for DAX
@ 2014-03-14 23:03 Toshi Kani
  2014-03-14 23:32 ` Kirill A. Shutemov
  0 siblings, 1 reply; 9+ messages in thread
From: Toshi Kani @ 2014-03-14 23:03 UTC (permalink / raw)
  To: willy, kirill.shutemov, david, linux-fsdevel
  Cc: linux-kernel, linux-mm, Toshi Kani

DAX provides direct access to NVDIMM and bypasses the page caches.
Newly introduced map_pages() callback reduces page faults by adding
mappings around a faulted page, which is not supported for DAX.

This patch implements map_pages() callback for DAX.  It reduces a
number of page faults and increases read performance of DAX as shown
below.  The values in parenthesis are relative to the base DAX results.

iozone results of mmap read/re-read tests [KB/sec]
 64KB:  read: 3,560,777 (x1.6) re-read: 9,086,412 (x1.8) pfault:   121 (-20%)
 128MB: read: 4,374,906 (x1.7) re-read: 6,137,189 (x2.4) pfault: 8,312 (-87%)

Signed-off-by: Toshi Kani <toshi.kani@hp.com>
----
Applies on top of DAX patchset [1] and fault-around patchset [2].

[1] https://lkml.org/lkml/2014/2/25/460
[2] https://lkml.org/lkml/2014/2/27/546
---
 fs/dax.c           |   68 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/file.c     |    6 +++++
 include/linux/fs.h |    5 ++++
 3 files changed, 79 insertions(+)

diff --git a/fs/dax.c b/fs/dax.c
index c8dfab0..bc54705 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -476,3 +476,71 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
 	return 0;
 }
 EXPORT_SYMBOL_GPL(dax_zero_page_range);
+
+static void dax_set_pte(struct vm_area_struct *vma, unsigned long addr,
+			unsigned long pfn, pte_t *pte)
+{
+	pte_t entry;
+
+	if (addr < vma->vm_start || addr >= vma->vm_end)
+		return;
+
+	if (!pte_none(*pte))
+		return;
+
+	entry = pte_mkspecial(pfn_pte(pfn, vma->vm_page_prot));
+	set_pte_at(vma->vm_mm, addr, pte, entry);
+	update_mmu_cache(vma, addr, pte);
+}
+
+void dax_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf,
+		get_block_t get_block)
+{
+	struct file *file = vma->vm_file;
+	struct inode *inode = file_inode(file);
+	struct buffer_head bh;
+	struct address_space *mapping = file->f_mapping;
+	unsigned long vaddr = (unsigned long)vmf->virtual_address;
+	pgoff_t pgoff = vmf->pgoff;
+	sector_t block;
+	pgoff_t size;
+	unsigned long pfn;
+	pte_t *pte = vmf->pte;
+	int error;
+
+	while (pgoff < vmf->max_pgoff) {
+		size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		if (pgoff >= size)
+			return;
+
+		memset(&bh, 0, sizeof(bh));
+		block = (sector_t)pgoff << (PAGE_SHIFT - inode->i_blkbits);
+		bh.b_size = PAGE_SIZE;
+		error = get_block(inode, block, &bh, 0);
+		if (error || bh.b_size < PAGE_SIZE)
+			goto next;
+
+		if (!buffer_mapped(&bh) || buffer_unwritten(&bh) ||
+		    buffer_new(&bh))
+			goto next;
+
+		/* Recheck i_size under i_mmap_mutex */
+		mutex_lock(&mapping->i_mmap_mutex);
+		size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		if (unlikely(pgoff >= size)) {
+			mutex_unlock(&mapping->i_mmap_mutex);
+			return;
+		}
+
+		error = dax_get_pfn(inode, &bh, &pfn);
+		if (error > 0)
+			dax_set_pte(vma, vaddr, pfn, pte);
+
+		mutex_unlock(&mapping->i_mmap_mutex);
+next:
+		vaddr += PAGE_SIZE;
+		pgoff++;
+		pte++;
+	}
+}
+EXPORT_SYMBOL_GPL(dax_map_pages);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index eb19383..15965ea 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -205,6 +205,11 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 					/* Is this the right get_block? */
 }
 
+static void ext4_dax_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	return dax_map_pages(vma, vmf, ext4_get_block);
+}
+
 static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	return dax_mkwrite(vma, vmf, ext4_get_block);
@@ -212,6 +217,7 @@ static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 static const struct vm_operations_struct ext4_dax_vm_ops = {
 	.fault		= ext4_dax_fault,
+	.map_pages	= ext4_dax_map_pages,
 	.page_mkwrite	= ext4_dax_mkwrite,
 	.remap_pages	= generic_file_remap_pages,
 };
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d0381ab..3bd1042 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2527,6 +2527,7 @@ ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, const struct iovec *,
 		loff_t, unsigned segs, get_block_t, dio_iodone_t, int flags);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 int dax_mkwrite(struct vm_area_struct *, struct vm_fault *, get_block_t);
+void dax_map_pages(struct vm_area_struct *, struct vm_fault *, get_block_t);
 #else
 static inline int dax_clear_blocks(struct inode *i, sector_t blk, long sz)
 {
@@ -2545,6 +2546,10 @@ static inline ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
 {
 	return -ENOTTY;
 }
+static inline void dax_map_pages(struct vm_area_struct *vma,
+		struct vm_fault *vmf, get_block_t get_block)
+{
+}
 #endif
 
 /* Can't be a function because PAGE_CACHE_SIZE is defined in pagemap.h */

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] Support map_pages() for DAX
  2014-03-14 23:03 [RFC PATCH] Support map_pages() for DAX Toshi Kani
@ 2014-03-14 23:32 ` Kirill A. Shutemov
  2014-03-14 23:58   ` Toshi Kani
  2014-03-16  2:46   ` Matthew Wilcox
  0 siblings, 2 replies; 9+ messages in thread
From: Kirill A. Shutemov @ 2014-03-14 23:32 UTC (permalink / raw)
  To: Toshi Kani
  Cc: willy, kirill.shutemov, david, linux-fsdevel, linux-kernel,
	linux-mm

On Fri, Mar 14, 2014 at 05:03:19PM -0600, Toshi Kani wrote:
> +void dax_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf,
> +		get_block_t get_block)
> +{
> +	struct file *file = vma->vm_file;
> +	struct inode *inode = file_inode(file);
> +	struct buffer_head bh;
> +	struct address_space *mapping = file->f_mapping;
> +	unsigned long vaddr = (unsigned long)vmf->virtual_address;
> +	pgoff_t pgoff = vmf->pgoff;
> +	sector_t block;
> +	pgoff_t size;
> +	unsigned long pfn;
> +	pte_t *pte = vmf->pte;
> +	int error;
> +
> +	while (pgoff < vmf->max_pgoff) {
> +		size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
> +		if (pgoff >= size)
> +			return;
> +
> +		memset(&bh, 0, sizeof(bh));
> +		block = (sector_t)pgoff << (PAGE_SHIFT - inode->i_blkbits);
> +		bh.b_size = PAGE_SIZE;
> +		error = get_block(inode, block, &bh, 0);
> +		if (error || bh.b_size < PAGE_SIZE)
> +			goto next;
> +
> +		if (!buffer_mapped(&bh) || buffer_unwritten(&bh) ||
> +		    buffer_new(&bh))
> +			goto next;
> +
> +		/* Recheck i_size under i_mmap_mutex */
> +		mutex_lock(&mapping->i_mmap_mutex);

NAK. Have you tested this with lockdep enabled?

->map_pages() called with page table lock taken and ->i_mmap_mutex
should be taken before it. It seems we need to take ->i_mmap_mutex in
do_read_fault() before calling ->map_pages().

Side note: I'm sceptical about whole idea to use i_mmap_mutux to protect
against truncate. It will not scale good enough comparing lock_page()
with its granularity.

> +		size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
> +		if (unlikely(pgoff >= size)) {
> +			mutex_unlock(&mapping->i_mmap_mutex);
> +			return;
> +		}
> +
> +		error = dax_get_pfn(inode, &bh, &pfn);
> +		if (error > 0)
> +			dax_set_pte(vma, vaddr, pfn, pte);
> +
> +		mutex_unlock(&mapping->i_mmap_mutex);
> +next:
> +		vaddr += PAGE_SIZE;
> +		pgoff++;
> +		pte++;
> +	}
> +}
> +EXPORT_SYMBOL_GPL(dax_map_pages);

-- 
 Kirill A. Shutemov

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] Support map_pages() for DAX
  2014-03-14 23:32 ` Kirill A. Shutemov
@ 2014-03-14 23:58   ` Toshi Kani
  2014-03-16  2:46   ` Matthew Wilcox
  1 sibling, 0 replies; 9+ messages in thread
From: Toshi Kani @ 2014-03-14 23:58 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: willy, kirill.shutemov, david, linux-fsdevel, linux-kernel,
	linux-mm

On Sat, 2014-03-15 at 01:32 +0200, Kirill A. Shutemov wrote:
> On Fri, Mar 14, 2014 at 05:03:19PM -0600, Toshi Kani wrote:
> > +void dax_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf,
> > +		get_block_t get_block)
> > +{
> > +	struct file *file = vma->vm_file;
> > +	struct inode *inode = file_inode(file);
> > +	struct buffer_head bh;
> > +	struct address_space *mapping = file->f_mapping;
> > +	unsigned long vaddr = (unsigned long)vmf->virtual_address;
> > +	pgoff_t pgoff = vmf->pgoff;
> > +	sector_t block;
> > +	pgoff_t size;
> > +	unsigned long pfn;
> > +	pte_t *pte = vmf->pte;
> > +	int error;
> > +
> > +	while (pgoff < vmf->max_pgoff) {
> > +		size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
> > +		if (pgoff >= size)
> > +			return;
> > +
> > +		memset(&bh, 0, sizeof(bh));
> > +		block = (sector_t)pgoff << (PAGE_SHIFT - inode->i_blkbits);
> > +		bh.b_size = PAGE_SIZE;
> > +		error = get_block(inode, block, &bh, 0);
> > +		if (error || bh.b_size < PAGE_SIZE)
> > +			goto next;
> > +
> > +		if (!buffer_mapped(&bh) || buffer_unwritten(&bh) ||
> > +		    buffer_new(&bh))
> > +			goto next;
> > +
> > +		/* Recheck i_size under i_mmap_mutex */
> > +		mutex_lock(&mapping->i_mmap_mutex);
> 
> NAK. Have you tested this with lockdep enabled?
>
> ->map_pages() called with page table lock taken and ->i_mmap_mutex
> should be taken before it. It seems we need to take ->i_mmap_mutex in
> do_read_fault() before calling ->map_pages().

Thanks for pointing this out! I will make sure to test with lockdep next
time.

> Side note: I'm sceptical about whole idea to use i_mmap_mutux to protect
> against truncate. It will not scale good enough comparing lock_page()
> with its granularity.

I see.  I will think about it as well.  

Thanks,
-Toshi



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] Support map_pages() for DAX
  2014-03-14 23:32 ` Kirill A. Shutemov
  2014-03-14 23:58   ` Toshi Kani
@ 2014-03-16  2:46   ` Matthew Wilcox
  2014-03-17 11:43     ` Kirill A. Shutemov
  1 sibling, 1 reply; 9+ messages in thread
From: Matthew Wilcox @ 2014-03-16  2:46 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Toshi Kani, kirill.shutemov, david, linux-fsdevel, linux-kernel,
	linux-mm

On Sat, Mar 15, 2014 at 01:32:33AM +0200, Kirill A. Shutemov wrote:
> Side note: I'm sceptical about whole idea to use i_mmap_mutux to protect
> against truncate. It will not scale good enough comparing lock_page()
> with its granularity.

I'm actually working on this now.  The basic idea is to put an entry in
the radix tree for each page.  For zero pages, that's a pagecache page.
For pages that map to the media, it's an exceptional entry.  Radix tree
exceptional entries take two bits, leaving us with 30 or 62 bits depending
on sizeof(void *).  We can then take two more bits for Dirty and Lock,
leaving 28 or 60 bits that we can use to cache the PFN on the page,
meaning that we won't have to call the filesystem's get_block as often.

This means that shmem_find_get_pages_and_swap() moves from the shmem code
to filemap and gets renamed to find_get_rtes().  Next step is making
the truncate code use it so that the truncate code locks against DAX
code paths as well as pagecache codepaths.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] Support map_pages() for DAX
  2014-03-16  2:46   ` Matthew Wilcox
@ 2014-03-17 11:43     ` Kirill A. Shutemov
  2014-03-17 14:45       ` Matthew Wilcox
  0 siblings, 1 reply; 9+ messages in thread
From: Kirill A. Shutemov @ 2014-03-17 11:43 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Toshi Kani, kirill.shutemov, david, linux-fsdevel, linux-kernel,
	linux-mm

On Sat, Mar 15, 2014 at 10:46:13PM -0400, Matthew Wilcox wrote:
> On Sat, Mar 15, 2014 at 01:32:33AM +0200, Kirill A. Shutemov wrote:
> > Side note: I'm sceptical about whole idea to use i_mmap_mutux to protect
> > against truncate. It will not scale good enough comparing lock_page()
> > with its granularity.
> 
> I'm actually working on this now.  The basic idea is to put an entry in
> the radix tree for each page.  For zero pages, that's a pagecache page.
> For pages that map to the media, it's an exceptional entry.  Radix tree
> exceptional entries take two bits, leaving us with 30 or 62 bits depending
> on sizeof(void *).  We can then take two more bits for Dirty and Lock,
> leaving 28 or 60 bits that we can use to cache the PFN on the page,
> meaning that we won't have to call the filesystem's get_block as often.

Sound reasonable to me. Implementation of ->map_pages should be trivial
with this.

Few questions:
 - why would you need Dirty for DAX?
 - are you sure that 28 bits is enough for PFN everywhere?
   ARM with LPAE can have up to 40 physical address lines. Is there any
   32-bit machine with more address lines?

-- 
 Kirill A. Shutemov

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] Support map_pages() for DAX
  2014-03-17 11:43     ` Kirill A. Shutemov
@ 2014-03-17 14:45       ` Matthew Wilcox
  2014-03-17 15:24         ` Amit Golander
  0 siblings, 1 reply; 9+ messages in thread
From: Matthew Wilcox @ 2014-03-17 14:45 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Toshi Kani, kirill.shutemov, david, linux-fsdevel, linux-kernel,
	linux-mm

On Mon, Mar 17, 2014 at 01:43:21PM +0200, Kirill A. Shutemov wrote:
> On Sat, Mar 15, 2014 at 10:46:13PM -0400, Matthew Wilcox wrote:
> > I'm actually working on this now.  The basic idea is to put an entry in
> > the radix tree for each page.  For zero pages, that's a pagecache page.
> > For pages that map to the media, it's an exceptional entry.  Radix tree
> > exceptional entries take two bits, leaving us with 30 or 62 bits depending
> > on sizeof(void *).  We can then take two more bits for Dirty and Lock,
> > leaving 28 or 60 bits that we can use to cache the PFN on the page,
> > meaning that we won't have to call the filesystem's get_block as often.
> 
> Sound reasonable to me. Implementation of ->map_pages should be trivial
> with this.
> 
> Few questions:
>  - why would you need Dirty for DAX?

One of the areas ignored by the original XIP code was CPU caches.  Maybe
s390 has write-through caches or something, but on x86 we need to write back
the lines from the CPU cache to the memory on an msync().  We'll also need
to do this for a write(), although that's a SMOP.

>  - are you sure that 28 bits is enough for PFN everywhere?
>    ARM with LPAE can have up to 40 physical address lines. Is there any
>    32-bit machine with more address lines?

It's clearly not enough :-)  My plan is to have a pair of functions
pfn_to_rte() and rte_to_pfn() with default implementations that work well
on 64-bit and can be overridden by address-space deficient architectures.
If rte_to_pfn() returns RTE_PFN_UNKNOWN (which is probably -1), we'll
just go off and call get_block and ->direct_access.  This will be a
well-tested codepath because it'll be the same as the codepath used the
first time we look up a block.

Architectures can use whatever fancy scheme they like to optimise
rte_to_pfn() ... I don't think suggesting that enabling DAX grows
the radix tree entries from 32 to 64 bit would be a popular idea, but
that'd be something for those architecture maintainers to figure out.
I certainly don't care much about an x86-32 kernel with DAX ... I can
see it maybe being interesting in a virtualisation environment, but
probably not.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] Support map_pages() for DAX
  2014-03-17 14:45       ` Matthew Wilcox
@ 2014-03-17 15:24         ` Amit Golander
  0 siblings, 0 replies; 9+ messages in thread
From: Amit Golander @ 2014-03-17 15:24 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Kirill A. Shutemov, Toshi Kani, kirill.shutemov, david,
	linux-fsdevel, linux-kernel, linux-mm

[-- Attachment #1: Type: text/plain, Size: 3032 bytes --]

On Mon, Mar 17, 2014 at 4:45 PM, Matthew Wilcox <willy@linux.intel.com>wrote:

> On Mon, Mar 17, 2014 at 01:43:21PM +0200, Kirill A. Shutemov wrote:
> > On Sat, Mar 15, 2014 at 10:46:13PM -0400, Matthew Wilcox wrote:
> > > I'm actually working on this now.  The basic idea is to put an entry in
> > > the radix tree for each page.  For zero pages, that's a pagecache page.
> > > For pages that map to the media, it's an exceptional entry.  Radix tree
> > > exceptional entries take two bits, leaving us with 30 or 62 bits
> depending
> > > on sizeof(void *).  We can then take two more bits for Dirty and Lock,
> > > leaving 28 or 60 bits that we can use to cache the PFN on the page,
> > > meaning that we won't have to call the filesystem's get_block as often.
> >
> > Sound reasonable to me. Implementation of ->map_pages should be trivial
> > with this.
> >
> > Few questions:
> >  - why would you need Dirty for DAX?
>
> One of the areas ignored by the original XIP code was CPU caches.  Maybe
> s390 has write-through caches or something, but on x86 we need to write
> back
> the lines from the CPU cache to the memory on an msync().  We'll also need
> to do this for a write(), although that's a SMOP.
>

Indeed CLFLUSH has to be used extensively in order to guarantee that the
data is seen by the memory controller. This adds many instructions to the
execution path, and more importantly is associated with a substantial
latency penalty. This sub-optimal behavior derives from the current
hardware implementation (e.g Intel E5-26xx v2), which does not ADR-protect
WB caches. Hopefully, in the future, processor vendors will extend the ADR
protection to the WB caches, which will free us from the need to CLFLUSH.



> >  - are you sure that 28 bits is enough for PFN everywhere?
> >    ARM with LPAE can have up to 40 physical address lines. Is there any
> >    32-bit machine with more address lines?
>
> It's clearly not enough :-)  My plan is to have a pair of functions
> pfn_to_rte() and rte_to_pfn() with default implementations that work well
> on 64-bit and can be overridden by address-space deficient architectures.
> If rte_to_pfn() returns RTE_PFN_UNKNOWN (which is probably -1), we'll
> just go off and call get_block and ->direct_access.  This will be a
> well-tested codepath because it'll be the same as the codepath used the
> first time we look up a block.
>
> Architectures can use whatever fancy scheme they like to optimise
> rte_to_pfn() ... I don't think suggesting that enabling DAX grows
> the radix tree entries from 32 to 64 bit would be a popular idea, but
> that'd be something for those architecture maintainers to figure out.
> I certainly don't care much about an x86-32 kernel with DAX ... I can
> see it maybe being interesting in a virtualisation environment, but
> probably not.
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

[-- Attachment #2: Type: text/html, Size: 4011 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [RFC PATCH] Support map_pages() for DAX
@ 2014-03-18 13:10 Zuckerman, Boris
  2014-03-18 14:00 ` Matthew Wilcox
  0 siblings, 1 reply; 9+ messages in thread
From: Zuckerman, Boris @ 2014-03-18 13:10 UTC (permalink / raw)
  To: Matthew Wilcox, Kirill A. Shutemov
  Cc: Kani, Toshimitsu, kirill.shutemov@linux.intel.com,
	david@fromorbit.com, linux-fsdevel@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org

Matthew,

First of all, thank you for doing this job! 
Supporting persistent memory for any OS is bit more than adding "just another device".
There are some thoughts and questions below. Perhaps, you discussed those already. If so, please point me to that discussion!

> > Few questions:
> >  - why would you need Dirty for DAX?
> 
> One of the areas ignored by the original XIP code was CPU caches.  Maybe
> s390 has write-through caches or something, but on x86 we need to write back the
> lines from the CPU cache to the memory on an msync().  We'll also need to do this for
> a write(), although that's a SMOP.
> 

X86 cache lines are much smaller than a page. Cache lined are flushed "naturally", but we do not know about that.
How many Dirty pages do we anticipate? What is the performance cost of msync()? Is that higher, if we do page-based accounting?

Reasons and frequency of msync():
Atomicity: needs barriers, happens frequently, leaves relatively small number of Dirty pages. Here the cost is probably smaller. 
Durability of application updates: issued infrequently, leaves many Dirty pages. The cost could be high, right?

Let's assume that at some point we get CPU/Persistent Memory Controller combinations that support atomicity of multiple updates in hardware. Would you need to mark pages Dirty in such cases? If not, what is the right layer build that support for x86?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] Support map_pages() for DAX
  2014-03-18 13:10 Zuckerman, Boris
@ 2014-03-18 14:00 ` Matthew Wilcox
  0 siblings, 0 replies; 9+ messages in thread
From: Matthew Wilcox @ 2014-03-18 14:00 UTC (permalink / raw)
  To: Zuckerman, Boris
  Cc: Kirill A. Shutemov, Kani, Toshimitsu,
	kirill.shutemov@linux.intel.com, david@fromorbit.com,
	linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org

On Tue, Mar 18, 2014 at 01:10:44PM +0000, Zuckerman, Boris wrote:
> X86 cache lines are much smaller than a page. Cache lined are flushed "naturally", but we do not know about that.
> How many Dirty pages do we anticipate? What is the performance cost of msync()? Is that higher, if we do page-based accounting?

The number of dirty pages is going to depend on the workload.  The problem
with looking at today's workloads as an approximation of what workloads
will look like is that people will optimise their software for persistent
memory once persistent memory becomes more widely available.  So all
we can do is point out "hey, you have a lot of dirty pages, maybe you'd
like to change your algorithms".

> Reasons and frequency of msync():
> Atomicity: needs barriers, happens frequently, leaves relatively small number of Dirty pages. Here the cost is probably smaller. 
> Durability of application updates: issued infrequently, leaves many Dirty pages. The cost could be high, right?

We have two ways on x86 to implement msync.  One is to flush every
cacheline and the other is to flush the entire cache.  If the user asks
to flush a terabyte of memory, it's clearly cheaper to flush the cache.
If the user asks to flush 64 bytes, we should clearly just flush a single
line.  Somewhere in-between there's a cross-over point, and that's going
to depend on the size of the CPU's cache, the nature of the workload,
and a few other factors.  I'm not worrying about where that is right now,
because we can't make that decision without first tracking which pages
are dirty and which are clean.

> Let's assume that at some point we get CPU/Persistent Memory Controller
> combinations that support atomicity of multiple updates in hardware. Would
> you need to mark pages Dirty in such cases? If not, what is the right
> layer build that support for x86?

Regardless of future hardware innovations, we need to support the
semantics of msync().  That is, if the user calls msync(), sees that
it has successfully synced a range of data to media, and then after a
reboot discovers that all or part of that msync hadn't actually happened,
we're going to have a very unhappy user on our hands.

If we have a write-through cache, then we don't need to implement
dirty bit tracking.  But my impression is that write-through caches are
significantly worse performing than write-back, so I don't intend to
optimise for them.

If there's some fancy new hardware that lets you do an update of multiple
cachelines atomically and persistently, then I guess the software will
be calling that instead of msync(), so the question about whether msync()
would need to flush the cachelines for that page won't actually arise.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2014-03-18 14:02 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-03-14 23:03 [RFC PATCH] Support map_pages() for DAX Toshi Kani
2014-03-14 23:32 ` Kirill A. Shutemov
2014-03-14 23:58   ` Toshi Kani
2014-03-16  2:46   ` Matthew Wilcox
2014-03-17 11:43     ` Kirill A. Shutemov
2014-03-17 14:45       ` Matthew Wilcox
2014-03-17 15:24         ` Amit Golander
  -- strict thread matches above, loose matches on Subject: below --
2014-03-18 13:10 Zuckerman, Boris
2014-03-18 14:00 ` Matthew Wilcox

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).