linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* RE: [RFC PATCH] Support map_pages() for DAX
@ 2014-03-18 13:10 Zuckerman, Boris
  2014-03-18 14:00 ` Matthew Wilcox
  0 siblings, 1 reply; 9+ messages in thread
From: Zuckerman, Boris @ 2014-03-18 13:10 UTC (permalink / raw)
  To: Matthew Wilcox, Kirill A. Shutemov
  Cc: Kani, Toshimitsu, kirill.shutemov@linux.intel.com,
	david@fromorbit.com, linux-fsdevel@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org

Matthew,

First of all, thank you for doing this job! 
Supporting persistent memory for any OS is bit more than adding "just another device".
There are some thoughts and questions below. Perhaps, you discussed those already. If so, please point me to that discussion!

> > Few questions:
> >  - why would you need Dirty for DAX?
> 
> One of the areas ignored by the original XIP code was CPU caches.  Maybe
> s390 has write-through caches or something, but on x86 we need to write back the
> lines from the CPU cache to the memory on an msync().  We'll also need to do this for
> a write(), although that's a SMOP.
> 

X86 cache lines are much smaller than a page. Cache lined are flushed "naturally", but we do not know about that.
How many Dirty pages do we anticipate? What is the performance cost of msync()? Is that higher, if we do page-based accounting?

Reasons and frequency of msync():
Atomicity: needs barriers, happens frequently, leaves relatively small number of Dirty pages. Here the cost is probably smaller. 
Durability of application updates: issued infrequently, leaves many Dirty pages. The cost could be high, right?

Let's assume that at some point we get CPU/Persistent Memory Controller combinations that support atomicity of multiple updates in hardware. Would you need to mark pages Dirty in such cases? If not, what is the right layer build that support for x86?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread
* [RFC PATCH] Support map_pages() for DAX
@ 2014-03-14 23:03 Toshi Kani
  2014-03-14 23:32 ` Kirill A. Shutemov
  0 siblings, 1 reply; 9+ messages in thread
From: Toshi Kani @ 2014-03-14 23:03 UTC (permalink / raw)
  To: willy, kirill.shutemov, david, linux-fsdevel
  Cc: linux-kernel, linux-mm, Toshi Kani

DAX provides direct access to NVDIMM and bypasses the page caches.
Newly introduced map_pages() callback reduces page faults by adding
mappings around a faulted page, which is not supported for DAX.

This patch implements map_pages() callback for DAX.  It reduces a
number of page faults and increases read performance of DAX as shown
below.  The values in parenthesis are relative to the base DAX results.

iozone results of mmap read/re-read tests [KB/sec]
 64KB:  read: 3,560,777 (x1.6) re-read: 9,086,412 (x1.8) pfault:   121 (-20%)
 128MB: read: 4,374,906 (x1.7) re-read: 6,137,189 (x2.4) pfault: 8,312 (-87%)

Signed-off-by: Toshi Kani <toshi.kani@hp.com>
----
Applies on top of DAX patchset [1] and fault-around patchset [2].

[1] https://lkml.org/lkml/2014/2/25/460
[2] https://lkml.org/lkml/2014/2/27/546
---
 fs/dax.c           |   68 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/file.c     |    6 +++++
 include/linux/fs.h |    5 ++++
 3 files changed, 79 insertions(+)

diff --git a/fs/dax.c b/fs/dax.c
index c8dfab0..bc54705 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -476,3 +476,71 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
 	return 0;
 }
 EXPORT_SYMBOL_GPL(dax_zero_page_range);
+
+static void dax_set_pte(struct vm_area_struct *vma, unsigned long addr,
+			unsigned long pfn, pte_t *pte)
+{
+	pte_t entry;
+
+	if (addr < vma->vm_start || addr >= vma->vm_end)
+		return;
+
+	if (!pte_none(*pte))
+		return;
+
+	entry = pte_mkspecial(pfn_pte(pfn, vma->vm_page_prot));
+	set_pte_at(vma->vm_mm, addr, pte, entry);
+	update_mmu_cache(vma, addr, pte);
+}
+
+void dax_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf,
+		get_block_t get_block)
+{
+	struct file *file = vma->vm_file;
+	struct inode *inode = file_inode(file);
+	struct buffer_head bh;
+	struct address_space *mapping = file->f_mapping;
+	unsigned long vaddr = (unsigned long)vmf->virtual_address;
+	pgoff_t pgoff = vmf->pgoff;
+	sector_t block;
+	pgoff_t size;
+	unsigned long pfn;
+	pte_t *pte = vmf->pte;
+	int error;
+
+	while (pgoff < vmf->max_pgoff) {
+		size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		if (pgoff >= size)
+			return;
+
+		memset(&bh, 0, sizeof(bh));
+		block = (sector_t)pgoff << (PAGE_SHIFT - inode->i_blkbits);
+		bh.b_size = PAGE_SIZE;
+		error = get_block(inode, block, &bh, 0);
+		if (error || bh.b_size < PAGE_SIZE)
+			goto next;
+
+		if (!buffer_mapped(&bh) || buffer_unwritten(&bh) ||
+		    buffer_new(&bh))
+			goto next;
+
+		/* Recheck i_size under i_mmap_mutex */
+		mutex_lock(&mapping->i_mmap_mutex);
+		size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		if (unlikely(pgoff >= size)) {
+			mutex_unlock(&mapping->i_mmap_mutex);
+			return;
+		}
+
+		error = dax_get_pfn(inode, &bh, &pfn);
+		if (error > 0)
+			dax_set_pte(vma, vaddr, pfn, pte);
+
+		mutex_unlock(&mapping->i_mmap_mutex);
+next:
+		vaddr += PAGE_SIZE;
+		pgoff++;
+		pte++;
+	}
+}
+EXPORT_SYMBOL_GPL(dax_map_pages);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index eb19383..15965ea 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -205,6 +205,11 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 					/* Is this the right get_block? */
 }
 
+static void ext4_dax_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	return dax_map_pages(vma, vmf, ext4_get_block);
+}
+
 static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	return dax_mkwrite(vma, vmf, ext4_get_block);
@@ -212,6 +217,7 @@ static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 static const struct vm_operations_struct ext4_dax_vm_ops = {
 	.fault		= ext4_dax_fault,
+	.map_pages	= ext4_dax_map_pages,
 	.page_mkwrite	= ext4_dax_mkwrite,
 	.remap_pages	= generic_file_remap_pages,
 };
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d0381ab..3bd1042 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2527,6 +2527,7 @@ ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, const struct iovec *,
 		loff_t, unsigned segs, get_block_t, dio_iodone_t, int flags);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 int dax_mkwrite(struct vm_area_struct *, struct vm_fault *, get_block_t);
+void dax_map_pages(struct vm_area_struct *, struct vm_fault *, get_block_t);
 #else
 static inline int dax_clear_blocks(struct inode *i, sector_t blk, long sz)
 {
@@ -2545,6 +2546,10 @@ static inline ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
 {
 	return -ENOTTY;
 }
+static inline void dax_map_pages(struct vm_area_struct *vma,
+		struct vm_fault *vmf, get_block_t get_block)
+{
+}
 #endif
 
 /* Can't be a function because PAGE_CACHE_SIZE is defined in pagemap.h */

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2014-03-18 14:02 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-03-18 13:10 [RFC PATCH] Support map_pages() for DAX Zuckerman, Boris
2014-03-18 14:00 ` Matthew Wilcox
  -- strict thread matches above, loose matches on Subject: below --
2014-03-14 23:03 Toshi Kani
2014-03-14 23:32 ` Kirill A. Shutemov
2014-03-14 23:58   ` Toshi Kani
2014-03-16  2:46   ` Matthew Wilcox
2014-03-17 11:43     ` Kirill A. Shutemov
2014-03-17 14:45       ` Matthew Wilcox
2014-03-17 15:24         ` Amit Golander

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).