* [PATCH 0/2] block: fix raw block device dax support @ 2016-01-29 15:18 Dan Williams 2016-01-29 15:18 ` [PATCH 1/2] block: revert runtime dax control of the raw block device Dan Williams 2016-01-29 15:18 ` [PATCH 2/2] block: use DAX for partition table reads Dan Williams 0 siblings, 2 replies; 9+ messages in thread From: Dan Williams @ 2016-01-29 15:18 UTC (permalink / raw) To: linux-block Cc: linux-nvdimm, Dave Chinner, linux-kernel, Christoph Hellwig, axboe, Jeff Moyer, Jan Kara, linux-fsdevel, Matthew Wilcox, Andrew Morton, Ross Zwisler The dax support for a raw block device did not account for page cache entries established by the kernel for partition reads. This breaks dax as it assumes that page cache entries are limited to covering holes in files, or are exceptional entries marking dirty pages. Additionally, the facility to toggle dax at runtime fails to handle evacuating the page cache when switching from non-dax-mode to dax-mode. It needs to be rethought as enabling dax needs to be atomic with flushing the page cache. --- Dan Williams (2): block: revert runtime dax control of the raw block device block: use DAX for partition table reads block/ioctl.c | 38 -------------------------------------- block/partition-generic.c | 18 +++++++++++++++--- fs/block_dev.c | 28 ---------------------------- fs/dax.c | 20 ++++++++++++++++++++ include/linux/blkdev.h | 10 ++++++++++ include/linux/fs.h | 3 --- include/uapi/linux/fs.h | 1 - 7 files changed, 45 insertions(+), 73 deletions(-) ^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 1/2] block: revert runtime dax control of the raw block device 2016-01-29 15:18 [PATCH 0/2] block: fix raw block device dax support Dan Williams @ 2016-01-29 15:18 ` Dan Williams 2016-01-29 17:54 ` Ross Zwisler 2016-01-29 15:18 ` [PATCH 2/2] block: use DAX for partition table reads Dan Williams 1 sibling, 1 reply; 9+ messages in thread From: Dan Williams @ 2016-01-29 15:18 UTC (permalink / raw) To: linux-block Cc: linux-nvdimm, Dave Chinner, linux-kernel, Christoph Hellwig, axboe, Jeff Moyer, Jan Kara, linux-fsdevel, Matthew Wilcox, Andrew Morton, Ross Zwisler Dynamically enabling DAX requires that the page cache first be flushed and invalidated. This must occur atomically with the change of DAX mode otherwise we confuse the fsync/msync tracking and violate data durability guarantees. Eliminate the possibilty of DAX-disabled to DAX-enabled transitions for now and revisit this for the next cycle. Cc: Jan Kara <jack@suse.com> Cc: Jeff Moyer <jmoyer@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Dave Chinner <david@fromorbit.com> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> --- block/ioctl.c | 38 -------------------------------------- fs/block_dev.c | 28 ---------------------------- include/linux/fs.h | 3 --- include/uapi/linux/fs.h | 1 - 4 files changed, 70 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index 77f5d17779d6..d8996bbd7f12 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -434,42 +434,6 @@ bool blkdev_dax_capable(struct block_device *bdev) return true; } - -static int blkdev_daxset(struct block_device *bdev, unsigned long argp) -{ - unsigned long arg; - int rc = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - - if (get_user(arg, (int __user *)(argp))) - return -EFAULT; - arg = !!arg; - if (arg == !!(bdev->bd_inode->i_flags & S_DAX)) - return 0; - - if (arg) - arg = S_DAX; - - if (arg && !blkdev_dax_capable(bdev)) - return -ENOTTY; - - inode_lock(bdev->bd_inode); - if (bdev->bd_map_count == 0) - inode_set_flags(bdev->bd_inode, arg, S_DAX); - else - rc = -EBUSY; - inode_unlock(bdev->bd_inode); - return rc; -} -#else -static int blkdev_daxset(struct block_device *bdev, int arg) -{ - if (arg) - return -ENOTTY; - return 0; -} #endif static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, @@ -634,8 +598,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKTRACESETUP: case BLKTRACETEARDOWN: return blk_trace_ioctl(bdev, cmd, argp); - case BLKDAXSET: - return blkdev_daxset(bdev, arg); case BLKDAXGET: return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX)); break; diff --git a/fs/block_dev.c b/fs/block_dev.c index 7b9cd49622b1..afb437484362 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1736,37 +1736,13 @@ static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL); } -static void blkdev_vm_open(struct vm_area_struct *vma) -{ - struct inode *bd_inode = bdev_file_inode(vma->vm_file); - struct block_device *bdev = I_BDEV(bd_inode); - - inode_lock(bd_inode); - bdev->bd_map_count++; - inode_unlock(bd_inode); -} - -static void blkdev_vm_close(struct vm_area_struct *vma) -{ - struct inode *bd_inode = bdev_file_inode(vma->vm_file); - struct block_device *bdev = I_BDEV(bd_inode); - - inode_lock(bd_inode); - bdev->bd_map_count--; - inode_unlock(bd_inode); -} - static const struct vm_operations_struct blkdev_dax_vm_ops = { - .open = blkdev_vm_open, - .close = blkdev_vm_close, .fault = blkdev_dax_fault, .pmd_fault = blkdev_dax_pmd_fault, .pfn_mkwrite = blkdev_dax_fault, }; static const struct vm_operations_struct blkdev_default_vm_ops = { - .open = blkdev_vm_open, - .close = blkdev_vm_close, .fault = filemap_fault, .map_pages = filemap_map_pages, }; @@ -1774,18 +1750,14 @@ static const struct vm_operations_struct blkdev_default_vm_ops = { static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *bd_inode = bdev_file_inode(file); - struct block_device *bdev = I_BDEV(bd_inode); file_accessed(file); - inode_lock(bd_inode); - bdev->bd_map_count++; if (IS_DAX(bd_inode)) { vma->vm_ops = &blkdev_dax_vm_ops; vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; } else { vma->vm_ops = &blkdev_default_vm_ops; } - inode_unlock(bd_inode); return 0; } diff --git a/include/linux/fs.h b/include/linux/fs.h index b10002d4a5f5..ae681002100a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -484,9 +484,6 @@ struct block_device { int bd_fsfreeze_count; /* Mutex for freeze */ struct mutex bd_fsfreeze_mutex; -#ifdef CONFIG_FS_DAX - int bd_map_count; -#endif }; /* diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 41e0433b4a83..149bec83a907 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -222,7 +222,6 @@ struct fsxattr { #define BLKSECDISCARD _IO(0x12,125) #define BLKROTATIONAL _IO(0x12,126) #define BLKZEROOUT _IO(0x12,127) -#define BLKDAXSET _IO(0x12,128) #define BLKDAXGET _IO(0x12,129) #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ ^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH 1/2] block: revert runtime dax control of the raw block device 2016-01-29 15:18 ` [PATCH 1/2] block: revert runtime dax control of the raw block device Dan Williams @ 2016-01-29 17:54 ` Ross Zwisler 0 siblings, 0 replies; 9+ messages in thread From: Ross Zwisler @ 2016-01-29 17:54 UTC (permalink / raw) To: Dan Williams Cc: linux-block, linux-nvdimm, Dave Chinner, linux-kernel, Christoph Hellwig, axboe, Jeff Moyer, Jan Kara, linux-fsdevel, Matthew Wilcox, Andrew Morton, Ross Zwisler On Fri, Jan 29, 2016 at 07:18:41AM -0800, Dan Williams wrote: > Dynamically enabling DAX requires that the page cache first be flushed > and invalidated. This must occur atomically with the change of DAX mode > otherwise we confuse the fsync/msync tracking and violate data > durability guarantees. Eliminate the possibilty of DAX-disabled to > DAX-enabled transitions for now and revisit this for the next cycle. > > Cc: Jan Kara <jack@suse.com> > Cc: Jeff Moyer <jmoyer@redhat.com> > Cc: Christoph Hellwig <hch@lst.de> > Cc: Dave Chinner <david@fromorbit.com> > Cc: Matthew Wilcox <willy@linux.intel.com> > Cc: Andrew Morton <akpm@linux-foundation.org> > Cc: Ross Zwisler <ross.zwisler@linux.intel.com> > Signed-off-by: Dan Williams <dan.j.williams@intel.com> Sure, makes sense. Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com> > --- > block/ioctl.c | 38 -------------------------------------- > fs/block_dev.c | 28 ---------------------------- > include/linux/fs.h | 3 --- > include/uapi/linux/fs.h | 1 - > 4 files changed, 70 deletions(-) > > diff --git a/block/ioctl.c b/block/ioctl.c > index 77f5d17779d6..d8996bbd7f12 100644 > --- a/block/ioctl.c > +++ b/block/ioctl.c > @@ -434,42 +434,6 @@ bool blkdev_dax_capable(struct block_device *bdev) > > return true; > } > - > -static int blkdev_daxset(struct block_device *bdev, unsigned long argp) > -{ > - unsigned long arg; > - int rc = 0; > - > - if (!capable(CAP_SYS_ADMIN)) > - return -EACCES; > - > - if (get_user(arg, (int __user *)(argp))) > - return -EFAULT; > - arg = !!arg; > - if (arg == !!(bdev->bd_inode->i_flags & S_DAX)) > - return 0; > - > - if (arg) > - arg = S_DAX; > - > - if (arg && !blkdev_dax_capable(bdev)) > - return -ENOTTY; > - > - inode_lock(bdev->bd_inode); > - if (bdev->bd_map_count == 0) > - inode_set_flags(bdev->bd_inode, arg, S_DAX); > - else > - rc = -EBUSY; > - inode_unlock(bdev->bd_inode); > - return rc; > -} > -#else > -static int blkdev_daxset(struct block_device *bdev, int arg) > -{ > - if (arg) > - return -ENOTTY; > - return 0; > -} > #endif > > static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, > @@ -634,8 +598,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, > case BLKTRACESETUP: > case BLKTRACETEARDOWN: > return blk_trace_ioctl(bdev, cmd, argp); > - case BLKDAXSET: > - return blkdev_daxset(bdev, arg); > case BLKDAXGET: > return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX)); > break; > diff --git a/fs/block_dev.c b/fs/block_dev.c > index 7b9cd49622b1..afb437484362 100644 > --- a/fs/block_dev.c > +++ b/fs/block_dev.c > @@ -1736,37 +1736,13 @@ static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, > return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL); > } > > -static void blkdev_vm_open(struct vm_area_struct *vma) > -{ > - struct inode *bd_inode = bdev_file_inode(vma->vm_file); > - struct block_device *bdev = I_BDEV(bd_inode); > - > - inode_lock(bd_inode); > - bdev->bd_map_count++; > - inode_unlock(bd_inode); > -} > - > -static void blkdev_vm_close(struct vm_area_struct *vma) > -{ > - struct inode *bd_inode = bdev_file_inode(vma->vm_file); > - struct block_device *bdev = I_BDEV(bd_inode); > - > - inode_lock(bd_inode); > - bdev->bd_map_count--; > - inode_unlock(bd_inode); > -} > - > static const struct vm_operations_struct blkdev_dax_vm_ops = { > - .open = blkdev_vm_open, > - .close = blkdev_vm_close, > .fault = blkdev_dax_fault, > .pmd_fault = blkdev_dax_pmd_fault, > .pfn_mkwrite = blkdev_dax_fault, > }; > > static const struct vm_operations_struct blkdev_default_vm_ops = { > - .open = blkdev_vm_open, > - .close = blkdev_vm_close, > .fault = filemap_fault, > .map_pages = filemap_map_pages, > }; > @@ -1774,18 +1750,14 @@ static const struct vm_operations_struct blkdev_default_vm_ops = { > static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) > { > struct inode *bd_inode = bdev_file_inode(file); > - struct block_device *bdev = I_BDEV(bd_inode); > > file_accessed(file); > - inode_lock(bd_inode); > - bdev->bd_map_count++; > if (IS_DAX(bd_inode)) { > vma->vm_ops = &blkdev_dax_vm_ops; > vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; > } else { > vma->vm_ops = &blkdev_default_vm_ops; > } > - inode_unlock(bd_inode); > > return 0; > } > diff --git a/include/linux/fs.h b/include/linux/fs.h > index b10002d4a5f5..ae681002100a 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -484,9 +484,6 @@ struct block_device { > int bd_fsfreeze_count; > /* Mutex for freeze */ > struct mutex bd_fsfreeze_mutex; > -#ifdef CONFIG_FS_DAX > - int bd_map_count; > -#endif > }; > > /* > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h > index 41e0433b4a83..149bec83a907 100644 > --- a/include/uapi/linux/fs.h > +++ b/include/uapi/linux/fs.h > @@ -222,7 +222,6 @@ struct fsxattr { > #define BLKSECDISCARD _IO(0x12,125) > #define BLKROTATIONAL _IO(0x12,126) > #define BLKZEROOUT _IO(0x12,127) > -#define BLKDAXSET _IO(0x12,128) > #define BLKDAXGET _IO(0x12,129) > > #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ > ^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 2/2] block: use DAX for partition table reads 2016-01-29 15:18 [PATCH 0/2] block: fix raw block device dax support Dan Williams 2016-01-29 15:18 ` [PATCH 1/2] block: revert runtime dax control of the raw block device Dan Williams @ 2016-01-29 15:18 ` Dan Williams 2016-01-29 17:46 ` Jens Axboe ` (3 more replies) 1 sibling, 4 replies; 9+ messages in thread From: Dan Williams @ 2016-01-29 15:18 UTC (permalink / raw) To: linux-block Cc: linux-nvdimm, Dave Chinner, linux-kernel, Christoph Hellwig, axboe, Jeff Moyer, Jan Kara, linux-fsdevel, Matthew Wilcox, Andrew Morton, Ross Zwisler Avoid populating pagecache when the block device is in DAX mode. Otherwise these page cache entries collide with the fsync/msync implementation and break data durability guarantees. Cc: Jan Kara <jack@suse.com> Cc: Jeff Moyer <jmoyer@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Dave Chinner <david@fromorbit.com> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Andrew Morton <akpm@linux-foundation.org> Reported-by: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> --- block/partition-generic.c | 18 +++++++++++++++--- fs/dax.c | 20 ++++++++++++++++++++ include/linux/blkdev.h | 10 ++++++++++ 3 files changed, 45 insertions(+), 3 deletions(-) diff --git a/block/partition-generic.c b/block/partition-generic.c index 746935a5973c..8e6fa1868249 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -16,6 +16,7 @@ #include <linux/kmod.h> #include <linux/ctype.h> #include <linux/genhd.h> +#include <linux/blkdev.h> #include <linux/blktrace_api.h> #include "partitions/check.h" @@ -550,13 +551,24 @@ int invalidate_partitions(struct gendisk *disk, struct block_device *bdev) return 0; } -unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) +static struct page *read_pagecache_sector(struct block_device *bdev, sector_t n) { struct address_space *mapping = bdev->bd_inode->i_mapping; + + return read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)), + NULL); +} + +unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) +{ struct page *page; - page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)), - NULL); + /* don't populate page cache for dax capable devices */ + if (IS_DAX(bdev->bd_inode)) + page = read_dax_sector(bdev, n); + else + page = read_pagecache_sector(bdev, n); + if (!IS_ERR(page)) { if (PageError(page)) goto fail; diff --git a/fs/dax.c b/fs/dax.c index 4fd6b0c5c6b5..227974adecb9 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -58,6 +58,26 @@ static void dax_unmap_atomic(struct block_device *bdev, blk_queue_exit(bdev->bd_queue); } +struct page *read_dax_sector(struct block_device *bdev, sector_t n) +{ + struct page *page = __page_cache_alloc(GFP_KERNEL | __GFP_COLD); + struct blk_dax_ctl dax = { + .size = PAGE_SIZE, + .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), + }; + long rc; + + if (!page) + return ERR_PTR(-ENOMEM); + + rc = dax_map_atomic(bdev, &dax); + if (rc < 0) + return ERR_PTR(rc); + memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); + dax_unmap_atomic(bdev, &dax); + return page; +} + /* * dax_clear_blocks() is called from within transaction context from XFS, * and hence this means the stack from this point must follow GFP_NOFS diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 29189aeace19..b1452c04f1a9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1633,6 +1633,16 @@ struct blk_dax_ctl { pfn_t pfn; }; +#ifdef CONFIG_FS_DAX +struct page *read_dax_sector(struct block_device *bdev, sector_t n); +#else +static inline struct page *read_dax_sector(struct block_device *bdev, + sector_t n) +{ + return ERR_PTR(-ENXIO); +} +#endif + struct block_device_operations { int (*open) (struct block_device *, fmode_t); void (*release) (struct gendisk *, fmode_t); ^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH 2/2] block: use DAX for partition table reads 2016-01-29 15:18 ` [PATCH 2/2] block: use DAX for partition table reads Dan Williams @ 2016-01-29 17:46 ` Jens Axboe 2016-01-29 17:54 ` Dan Williams 2016-01-29 19:24 ` Ross Zwisler ` (2 subsequent siblings) 3 siblings, 1 reply; 9+ messages in thread From: Jens Axboe @ 2016-01-29 17:46 UTC (permalink / raw) To: Dan Williams, linux-block Cc: linux-nvdimm, Dave Chinner, linux-kernel, Christoph Hellwig, Jeff Moyer, Jan Kara, linux-fsdevel, Matthew Wilcox, Andrew Morton, Ross Zwisler On 01/29/2016 08:18 AM, Dan Williams wrote: > +unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) > +{ > struct page *page; > > - page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)), > - NULL); > + /* don't populate page cache for dax capable devices */ > + if (IS_DAX(bdev->bd_inode)) > + page = read_dax_sector(bdev, n); > + else > + page = read_pagecache_sector(bdev, n); > + Fall back to non-dax, if dax fails? > +struct page *read_dax_sector(struct block_device *bdev, sector_t n) > +{ > + struct page *page = __page_cache_alloc(GFP_KERNEL | __GFP_COLD); Why isn't that just alloc_pages()? -- Jens Axboe ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 2/2] block: use DAX for partition table reads 2016-01-29 17:46 ` Jens Axboe @ 2016-01-29 17:54 ` Dan Williams 0 siblings, 0 replies; 9+ messages in thread From: Dan Williams @ 2016-01-29 17:54 UTC (permalink / raw) To: Jens Axboe Cc: linux-block, linux-nvdimm@lists.01.org, Dave Chinner, linux-kernel@vger.kernel.org, Christoph Hellwig, Jeff Moyer, Jan Kara, linux-fsdevel, Matthew Wilcox, Andrew Morton, Ross Zwisler On Fri, Jan 29, 2016 at 9:46 AM, Jens Axboe <axboe@fb.com> wrote: > On 01/29/2016 08:18 AM, Dan Williams wrote: >> >> +unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, >> Sector *p) >> +{ >> struct page *page; >> >> - page = read_mapping_page(mapping, (pgoff_t)(n >> >> (PAGE_CACHE_SHIFT-9)), >> - NULL); >> + /* don't populate page cache for dax capable devices */ >> + if (IS_DAX(bdev->bd_inode)) >> + page = read_dax_sector(bdev, n); >> + else >> + page = read_pagecache_sector(bdev, n); >> + > > > Fall back to non-dax, if dax fails? I think we need to fail hard otherwise we're back to the original problem of confusing the dax code that expects to find an empty page cache. > >> +struct page *read_dax_sector(struct block_device *bdev, sector_t n) >> +{ >> + struct page *page = __page_cache_alloc(GFP_KERNEL | __GFP_COLD); > > > Why isn't that just alloc_pages()? Just for symmetry with the same allocation that the pagecache path makes, but alloc_pages() works too... ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 2/2] block: use DAX for partition table reads 2016-01-29 15:18 ` [PATCH 2/2] block: use DAX for partition table reads Dan Williams 2016-01-29 17:46 ` Jens Axboe @ 2016-01-29 19:24 ` Ross Zwisler 2016-01-29 22:45 ` Matthew Wilcox 2016-01-30 0:33 ` [PATCH v2] " Dan Williams 3 siblings, 0 replies; 9+ messages in thread From: Ross Zwisler @ 2016-01-29 19:24 UTC (permalink / raw) To: Dan Williams Cc: linux-block, linux-nvdimm, Dave Chinner, linux-kernel, Christoph Hellwig, axboe, Jeff Moyer, Jan Kara, linux-fsdevel, Matthew Wilcox, Andrew Morton, Ross Zwisler On Fri, Jan 29, 2016 at 07:18:46AM -0800, Dan Williams wrote: > Avoid populating pagecache when the block device is in DAX mode. > Otherwise these page cache entries collide with the fsync/msync > implementation and break data durability guarantees. > > Cc: Jan Kara <jack@suse.com> > Cc: Jeff Moyer <jmoyer@redhat.com> > Cc: Christoph Hellwig <hch@lst.de> > Cc: Dave Chinner <david@fromorbit.com> > Cc: Matthew Wilcox <willy@linux.intel.com> > Cc: Andrew Morton <akpm@linux-foundation.org> > Reported-by: Ross Zwisler <ross.zwisler@linux.intel.com> > Signed-off-by: Dan Williams <dan.j.williams@intel.com> This solves the problem for me, thanks! Tested-by: Ross Zwisler <ross.zwisler@linux.intel.com> ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 2/2] block: use DAX for partition table reads 2016-01-29 15:18 ` [PATCH 2/2] block: use DAX for partition table reads Dan Williams 2016-01-29 17:46 ` Jens Axboe 2016-01-29 19:24 ` Ross Zwisler @ 2016-01-29 22:45 ` Matthew Wilcox 2016-01-30 0:33 ` [PATCH v2] " Dan Williams 3 siblings, 0 replies; 9+ messages in thread From: Matthew Wilcox @ 2016-01-29 22:45 UTC (permalink / raw) To: Dan Williams Cc: linux-block, linux-nvdimm, Dave Chinner, linux-kernel, Christoph Hellwig, axboe, Jeff Moyer, Jan Kara, linux-fsdevel, Andrew Morton, Ross Zwisler On Fri, Jan 29, 2016 at 07:18:46AM -0800, Dan Williams wrote: > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h > index 29189aeace19..b1452c04f1a9 100644 > --- a/include/linux/blkdev.h > +++ b/include/linux/blkdev.h > @@ -1633,6 +1633,16 @@ struct blk_dax_ctl { > pfn_t pfn; > }; > > +#ifdef CONFIG_FS_DAX > +struct page *read_dax_sector(struct block_device *bdev, sector_t n); > +#else > +static inline struct page *read_dax_sector(struct block_device *bdev, > + sector_t n) > +{ > + return ERR_PTR(-ENXIO); > +} > +#endif > + Can you move this to include/linux/dax.h? I'd like to keep it that all functions in dax.c have a prototype in dax.h. With that change, Reviewed-by: Matthew Wilcox <willy@linux.intel.com> ^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH v2] block: use DAX for partition table reads 2016-01-29 15:18 ` [PATCH 2/2] block: use DAX for partition table reads Dan Williams ` (2 preceding siblings ...) 2016-01-29 22:45 ` Matthew Wilcox @ 2016-01-30 0:33 ` Dan Williams 3 siblings, 0 replies; 9+ messages in thread From: Dan Williams @ 2016-01-30 0:33 UTC (permalink / raw) To: linux-block Cc: linux-nvdimm, Dave Chinner, linux-kernel, Christoph Hellwig, axboe, Jeff Moyer, Jan Kara, linux-fsdevel, Matthew Wilcox, Andrew Morton, Ross Zwisler Avoid populating pagecache when the block device is in DAX mode. Otherwise these page cache entries collide with the fsync/msync implementation and break data durability guarantees. Cc: Jan Kara <jack@suse.com> Cc: Jeff Moyer <jmoyer@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Dave Chinner <david@fromorbit.com> Cc: Andrew Morton <akpm@linux-foundation.org> Reported-by: Ross Zwisler <ross.zwisler@linux.intel.com> Tested-by: Ross Zwisler <ross.zwisler@linux.intel.com> Reviewed-by: Matthew Wilcox <willy@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> --- Changes in v2: 1/ Switch from __page_cache_alloc to alloc_pages (Jens) 2/ Move read_dax_sector() declaration to include/linux/dax.h (Willy) 3/ Collect Reviewed-by and Tested-by tags from Willy and Ross. block/partition-generic.c | 18 +++++++++++++++--- fs/dax.c | 20 ++++++++++++++++++++ include/linux/dax.h | 11 +++++++++++ 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/block/partition-generic.c b/block/partition-generic.c index 746935a5973c..fefd01b496a0 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -16,6 +16,7 @@ #include <linux/kmod.h> #include <linux/ctype.h> #include <linux/genhd.h> +#include <linux/dax.h> #include <linux/blktrace_api.h> #include "partitions/check.h" @@ -550,13 +551,24 @@ int invalidate_partitions(struct gendisk *disk, struct block_device *bdev) return 0; } -unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) +static struct page *read_pagecache_sector(struct block_device *bdev, sector_t n) { struct address_space *mapping = bdev->bd_inode->i_mapping; + + return read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)), + NULL); +} + +unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) +{ struct page *page; - page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)), - NULL); + /* don't populate page cache for dax capable devices */ + if (IS_DAX(bdev->bd_inode)) + page = read_dax_sector(bdev, n); + else + page = read_pagecache_sector(bdev, n); + if (!IS_ERR(page)) { if (PageError(page)) goto fail; diff --git a/fs/dax.c b/fs/dax.c index 4fd6b0c5c6b5..e0e9358baf35 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -58,6 +58,26 @@ static void dax_unmap_atomic(struct block_device *bdev, blk_queue_exit(bdev->bd_queue); } +struct page *read_dax_sector(struct block_device *bdev, sector_t n) +{ + struct page *page = alloc_pages(GFP_KERNEL, 0); + struct blk_dax_ctl dax = { + .size = PAGE_SIZE, + .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), + }; + long rc; + + if (!page) + return ERR_PTR(-ENOMEM); + + rc = dax_map_atomic(bdev, &dax); + if (rc < 0) + return ERR_PTR(rc); + memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); + dax_unmap_atomic(bdev, &dax); + return page; +} + /* * dax_clear_blocks() is called from within transaction context from XFS, * and hence this means the stack from this point must follow GFP_NOFS diff --git a/include/linux/dax.h b/include/linux/dax.h index 8204c3dc3800..818e45078929 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -14,6 +14,17 @@ int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, dax_iodone_t); int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, dax_iodone_t); + +#ifdef CONFIG_FS_DAX +struct page *read_dax_sector(struct block_device *bdev, sector_t n); +#else +static inline struct page *read_dax_sector(struct block_device *bdev, + sector_t n) +{ + return ERR_PTR(-ENXIO); +} +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *, unsigned int flags, get_block_t, dax_iodone_t); ^ permalink raw reply related [flat|nested] 9+ messages in thread
end of thread, other threads:[~2016-01-30 0:33 UTC | newest] Thread overview: 9+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2016-01-29 15:18 [PATCH 0/2] block: fix raw block device dax support Dan Williams 2016-01-29 15:18 ` [PATCH 1/2] block: revert runtime dax control of the raw block device Dan Williams 2016-01-29 17:54 ` Ross Zwisler 2016-01-29 15:18 ` [PATCH 2/2] block: use DAX for partition table reads Dan Williams 2016-01-29 17:46 ` Jens Axboe 2016-01-29 17:54 ` Dan Williams 2016-01-29 19:24 ` Ross Zwisler 2016-01-29 22:45 ` Matthew Wilcox 2016-01-30 0:33 ` [PATCH v2] " Dan Williams
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).