* [PATCH] dax: allow DAX to look up an inode's block device
@ 2016-02-02 23:11 Ross Zwisler
2016-02-02 23:19 ` Al Viro
0 siblings, 1 reply; 8+ messages in thread
From: Ross Zwisler @ 2016-02-02 23:11 UTC (permalink / raw)
To: linux-kernel
Cc: Jeff Layton, Andrew Morton, linux-nvdimm, xfs, J. Bruce Fields,
Alexander Viro, Jan Kara, linux-fsdevel, Matthew Wilcox,
Ross Zwisler, Dan Williams
There are a number of places in dax.c that look up the struct block_device
associated with an inode. Previously this was done by just using
inode->i_sb->s_bdev. This is correct in some cases, such as when using
ext2 and ext4.
However, for raw block devices and for XFS with a real-time device, the
value in inode->i_sb->s_bdev is not correct. With the code as it is
currently written, an fsync or msync to a DAX enabled raw block device will
cause a NULL pointer dereference kernel BUG. For this to work correctly we
need to ask the block device or filesystem what struct block_device is
appropriate for our inode.
To that end, add a get_bdev(struct inode *) entry point to struct
super_operations. If this function pointer is non-NULL, this notifies DAX
that it needs to use it to look up the correct block_device. If
i_sb->get_bdev() is NULL DAX will default to inode->i_sb->s_bdev.
I added the function to super_operations instead of another alternative
like inode_operations because the function pointer varies by filesystem or
block device, not per inode. I believe that this will also save memory
because there is only one struct super_operations per mounted filesystem
but there could be many struct inode_operations and there is no need to
keep many copies of the same function pointer in memory.
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
fs/block_dev.c | 6 ++++++
fs/dax.c | 20 ++++++++++++++------
fs/xfs/xfs_aops.c | 2 +-
fs/xfs/xfs_aops.h | 1 +
fs/xfs/xfs_super.c | 1 +
include/linux/fs.h | 1 +
6 files changed, 24 insertions(+), 7 deletions(-)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index fa0507a..845b049 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -156,6 +156,11 @@ blkdev_get_block(struct inode *inode, sector_t iblock,
return 0;
}
+static struct block_device *blkdev_get_bdev(struct inode *inode)
+{
+ return I_BDEV(inode);
+}
+
static struct inode *bdev_file_inode(struct file *file)
{
return file->f_mapping->host;
@@ -569,6 +574,7 @@ static const struct super_operations bdev_sops = {
.alloc_inode = bdev_alloc_inode,
.destroy_inode = bdev_destroy_inode,
.drop_inode = generic_delete_inode,
+ .get_bdev = blkdev_get_bdev,
.evict_inode = bdev_evict_inode,
};
diff --git a/fs/dax.c b/fs/dax.c
index 227974a..c701ea4 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -32,6 +32,14 @@
#include <linux/pfn_t.h>
#include <linux/sizes.h>
+static struct block_device *dax_get_bdev(struct inode *inode)
+{
+ if (inode->i_sb->s_op->get_bdev)
+ return inode->i_sb->s_op->get_bdev(inode);
+ else
+ return inode->i_sb->s_bdev;
+}
+
static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
{
struct request_queue *q = bdev->bd_queue;
@@ -85,7 +93,7 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n)
*/
int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
{
- struct block_device *bdev = inode->i_sb->s_bdev;
+ struct block_device *bdev = dax_get_bdev(inode);
struct blk_dax_ctl dax = {
.sector = block << (inode->i_blkbits - 9),
.size = _size,
@@ -266,7 +274,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
loff_t end = pos + iov_iter_count(iter);
memset(&bh, 0, sizeof(bh));
- bh.b_bdev = inode->i_sb->s_bdev;
+ bh.b_bdev = dax_get_bdev(inode);
if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
struct address_space *mapping = inode->i_mapping;
@@ -488,7 +496,7 @@ int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
loff_t end)
{
struct inode *inode = mapping->host;
- struct block_device *bdev = inode->i_sb->s_bdev;
+ struct block_device *bdev = dax_get_bdev(inode);
pgoff_t start_index, end_index, pmd_index;
pgoff_t indices[PAGEVEC_SIZE];
struct pagevec pvec;
@@ -628,7 +636,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
memset(&bh, 0, sizeof(bh));
block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
- bh.b_bdev = inode->i_sb->s_bdev;
+ bh.b_bdev = dax_get_bdev(inode);
bh.b_size = PAGE_SIZE;
repeat:
@@ -847,7 +855,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
}
memset(&bh, 0, sizeof(bh));
- bh.b_bdev = inode->i_sb->s_bdev;
+ bh.b_bdev = dax_get_bdev(inode);
block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
bh.b_size = PMD_SIZE;
@@ -1100,7 +1108,7 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
BUG_ON((offset + length) > PAGE_CACHE_SIZE);
memset(&bh, 0, sizeof(bh));
- bh.b_bdev = inode->i_sb->s_bdev;
+ bh.b_bdev = dax_get_bdev(inode);
bh.b_size = PAGE_CACHE_SIZE;
err = get_block(inode, index, &bh, 0);
if (err < 0)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 379c089..fc20518 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -55,7 +55,7 @@ xfs_count_page_state(
} while ((bh = bh->b_this_page) != head);
}
-STATIC struct block_device *
+struct block_device *
xfs_find_bdev_for_inode(
struct inode *inode)
{
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index f6ffc9a..a4343c6 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -62,5 +62,6 @@ int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create);
extern void xfs_count_page_state(struct page *, int *, int *);
+extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 59c9b7b..26e7051 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1623,6 +1623,7 @@ static const struct super_operations xfs_super_operations = {
.destroy_inode = xfs_fs_destroy_inode,
.evict_inode = xfs_fs_evict_inode,
.drop_inode = xfs_fs_drop_inode,
+ .get_bdev = xfs_find_bdev_for_inode,
.put_super = xfs_fs_put_super,
.sync_fs = xfs_fs_sync_fs,
.freeze_fs = xfs_fs_freeze,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b10002d..5b636eb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1730,6 +1730,7 @@ struct super_operations {
int (*write_inode) (struct inode *, struct writeback_control *wbc);
int (*drop_inode) (struct inode *);
void (*evict_inode) (struct inode *);
+ struct block_device *(*get_bdev) (struct inode *);
void (*put_super) (struct super_block *);
int (*sync_fs)(struct super_block *sb, int wait);
int (*freeze_super) (struct super_block *);
--
2.5.0
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 8+ messages in thread* Re: [PATCH] dax: allow DAX to look up an inode's block device
2016-02-02 23:11 [PATCH] dax: allow DAX to look up an inode's block device Ross Zwisler
@ 2016-02-02 23:19 ` Al Viro
2016-02-02 23:36 ` Jared Hulbert
2016-02-02 23:38 ` Dan Williams
0 siblings, 2 replies; 8+ messages in thread
From: Al Viro @ 2016-02-02 23:19 UTC (permalink / raw)
To: Ross Zwisler
Cc: Jeff Layton, linux-nvdimm, linux-kernel, xfs, J. Bruce Fields,
Jan Kara, linux-fsdevel, Matthew Wilcox, Andrew Morton,
Dan Williams
On Tue, Feb 02, 2016 at 04:11:42PM -0700, Ross Zwisler wrote:
> However, for raw block devices and for XFS with a real-time device, the
> value in inode->i_sb->s_bdev is not correct. With the code as it is
> currently written, an fsync or msync to a DAX enabled raw block device will
> cause a NULL pointer dereference kernel BUG. For this to work correctly we
> need to ask the block device or filesystem what struct block_device is
> appropriate for our inode.
>
> To that end, add a get_bdev(struct inode *) entry point to struct
> super_operations. If this function pointer is non-NULL, this notifies DAX
> that it needs to use it to look up the correct block_device. If
> i_sb->get_bdev() is NULL DAX will default to inode->i_sb->s_bdev.
Umm... It assumes that bdev will stay pinned for as long as inode is
referenced, presumably? If so, that needs to be documented (and verified
for existing fs instances). In principle, multi-disk fs might want to
support things like "silently move the inodes backed by that disk to other
ones"...
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] dax: allow DAX to look up an inode's block device
2016-02-02 23:19 ` Al Viro
@ 2016-02-02 23:36 ` Jared Hulbert
2016-02-02 23:41 ` Dan Williams
2016-02-02 23:38 ` Dan Williams
1 sibling, 1 reply; 8+ messages in thread
From: Jared Hulbert @ 2016-02-02 23:36 UTC (permalink / raw)
To: Al Viro, Dan Williams
Cc: Andrew Morton, linux-nvdimm, LKML, xfs, J. Bruce Fields, Jan Kara,
Linux FS Devel, Ross Zwisler, Jeff Layton
On Tue, Feb 2, 2016 at 3:19 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> On Tue, Feb 02, 2016 at 04:11:42PM -0700, Ross Zwisler wrote:
>
> > However, for raw block devices and for XFS with a real-time device, the
> > value in inode->i_sb->s_bdev is not correct. With the code as it is
> > currently written, an fsync or msync to a DAX enabled raw block device will
> > cause a NULL pointer dereference kernel BUG. For this to work correctly we
> > need to ask the block device or filesystem what struct block_device is
> > appropriate for our inode.
> >
> > To that end, add a get_bdev(struct inode *) entry point to struct
> > super_operations. If this function pointer is non-NULL, this notifies DAX
> > that it needs to use it to look up the correct block_device. If
> > i_sb->get_bdev() is NULL DAX will default to inode->i_sb->s_bdev.
>
> Umm... It assumes that bdev will stay pinned for as long as inode is
> referenced, presumably? If so, that needs to be documented (and verified
> for existing fs instances). In principle, multi-disk fs might want to
> support things like "silently move the inodes backed by that disk to other
> ones"...
Dan, This is exactly the kind of thing I'm taking about WRT the
weirder device models and directly calling bdev_direct_access().
Filesystems don't have the monogamous relationship with a device that
is implicitly assumed in DAX, you have to ask the filesystem what the
relationship is and is migrating to, and allow the filesystem to
update DAX when the relationship is changing. As we start to see many
DIMM's and 10s TiB pmem systems this is going be an even bigger deal
as load balancing, wear leveling, and fault tolerance concerned are
inevitably driven by the filesystem.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] dax: allow DAX to look up an inode's block device
2016-02-02 23:36 ` Jared Hulbert
@ 2016-02-02 23:41 ` Dan Williams
2016-02-03 0:33 ` Jared Hulbert
0 siblings, 1 reply; 8+ messages in thread
From: Dan Williams @ 2016-02-02 23:41 UTC (permalink / raw)
To: Jared Hulbert
Cc: Andrew Morton, linux-nvdimm, LKML, XFS Developers,
J. Bruce Fields, Al Viro, Jan Kara, Linux FS Devel, Jeff Layton,
Ross Zwisler
On Tue, Feb 2, 2016 at 3:36 PM, Jared Hulbert <jaredeh@gmail.com> wrote:
> On Tue, Feb 2, 2016 at 3:19 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>>
>> On Tue, Feb 02, 2016 at 04:11:42PM -0700, Ross Zwisler wrote:
>>
>> > However, for raw block devices and for XFS with a real-time device, the
>> > value in inode->i_sb->s_bdev is not correct. With the code as it is
>> > currently written, an fsync or msync to a DAX enabled raw block device will
>> > cause a NULL pointer dereference kernel BUG. For this to work correctly we
>> > need to ask the block device or filesystem what struct block_device is
>> > appropriate for our inode.
>> >
>> > To that end, add a get_bdev(struct inode *) entry point to struct
>> > super_operations. If this function pointer is non-NULL, this notifies DAX
>> > that it needs to use it to look up the correct block_device. If
>> > i_sb->get_bdev() is NULL DAX will default to inode->i_sb->s_bdev.
>>
>> Umm... It assumes that bdev will stay pinned for as long as inode is
>> referenced, presumably? If so, that needs to be documented (and verified
>> for existing fs instances). In principle, multi-disk fs might want to
>> support things like "silently move the inodes backed by that disk to other
>> ones"...
>
> Dan, This is exactly the kind of thing I'm taking about WRT the
> weirder device models and directly calling bdev_direct_access().
> Filesystems don't have the monogamous relationship with a device that
> is implicitly assumed in DAX, you have to ask the filesystem what the
> relationship is and is migrating to, and allow the filesystem to
> update DAX when the relationship is changing.
That's precisely what ->get_bdev() does. When the answer
inode->i_sb->s_bdev lookup is invalid, use ->get_bdev().
> As we start to see many
> DIMM's and 10s TiB pmem systems this is going be an even bigger deal
> as load balancing, wear leveling, and fault tolerance concerned are
> inevitably driven by the filesystem.
No, there are no plans on the horizon for an fs to manage these media
specific concerns for persistent memory.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] dax: allow DAX to look up an inode's block device
2016-02-02 23:41 ` Dan Williams
@ 2016-02-03 0:33 ` Jared Hulbert
0 siblings, 0 replies; 8+ messages in thread
From: Jared Hulbert @ 2016-02-03 0:33 UTC (permalink / raw)
To: Dan Williams
Cc: Andrew Morton, linux-nvdimm, LKML, XFS Developers,
J. Bruce Fields, Al Viro, Jan Kara, Linux FS Devel, Jeff Layton,
Ross Zwisler
On Tue, Feb 2, 2016 at 3:41 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> On Tue, Feb 2, 2016 at 3:36 PM, Jared Hulbert <jaredeh@gmail.com> wrote:
>> On Tue, Feb 2, 2016 at 3:19 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>>>
>>> On Tue, Feb 02, 2016 at 04:11:42PM -0700, Ross Zwisler wrote:
>>>
>>> > However, for raw block devices and for XFS with a real-time device, the
>>> > value in inode->i_sb->s_bdev is not correct. With the code as it is
>>> > currently written, an fsync or msync to a DAX enabled raw block device will
>>> > cause a NULL pointer dereference kernel BUG. For this to work correctly we
>>> > need to ask the block device or filesystem what struct block_device is
>>> > appropriate for our inode.
>>> >
>>> > To that end, add a get_bdev(struct inode *) entry point to struct
>>> > super_operations. If this function pointer is non-NULL, this notifies DAX
>>> > that it needs to use it to look up the correct block_device. If
>>> > i_sb->get_bdev() is NULL DAX will default to inode->i_sb->s_bdev.
>>>
>>> Umm... It assumes that bdev will stay pinned for as long as inode is
>>> referenced, presumably? If so, that needs to be documented (and verified
>>> for existing fs instances). In principle, multi-disk fs might want to
>>> support things like "silently move the inodes backed by that disk to other
>>> ones"...
>>
>> Dan, This is exactly the kind of thing I'm taking about WRT the
>> weirder device models and directly calling bdev_direct_access().
>> Filesystems don't have the monogamous relationship with a device that
>> is implicitly assumed in DAX, you have to ask the filesystem what the
>> relationship is and is migrating to, and allow the filesystem to
>> update DAX when the relationship is changing.
>
> That's precisely what ->get_bdev() does. When the answer
> inode->i_sb->s_bdev lookup is invalid, use ->get_bdev().
>
>> As we start to see many
>> DIMM's and 10s TiB pmem systems this is going be an even bigger deal
>> as load balancing, wear leveling, and fault tolerance concerned are
>> inevitably driven by the filesystem.
>
> No, there are no plans on the horizon for an fs to manage these media
> specific concerns for persistent memory.
So the filesystem is now directly in charge of mapping user pages to
physical memory. The filesystem is effectively bypassing NUMA and
zones and all that stuff that tries to balance memory bus and QPI
traffic etc. You don't think the filesystem will therefore be in
charge of memory bus hotspots?
Alright. We can just agree to disagree on that point.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] dax: allow DAX to look up an inode's block device
2016-02-02 23:19 ` Al Viro
2016-02-02 23:36 ` Jared Hulbert
@ 2016-02-02 23:38 ` Dan Williams
2016-02-02 23:39 ` Dan Williams
1 sibling, 1 reply; 8+ messages in thread
From: Dan Williams @ 2016-02-02 23:38 UTC (permalink / raw)
To: Al Viro
Cc: Jeff Layton, linux-nvdimm, linux-kernel@vger.kernel.org,
XFS Developers, J. Bruce Fields, inux-btrfs, Jan Kara,
linux-fsdevel, Matthew Wilcox, Ross Zwisler, Andrew Morton
[ adding btrfs ]
On Tue, Feb 2, 2016 at 3:19 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Tue, Feb 02, 2016 at 04:11:42PM -0700, Ross Zwisler wrote:
>
>> However, for raw block devices and for XFS with a real-time device, the
>> value in inode->i_sb->s_bdev is not correct. With the code as it is
>> currently written, an fsync or msync to a DAX enabled raw block device will
>> cause a NULL pointer dereference kernel BUG. For this to work correctly we
>> need to ask the block device or filesystem what struct block_device is
>> appropriate for our inode.
>>
>> To that end, add a get_bdev(struct inode *) entry point to struct
>> super_operations. If this function pointer is non-NULL, this notifies DAX
>> that it needs to use it to look up the correct block_device. If
>> i_sb->get_bdev() is NULL DAX will default to inode->i_sb->s_bdev.
>
> Umm... It assumes that bdev will stay pinned for as long as inode is
> referenced, presumably? If so, that needs to be documented (and verified
> for existing fs instances). In principle, multi-disk fs might want to
> support things like "silently move the inodes backed by that disk to other
> ones"...
I assume btrfs is the only fs we have that might reassign the bdev for
a given inode on the fly? Hopefully we don't need anything stronger
than rcu_read_lock() to pin the result as valid.
At least in this case the initial user is dax-fsync where the
->get_bdev() answer should be static for the life of the inode, and
btrfs does not currently interface with dax. But yes, we need to get
the expected semantics clear.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] dax: allow DAX to look up an inode's block device
2016-02-02 23:38 ` Dan Williams
@ 2016-02-02 23:39 ` Dan Williams
2016-02-02 23:52 ` Matthew Wilcox
0 siblings, 1 reply; 8+ messages in thread
From: Dan Williams @ 2016-02-02 23:39 UTC (permalink / raw)
To: Al Viro
Cc: Jeff Layton, linux-nvdimm, linux-kernel@vger.kernel.org,
XFS Developers, J. Bruce Fields, Jan Kara, linux-fsdevel,
Matthew Wilcox, Ross Zwisler, Andrew Morton, linux-btrfs
[ adding btrfs, resend with the correct list address ]
On Tue, Feb 2, 2016 at 3:19 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Tue, Feb 02, 2016 at 04:11:42PM -0700, Ross Zwisler wrote:
>
>> However, for raw block devices and for XFS with a real-time device, the
>> value in inode->i_sb->s_bdev is not correct. With the code as it is
>> currently written, an fsync or msync to a DAX enabled raw block device will
>> cause a NULL pointer dereference kernel BUG. For this to work correctly we
>> need to ask the block device or filesystem what struct block_device is
>> appropriate for our inode.
>>
>> To that end, add a get_bdev(struct inode *) entry point to struct
>> super_operations. If this function pointer is non-NULL, this notifies DAX
>> that it needs to use it to look up the correct block_device. If
>> i_sb->get_bdev() is NULL DAX will default to inode->i_sb->s_bdev.
>
> Umm... It assumes that bdev will stay pinned for as long as inode is
> referenced, presumably? If so, that needs to be documented (and verified
> for existing fs instances). In principle, multi-disk fs might want to
> support things like "silently move the inodes backed by that disk to other
> ones"...
I assume btrfs is the only fs we have that might reassign the bdev for
a given inode on the fly? Hopefully we don't need anything stronger
than rcu_read_lock() to pin the result as valid.
At least in this case the initial user is dax-fsync where the
->get_bdev() answer should be static for the life of the inode, and
btrfs does not currently interface with dax. But yes, we need to get
the expected semantics clear.
On Tue, Feb 2, 2016 at 3:38 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> [ adding btrfs ]
>
> On Tue, Feb 2, 2016 at 3:19 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>> On Tue, Feb 02, 2016 at 04:11:42PM -0700, Ross Zwisler wrote:
>>
>>> However, for raw block devices and for XFS with a real-time device, the
>>> value in inode->i_sb->s_bdev is not correct. With the code as it is
>>> currently written, an fsync or msync to a DAX enabled raw block device will
>>> cause a NULL pointer dereference kernel BUG. For this to work correctly we
>>> need to ask the block device or filesystem what struct block_device is
>>> appropriate for our inode.
>>>
>>> To that end, add a get_bdev(struct inode *) entry point to struct
>>> super_operations. If this function pointer is non-NULL, this notifies DAX
>>> that it needs to use it to look up the correct block_device. If
>>> i_sb->get_bdev() is NULL DAX will default to inode->i_sb->s_bdev.
>>
>> Umm... It assumes that bdev will stay pinned for as long as inode is
>> referenced, presumably? If so, that needs to be documented (and verified
>> for existing fs instances). In principle, multi-disk fs might want to
>> support things like "silently move the inodes backed by that disk to other
>> ones"...
>
> I assume btrfs is the only fs we have that might reassign the bdev for
> a given inode on the fly? Hopefully we don't need anything stronger
> than rcu_read_lock() to pin the result as valid.
>
> At least in this case the initial user is dax-fsync where the
> ->get_bdev() answer should be static for the life of the inode, and
> btrfs does not currently interface with dax. But yes, we need to get
> the expected semantics clear.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] dax: allow DAX to look up an inode's block device
2016-02-02 23:39 ` Dan Williams
@ 2016-02-02 23:52 ` Matthew Wilcox
0 siblings, 0 replies; 8+ messages in thread
From: Matthew Wilcox @ 2016-02-02 23:52 UTC (permalink / raw)
To: Dan Williams
Cc: Jeff Layton, linux-nvdimm, linux-kernel@vger.kernel.org,
XFS Developers, J. Bruce Fields, Al Viro, Jan Kara, linux-fsdevel,
Ross Zwisler, Andrew Morton, linux-btrfs
On Tue, Feb 02, 2016 at 03:39:15PM -0800, Dan Williams wrote:
> On Tue, Feb 2, 2016 at 3:19 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> > On Tue, Feb 02, 2016 at 04:11:42PM -0700, Ross Zwisler wrote:
> >> However, for raw block devices and for XFS with a real-time device, the
> >> value in inode->i_sb->s_bdev is not correct. With the code as it is
> >> currently written, an fsync or msync to a DAX enabled raw block device will
> >> cause a NULL pointer dereference kernel BUG. For this to work correctly we
> >> need to ask the block device or filesystem what struct block_device is
> >> appropriate for our inode.
> >>
> >> To that end, add a get_bdev(struct inode *) entry point to struct
> >> super_operations. If this function pointer is non-NULL, this notifies DAX
> >> that it needs to use it to look up the correct block_device. If
> >> i_sb->get_bdev() is NULL DAX will default to inode->i_sb->s_bdev.
> >
> > Umm... It assumes that bdev will stay pinned for as long as inode is
> > referenced, presumably? If so, that needs to be documented (and verified
> > for existing fs instances). In principle, multi-disk fs might want to
> > support things like "silently move the inodes backed by that disk to other
> > ones"...
>
> I assume btrfs is the only fs we have that might reassign the bdev for
> a given inode on the fly? Hopefully we don't need anything stronger
> than rcu_read_lock() to pin the result as valid.
>
> At least in this case the initial user is dax-fsync where the
> ->get_bdev() answer should be static for the life of the inode, and
> btrfs does not currently interface with dax. But yes, we need to get
> the expected semantics clear.
Let's be clear though. ->get_bdev is a temporary hack. The need for
it goes away when DAX doesn't rely on being on a block_device any more.
I don't expect it to live longer than six months.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2016-02-03 0:33 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-02-02 23:11 [PATCH] dax: allow DAX to look up an inode's block device Ross Zwisler
2016-02-02 23:19 ` Al Viro
2016-02-02 23:36 ` Jared Hulbert
2016-02-02 23:41 ` Dan Williams
2016-02-03 0:33 ` Jared Hulbert
2016-02-02 23:38 ` Dan Williams
2016-02-02 23:39 ` Dan Williams
2016-02-02 23:52 ` Matthew Wilcox
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox