* [PATCH 28/29] xfs: Convert to bdev_open_by_path()
2023-08-11 11:04 [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle Jan Kara
@ 2023-08-11 11:04 ` Jan Kara
[not found] ` <CGME20230814102748eucas1p269b8a53ed09fae1eb57dce3d2a7de752@eucas1p2.samsung.com>
2023-08-11 12:27 ` [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle Christoph Hellwig
2023-08-25 1:58 ` Al Viro
2 siblings, 1 reply; 11+ messages in thread
From: Jan Kara @ 2023-08-11 11:04 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-block, Christoph Hellwig, Jan Kara, Darrick J. Wong,
linux-xfs
Convert xfs to use bdev_open_by_path() and pass the handle around.
CC: "Darrick J. Wong" <djwong@kernel.org>
CC: linux-xfs@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
fs/xfs/xfs_buf.c | 11 +++++-----
fs/xfs/xfs_buf.h | 3 ++-
fs/xfs/xfs_super.c | 54 +++++++++++++++++++++++++---------------------
3 files changed, 37 insertions(+), 31 deletions(-)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 15d1e5a7c2d3..461a5fb6155b 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1989,7 +1989,7 @@ xfs_setsize_buftarg_early(
struct xfs_buftarg *
xfs_alloc_buftarg(
struct xfs_mount *mp,
- struct block_device *bdev)
+ struct bdev_handle *bdev_handle)
{
xfs_buftarg_t *btp;
const struct dax_holder_operations *ops = NULL;
@@ -2000,9 +2000,10 @@ xfs_alloc_buftarg(
btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
btp->bt_mount = mp;
- btp->bt_dev = bdev->bd_dev;
- btp->bt_bdev = bdev;
- btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off,
+ btp->bt_bdev_handle = bdev_handle;
+ btp->bt_dev = bdev_handle->bdev->bd_dev;
+ btp->bt_bdev = bdev_handle->bdev;
+ btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
mp, ops);
/*
@@ -2012,7 +2013,7 @@ xfs_alloc_buftarg(
ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
DEFAULT_RATELIMIT_BURST);
- if (xfs_setsize_buftarg_early(btp, bdev))
+ if (xfs_setsize_buftarg_early(btp, btp->bt_bdev))
goto error_free;
if (list_lru_init(&btp->bt_lru))
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 549c60942208..f6418c1312f5 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -92,6 +92,7 @@ typedef unsigned int xfs_buf_flags_t;
*/
typedef struct xfs_buftarg {
dev_t bt_dev;
+ struct bdev_handle *bt_bdev_handle;
struct block_device *bt_bdev;
struct dax_device *bt_daxdev;
u64 bt_dax_part_off;
@@ -351,7 +352,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
* Handling of buftargs.
*/
struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp,
- struct block_device *bdev);
+ struct bdev_handle *bdev_handle);
extern void xfs_free_buftarg(struct xfs_buftarg *);
extern void xfs_buftarg_wait(struct xfs_buftarg *);
extern void xfs_buftarg_drain(struct xfs_buftarg *);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 5340f2dc28bd..6189f726b309 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -381,14 +381,15 @@ STATIC int
xfs_blkdev_get(
xfs_mount_t *mp,
const char *name,
- struct block_device **bdevp)
+ struct bdev_handle **handlep)
{
int error = 0;
- *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE,
- mp->m_super, &fs_holder_ops);
- if (IS_ERR(*bdevp)) {
- error = PTR_ERR(*bdevp);
+ *handlep = bdev_open_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE,
+ mp->m_super, &fs_holder_ops);
+ if (IS_ERR(*handlep)) {
+ error = PTR_ERR(*handlep);
+ *handlep = NULL;
xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
}
@@ -397,11 +398,10 @@ xfs_blkdev_get(
STATIC void
xfs_blkdev_put(
- struct xfs_mount *mp,
- struct block_device *bdev)
+ struct bdev_handle *handle)
{
- if (bdev)
- blkdev_put(bdev, mp->m_super);
+ if (handle)
+ bdev_release(handle);
}
STATIC void
@@ -409,16 +409,18 @@ xfs_close_devices(
struct xfs_mount *mp)
{
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
- struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
+ struct bdev_handle *logdev_handle =
+ mp->m_logdev_targp->bt_bdev_handle;
xfs_free_buftarg(mp->m_logdev_targp);
- xfs_blkdev_put(mp, logdev);
+ xfs_blkdev_put(logdev_handle);
}
if (mp->m_rtdev_targp) {
- struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
+ struct bdev_handle *rtdev_handle =
+ mp->m_rtdev_targp->bt_bdev_handle;
xfs_free_buftarg(mp->m_rtdev_targp);
- xfs_blkdev_put(mp, rtdev);
+ xfs_blkdev_put(rtdev_handle);
}
xfs_free_buftarg(mp->m_ddev_targp);
}
@@ -439,7 +441,7 @@ xfs_open_devices(
{
struct super_block *sb = mp->m_super;
struct block_device *ddev = sb->s_bdev;
- struct block_device *logdev = NULL, *rtdev = NULL;
+ struct bdev_handle *logdev_handle = NULL, *rtdev_handle = NULL;
int error;
/*
@@ -452,17 +454,19 @@ xfs_open_devices(
* Open real time and log devices - order is important.
*/
if (mp->m_logname) {
- error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
+ error = xfs_blkdev_get(mp, mp->m_logname, &logdev_handle);
if (error)
goto out_relock;
}
if (mp->m_rtname) {
- error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev);
+ error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_handle);
if (error)
goto out_close_logdev;
- if (rtdev == ddev || rtdev == logdev) {
+ if (rtdev_handle->bdev == ddev ||
+ (logdev_handle &&
+ rtdev_handle->bdev == logdev_handle->bdev)) {
xfs_warn(mp,
"Cannot mount filesystem with identical rtdev and ddev/logdev.");
error = -EINVAL;
@@ -474,18 +478,18 @@ xfs_open_devices(
* Setup xfs_mount buffer target pointers
*/
error = -ENOMEM;
- mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev);
+ mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_handle);
if (!mp->m_ddev_targp)
goto out_close_rtdev;
- if (rtdev) {
- mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev);
+ if (rtdev_handle) {
+ mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_handle);
if (!mp->m_rtdev_targp)
goto out_free_ddev_targ;
}
- if (logdev && logdev != ddev) {
- mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev);
+ if (logdev_handle && logdev_handle->bdev != ddev) {
+ mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_handle);
if (!mp->m_logdev_targp)
goto out_free_rtdev_targ;
} else {
@@ -503,10 +507,10 @@ xfs_open_devices(
out_free_ddev_targ:
xfs_free_buftarg(mp->m_ddev_targp);
out_close_rtdev:
- xfs_blkdev_put(mp, rtdev);
+ xfs_blkdev_put(rtdev_handle);
out_close_logdev:
- if (logdev && logdev != ddev)
- xfs_blkdev_put(mp, logdev);
+ if (logdev_handle && logdev_handle->bdev != ddev)
+ xfs_blkdev_put(logdev_handle);
goto out_relock;
}
--
2.35.3
^ permalink raw reply related [flat|nested] 11+ messages in thread* Re: [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle
2023-08-11 11:04 [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle Jan Kara
2023-08-11 11:04 ` [PATCH 28/29] xfs: Convert to bdev_open_by_path() Jan Kara
@ 2023-08-11 12:27 ` Christoph Hellwig
2023-08-25 1:58 ` Al Viro
2 siblings, 0 replies; 11+ messages in thread
From: Christoph Hellwig @ 2023-08-11 12:27 UTC (permalink / raw)
To: Jan Kara
Cc: linux-fsdevel, linux-block, Christoph Hellwig, Alasdair Kergon,
Andrew Morton, Anna Schumaker, Chao Yu, Christian Borntraeger,
Darrick J. Wong, Dave Kleikamp, David Sterba, dm-devel, drbd-dev,
Gao Xiang, Jack Wang, Jaegeuk Kim, jfs-discussion, Joern Engel,
Joseph Qi, Kent Overstreet, linux-bcache, linux-btrfs,
linux-erofs, linux-ext4, linux-f2fs-devel, linux-mm, linux-mtd,
linux-nfs, linux-nilfs, linux-nvme, linux-pm, linux-raid,
linux-s390, linux-scsi, linux-xfs, Md. Haris Iqbal, Mike Snitzer,
Minchan Kim, ocfs2-devel, reiserfs-devel, Sergey Senozhatsky,
Song Liu, Sven Schnelle, target-devel, Ted Tso, Trond Myklebust,
xen-devel
Except for a mostly cosmetic nitpick this looks good to me:
Acked-by: Christoph Hellwig <hch@lst.de>
That's not eactly the deep review I'd like to do, but as I'm about to
head out for vacation that's probably as good as it gets.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle
2023-08-11 11:04 [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle Jan Kara
2023-08-11 11:04 ` [PATCH 28/29] xfs: Convert to bdev_open_by_path() Jan Kara
2023-08-11 12:27 ` [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle Christoph Hellwig
@ 2023-08-25 1:58 ` Al Viro
2023-08-25 13:47 ` Jan Kara
2 siblings, 1 reply; 11+ messages in thread
From: Al Viro @ 2023-08-25 1:58 UTC (permalink / raw)
To: Jan Kara
Cc: linux-fsdevel, linux-block, Christoph Hellwig, Alasdair Kergon,
Andrew Morton, Anna Schumaker, Chao Yu, Christian Borntraeger,
Darrick J. Wong, Dave Kleikamp, David Sterba, dm-devel, drbd-dev,
Gao Xiang, Jack Wang, Jaegeuk Kim, jfs-discussion, Joern Engel,
Joseph Qi, Kent Overstreet, linux-bcache, linux-btrfs,
linux-erofs, linux-ext4, linux-f2fs-devel, linux-mm, linux-mtd,
linux-nfs, linux-nilfs, linux-nvme, linux-pm, linux-raid,
linux-s390, linux-scsi, linux-xfs, Md. Haris Iqbal, Mike Snitzer,
Minchan Kim, ocfs2-devel, reiserfs-devel, Sergey Senozhatsky,
Song Liu, Sven Schnelle, target-devel, Ted Tso, Trond Myklebust,
xen-devel
On Fri, Aug 11, 2023 at 01:04:31PM +0200, Jan Kara wrote:
> Hello,
>
> this is a v2 of the patch series which implements the idea of blkdev_get_by_*()
> calls returning bdev_handle which is then passed to blkdev_put() [1]. This
> makes the get and put calls for bdevs more obviously matching and allows us to
> propagate context from get to put without having to modify all the users
> (again!). In particular I need to propagate used open flags to blkdev_put() to
> be able count writeable opens and add support for blocking writes to mounted
> block devices. I'll send that series separately.
>
> The series is based on Christian's vfs tree as of yesterday as there is quite
> some overlap. Patches have passed some reasonable testing - I've tested block
> changes, md, dm, bcache, xfs, btrfs, ext4, swap. This obviously doesn't cover
> everything so I'd like to ask respective maintainers to review / test their
> changes. Thanks! I've pushed out the full branch to:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs.git bdev_handle
>
> to ease review / testing.
Hmm... Completely Insane Idea(tm): how about turning that thing inside out and
having your bdev_open_by... return an actual opened struct file?
After all, we do that for sockets and pipes just fine and that's a whole lot
hotter area.
Suppose we leave blkdev_open()/blkdev_release() as-is. No need to mess with
what we have for normal opened files for block devices. And have block_open_by_dev()
that would find bdev, etc., same yours does and shove it into anon file.
Paired with plain fput() - no need to bother with new primitives for closing.
With a helper returning I_BDEV(bdev_file_inode(file)) to get from those to bdev.
NOTE: I'm not suggesting replacing ->s_bdev with struct file * if we do that -
we want that value cached, obviously. Just store both...
Not saying it's a good idea, but... might be interesting to look into.
Comments?
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle
2023-08-25 1:58 ` Al Viro
@ 2023-08-25 13:47 ` Jan Kara
2023-08-26 2:28 ` Al Viro
` (2 more replies)
0 siblings, 3 replies; 11+ messages in thread
From: Jan Kara @ 2023-08-25 13:47 UTC (permalink / raw)
To: Al Viro
Cc: Jan Kara, linux-fsdevel, linux-block, Christoph Hellwig,
Alasdair Kergon, Andrew Morton, Anna Schumaker, Chao Yu,
Christian Borntraeger, Darrick J. Wong, Dave Kleikamp,
David Sterba, dm-devel, drbd-dev, Gao Xiang, Jack Wang,
Jaegeuk Kim, jfs-discussion, Joern Engel, Joseph Qi,
Kent Overstreet, linux-bcache, linux-btrfs, linux-erofs,
linux-ext4, linux-f2fs-devel, linux-mm, linux-mtd, linux-nfs,
linux-nilfs, linux-nvme, linux-pm, linux-raid, linux-s390,
linux-scsi, linux-xfs, Md. Haris Iqbal, Mike Snitzer, Minchan Kim,
ocfs2-devel, reiserfs-devel, Sergey Senozhatsky, Song Liu,
Sven Schnelle, target-devel, Ted Tso, Trond Myklebust, xen-devel,
Jens Axboe, Christian Brauner
On Fri 25-08-23 02:58:43, Al Viro wrote:
> On Fri, Aug 11, 2023 at 01:04:31PM +0200, Jan Kara wrote:
> > Hello,
> >
> > this is a v2 of the patch series which implements the idea of blkdev_get_by_*()
> > calls returning bdev_handle which is then passed to blkdev_put() [1]. This
> > makes the get and put calls for bdevs more obviously matching and allows us to
> > propagate context from get to put without having to modify all the users
> > (again!). In particular I need to propagate used open flags to blkdev_put() to
> > be able count writeable opens and add support for blocking writes to mounted
> > block devices. I'll send that series separately.
> >
> > The series is based on Christian's vfs tree as of yesterday as there is quite
> > some overlap. Patches have passed some reasonable testing - I've tested block
> > changes, md, dm, bcache, xfs, btrfs, ext4, swap. This obviously doesn't cover
> > everything so I'd like to ask respective maintainers to review / test their
> > changes. Thanks! I've pushed out the full branch to:
> >
> > git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs.git bdev_handle
> >
> > to ease review / testing.
>
> Hmm... Completely Insane Idea(tm): how about turning that thing inside out and
> having your bdev_open_by... return an actual opened struct file?
>
> After all, we do that for sockets and pipes just fine and that's a whole lot
> hotter area.
>
> Suppose we leave blkdev_open()/blkdev_release() as-is. No need to mess with
> what we have for normal opened files for block devices. And have block_open_by_dev()
> that would find bdev, etc., same yours does and shove it into anon file.
>
> Paired with plain fput() - no need to bother with new primitives for closing.
> With a helper returning I_BDEV(bdev_file_inode(file)) to get from those to bdev.
>
> NOTE: I'm not suggesting replacing ->s_bdev with struct file * if we do that -
> we want that value cached, obviously. Just store both...
>
> Not saying it's a good idea, but... might be interesting to look into.
> Comments?
I can see the appeal of not having to introduce the new bdev_handle type
and just using struct file which unifies in-kernel and userspace block
device opens. But I can see downsides too - the last fput() happening from
task work makes me a bit nervous whether it will not break something
somewhere with exclusive bdev opens. Getting from struct file to bdev is
somewhat harder but I guess a helper like F_BDEV() would solve that just
fine.
So besides my last fput() worry about I think this could work and would be
probably a bit nicer than what I have. But before going and redoing the whole
series let me gather some more feedback so that we don't go back and forth.
Christoph, Christian, Jens, any opinion?
Honza
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle
2023-08-25 13:47 ` Jan Kara
@ 2023-08-26 2:28 ` Al Viro
2023-08-28 14:27 ` Christoph Hellwig
2023-08-28 13:20 ` Christian Brauner
2023-08-28 14:22 ` Christoph Hellwig
2 siblings, 1 reply; 11+ messages in thread
From: Al Viro @ 2023-08-26 2:28 UTC (permalink / raw)
To: Jan Kara
Cc: linux-fsdevel, linux-block, Christoph Hellwig, Alasdair Kergon,
Andrew Morton, Anna Schumaker, Chao Yu, Christian Borntraeger,
Darrick J. Wong, Dave Kleikamp, David Sterba, dm-devel, drbd-dev,
Gao Xiang, Jack Wang, Jaegeuk Kim, jfs-discussion, Joern Engel,
Joseph Qi, Kent Overstreet, linux-bcache, linux-btrfs,
linux-erofs, linux-ext4, linux-f2fs-devel, linux-mm, linux-mtd,
linux-nfs, linux-nilfs, linux-nvme, linux-pm, linux-raid,
linux-s390, linux-scsi, linux-xfs, Md. Haris Iqbal, Mike Snitzer,
Minchan Kim, ocfs2-devel, reiserfs-devel, Sergey Senozhatsky,
Song Liu, Sven Schnelle, target-devel, Ted Tso, Trond Myklebust,
xen-devel, Jens Axboe, Christian Brauner
On Fri, Aug 25, 2023 at 03:47:56PM +0200, Jan Kara wrote:
> I can see the appeal of not having to introduce the new bdev_handle type
> and just using struct file which unifies in-kernel and userspace block
> device opens. But I can see downsides too - the last fput() happening from
> task work makes me a bit nervous whether it will not break something
> somewhere with exclusive bdev opens. Getting from struct file to bdev is
> somewhat harder but I guess a helper like F_BDEV() would solve that just
> fine.
>
> So besides my last fput() worry about I think this could work and would be
> probably a bit nicer than what I have. But before going and redoing the whole
> series let me gather some more feedback so that we don't go back and forth.
> Christoph, Christian, Jens, any opinion?
Redoing is not an issue - it can be done on top of your series just
as well. Async behaviour of fput() might be, but... need to look
through the actual users; for a lot of them it's perfectly fine.
FWIW, from a cursory look there appears to be a missing primitive: take
an opened bdev (or bdev_handle, with your variant, or opened file if we
go that way eventually) and claim it.
I mean, look at claim_swapfile() for example:
p->bdev = blkdev_get_by_dev(inode->i_rdev,
FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
if (IS_ERR(p->bdev)) {
error = PTR_ERR(p->bdev);
p->bdev = NULL;
return error;
}
p->old_block_size = block_size(p->bdev);
error = set_blocksize(p->bdev, PAGE_SIZE);
if (error < 0)
return error;
we already have the file opened, and we keep it opened all the way until
the swapoff(2); here we have noticed that it's a block device and we
* open the fucker again (by device number), this time claiming
it with our swap_info_struct as holder, to be closed at swapoff(2) time
(just before we close the file)
* flip the block size to PAGE_SIZE, to be reverted at swapoff(2)
time That really looks like it ought to be
* take the opened file, see that it's a block device
* try to claim it with that holder
* on success, flip the block size
with close_filp() in the swapoff(2) (or failure exit path in swapon(2))
doing what it would've done for an O_EXCL opened block device.
The only difference from O_EXCL userland open is that here we would
end up with holder pointing not to struct file in question, but to our
swap_info_struct. It will do the right thing.
This extra open is entirely due to "well, we need to claim it and the
primitive that does that happens to be tied to opening"; feels rather
counter-intuitive.
For that matter, we could add an explicit "unclaim" primitive - might
be easier to follow. That would add another example where that could
be used - in blkdev_bszset() we have an opened block device (it's an
ioctl, after all), we want to change block size and we *really* don't
want to have that happen under a mounted filesystem. So if it's not
opened exclusive, we do a temporary exclusive open of own and act on
that instead. Might as well go for a temporary claim...
BTW, what happens if two threads call ioctl(fd, BLKBSZSET, &n)
for the same descriptor that happens to have been opened O_EXCL?
Without O_EXCL they would've been unable to claim the sucker at the same
time - the holder we are using is the address of a function argument,
i.e. something that points to kernel stack of the caller. Those would
conflict and we either get set_blocksize() calls fully serialized, or
one of the callers would eat -EBUSY. Not so in "opened with O_EXCL"
case - they can very well overlap and IIRC set_blocksize() does *not*
expect that kind of crap... It's all under CAP_SYS_ADMIN, so it's not
as if it was a meaningful security hole anyway, but it does look fishy.
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle
2023-08-26 2:28 ` Al Viro
@ 2023-08-28 14:27 ` Christoph Hellwig
0 siblings, 0 replies; 11+ messages in thread
From: Christoph Hellwig @ 2023-08-28 14:27 UTC (permalink / raw)
To: Al Viro
Cc: Jan Kara, linux-fsdevel, linux-block, Christoph Hellwig,
Alasdair Kergon, Andrew Morton, Anna Schumaker, Chao Yu,
Christian Borntraeger, Darrick J. Wong, Dave Kleikamp,
David Sterba, dm-devel, drbd-dev, Gao Xiang, Jack Wang,
Jaegeuk Kim, jfs-discussion, Joern Engel, Joseph Qi,
Kent Overstreet, linux-bcache, linux-btrfs, linux-erofs,
linux-ext4, linux-f2fs-devel, linux-mm, linux-mtd, linux-nfs,
linux-nilfs, linux-nvme, linux-pm, linux-raid, linux-s390,
linux-scsi, linux-xfs, Md. Haris Iqbal, Mike Snitzer, Minchan Kim,
ocfs2-devel, reiserfs-devel, Sergey Senozhatsky, Song Liu,
Sven Schnelle, target-devel, Ted Tso, Trond Myklebust, xen-devel,
Jens Axboe, Christian Brauner
On Sat, Aug 26, 2023 at 03:28:52AM +0100, Al Viro wrote:
> I mean, look at claim_swapfile() for example:
> p->bdev = blkdev_get_by_dev(inode->i_rdev,
> FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
> if (IS_ERR(p->bdev)) {
> error = PTR_ERR(p->bdev);
> p->bdev = NULL;
> return error;
> }
> p->old_block_size = block_size(p->bdev);
> error = set_blocksize(p->bdev, PAGE_SIZE);
> if (error < 0)
> return error;
> we already have the file opened, and we keep it opened all the way until
> the swapoff(2); here we have noticed that it's a block device and we
> * open the fucker again (by device number), this time claiming
> it with our swap_info_struct as holder, to be closed at swapoff(2) time
> (just before we close the file)
Note that some drivers look at FMODE_EXCL/BLK_OPEN_EXCL in ->open.
These are probably bogus and maybe we want to kill them, but that will
need an audit first.
> BTW, what happens if two threads call ioctl(fd, BLKBSZSET, &n)
> for the same descriptor that happens to have been opened O_EXCL?
> Without O_EXCL they would've been unable to claim the sucker at the same
> time - the holder we are using is the address of a function argument,
> i.e. something that points to kernel stack of the caller. Those would
> conflict and we either get set_blocksize() calls fully serialized, or
> one of the callers would eat -EBUSY. Not so in "opened with O_EXCL"
> case - they can very well overlap and IIRC set_blocksize() does *not*
> expect that kind of crap... It's all under CAP_SYS_ADMIN, so it's not
> as if it was a meaningful security hole anyway, but it does look fishy.
The user get to keep the pieces.. BLKBSZSET is kinda bogus anyway
as the soft blocksize only matters for buffer_head-like I/O, and
there only for file systems. Not idea why anyone would set it manually.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle
2023-08-25 13:47 ` Jan Kara
2023-08-26 2:28 ` Al Viro
@ 2023-08-28 13:20 ` Christian Brauner
2023-08-28 14:22 ` Christoph Hellwig
2 siblings, 0 replies; 11+ messages in thread
From: Christian Brauner @ 2023-08-28 13:20 UTC (permalink / raw)
To: Jan Kara
Cc: Al Viro, linux-fsdevel, linux-block, Christoph Hellwig,
Alasdair Kergon, Andrew Morton, Anna Schumaker, Chao Yu,
Christian Borntraeger, Darrick J. Wong, Dave Kleikamp,
David Sterba, dm-devel, drbd-dev, Gao Xiang, Jack Wang,
Jaegeuk Kim, jfs-discussion, Joern Engel, Joseph Qi,
Kent Overstreet, linux-bcache, linux-btrfs, linux-erofs,
linux-ext4, linux-f2fs-devel, linux-mm, linux-mtd, linux-nfs,
linux-nilfs, linux-nvme, linux-pm, linux-raid, linux-s390,
linux-scsi, linux-xfs, Md. Haris Iqbal, Mike Snitzer, Minchan Kim,
ocfs2-devel, reiserfs-devel, Sergey Senozhatsky, Song Liu,
Sven Schnelle, target-devel, Ted Tso, Trond Myklebust, xen-devel,
Jens Axboe
> So besides my last fput() worry about I think this could work and would be
> probably a bit nicer than what I have. But before going and redoing the whole
> series let me gather some more feedback so that we don't go back and forth.
> Christoph, Christian, Jens, any opinion?
I'll be a bit under water for the next few days, I expect but I'll get
back to this. I think not making you redo this whole thing from scratch
is what I'd prefer unless there's really clear advantages. But I don't
want to offer a haphazard opinion in the middle of the merge window.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle
2023-08-25 13:47 ` Jan Kara
2023-08-26 2:28 ` Al Viro
2023-08-28 13:20 ` Christian Brauner
@ 2023-08-28 14:22 ` Christoph Hellwig
2 siblings, 0 replies; 11+ messages in thread
From: Christoph Hellwig @ 2023-08-28 14:22 UTC (permalink / raw)
To: Jan Kara
Cc: Al Viro, linux-fsdevel, linux-block, Christoph Hellwig,
Alasdair Kergon, Andrew Morton, Anna Schumaker, Chao Yu,
Christian Borntraeger, Darrick J. Wong, Dave Kleikamp,
David Sterba, dm-devel, drbd-dev, Gao Xiang, Jack Wang,
Jaegeuk Kim, jfs-discussion, Joern Engel, Joseph Qi,
Kent Overstreet, linux-bcache, linux-btrfs, linux-erofs,
linux-ext4, linux-f2fs-devel, linux-mm, linux-mtd, linux-nfs,
linux-nilfs, linux-nvme, linux-pm, linux-raid, linux-s390,
linux-scsi, linux-xfs, Md. Haris Iqbal, Mike Snitzer, Minchan Kim,
ocfs2-devel, reiserfs-devel, Sergey Senozhatsky, Song Liu,
Sven Schnelle, target-devel, Ted Tso, Trond Myklebust, xen-devel,
Jens Axboe, Christian Brauner
On Fri, Aug 25, 2023 at 03:47:56PM +0200, Jan Kara wrote:
> I can see the appeal of not having to introduce the new bdev_handle type
> and just using struct file which unifies in-kernel and userspace block
> device opens. But I can see downsides too - the last fput() happening from
> task work makes me a bit nervous whether it will not break something
> somewhere with exclusive bdev opens. Getting from struct file to bdev is
> somewhat harder but I guess a helper like F_BDEV() would solve that just
> fine.
>
> So besides my last fput() worry about I think this could work and would be
> probably a bit nicer than what I have. But before going and redoing the whole
> series let me gather some more feedback so that we don't go back and forth.
> Christoph, Christian, Jens, any opinion?
I did think about the file a bit. The fact that we'd need something
like an anon_file for the by_dev open was always a huge turn off for
me, but maybe my concern is overblown. Having a struct file would
actually be really useful for a bunch of users.
^ permalink raw reply [flat|nested] 11+ messages in thread