* [PATCH 25/29] nfs/blocklayout: Convert to use bdev_open_by_dev/path()
2023-08-11 11:04 [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle Jan Kara
@ 2023-08-11 11:04 ` Jan Kara
2023-08-11 12:27 ` [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle Christoph Hellwig
2023-08-25 1:58 ` Al Viro
2 siblings, 0 replies; 9+ messages in thread
From: Jan Kara @ 2023-08-11 11:04 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-block, Christoph Hellwig, Jan Kara, linux-nfs,
Trond Myklebust, Anna Schumaker
Convert block device handling to use bdev_open_by_dev/path() and pass
the handle around.
CC: linux-nfs@vger.kernel.org
CC: Trond Myklebust <trond.myklebust@hammerspace.com>
CC: Anna Schumaker <anna@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
fs/nfs/blocklayout/blocklayout.h | 2 +-
fs/nfs/blocklayout/dev.c | 76 ++++++++++++++++----------------
2 files changed, 38 insertions(+), 40 deletions(-)
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 716bc75e9ed2..b4294a8aa2d4 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -108,7 +108,7 @@ struct pnfs_block_dev {
struct pnfs_block_dev *children;
u64 chunk_size;
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
u64 disk_offset;
u64 pr_key;
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index 70f5563a8e81..0b57b8a4b7e2 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -25,17 +25,17 @@ bl_free_device(struct pnfs_block_dev *dev)
} else {
if (dev->pr_registered) {
const struct pr_ops *ops =
- dev->bdev->bd_disk->fops->pr_ops;
+ dev->bdev_handle->bdev->bd_disk->fops->pr_ops;
int error;
- error = ops->pr_register(dev->bdev, dev->pr_key, 0,
- false);
+ error = ops->pr_register(dev->bdev_handle->bdev,
+ dev->pr_key, 0, false);
if (error)
pr_err("failed to unregister PR key.\n");
}
- if (dev->bdev)
- blkdev_put(dev->bdev, NULL);
+ if (dev->bdev_handle)
+ bdev_release(dev->bdev_handle);
}
}
@@ -169,7 +169,7 @@ static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
map->start = dev->start;
map->len = dev->len;
map->disk_offset = dev->disk_offset;
- map->bdev = dev->bdev;
+ map->bdev = dev->bdev_handle->bdev;
return true;
}
@@ -236,28 +236,26 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
dev_t dev;
dev = bl_resolve_deviceid(server, v, gfp_mask);
if (!dev)
return -EIO;
- bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL,
- NULL);
- if (IS_ERR(bdev)) {
+ bdev_handle = bdev_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
+ NULL, NULL);
+ if (IS_ERR(bdev_handle)) {
printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
- MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
- return PTR_ERR(bdev);
+ MAJOR(dev), MINOR(dev), PTR_ERR(bdev_handle));
+ return PTR_ERR(bdev_handle);
}
- d->bdev = bdev;
-
-
- d->len = bdev_nr_bytes(d->bdev);
+ d->bdev_handle = bdev_handle;
+ d->len = bdev_nr_bytes(bdev_handle->bdev);
d->map = bl_map_simple;
printk(KERN_INFO "pNFS: using block device %s\n",
- d->bdev->bd_disk->disk_name);
+ bdev_handle->bdev->bd_disk->disk_name);
return 0;
}
@@ -302,10 +300,10 @@ bl_validate_designator(struct pnfs_block_volume *v)
}
}
-static struct block_device *
+static struct bdev_handle *
bl_open_path(struct pnfs_block_volume *v, const char *prefix)
{
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
const char *devname;
devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/%s%*phN",
@@ -313,15 +311,15 @@ bl_open_path(struct pnfs_block_volume *v, const char *prefix)
if (!devname)
return ERR_PTR(-ENOMEM);
- bdev = blkdev_get_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL,
- NULL);
- if (IS_ERR(bdev)) {
+ bdev_handle = bdev_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE,
+ NULL, NULL);
+ if (IS_ERR(bdev_handle)) {
pr_warn("pNFS: failed to open device %s (%ld)\n",
- devname, PTR_ERR(bdev));
+ devname, PTR_ERR(bdev_handle));
}
kfree(devname);
- return bdev;
+ return bdev_handle;
}
static int
@@ -329,7 +327,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
const struct pr_ops *ops;
int error;
@@ -342,32 +340,32 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
* On other distributions like Debian, the default SCSI by-id path will
* point to the dm-multipath device if one exists.
*/
- bdev = bl_open_path(v, "dm-uuid-mpath-0x");
- if (IS_ERR(bdev))
- bdev = bl_open_path(v, "wwn-0x");
- if (IS_ERR(bdev))
- return PTR_ERR(bdev);
- d->bdev = bdev;
-
- d->len = bdev_nr_bytes(d->bdev);
+ bdev_handle = bl_open_path(v, "dm-uuid-mpath-0x");
+ if (IS_ERR(bdev_handle))
+ bdev_handle = bl_open_path(v, "wwn-0x");
+ if (IS_ERR(bdev_handle))
+ return PTR_ERR(bdev_handle);
+ d->bdev_handle = bdev_handle;
+
+ d->len = bdev_nr_bytes(d->bdev_handle->bdev);
d->map = bl_map_simple;
d->pr_key = v->scsi.pr_key;
pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
- d->bdev->bd_disk->disk_name, d->pr_key);
+ d->bdev_handle->bdev->bd_disk->disk_name, d->pr_key);
- ops = d->bdev->bd_disk->fops->pr_ops;
+ ops = d->bdev_handle->bdev->bd_disk->fops->pr_ops;
if (!ops) {
pr_err("pNFS: block device %s does not support reservations.",
- d->bdev->bd_disk->disk_name);
+ d->bdev_handle->bdev->bd_disk->disk_name);
error = -EINVAL;
goto out_blkdev_put;
}
- error = ops->pr_register(d->bdev, 0, d->pr_key, true);
+ error = ops->pr_register(d->bdev_handle->bdev, 0, d->pr_key, true);
if (error) {
pr_err("pNFS: failed to register key for block device %s.",
- d->bdev->bd_disk->disk_name);
+ d->bdev_handle->bdev->bd_disk->disk_name);
goto out_blkdev_put;
}
@@ -375,7 +373,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
return 0;
out_blkdev_put:
- blkdev_put(d->bdev, NULL);
+ bdev_release(d->bdev_handle);
return error;
}
--
2.35.3
^ permalink raw reply related [flat|nested] 9+ messages in thread* Re: [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle
2023-08-11 11:04 [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle Jan Kara
2023-08-11 11:04 ` [PATCH 25/29] nfs/blocklayout: Convert to use bdev_open_by_dev/path() Jan Kara
@ 2023-08-11 12:27 ` Christoph Hellwig
2023-08-25 1:58 ` Al Viro
2 siblings, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2023-08-11 12:27 UTC (permalink / raw)
To: Jan Kara
Cc: linux-fsdevel, linux-block, Christoph Hellwig, Alasdair Kergon,
Andrew Morton, Anna Schumaker, Chao Yu, Christian Borntraeger,
Darrick J. Wong, Dave Kleikamp, David Sterba, dm-devel, drbd-dev,
Gao Xiang, Jack Wang, Jaegeuk Kim, jfs-discussion, Joern Engel,
Joseph Qi, Kent Overstreet, linux-bcache, linux-btrfs,
linux-erofs, linux-ext4, linux-f2fs-devel, linux-mm, linux-mtd,
linux-nfs, linux-nilfs, linux-nvme, linux-pm, linux-raid,
linux-s390, linux-scsi, linux-xfs, Md. Haris Iqbal, Mike Snitzer,
Minchan Kim, ocfs2-devel, reiserfs-devel, Sergey Senozhatsky,
Song Liu, Sven Schnelle, target-devel, Ted Tso, Trond Myklebust,
xen-devel
Except for a mostly cosmetic nitpick this looks good to me:
Acked-by: Christoph Hellwig <hch@lst.de>
That's not eactly the deep review I'd like to do, but as I'm about to
head out for vacation that's probably as good as it gets.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle
2023-08-11 11:04 [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle Jan Kara
2023-08-11 11:04 ` [PATCH 25/29] nfs/blocklayout: Convert to use bdev_open_by_dev/path() Jan Kara
2023-08-11 12:27 ` [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle Christoph Hellwig
@ 2023-08-25 1:58 ` Al Viro
2023-08-25 13:47 ` Jan Kara
2 siblings, 1 reply; 9+ messages in thread
From: Al Viro @ 2023-08-25 1:58 UTC (permalink / raw)
To: Jan Kara
Cc: linux-fsdevel, linux-block, Christoph Hellwig, Alasdair Kergon,
Andrew Morton, Anna Schumaker, Chao Yu, Christian Borntraeger,
Darrick J. Wong, Dave Kleikamp, David Sterba, dm-devel, drbd-dev,
Gao Xiang, Jack Wang, Jaegeuk Kim, jfs-discussion, Joern Engel,
Joseph Qi, Kent Overstreet, linux-bcache, linux-btrfs,
linux-erofs, linux-ext4, linux-f2fs-devel, linux-mm, linux-mtd,
linux-nfs, linux-nilfs, linux-nvme, linux-pm, linux-raid,
linux-s390, linux-scsi, linux-xfs, Md. Haris Iqbal, Mike Snitzer,
Minchan Kim, ocfs2-devel, reiserfs-devel, Sergey Senozhatsky,
Song Liu, Sven Schnelle, target-devel, Ted Tso, Trond Myklebust,
xen-devel
On Fri, Aug 11, 2023 at 01:04:31PM +0200, Jan Kara wrote:
> Hello,
>
> this is a v2 of the patch series which implements the idea of blkdev_get_by_*()
> calls returning bdev_handle which is then passed to blkdev_put() [1]. This
> makes the get and put calls for bdevs more obviously matching and allows us to
> propagate context from get to put without having to modify all the users
> (again!). In particular I need to propagate used open flags to blkdev_put() to
> be able count writeable opens and add support for blocking writes to mounted
> block devices. I'll send that series separately.
>
> The series is based on Christian's vfs tree as of yesterday as there is quite
> some overlap. Patches have passed some reasonable testing - I've tested block
> changes, md, dm, bcache, xfs, btrfs, ext4, swap. This obviously doesn't cover
> everything so I'd like to ask respective maintainers to review / test their
> changes. Thanks! I've pushed out the full branch to:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs.git bdev_handle
>
> to ease review / testing.
Hmm... Completely Insane Idea(tm): how about turning that thing inside out and
having your bdev_open_by... return an actual opened struct file?
After all, we do that for sockets and pipes just fine and that's a whole lot
hotter area.
Suppose we leave blkdev_open()/blkdev_release() as-is. No need to mess with
what we have for normal opened files for block devices. And have block_open_by_dev()
that would find bdev, etc., same yours does and shove it into anon file.
Paired with plain fput() - no need to bother with new primitives for closing.
With a helper returning I_BDEV(bdev_file_inode(file)) to get from those to bdev.
NOTE: I'm not suggesting replacing ->s_bdev with struct file * if we do that -
we want that value cached, obviously. Just store both...
Not saying it's a good idea, but... might be interesting to look into.
Comments?
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle
2023-08-25 1:58 ` Al Viro
@ 2023-08-25 13:47 ` Jan Kara
2023-08-26 2:28 ` Al Viro
` (2 more replies)
0 siblings, 3 replies; 9+ messages in thread
From: Jan Kara @ 2023-08-25 13:47 UTC (permalink / raw)
To: Al Viro
Cc: Jan Kara, linux-fsdevel, linux-block, Christoph Hellwig,
Alasdair Kergon, Andrew Morton, Anna Schumaker, Chao Yu,
Christian Borntraeger, Darrick J. Wong, Dave Kleikamp,
David Sterba, dm-devel, drbd-dev, Gao Xiang, Jack Wang,
Jaegeuk Kim, jfs-discussion, Joern Engel, Joseph Qi,
Kent Overstreet, linux-bcache, linux-btrfs, linux-erofs,
linux-ext4, linux-f2fs-devel, linux-mm, linux-mtd, linux-nfs,
linux-nilfs, linux-nvme, linux-pm, linux-raid, linux-s390,
linux-scsi, linux-xfs, Md. Haris Iqbal, Mike Snitzer, Minchan Kim,
ocfs2-devel, reiserfs-devel, Sergey Senozhatsky, Song Liu,
Sven Schnelle, target-devel, Ted Tso, Trond Myklebust, xen-devel,
Jens Axboe, Christian Brauner
On Fri 25-08-23 02:58:43, Al Viro wrote:
> On Fri, Aug 11, 2023 at 01:04:31PM +0200, Jan Kara wrote:
> > Hello,
> >
> > this is a v2 of the patch series which implements the idea of blkdev_get_by_*()
> > calls returning bdev_handle which is then passed to blkdev_put() [1]. This
> > makes the get and put calls for bdevs more obviously matching and allows us to
> > propagate context from get to put without having to modify all the users
> > (again!). In particular I need to propagate used open flags to blkdev_put() to
> > be able count writeable opens and add support for blocking writes to mounted
> > block devices. I'll send that series separately.
> >
> > The series is based on Christian's vfs tree as of yesterday as there is quite
> > some overlap. Patches have passed some reasonable testing - I've tested block
> > changes, md, dm, bcache, xfs, btrfs, ext4, swap. This obviously doesn't cover
> > everything so I'd like to ask respective maintainers to review / test their
> > changes. Thanks! I've pushed out the full branch to:
> >
> > git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs.git bdev_handle
> >
> > to ease review / testing.
>
> Hmm... Completely Insane Idea(tm): how about turning that thing inside out and
> having your bdev_open_by... return an actual opened struct file?
>
> After all, we do that for sockets and pipes just fine and that's a whole lot
> hotter area.
>
> Suppose we leave blkdev_open()/blkdev_release() as-is. No need to mess with
> what we have for normal opened files for block devices. And have block_open_by_dev()
> that would find bdev, etc., same yours does and shove it into anon file.
>
> Paired with plain fput() - no need to bother with new primitives for closing.
> With a helper returning I_BDEV(bdev_file_inode(file)) to get from those to bdev.
>
> NOTE: I'm not suggesting replacing ->s_bdev with struct file * if we do that -
> we want that value cached, obviously. Just store both...
>
> Not saying it's a good idea, but... might be interesting to look into.
> Comments?
I can see the appeal of not having to introduce the new bdev_handle type
and just using struct file which unifies in-kernel and userspace block
device opens. But I can see downsides too - the last fput() happening from
task work makes me a bit nervous whether it will not break something
somewhere with exclusive bdev opens. Getting from struct file to bdev is
somewhat harder but I guess a helper like F_BDEV() would solve that just
fine.
So besides my last fput() worry about I think this could work and would be
probably a bit nicer than what I have. But before going and redoing the whole
series let me gather some more feedback so that we don't go back and forth.
Christoph, Christian, Jens, any opinion?
Honza
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle
2023-08-25 13:47 ` Jan Kara
@ 2023-08-26 2:28 ` Al Viro
2023-08-28 14:27 ` Christoph Hellwig
2023-08-28 13:20 ` Christian Brauner
2023-08-28 14:22 ` Christoph Hellwig
2 siblings, 1 reply; 9+ messages in thread
From: Al Viro @ 2023-08-26 2:28 UTC (permalink / raw)
To: Jan Kara
Cc: linux-fsdevel, linux-block, Christoph Hellwig, Alasdair Kergon,
Andrew Morton, Anna Schumaker, Chao Yu, Christian Borntraeger,
Darrick J. Wong, Dave Kleikamp, David Sterba, dm-devel, drbd-dev,
Gao Xiang, Jack Wang, Jaegeuk Kim, jfs-discussion, Joern Engel,
Joseph Qi, Kent Overstreet, linux-bcache, linux-btrfs,
linux-erofs, linux-ext4, linux-f2fs-devel, linux-mm, linux-mtd,
linux-nfs, linux-nilfs, linux-nvme, linux-pm, linux-raid,
linux-s390, linux-scsi, linux-xfs, Md. Haris Iqbal, Mike Snitzer,
Minchan Kim, ocfs2-devel, reiserfs-devel, Sergey Senozhatsky,
Song Liu, Sven Schnelle, target-devel, Ted Tso, Trond Myklebust,
xen-devel, Jens Axboe, Christian Brauner
On Fri, Aug 25, 2023 at 03:47:56PM +0200, Jan Kara wrote:
> I can see the appeal of not having to introduce the new bdev_handle type
> and just using struct file which unifies in-kernel and userspace block
> device opens. But I can see downsides too - the last fput() happening from
> task work makes me a bit nervous whether it will not break something
> somewhere with exclusive bdev opens. Getting from struct file to bdev is
> somewhat harder but I guess a helper like F_BDEV() would solve that just
> fine.
>
> So besides my last fput() worry about I think this could work and would be
> probably a bit nicer than what I have. But before going and redoing the whole
> series let me gather some more feedback so that we don't go back and forth.
> Christoph, Christian, Jens, any opinion?
Redoing is not an issue - it can be done on top of your series just
as well. Async behaviour of fput() might be, but... need to look
through the actual users; for a lot of them it's perfectly fine.
FWIW, from a cursory look there appears to be a missing primitive: take
an opened bdev (or bdev_handle, with your variant, or opened file if we
go that way eventually) and claim it.
I mean, look at claim_swapfile() for example:
p->bdev = blkdev_get_by_dev(inode->i_rdev,
FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
if (IS_ERR(p->bdev)) {
error = PTR_ERR(p->bdev);
p->bdev = NULL;
return error;
}
p->old_block_size = block_size(p->bdev);
error = set_blocksize(p->bdev, PAGE_SIZE);
if (error < 0)
return error;
we already have the file opened, and we keep it opened all the way until
the swapoff(2); here we have noticed that it's a block device and we
* open the fucker again (by device number), this time claiming
it with our swap_info_struct as holder, to be closed at swapoff(2) time
(just before we close the file)
* flip the block size to PAGE_SIZE, to be reverted at swapoff(2)
time That really looks like it ought to be
* take the opened file, see that it's a block device
* try to claim it with that holder
* on success, flip the block size
with close_filp() in the swapoff(2) (or failure exit path in swapon(2))
doing what it would've done for an O_EXCL opened block device.
The only difference from O_EXCL userland open is that here we would
end up with holder pointing not to struct file in question, but to our
swap_info_struct. It will do the right thing.
This extra open is entirely due to "well, we need to claim it and the
primitive that does that happens to be tied to opening"; feels rather
counter-intuitive.
For that matter, we could add an explicit "unclaim" primitive - might
be easier to follow. That would add another example where that could
be used - in blkdev_bszset() we have an opened block device (it's an
ioctl, after all), we want to change block size and we *really* don't
want to have that happen under a mounted filesystem. So if it's not
opened exclusive, we do a temporary exclusive open of own and act on
that instead. Might as well go for a temporary claim...
BTW, what happens if two threads call ioctl(fd, BLKBSZSET, &n)
for the same descriptor that happens to have been opened O_EXCL?
Without O_EXCL they would've been unable to claim the sucker at the same
time - the holder we are using is the address of a function argument,
i.e. something that points to kernel stack of the caller. Those would
conflict and we either get set_blocksize() calls fully serialized, or
one of the callers would eat -EBUSY. Not so in "opened with O_EXCL"
case - they can very well overlap and IIRC set_blocksize() does *not*
expect that kind of crap... It's all under CAP_SYS_ADMIN, so it's not
as if it was a meaningful security hole anyway, but it does look fishy.
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle
2023-08-26 2:28 ` Al Viro
@ 2023-08-28 14:27 ` Christoph Hellwig
0 siblings, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2023-08-28 14:27 UTC (permalink / raw)
To: Al Viro
Cc: Jan Kara, linux-fsdevel, linux-block, Christoph Hellwig,
Alasdair Kergon, Andrew Morton, Anna Schumaker, Chao Yu,
Christian Borntraeger, Darrick J. Wong, Dave Kleikamp,
David Sterba, dm-devel, drbd-dev, Gao Xiang, Jack Wang,
Jaegeuk Kim, jfs-discussion, Joern Engel, Joseph Qi,
Kent Overstreet, linux-bcache, linux-btrfs, linux-erofs,
linux-ext4, linux-f2fs-devel, linux-mm, linux-mtd, linux-nfs,
linux-nilfs, linux-nvme, linux-pm, linux-raid, linux-s390,
linux-scsi, linux-xfs, Md. Haris Iqbal, Mike Snitzer, Minchan Kim,
ocfs2-devel, reiserfs-devel, Sergey Senozhatsky, Song Liu,
Sven Schnelle, target-devel, Ted Tso, Trond Myklebust, xen-devel,
Jens Axboe, Christian Brauner
On Sat, Aug 26, 2023 at 03:28:52AM +0100, Al Viro wrote:
> I mean, look at claim_swapfile() for example:
> p->bdev = blkdev_get_by_dev(inode->i_rdev,
> FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
> if (IS_ERR(p->bdev)) {
> error = PTR_ERR(p->bdev);
> p->bdev = NULL;
> return error;
> }
> p->old_block_size = block_size(p->bdev);
> error = set_blocksize(p->bdev, PAGE_SIZE);
> if (error < 0)
> return error;
> we already have the file opened, and we keep it opened all the way until
> the swapoff(2); here we have noticed that it's a block device and we
> * open the fucker again (by device number), this time claiming
> it with our swap_info_struct as holder, to be closed at swapoff(2) time
> (just before we close the file)
Note that some drivers look at FMODE_EXCL/BLK_OPEN_EXCL in ->open.
These are probably bogus and maybe we want to kill them, but that will
need an audit first.
> BTW, what happens if two threads call ioctl(fd, BLKBSZSET, &n)
> for the same descriptor that happens to have been opened O_EXCL?
> Without O_EXCL they would've been unable to claim the sucker at the same
> time - the holder we are using is the address of a function argument,
> i.e. something that points to kernel stack of the caller. Those would
> conflict and we either get set_blocksize() calls fully serialized, or
> one of the callers would eat -EBUSY. Not so in "opened with O_EXCL"
> case - they can very well overlap and IIRC set_blocksize() does *not*
> expect that kind of crap... It's all under CAP_SYS_ADMIN, so it's not
> as if it was a meaningful security hole anyway, but it does look fishy.
The user get to keep the pieces.. BLKBSZSET is kinda bogus anyway
as the soft blocksize only matters for buffer_head-like I/O, and
there only for file systems. Not idea why anyone would set it manually.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle
2023-08-25 13:47 ` Jan Kara
2023-08-26 2:28 ` Al Viro
@ 2023-08-28 13:20 ` Christian Brauner
2023-08-28 14:22 ` Christoph Hellwig
2 siblings, 0 replies; 9+ messages in thread
From: Christian Brauner @ 2023-08-28 13:20 UTC (permalink / raw)
To: Jan Kara
Cc: Al Viro, linux-fsdevel, linux-block, Christoph Hellwig,
Alasdair Kergon, Andrew Morton, Anna Schumaker, Chao Yu,
Christian Borntraeger, Darrick J. Wong, Dave Kleikamp,
David Sterba, dm-devel, drbd-dev, Gao Xiang, Jack Wang,
Jaegeuk Kim, jfs-discussion, Joern Engel, Joseph Qi,
Kent Overstreet, linux-bcache, linux-btrfs, linux-erofs,
linux-ext4, linux-f2fs-devel, linux-mm, linux-mtd, linux-nfs,
linux-nilfs, linux-nvme, linux-pm, linux-raid, linux-s390,
linux-scsi, linux-xfs, Md. Haris Iqbal, Mike Snitzer, Minchan Kim,
ocfs2-devel, reiserfs-devel, Sergey Senozhatsky, Song Liu,
Sven Schnelle, target-devel, Ted Tso, Trond Myklebust, xen-devel,
Jens Axboe
> So besides my last fput() worry about I think this could work and would be
> probably a bit nicer than what I have. But before going and redoing the whole
> series let me gather some more feedback so that we don't go back and forth.
> Christoph, Christian, Jens, any opinion?
I'll be a bit under water for the next few days, I expect but I'll get
back to this. I think not making you redo this whole thing from scratch
is what I'd prefer unless there's really clear advantages. But I don't
want to offer a haphazard opinion in the middle of the merge window.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle
2023-08-25 13:47 ` Jan Kara
2023-08-26 2:28 ` Al Viro
2023-08-28 13:20 ` Christian Brauner
@ 2023-08-28 14:22 ` Christoph Hellwig
2 siblings, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2023-08-28 14:22 UTC (permalink / raw)
To: Jan Kara
Cc: Al Viro, linux-fsdevel, linux-block, Christoph Hellwig,
Alasdair Kergon, Andrew Morton, Anna Schumaker, Chao Yu,
Christian Borntraeger, Darrick J. Wong, Dave Kleikamp,
David Sterba, dm-devel, drbd-dev, Gao Xiang, Jack Wang,
Jaegeuk Kim, jfs-discussion, Joern Engel, Joseph Qi,
Kent Overstreet, linux-bcache, linux-btrfs, linux-erofs,
linux-ext4, linux-f2fs-devel, linux-mm, linux-mtd, linux-nfs,
linux-nilfs, linux-nvme, linux-pm, linux-raid, linux-s390,
linux-scsi, linux-xfs, Md. Haris Iqbal, Mike Snitzer, Minchan Kim,
ocfs2-devel, reiserfs-devel, Sergey Senozhatsky, Song Liu,
Sven Schnelle, target-devel, Ted Tso, Trond Myklebust, xen-devel,
Jens Axboe, Christian Brauner
On Fri, Aug 25, 2023 at 03:47:56PM +0200, Jan Kara wrote:
> I can see the appeal of not having to introduce the new bdev_handle type
> and just using struct file which unifies in-kernel and userspace block
> device opens. But I can see downsides too - the last fput() happening from
> task work makes me a bit nervous whether it will not break something
> somewhere with exclusive bdev opens. Getting from struct file to bdev is
> somewhat harder but I guess a helper like F_BDEV() would solve that just
> fine.
>
> So besides my last fput() worry about I think this could work and would be
> probably a bit nicer than what I have. But before going and redoing the whole
> series let me gather some more feedback so that we don't go back and forth.
> Christoph, Christian, Jens, any opinion?
I did think about the file a bit. The fact that we'd need something
like an anon_file for the by_dev open was always a huge turn off for
me, but maybe my concern is overblown. Having a struct file would
actually be really useful for a bunch of users.
^ permalink raw reply [flat|nested] 9+ messages in thread