* Topology ioctls
@ 2009-09-23 14:26 Martin K. Petersen
2009-09-23 18:28 ` Jamie Lokier
0 siblings, 1 reply; 3+ messages in thread
From: Martin K. Petersen @ 2009-09-23 14:26 UTC (permalink / raw)
To: linux-fsdevel; +Cc: Eric Sandeen, Andreas Dilger, Jim Meyering, jens.axboe
The original rationale for exporting the topology information via sysfs
was that we intended to support multiple heterogeneous regions within a
block device. And that fit poorly with an ioctl approach.
However, with a single region per device it is trivial to provide the
topology. And while mkfs.* will continue to use the libblkid interface,
there are users that would like to get access to this information
without having to traverse sysfs and stitch things together manually.
Example:
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <errno.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#define BLKSSZGET _IO(0x12,104)
#define BLKIOMIN _IO(0x12,120)
#define BLKIOOPT _IO(0x12,121)
#define BLKALIGNOFF _IO(0x12,122)
#define BLKPBSZGET _IO(0x12,123)
static int die(const char *fmt, ...)
{
int err = errno;
va_list val;
va_start(val, fmt);
vfprintf(stderr, fmt, val);
if (err != 0)
fprintf(stderr, ": %s", strerror(err));
fprintf(stderr, "\n");
va_end(val);
exit(EXIT_FAILURE);
}
int main(int argc, char *argv[])
{
int fd, lbs, pbs, min, opt, align;
if (argc != 2)
die("Usage: %s <dev>", argv[0]);
fd = open(argv[1], O_RDONLY);
if (fd < 0)
die("Can't open %s", argv[1]);
if (ioctl(fd, BLKSSZGET, &lbs) < 0)
die("Can't get logical block size");
if (ioctl(fd, BLKPBSZGET, &pbs) < 0)
die("Can't get physical block size");
if (ioctl(fd, BLKIOMIN, &min) < 0)
die("Can't get preferred random I/O size");
if (ioctl(fd, BLKIOOPT, &opt) < 0)
die("Can't get preferred sustained I/O size");
if (ioctl(fd, BLKALIGNOFF, &align) < 0)
die("Can't get alignment offset");
printf("%s:\n", argv[1]);
printf("\tlogical block size: %u\n", lbs);
printf("\tphysical block size: %u\n", pbs);
printf("\trandom I/O size: %u\n", min);
printf("\tsustained I/O size: %u\n", opt);
if (align == -1)
printf("\talignment offset: inconsistent\n");
else
printf("\talignment offset: %u\n", align);
exit(EXIT_SUCCESS);
}
Patch:
block: Topology ioctls
Not all users of the topology information want to use libblkid. Provide
the topology information through bdev ioctls.
Also clarify sector size comments for existing BLK ioctls.
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 7865a34..bcc8bec 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -734,6 +734,14 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
switch (cmd) {
case HDIO_GETGEO:
return compat_hdio_getgeo(disk, bdev, compat_ptr(arg));
+ case BLKPBSZGET:
+ return compat_put_int(arg, bdev_physical_block_size(bdev));
+ case BLKIOMIN:
+ return compat_put_int(arg, bdev_io_min(bdev));
+ case BLKIOOPT:
+ return compat_put_int(arg, bdev_io_opt(bdev));
+ case BLKALIGNOFF:
+ return compat_put_int(arg, bdev_alignment_offset(bdev));
case BLKFLSBUF:
case BLKROSET:
case BLKDISCARD:
diff --git a/block/ioctl.c b/block/ioctl.c
index d3e6b58..fea6f2c 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -263,10 +263,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
return put_long(arg, (bdi->ra_pages * PAGE_CACHE_SIZE) / 512);
case BLKROGET:
return put_int(arg, bdev_read_only(bdev) != 0);
- case BLKBSZGET: /* get the logical block size (cf. BLKSSZGET) */
+ case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */
return put_int(arg, block_size(bdev));
- case BLKSSZGET: /* get block device hardware sector size */
+ case BLKSSZGET: /* get block device logical block size */
return put_int(arg, bdev_logical_block_size(bdev));
+ case BLKPBSZGET: /* get block device physical block size */
+ return put_int(arg, bdev_physical_block_size(bdev));
case BLKSECTGET:
return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
case BLKRASET:
@@ -309,6 +311,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
return put_ulong(arg, size >> 9);
case BLKGETSIZE64:
return put_u64(arg, bdev->bd_inode->i_size);
+ case BLKIOMIN:
+ return put_int(arg, bdev_io_min(bdev));
+ case BLKIOOPT:
+ return put_int(arg, bdev_io_opt(bdev));
+ case BLKALIGNOFF:
+ return put_int(arg, bdev_alignment_offset(bdev));
case BLKTRACESTART:
case BLKTRACESTOP:
case BLKTRACESETUP:
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e23a86c..935bcb0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1080,16 +1080,31 @@ static inline unsigned int queue_physical_block_size(struct request_queue *q)
return q->limits.physical_block_size;
}
+static inline unsigned int bdev_physical_block_size(struct block_device *bdev)
+{
+ return queue_physical_block_size(bdev_get_queue(bdev));
+}
+
static inline unsigned int queue_io_min(struct request_queue *q)
{
return q->limits.io_min;
}
+static inline unsigned int bdev_io_min(struct block_device *bdev)
+{
+ return queue_io_min(bdev_get_queue(bdev));
+}
+
static inline unsigned int queue_io_opt(struct request_queue *q)
{
return q->limits.io_opt;
}
+static inline unsigned int bdev_io_opt(struct block_device *bdev)
+{
+ return queue_io_opt(bdev_get_queue(bdev));
+}
+
static inline int queue_alignment_offset(struct request_queue *q)
{
if (q && q->limits.misaligned)
@@ -1108,6 +1123,19 @@ static inline int queue_sector_alignment_offset(struct request_queue *q,
& (q->limits.io_min - 1);
}
+static inline int bdev_alignment_offset(struct block_device *bdev)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ if (!q || q->limits.misaligned)
+ return -1;
+
+ if (bdev != bdev->bd_contains)
+ return bdev->bd_part->alignment_offset;
+
+ return q->limits.alignment_offset;
+}
+
static inline int queue_dma_alignment(struct request_queue *q)
{
return q ? q->dma_alignment : 511;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 90162fb..3f401fc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -300,6 +300,10 @@ struct inodes_stat_t {
#define BLKTRACESTOP _IO(0x12,117)
#define BLKTRACETEARDOWN _IO(0x12,118)
#define BLKDISCARD _IO(0x12,119)
+#define BLKIOMIN _IO(0x12,120)
+#define BLKIOOPT _IO(0x12,121)
+#define BLKALIGNOFF _IO(0x12,122)
+#define BLKPBSZGET _IO(0x12,123)
#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
#define FIBMAP _IO(0x00,1) /* bmap access */
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: Topology ioctls
2009-09-23 14:26 Topology ioctls Martin K. Petersen
@ 2009-09-23 18:28 ` Jamie Lokier
2009-09-24 4:06 ` Martin K. Petersen
0 siblings, 1 reply; 3+ messages in thread
From: Jamie Lokier @ 2009-09-23 18:28 UTC (permalink / raw)
To: Martin K. Petersen
Cc: linux-fsdevel, Eric Sandeen, Andreas Dilger, Jim Meyering,
jens.axboe
Martin K. Petersen wrote:
>
> The original rationale for exporting the topology information via sysfs
> was that we intended to support multiple heterogeneous regions within a
> block device. And that fit poorly with an ioctl approach.
>
> However, with a single region per device it is trivial to provide the
> topology. And while mkfs.* will continue to use the libblkid interface,
> there are users that would like to get access to this information
> without having to traverse sysfs and stitch things together manually.
Quite nice, I can see it coming in handy.
One more bit of information I'd like is the "write affected block
size". When you write to a single disk, it's the sector size (512 or
soon to be 4096). For a RAID, it's probably quite large, depending on
implementation details. For some kind of flash - see threads from
Pavel and LWN about sector writes causing erase-block sizes to be
lost.
That size is useful to any program which does journalling/logging type
of write pattern, i.e. databases and filesystems-in-a-file, so they
can write new commit blocks sufficiently far apart.
-- Jamie
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: Topology ioctls
2009-09-23 18:28 ` Jamie Lokier
@ 2009-09-24 4:06 ` Martin K. Petersen
0 siblings, 0 replies; 3+ messages in thread
From: Martin K. Petersen @ 2009-09-24 4:06 UTC (permalink / raw)
To: Jamie Lokier
Cc: Martin K. Petersen, linux-fsdevel, Eric Sandeen, Andreas Dilger,
Jim Meyering, jens.axboe
>>>>> "Jamie" == Jamie Lokier <jamie@shareable.org> writes:
Jamie> One more bit of information I'd like is the "write affected block
Jamie> size". When you write to a single disk, it's the sector size
Jamie> (512 or soon to be 4096). For a RAID, it's probably quite large,
Jamie> depending on implementation details.
That's what BLKIOMIN is all about...
--
Martin K. Petersen Oracle Linux Engineering
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2009-09-24 4:07 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-09-23 14:26 Topology ioctls Martin K. Petersen
2009-09-23 18:28 ` Jamie Lokier
2009-09-24 4:06 ` Martin K. Petersen
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).