linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC] fs: add ioctl to query protection info capabilities
       [not found] <CGME20250527105950epcas5p1b53753ab614bf6bde4ffbf5165c7d263@epcas5p1.samsung.com>
@ 2025-05-27 10:42 ` Anuj Gupta
  2025-05-29  3:02   ` Martin K. Petersen
  0 siblings, 1 reply; 13+ messages in thread
From: Anuj Gupta @ 2025-05-27 10:42 UTC (permalink / raw)
  To: jack, anuj1072538, axboe, viro, brauner, hch, martin.petersen
  Cc: linux-block, linux-fsdevel, joshi.k, Anuj Gupta

Add a new ioctl, FS_IOC_GETPICAP, to query protection info (PI)
capabilities. This ioctl returns information about the files integrity
profile. This is useful for userspace applications to understand a files
end-to-end data protection support and configure the I/O accordingly.

For now this interface is only supported by block devices. However the
design and placement of this ioctl in generic FS ioctl space allows us
to extend it to work over files as well. This maybe useful when
filesystems start supporting  PI-aware layouts.

A new structure struct fs_pi_cap is introduced, which contains the
following fields:
1. flags: Bitmask of capability flags.
2. interval: the data block interval (in bytes) for which the protection
information is generated.
3. csum type: type of checksum used.
4. tuple_size: size (in bytes) of the protection information tuple.
5. tag_size: size (in bytes) of tag information.
6. pi_offset: offset of protection info within the tuple.
7. rsvd: reserved for future use.

The internal logic to fetch the capability is encapsulated in a helper
function blk_get_pi_cap(), which uses the blk_integrity profile
associated with the device. The ioctl returns -EOPNOTSUPP, if
CONFIG_BLK_DEV_INTEGRITY is not enabled.

Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
---
 block/blk-integrity.c         | 24 ++++++++++++++++++++++++
 block/ioctl.c                 |  3 +++
 include/linux/blk-integrity.h |  6 ++++++
 include/uapi/linux/fs.h       | 25 +++++++++++++++++++++++++
 4 files changed, 58 insertions(+)

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index a1678f0a9f81..81c25b7ec1eb 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -54,6 +54,30 @@ int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio)
 	return segments;
 }
 
+int blk_get_pi_cap(struct block_device *bdev, struct fs_pi_cap __user *argp)
+{
+	struct blk_integrity *bi = blk_get_integrity(bdev->bd_disk);
+	struct fs_pi_cap pi_cap = {};
+
+	if (!bi)
+		goto out;
+
+	if (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE)
+		pi_cap.flags |= FILE_PI_CAP_INTEGRITY;
+	if (bi->flags & BLK_INTEGRITY_REF_TAG)
+		pi_cap.flags |= FILE_PI_CAP_REFTAG;
+	pi_cap.csum_type = bi->csum_type;
+	pi_cap.tuple_size = bi->tuple_size;
+	pi_cap.tag_size = bi->tag_size;
+	pi_cap.interval = 1 << bi->interval_exp;
+	pi_cap.pi_offset = bi->pi_offset;
+
+out:
+	if (copy_to_user(argp, &pi_cap, sizeof(struct fs_pi_cap)))
+		return -EFAULT;
+	return 0;
+}
+
 /**
  * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
  * @rq:		request to map
diff --git a/block/ioctl.c b/block/ioctl.c
index e472cc1030c6..53b35bf3e6fa 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -13,6 +13,7 @@
 #include <linux/uaccess.h>
 #include <linux/pagemap.h>
 #include <linux/io_uring/cmd.h>
+#include <linux/blk-integrity.h>
 #include <uapi/linux/blkdev.h>
 #include "blk.h"
 #include "blk-crypto-internal.h"
@@ -643,6 +644,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode,
 		return blkdev_pr_preempt(bdev, mode, argp, true);
 	case IOC_PR_CLEAR:
 		return blkdev_pr_clear(bdev, mode, argp);
+	case FS_IOC_GETPICAP:
+		return blk_get_pi_cap(bdev, argp);
 	default:
 		return -ENOIOCTLCMD;
 	}
diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h
index c7eae0bfb013..6118a0c28605 100644
--- a/include/linux/blk-integrity.h
+++ b/include/linux/blk-integrity.h
@@ -29,6 +29,7 @@ int blk_rq_map_integrity_sg(struct request *, struct scatterlist *);
 int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
 int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
 			      ssize_t bytes);
+int blk_get_pi_cap(struct block_device *bdev, struct fs_pi_cap __user *argp);
 
 static inline bool
 blk_integrity_queue_supports_integrity(struct request_queue *q)
@@ -92,6 +93,11 @@ static inline struct bio_vec rq_integrity_vec(struct request *rq)
 				 rq->bio->bi_integrity->bip_iter);
 }
 #else /* CONFIG_BLK_DEV_INTEGRITY */
+static inline int blk_get_pi_cap(struct block_device *bdev,
+				 struct fs_pi_cap __user *argp)
+{
+	return -EOPNOTSUPP;
+}
 static inline int blk_rq_count_integrity_sg(struct request_queue *q,
 					    struct bio *b)
 {
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index e762e1af650c..32383efca896 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -91,6 +91,29 @@ struct fs_sysfs_path {
 	__u8			name[128];
 };
 
+#define	FILE_PI_CAP_INTEGRITY		(1 << 0)
+#define	FILE_PI_CAP_REFTAG		(1 << 1)
+
+/*
+ * struct fs_pi_cap - protection information(PI) capability descriptor
+ * @flags:			Bitmask of capability flags
+ * @interval:			Number of bytes of data per PI tuple
+ * @csum_type:			Checksum type
+ * @tuple_size:			Size in bytes of the PI tuple
+ * @tag_size:			Size of the tag area within the tuple
+ * @pi_offset:			Offset in bytes of the PI metadata within the tuple
+ * @rsvd:			Reserved for future use
+ */
+struct fs_pi_cap {
+	__u32	flags;
+	__u16	interval;
+	__u8	csum_type;
+	__u8	tuple_size;
+	__u8	tag_size;
+	__u8	pi_offset;
+	__u8	rsvd[6];
+};
+
 /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */
 #define FILE_DEDUPE_RANGE_SAME		0
 #define FILE_DEDUPE_RANGE_DIFFERS	1
@@ -247,6 +270,8 @@ struct fsxattr {
  * also /sys/kernel/debug/ for filesystems with debugfs exports
  */
 #define FS_IOC_GETFSSYSFSPATH		_IOR(0x15, 1, struct fs_sysfs_path)
+/* Get protection info capability details */
+#define FS_IOC_GETPICAP			_IOR('f', 3, struct fs_pi_cap)
 
 /*
  * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [RFC] fs: add ioctl to query protection info capabilities
  2025-05-27 10:42 ` [RFC] fs: add ioctl to query protection info capabilities Anuj Gupta
@ 2025-05-29  3:02   ` Martin K. Petersen
  2025-05-29  7:12     ` Anuj Gupta/Anuj Gupta
  0 siblings, 1 reply; 13+ messages in thread
From: Martin K. Petersen @ 2025-05-29  3:02 UTC (permalink / raw)
  To: Anuj Gupta
  Cc: jack, anuj1072538, axboe, viro, brauner, hch, martin.petersen,
	linux-block, linux-fsdevel, joshi.k


Hi Anuj!

Thanks for working on this!

> 4. tuple_size: size (in bytes) of the protection information tuple.
> 6. pi_offset: offset of protection info within the tuple.

I find this a little confusing. The T10 PI tuple is <guard, app, ref>.

I acknowledge things currently are a bit muddy in the block layer since
tuple_size has been transmogrified to hold the NVMe metadata size.

But for a new user-visible interface I think we should make the
terminology clear. The tuple is the PI and not the rest of the metadata.

So I think you'd want:

4. metadata_size: size (in bytes) of the metadata associated with each interval.
6. pi_offset: offset of protection information tuple within the metadata.

> +#define	FILE_PI_CAP_INTEGRITY		(1 << 0)
> +#define	FILE_PI_CAP_REFTAG		(1 << 1)

You'll also need to have corresponding uapi defines for:

enum blk_integrity_checksum {
        BLK_INTEGRITY_CSUM_NONE         = 0,
        BLK_INTEGRITY_CSUM_IP           = 1,
        BLK_INTEGRITY_CSUM_CRC          = 2,
        BLK_INTEGRITY_CSUM_CRC64        = 3,
} __packed ;

> +
> +/*
> + * struct fs_pi_cap - protection information(PI) capability descriptor
> + * @flags:			Bitmask of capability flags
> + * @interval:		Number of bytes of data per PI tuple
> + * @csum_type:		Checksum type
> + * @tuple_size:		Size in bytes of the PI tuple
> + * @tag_size:		Size of the tag area within the tuple
> + * @pi_offset:		Offset in bytes of the PI metadata within the tuple
> + * @rsvd:			Reserved for future use

See above for distinction between metadata and PI tuple. The question is
whether we need to report the size of those two separately (both in
kernel and in this structure). Otherwise how do we know how big the PI
tuple is? Or do we infer that from the csum_type?

Also, for the extended NVMe PI types we'd probably need to know the size
of the ref tag and the storage tag.

-- 
Martin K. Petersen

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] fs: add ioctl to query protection info capabilities
  2025-05-29  3:02   ` Martin K. Petersen
@ 2025-05-29  7:12     ` Anuj Gupta/Anuj Gupta
  2025-05-29 17:59       ` Eric Biggers
                         ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Anuj Gupta/Anuj Gupta @ 2025-05-29  7:12 UTC (permalink / raw)
  To: Martin K. Petersen
  Cc: jack, anuj1072538, axboe, viro, brauner, hch, linux-block,
	linux-fsdevel, joshi.k

On 5/29/2025 8:32 AM, Martin K. Petersen wrote:
> 
> Hi Anuj!
> 
> Thanks for working on this!
> 
Hi Martin,
Thanks for the feedback!

>> 4. tuple_size: size (in bytes) of the protection information tuple.
>> 6. pi_offset: offset of protection info within the tuple.
> 
> I find this a little confusing. The T10 PI tuple is <guard, app, ref>.
> 
> I acknowledge things currently are a bit muddy in the block layer since
> tuple_size has been transmogrified to hold the NVMe metadata size.
> 
> But for a new user-visible interface I think we should make the
> terminology clear. The tuple is the PI and not the rest of the metadata.
> 
> So I think you'd want:
> 
> 4. metadata_size: size (in bytes) of the metadata associated with each interval.
> 6. pi_offset: offset of protection information tuple within the metadata.
> 

Yes, this representation looks better. Will make this change.

>> +#define	FILE_PI_CAP_INTEGRITY		(1 << 0)
>> +#define	FILE_PI_CAP_REFTAG		(1 << 1)
> 
> You'll also need to have corresponding uapi defines for:
> 
> enum blk_integrity_checksum {
>          BLK_INTEGRITY_CSUM_NONE         = 0,
>          BLK_INTEGRITY_CSUM_IP           = 1,
>          BLK_INTEGRITY_CSUM_CRC          = 2,
>          BLK_INTEGRITY_CSUM_CRC64        = 3,
> } __packed ;
>

Right, I'll add these definitions to the UAPI.
>> +
>> +/*
>> + * struct fs_pi_cap - protection information(PI) capability descriptor
>> + * @flags:			Bitmask of capability flags
>> + * @interval:		Number of bytes of data per PI tuple
>> + * @csum_type:		Checksum type
>> + * @tuple_size:		Size in bytes of the PI tuple
>> + * @tag_size:		Size of the tag area within the tuple
>> + * @pi_offset:		Offset in bytes of the PI metadata within the tuple
>> + * @rsvd:			Reserved for future use
> 
> See above for distinction between metadata and PI tuple. The question is
> whether we need to report the size of those two separately (both in
> kernel and in this structure). Otherwise how do we know how big the PI
> tuple is? Or do we infer that from the csum_type?
> 

The block layer currently infers this by looking at the csum_type (e.g.,
in blk_integrity_generate). I assumed userspace could do the same, so I
didn't expose a separate pi_tuple_size field. Do you see this
differently?

As you mentioned, the other option would be to report the PI tuple size
explicitly in both the kernel and in the uapi struct.

> Also, for the extended NVMe PI types we'd probably need to know the size
> of the ref tag and the storage tag.
>

Makes sense, I will introduce ref_tag_size and storage_tag_size in the
UAPI struct to account for this.
I did a respin based on your feedback here [1]. If this looks good to
you, I'll roll out a v2.

Thanks,
Anuj Gupta

[1]

[PATCH] fs: add ioctl to query protection info capabilities

Add a new ioctl, FS_IOC_GETPICAP, to query protection info (PI)
capabilities. This ioctl returns information about the files integrity
profile. This is useful for userspace applications to understand a files
end-to-end data protection support and configure the I/O accordingly.

For now this interface is only supported by block devices. However the
design and placement of this ioctl in generic FS ioctl space allows us
to extend it to work over files as well. This maybe useful when
filesystems start supporting  PI-aware layouts.

A new structure struct fs_pi_cap is introduced, which contains the
following fields:
1. flags: bitmask of capability flags.
2. interval: the data block interval (in bytes) for which the protection
information is generated.
3. csum type: type of checksum used.
4. metadata_size: size (in bytes) of the metadata associated with each
interval.
5. tag_size: size (in bytes) of tag information.
6. pi_offset: offset of protection information tuple within the
metadata.
7. ref_tag_size: size in bytes of the reference tag.
8. storage_tag_size: size in bytes of the storage tag.
9. rsvd: reserved for future use.

The internal logic to fetch the capability is encapsulated in a helper
function blk_get_pi_cap(), which uses the blk_integrity profile
associated with the device. The ioctl returns -EOPNOTSUPP, if
CONFIG_BLK_DEV_INTEGRITY is not enabled.

Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
---
  block/blk-integrity.c         | 38 +++++++++++++++++++++++++++++++++++
  block/ioctl.c                 |  3 +++
  include/linux/blk-integrity.h |  6 ++++++
  include/uapi/linux/fs.h       | 36 +++++++++++++++++++++++++++++++++
  4 files changed, 83 insertions(+)

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index a1678f0a9f81..9bd2888a85ce 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -13,6 +13,7 @@
  #include <linux/scatterlist.h>
  #include <linux/export.h>
  #include <linux/slab.h>
+#include <linux/t10-pi.h>

  #include "blk.h"

@@ -54,6 +55,43 @@ int blk_rq_count_integrity_sg(struct request_queue 
*q, struct bio *bio)
  	return segments;
  }

+int blk_get_pi_cap(struct block_device *bdev, struct fs_pi_cap __user 
*argp)
+{
+	struct blk_integrity *bi = blk_get_integrity(bdev->bd_disk);
+	struct fs_pi_cap pi_cap = {};
+
+	if (!bi)
+		goto out;
+
+	if (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE)
+		pi_cap.flags |= FILE_PI_CAP_INTEGRITY;
+	if (bi->flags & BLK_INTEGRITY_REF_TAG)
+		pi_cap.flags |= FILE_PI_CAP_REFTAG;
+	pi_cap.csum_type = bi->csum_type;
+	pi_cap.tuple_size = bi->tuple_size;
+	pi_cap.tag_size = bi->tag_size;
+	pi_cap.interval = 1 << bi->interval_exp;
+	pi_cap.pi_offset = bi->pi_offset;
+	switch (bi->csum_type) {
+		case BLK_INTEGRITY_CSUM_CRC64:
+			pi_cap.ref_tag_size = sizeof_field(struct crc64_pi_tuple
+							   , ref_tag);
+			break;
+		case BLK_INTEGRITY_CSUM_CRC:
+		case BLK_INTEGRITY_CSUM_IP:
+			pi_cap.ref_tag_size = sizeof_field(struct t10_pi_tuple,
+							   ref_tag);
+			break;
+		default:
+			break;
+	}
+
+out:
+	if (copy_to_user(argp, &pi_cap, sizeof(struct fs_pi_cap)))
+		return -EFAULT;
+	return 0;
+}
+
  /**
   * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
   * @rq:		request to map
diff --git a/block/ioctl.c b/block/ioctl.c
index e472cc1030c6..53b35bf3e6fa 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -13,6 +13,7 @@
  #include <linux/uaccess.h>
  #include <linux/pagemap.h>
  #include <linux/io_uring/cmd.h>
+#include <linux/blk-integrity.h>
  #include <uapi/linux/blkdev.h>
  #include "blk.h"
  #include "blk-crypto-internal.h"
@@ -643,6 +644,8 @@ static int blkdev_common_ioctl(struct block_device 
*bdev, blk_mode_t mode,
  		return blkdev_pr_preempt(bdev, mode, argp, true);
  	case IOC_PR_CLEAR:
  		return blkdev_pr_clear(bdev, mode, argp);
+	case FS_IOC_GETPICAP:
+		return blk_get_pi_cap(bdev, argp);
  	default:
  		return -ENOIOCTLCMD;
  	}
diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h
index c7eae0bfb013..6118a0c28605 100644
--- a/include/linux/blk-integrity.h
+++ b/include/linux/blk-integrity.h
@@ -29,6 +29,7 @@ int blk_rq_map_integrity_sg(struct request *, struct 
scatterlist *);
  int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
  int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
  			      ssize_t bytes);
+int blk_get_pi_cap(struct block_device *bdev, struct fs_pi_cap __user 
*argp);

  static inline bool
  blk_integrity_queue_supports_integrity(struct request_queue *q)
@@ -92,6 +93,11 @@ static inline struct bio_vec rq_integrity_vec(struct 
request *rq)
  				 rq->bio->bi_integrity->bip_iter);
  }
  #else /* CONFIG_BLK_DEV_INTEGRITY */
+static inline int blk_get_pi_cap(struct block_device *bdev,
+				 struct fs_pi_cap __user *argp)
+{
+	return -EOPNOTSUPP;
+}
  static inline int blk_rq_count_integrity_sg(struct request_queue *q,
  					    struct bio *b)
  {
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index e762e1af650c..c70584b09bed 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -91,6 +91,40 @@ struct fs_sysfs_path {
  	__u8			name[128];
  };

+/* Protection info capability flags */
+#define	FILE_PI_CAP_INTEGRITY		(1 << 0)
+#define	FILE_PI_CAP_REFTAG		(1 << 1)
+
+/* Checksum types for Protection Information */
+#define FS_PI_CSUM_NONE			0
+#define FS_PI_CSUM_IP			1
+#define FS_PI_CSUM_CRC			2
+#define FS_PI_CSUM_CRC64		3
+
+/*
+ * struct fs_pi_cap - protection information(PI) capability descriptor
+ * @flags:			Bitmask of capability flags
+ * @interval:			Number of bytes of data per PI tuple
+ * @csum_type:			Checksum type
+ * @metadata_size:		Size in bytes of the metadata associated with each 
interval
+ * @tag_size:			Size of the tag area within the tuple
+ * @pi_offset:			Offset of protection information tuple within the metadata
+ * @ref_tag_size:		Size in bytes of the reference tag
+ * @storage_tag_size:		Size in bytes of the storage tag
+ * @rsvd:			Reserved for future use
+ */
+struct fs_pi_cap {
+	__u32	flags;
+	__u16	interval;
+	__u8	csum_type;
+	__u8	tuple_size;
+	__u8	tag_size;
+	__u8	pi_offset;
+	__u8	ref_tag_size;
+	__u8	storage_tag_size;
+	__u8	rsvd[4];
+};
+
  /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl 
definitions */
  #define FILE_DEDUPE_RANGE_SAME		0
  #define FILE_DEDUPE_RANGE_DIFFERS	1
@@ -247,6 +281,8 @@ struct fsxattr {
   * also /sys/kernel/debug/ for filesystems with debugfs exports
   */
  #define FS_IOC_GETFSSYSFSPATH		_IOR(0x15, 1, struct fs_sysfs_path)
+/* Get protection info capability details */
+#define FS_IOC_GETPICAP			_IOR('f', 3, struct fs_pi_cap)

  /*
   * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
-- 
2.25.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [RFC] fs: add ioctl to query protection info capabilities
  2025-05-29  7:12     ` Anuj Gupta/Anuj Gupta
@ 2025-05-29 17:59       ` Eric Biggers
  2025-05-30  5:24         ` Christian Brauner
  2025-05-29 21:14       ` [RFC] " Andreas Dilger
  2025-06-03  3:12       ` [RFC] fs: " Martin K. Petersen
  2 siblings, 1 reply; 13+ messages in thread
From: Eric Biggers @ 2025-05-29 17:59 UTC (permalink / raw)
  To: Anuj Gupta/Anuj Gupta
  Cc: Martin K. Petersen, jack, anuj1072538, axboe, viro, brauner, hch,
	linux-block, linux-fsdevel, joshi.k

On Thu, May 29, 2025 at 12:42:45PM +0530, Anuj Gupta/Anuj Gupta wrote:
> On 5/29/2025 8:32 AM, Martin K. Petersen wrote:
> > 
> > Hi Anuj!
> > 
> > Thanks for working on this!
> > 
> Hi Martin,
> Thanks for the feedback!
> 
> >> 4. tuple_size: size (in bytes) of the protection information tuple.
> >> 6. pi_offset: offset of protection info within the tuple.
> > 
> > I find this a little confusing. The T10 PI tuple is <guard, app, ref>.
> > 
> > I acknowledge things currently are a bit muddy in the block layer since
> > tuple_size has been transmogrified to hold the NVMe metadata size.
> > 
> > But for a new user-visible interface I think we should make the
> > terminology clear. The tuple is the PI and not the rest of the metadata.
> > 
> > So I think you'd want:
> > 
> > 4. metadata_size: size (in bytes) of the metadata associated with each interval.
> > 6. pi_offset: offset of protection information tuple within the metadata.
> > 
> 
> Yes, this representation looks better. Will make this change.
> 
> >> +#define	FILE_PI_CAP_INTEGRITY		(1 << 0)
> >> +#define	FILE_PI_CAP_REFTAG		(1 << 1)
> > 
> > You'll also need to have corresponding uapi defines for:
> > 
> > enum blk_integrity_checksum {
> >          BLK_INTEGRITY_CSUM_NONE         = 0,
> >          BLK_INTEGRITY_CSUM_IP           = 1,
> >          BLK_INTEGRITY_CSUM_CRC          = 2,
> >          BLK_INTEGRITY_CSUM_CRC64        = 3,
> > } __packed ;
> >
> 
> Right, I'll add these definitions to the UAPI.

Would it make sense to give the CRCs clearer names?  For example CRC16_T10DIF
and CRC64_NVME.

- Eric

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] add ioctl to query protection info capabilities
  2025-05-29  7:12     ` Anuj Gupta/Anuj Gupta
  2025-05-29 17:59       ` Eric Biggers
@ 2025-05-29 21:14       ` Andreas Dilger
  2025-06-03  3:12       ` [RFC] fs: " Martin K. Petersen
  2 siblings, 0 replies; 13+ messages in thread
From: Andreas Dilger @ 2025-05-29 21:14 UTC (permalink / raw)
  To: Anuj Gupta/Anuj Gupta
  Cc: Martin Petersen, jack, anuj1072538, axboe, viro, brauner, hch,
	linux-block, linux-fsdevel, joshi.k

[-- Attachment #1: Type: text/plain, Size: 2846 bytes --]

On May 29, 2025, at 1:12 AM, Anuj Gupta/Anuj Gupta <anuj20.g@samsung.com> wrote:
> +/* Protection info capability flags */
> +#define	FILE_PI_CAP_INTEGRITY		(1 << 0)
> +#define	FILE_PI_CAP_REFTAG		(1 << 1)
> +
> +/* Checksum types for Protection Information */
> +#define FS_PI_CSUM_NONE			0
> +#define FS_PI_CSUM_IP			1
> +#define FS_PI_CSUM_CRC			2
> +#define FS_PI_CSUM_CRC64		3
> +
> +/*
> + * struct fs_pi_cap - protection information(PI) capability descriptor
> + * @flags:			Bitmask of capability flags
> + * @interval:			Number of bytes of data per PI tuple
> + * @csum_type:			Checksum type
> + * @metadata_size:		Size in bytes of the metadata associated with each
> interval
> + * @tag_size:			Size of the tag area within the tuple
> + * @pi_offset:			Offset of protection information tuple within the metadata
> + * @ref_tag_size:		Size in bytes of the reference tag
> + * @storage_tag_size:		Size in bytes of the storage tag
> + * @rsvd:			Reserved for future use
> + */
> +struct fs_pi_cap {
> +	__u32	flags;

Minor nits on the struct.

It would be preferable to have a struct prefix on these fields, like "fpc_"
so that tags for "flags" don't return a million different structs.

> +	__u16	interval;
> +	__u8	csum_type;
> +	__u8	tuple_size;
> +	__u8	tag_size;
> +	__u8	pi_offset;
> +	__u8	ref_tag_size;
> +	__u8	storage_tag_size;
> +	__u8	rsvd[4];
> +};

It seems strange to have padding to align this struct to a 20-byte size.
Having 4 bytes of padding is probably insufficient for future expansion
(e.g. just a single int if that was needed), and 20 bytes isn't exactly
a "normal" power-of-two size.

Since ioctls take the size of the struct, you could either remove rsvd
entirely, and use the struct size to "version" the ioctl if it needs to
change (at the cost of consuming more ioctls), or expand the struct to
be large enough to allow proper expansion in the future, like 32 bytes
with fpc_rsvd[24] (using "flags" to indicate the validity of the new fields).

>  #define FS_IOC_GETFSSYSFSPATH		_IOR(0x15, 1, struct fs_sysfs_path)
> +/* Get protection info capability details */
> +#define FS_IOC_GETPICAP			_IOR('f', 3, struct fs_pi_cap)

Note that _IOR('f', 3, int) is already used as EXT4_IOC32_GETVERSION, though
this does not strictly conflict because of the different struct size.

At a minimum, FS_IOC_GETPICAP should be declared right after FS_IOC_SETFLAGS
_IOR('f', 2,) so that it is clear that _IOR('f', 3,) is used.

However, in Documentation/userspace-api/ioctl/ioctl-number.rst it reports
that 'f' 00-1f is reserved for ext4, while 0x15 00-ff is reserved for generic
FS_IOC_* ioctls, so that would be the better one to use.  Currently only
_IOR(0x15, 0,) and _IOR(0x15, 1,) are used, so _IOR(0x15, 3) should be safe.

Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] fs: add ioctl to query protection info capabilities
  2025-05-29 17:59       ` Eric Biggers
@ 2025-05-30  5:24         ` Christian Brauner
  2025-06-03 18:43           ` Anuj gupta
  0 siblings, 1 reply; 13+ messages in thread
From: Christian Brauner @ 2025-05-30  5:24 UTC (permalink / raw)
  To: Eric Biggers, Amir Goldstein
  Cc: Anuj Gupta/Anuj Gupta, Martin K. Petersen, jack, anuj1072538,
	axboe, viro, hch, linux-block, linux-fsdevel, joshi.k

On Thu, May 29, 2025 at 05:59:34PM +0000, Eric Biggers wrote:
> On Thu, May 29, 2025 at 12:42:45PM +0530, Anuj Gupta/Anuj Gupta wrote:
> > On 5/29/2025 8:32 AM, Martin K. Petersen wrote:
> > > 
> > > Hi Anuj!
> > > 
> > > Thanks for working on this!
> > > 
> > Hi Martin,
> > Thanks for the feedback!
> > 
> > >> 4. tuple_size: size (in bytes) of the protection information tuple.
> > >> 6. pi_offset: offset of protection info within the tuple.
> > > 
> > > I find this a little confusing. The T10 PI tuple is <guard, app, ref>.
> > > 
> > > I acknowledge things currently are a bit muddy in the block layer since
> > > tuple_size has been transmogrified to hold the NVMe metadata size.
> > > 
> > > But for a new user-visible interface I think we should make the
> > > terminology clear. The tuple is the PI and not the rest of the metadata.
> > > 
> > > So I think you'd want:
> > > 
> > > 4. metadata_size: size (in bytes) of the metadata associated with each interval.
> > > 6. pi_offset: offset of protection information tuple within the metadata.
> > > 
> > 
> > Yes, this representation looks better. Will make this change.
> > 
> > >> +#define	FILE_PI_CAP_INTEGRITY		(1 << 0)
> > >> +#define	FILE_PI_CAP_REFTAG		(1 << 1)
> > > 
> > > You'll also need to have corresponding uapi defines for:
> > > 
> > > enum blk_integrity_checksum {
> > >          BLK_INTEGRITY_CSUM_NONE         = 0,
> > >          BLK_INTEGRITY_CSUM_IP           = 1,
> > >          BLK_INTEGRITY_CSUM_CRC          = 2,
> > >          BLK_INTEGRITY_CSUM_CRC64        = 3,
> > > } __packed ;
> > >
> > 
> > Right, I'll add these definitions to the UAPI.
> 
> Would it make sense to give the CRCs clearer names?  For example CRC16_T10DIF
> and CRC64_NVME.

Hm, I wonder whether we should just make all of this an extension of the
new file_getattr() system call we're about to add instead of adding a
separate ioctl for this.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] fs: add ioctl to query protection info capabilities
  2025-05-29  7:12     ` Anuj Gupta/Anuj Gupta
  2025-05-29 17:59       ` Eric Biggers
  2025-05-29 21:14       ` [RFC] " Andreas Dilger
@ 2025-06-03  3:12       ` Martin K. Petersen
  2025-06-03  6:30         ` Christoph Hellwig
  2 siblings, 1 reply; 13+ messages in thread
From: Martin K. Petersen @ 2025-06-03  3:12 UTC (permalink / raw)
  To: Anuj Gupta/Anuj Gupta
  Cc: Martin K. Petersen, jack, anuj1072538, axboe, viro, brauner, hch,
	linux-block, linux-fsdevel, joshi.k


Hi Anuj!

I've been mulling over this for a few days...

> The block layer currently infers this by looking at the csum_type
> (e.g., in blk_integrity_generate). I assumed userspace could do the
> same, so I didn't expose a separate pi_tuple_size field. Do you see
> this differently?

When the block layer data integrity code was originally designed, the
concept of non-PI metadata didn't exist.

Then NVMe came along and we added support for opaque metadata in
addition to the PI.

As a result, the block layer considers the opaque metadata part of the
PI but it technically isn't. It really should be the other way around:
The PI is a subset of the metadata.

It would require quite a bit of rototilling to metadata-ize the block
layer plumbing at this point. But for a new user API, I do think we
should try to align with the architecture outlined in the standards.

-- 
Martin K. Petersen

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] fs: add ioctl to query protection info capabilities
  2025-06-03  3:12       ` [RFC] fs: " Martin K. Petersen
@ 2025-06-03  6:30         ` Christoph Hellwig
  2025-06-04  1:48           ` Martin K. Petersen
  0 siblings, 1 reply; 13+ messages in thread
From: Christoph Hellwig @ 2025-06-03  6:30 UTC (permalink / raw)
  To: Martin K. Petersen
  Cc: Anuj Gupta/Anuj Gupta, jack, anuj1072538, axboe, viro, brauner,
	hch, linux-block, linux-fsdevel, joshi.k

On Mon, Jun 02, 2025 at 11:12:38PM -0400, Martin K. Petersen wrote:
> As a result, the block layer considers the opaque metadata part of the
> PI but it technically isn't. It really should be the other way around:
> The PI is a subset of the metadata.
> 
> It would require quite a bit of rototilling to metadata-ize the block
> layer plumbing at this point. But for a new user API, I do think we
> should try to align with the architecture outlined in the standards.

As in exporting the total metadata size and PI tuple size?


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] fs: add ioctl to query protection info capabilities
  2025-05-30  5:24         ` Christian Brauner
@ 2025-06-03 18:43           ` Anuj gupta
  2025-06-04  7:53             ` Christian Brauner
  0 siblings, 1 reply; 13+ messages in thread
From: Anuj gupta @ 2025-06-03 18:43 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Eric Biggers, Amir Goldstein, Anuj Gupta/Anuj Gupta,
	Martin K. Petersen, jack, axboe, viro, hch, linux-block,
	linux-fsdevel, joshi.k

> Hm, I wonder whether we should just make all of this an extension of the
> new file_getattr() system call we're about to add instead of adding a
> separate ioctl for this.

Hi Christian,
Thanks for the suggestion to explore file_getattr() for exposing PI
capabilities. I spent some time evaluating this path.

Block devices don’t implement inode_operations, including fileattr_get,
so invoking file_getattr() on something like /dev/nvme0n1 currently
returns -EOPNOTSUPP.  Supporting this would require introducing
inode_operations, and then wiring up fileattr_get in the block layer.

Given that, I think sticking with an ioctl may be the cleaner approach.
Do you see this differently?

Thanks,
Anuj

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] fs: add ioctl to query protection info capabilities
  2025-06-03  6:30         ` Christoph Hellwig
@ 2025-06-04  1:48           ` Martin K. Petersen
  0 siblings, 0 replies; 13+ messages in thread
From: Martin K. Petersen @ 2025-06-04  1:48 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Martin K. Petersen, Anuj Gupta/Anuj Gupta, jack, anuj1072538,
	axboe, viro, brauner, linux-block, linux-fsdevel, joshi.k


Christoph,

>> It would require quite a bit of rototilling to metadata-ize the block
>> layer plumbing at this point. But for a new user API, I do think we
>> should try to align with the architecture outlined in the standards.
>
> As in exporting the total metadata size and PI tuple size?

Yep. An alternative would be to have uapi defines for the PI tuple size
given each of the checksum types. But I do think it is clearer to make
the sizes explicit in the returned struct.

-- 
Martin K. Petersen

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] fs: add ioctl to query protection info capabilities
  2025-06-03 18:43           ` Anuj gupta
@ 2025-06-04  7:53             ` Christian Brauner
  2025-06-04  7:57               ` Christoph Hellwig
  0 siblings, 1 reply; 13+ messages in thread
From: Christian Brauner @ 2025-06-04  7:53 UTC (permalink / raw)
  To: Anuj gupta, hch
  Cc: Eric Biggers, Amir Goldstein, Anuj Gupta/Anuj Gupta,
	Martin K. Petersen, jack, axboe, viro, linux-block, linux-fsdevel,
	joshi.k

On Wed, Jun 04, 2025 at 12:13:38AM +0530, Anuj gupta wrote:
> > Hm, I wonder whether we should just make all of this an extension of the
> > new file_getattr() system call we're about to add instead of adding a
> > separate ioctl for this.
> 
> Hi Christian,
> Thanks for the suggestion to explore file_getattr() for exposing PI
> capabilities. I spent some time evaluating this path.
> 
> Block devices don’t implement inode_operations, including fileattr_get,
> so invoking file_getattr() on something like /dev/nvme0n1 currently
> returns -EOPNOTSUPP.  Supporting this would require introducing
> inode_operations, and then wiring up fileattr_get in the block layer.
> 
> Given that, I think sticking with an ioctl may be the cleaner approach.
> Do you see this differently?

Would it be so bad to add custom inode operations?
It's literally just something like:

diff --git a/block/bdev.c b/block/bdev.c
index b77ddd12dc06..9b4f76e2afca 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -453,6 +453,11 @@ void __init bdev_cache_init(void)
        blockdev_superblock = blockdev_mnt->mnt_sb;   /* For writeback */
 }

+static const struct inode_operations bdev_inode_operations = {
+       .fileattr_get   = bdev_file_attr_get,
+       .fileattr_set   = bdev_file_attr_set,
+}
+
 struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 {
        struct block_device *bdev;
@@ -462,6 +467,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
        if (!inode)
                return NULL;
        inode->i_mode = S_IFBLK;
+       inode->i_op = &bdev_inode_operations;
        inode->i_rdev = 0;
        inode->i_data.a_ops = &def_blk_aops;
        mapping_set_gfp_mask(&inode->i_data, GFP_USER);


instead of using empty_iops.

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [RFC] fs: add ioctl to query protection info capabilities
  2025-06-04  7:53             ` Christian Brauner
@ 2025-06-04  7:57               ` Christoph Hellwig
  2025-06-05  8:24                 ` Christian Brauner
  0 siblings, 1 reply; 13+ messages in thread
From: Christoph Hellwig @ 2025-06-04  7:57 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Anuj gupta, hch, Eric Biggers, Amir Goldstein,
	Anuj Gupta/Anuj Gupta, Martin K. Petersen, jack, axboe, viro,
	linux-block, linux-fsdevel, joshi.k

On Wed, Jun 04, 2025 at 09:53:10AM +0200, Christian Brauner wrote:
> On Wed, Jun 04, 2025 at 12:13:38AM +0530, Anuj gupta wrote:
> > > Hm, I wonder whether we should just make all of this an extension of the
> > > new file_getattr() system call we're about to add instead of adding a
> > > separate ioctl for this.
> > 
> > Hi Christian,
> > Thanks for the suggestion to explore file_getattr() for exposing PI
> > capabilities. I spent some time evaluating this path.
> > 
> > Block devices don’t implement inode_operations, including fileattr_get,
> > so invoking file_getattr() on something like /dev/nvme0n1 currently
> > returns -EOPNOTSUPP.  Supporting this would require introducing
> > inode_operations, and then wiring up fileattr_get in the block layer.
> > 
> > Given that, I think sticking with an ioctl may be the cleaner approach.
> > Do you see this differently?
> 
> Would it be so bad to add custom inode operations?
> It's literally just something like:

That doesn't help as the inode operations for the underlying block device
inode won't ever be called.  The visible inode is the block device node
in the containing file system.

Given fileattr get/set seems to be about the actual files in the file
system, using them for something that affects the I/O path for block
device nodes does not feel like a good fit.  And I think the reason they
are inode and not file operations is exactly to be able to cover
attributes always controlled by the containing file system.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] fs: add ioctl to query protection info capabilities
  2025-06-04  7:57               ` Christoph Hellwig
@ 2025-06-05  8:24                 ` Christian Brauner
  0 siblings, 0 replies; 13+ messages in thread
From: Christian Brauner @ 2025-06-05  8:24 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Anuj gupta, Eric Biggers, Amir Goldstein, Anuj Gupta/Anuj Gupta,
	Martin K. Petersen, jack, axboe, viro, linux-block, linux-fsdevel,
	joshi.k

On Wed, Jun 04, 2025 at 12:57:18AM -0700, Christoph Hellwig wrote:
> On Wed, Jun 04, 2025 at 09:53:10AM +0200, Christian Brauner wrote:
> > On Wed, Jun 04, 2025 at 12:13:38AM +0530, Anuj gupta wrote:
> > > > Hm, I wonder whether we should just make all of this an extension of the
> > > > new file_getattr() system call we're about to add instead of adding a
> > > > separate ioctl for this.
> > > 
> > > Hi Christian,
> > > Thanks for the suggestion to explore file_getattr() for exposing PI
> > > capabilities. I spent some time evaluating this path.
> > > 
> > > Block devices don’t implement inode_operations, including fileattr_get,
> > > so invoking file_getattr() on something like /dev/nvme0n1 currently
> > > returns -EOPNOTSUPP.  Supporting this would require introducing
> > > inode_operations, and then wiring up fileattr_get in the block layer.
> > > 
> > > Given that, I think sticking with an ioctl may be the cleaner approach.
> > > Do you see this differently?
> > 
> > Would it be so bad to add custom inode operations?
> > It's literally just something like:
> 
> That doesn't help as the inode operations for the underlying block device
> inode won't ever be called.  The visible inode is the block device node
> in the containing file system.

Ah, it's the same thing as with sockets, I forgot about that. Thanks.

> Given fileattr get/set seems to be about the actual files in the file
> system, using them for something that affects the I/O path for block
> device nodes does not feel like a good fit.  And I think the reason they
> are inode and not file operations is exactly to be able to cover
> attributes always controlled by the containing file system.

Yes, that seems fine then.

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2025-06-05  8:24 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <CGME20250527105950epcas5p1b53753ab614bf6bde4ffbf5165c7d263@epcas5p1.samsung.com>
2025-05-27 10:42 ` [RFC] fs: add ioctl to query protection info capabilities Anuj Gupta
2025-05-29  3:02   ` Martin K. Petersen
2025-05-29  7:12     ` Anuj Gupta/Anuj Gupta
2025-05-29 17:59       ` Eric Biggers
2025-05-30  5:24         ` Christian Brauner
2025-06-03 18:43           ` Anuj gupta
2025-06-04  7:53             ` Christian Brauner
2025-06-04  7:57               ` Christoph Hellwig
2025-06-05  8:24                 ` Christian Brauner
2025-05-29 21:14       ` [RFC] " Andreas Dilger
2025-06-03  3:12       ` [RFC] fs: " Martin K. Petersen
2025-06-03  6:30         ` Christoph Hellwig
2025-06-04  1:48           ` Martin K. Petersen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).