* [PATCH v3 1/2] dmaengine: idxd: Expose DSA3.0 capabilities through sysfs
2025-08-21 8:51 [PATCH v3 0/2] dmaengine: idxd: Add basic DSA 3.0 capability and SGL support Yi Sun
@ 2025-08-21 8:51 ` Yi Sun
2025-09-02 7:25 ` Vinod Koul
2025-08-21 8:51 ` [PATCH v3 2/2] dmaengine: idxd: Add Max SGL Size Support for DSA3.0 Yi Sun
1 sibling, 1 reply; 6+ messages in thread
From: Yi Sun @ 2025-08-21 8:51 UTC (permalink / raw)
To: vkoul
Cc: vinicius.gomes, dave.jiang, dmaengine, linux-kernel, yi.sun,
gordon.jin, fenghuay, yi1.lai, Anil S Keshavamurthy
Introduce sysfs interfaces for 3 new Data Streaming Accelerator (DSA)
capability registers (dsacap0-2) to enable userspace awareness of hardware
features in DSA version 3 and later devices.
Userspace components (e.g. configure libraries, workload Apps) require this
information to:
1. Select optimal data transfer strategies based on SGL capabilities
2. Enable hardware-specific optimizations for floating-point operations
3. Configure memory operations with proper numerical handling
4. Verify compute operation compatibility before submitting jobs
The output format is <dsacap2>,<dsacap1>,<dsacap0>, where each DSA
capability value is a 64-bit hexadecimal number, separated by commas.
The ordering follows the DSA 3.0 specification layout:
Offset: 0x190 0x188 0x180
Reg: dsacap2 dsacap1 dsacap0
Example:
cat /sys/bus/dsa/devices/dsa0/dsacaps
000000000000f18d,0014000e000007aa,00fa01ff01ff03ff
According to the DSA 3.0 specification, there are 15 fields defined for
the three dsacap registers. However, there's no need to define all
register structures unless a use case requires them. At this point,
support for the Scatter-Gather List (SGL) located in dsacap0 is necessary,
so only dsacap0 is defined accordingly.
For reference, the DSA 3.0 specification is available at:
Link: https://software.intel.com/content/www/us/en/develop/articles/intel-data-streaming-accelerator-architecture-specification.html
Signed-off-by: Yi Sun <yi.sun@intel.com>
Co-developed-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Tested-by: Yi Lai <yi1.lai@intel.com>
Acked-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd
index 4a355e6747ae..bd281063d626 100644
--- a/Documentation/ABI/stable/sysfs-driver-dma-idxd
+++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd
@@ -136,6 +136,21 @@ Description: The last executed device administrative command's status/error.
Also last configuration error overloaded.
Writing to it will clear the status.
+What: /sys/bus/dsa/devices/dsa<m>/dsacaps
+Date: Oct 5, 2025
+KernelVersion: 6.17.0
+Contact: dmaengine@vger.kernel.org
+Description: The DSA3 specification introduces three new capability
+ registers: dsacap[0-2]. User components (e.g., configuration
+ libraries and workload applications) require this information
+ to properly utilize the DSA3 features.
+ This includes SGL capability support, Enabling hardware-specific
+ optimizations, Configuring memory, etc.
+ The output format is '<dsacap2>,<dsacap1>,<dsacap0>' where each
+ DSA cap value is a 64 bit hex value.
+ This attribute should only be visible on DSA devices of version
+ 3 or later.
+
What: /sys/bus/dsa/devices/dsa<m>/iaa_cap
Date: Sept 14, 2022
KernelVersion: 6.0.0
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 74e6695881e6..cc0a3fe1c957 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -252,6 +252,9 @@ struct idxd_hw {
struct opcap opcap;
u32 cmd_cap;
union iaa_cap_reg iaa_cap;
+ union dsacap0_reg dsacap0;
+ union dsacap1_reg dsacap1;
+ union dsacap2_reg dsacap2;
};
enum idxd_device_state {
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 35bdefd3728b..084df60d407b 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -582,6 +582,12 @@ static void idxd_read_caps(struct idxd_device *idxd)
}
multi_u64_to_bmap(idxd->opcap_bmap, &idxd->hw.opcap.bits[0], 4);
+ if (idxd->hw.version >= DEVICE_VERSION_3) {
+ idxd->hw.dsacap0.bits = ioread64(idxd->reg_base + IDXD_DSACAP0_OFFSET);
+ idxd->hw.dsacap1.bits = ioread64(idxd->reg_base + IDXD_DSACAP1_OFFSET);
+ idxd->hw.dsacap2.bits = ioread64(idxd->reg_base + IDXD_DSACAP2_OFFSET);
+ }
+
/* read iaa cap */
if (idxd->data->type == IDXD_TYPE_IAX && idxd->hw.version >= DEVICE_VERSION_2)
idxd->hw.iaa_cap.bits = ioread64(idxd->reg_base + IDXD_IAACAP_OFFSET);
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index 9c1c546fe443..439bbc311591 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -13,6 +13,7 @@
#define DEVICE_VERSION_1 0x100
#define DEVICE_VERSION_2 0x200
+#define DEVICE_VERSION_3 0x300
#define IDXD_MMIO_BAR 0
#define IDXD_WQ_BAR 2
@@ -582,6 +583,30 @@ union evl_status_reg {
u64 bits;
};
+#define IDXD_DSACAP0_OFFSET 0x180
+union dsacap0_reg {
+ u64 bits;
+ struct {
+ u64 max_sgl_shift:4;
+ u64 max_gr_block_shift:4;
+ u64 ops_inter_domain:7;
+ u64 rsvd1:17;
+ u64 sgl_formats:16;
+ u64 max_sg_process:8;
+ u64 rsvd2:8;
+ };
+};
+
+#define IDXD_DSACAP1_OFFSET 0x188
+union dsacap1_reg {
+ u64 bits;
+};
+
+#define IDXD_DSACAP2_OFFSET 0x190
+union dsacap2_reg {
+ u64 bits;
+};
+
#define IDXD_MAX_BATCH_IDENT 256
struct __evl_entry {
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 9f0701021af0..cc2c83d7f710 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1713,6 +1713,18 @@ static ssize_t event_log_size_store(struct device *dev,
}
static DEVICE_ATTR_RW(event_log_size);
+static ssize_t dsacaps_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct idxd_device *idxd = confdev_to_idxd(dev);
+
+ return sysfs_emit(buf, "%016llx,%016llx,%016llx\n",
+ (u64)idxd->hw.dsacap2.bits,
+ (u64)idxd->hw.dsacap1.bits,
+ (u64)idxd->hw.dsacap0.bits);
+}
+static DEVICE_ATTR_RO(dsacaps);
+
static bool idxd_device_attr_max_batch_size_invisible(struct attribute *attr,
struct idxd_device *idxd)
{
@@ -1750,6 +1762,14 @@ static bool idxd_device_attr_event_log_size_invisible(struct attribute *attr,
!idxd->hw.gen_cap.evl_support);
}
+static bool idxd_device_attr_dsacaps_invisible(struct attribute *attr,
+ struct idxd_device *idxd)
+{
+ return attr == &dev_attr_dsacaps.attr &&
+ (idxd->data->type != IDXD_TYPE_DSA ||
+ idxd->hw.version < DEVICE_VERSION_3);
+}
+
static umode_t idxd_device_attr_visible(struct kobject *kobj,
struct attribute *attr, int n)
{
@@ -1768,6 +1788,9 @@ static umode_t idxd_device_attr_visible(struct kobject *kobj,
if (idxd_device_attr_event_log_size_invisible(attr, idxd))
return 0;
+ if (idxd_device_attr_dsacaps_invisible(attr, idxd))
+ return 0;
+
return attr->mode;
}
@@ -1795,6 +1818,7 @@ static struct attribute *idxd_device_attributes[] = {
&dev_attr_cmd_status.attr,
&dev_attr_iaa_cap.attr,
&dev_attr_event_log_size.attr,
+ &dev_attr_dsacaps.attr,
NULL,
};
--
2.43.0
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH v3 2/2] dmaengine: idxd: Add Max SGL Size Support for DSA3.0
2025-08-21 8:51 [PATCH v3 0/2] dmaengine: idxd: Add basic DSA 3.0 capability and SGL support Yi Sun
2025-08-21 8:51 ` [PATCH v3 1/2] dmaengine: idxd: Expose DSA3.0 capabilities through sysfs Yi Sun
@ 2025-08-21 8:51 ` Yi Sun
1 sibling, 0 replies; 6+ messages in thread
From: Yi Sun @ 2025-08-21 8:51 UTC (permalink / raw)
To: vkoul
Cc: vinicius.gomes, dave.jiang, dmaengine, linux-kernel, yi.sun,
gordon.jin, fenghuay, yi1.lai, Anil S Keshavamurthy
Certain DSA 3.0 opcodes, such as Gather copy and Gather reduce, require max
SGL configured for workqueues prior to supporting these opcodes.
Configure the maximum scatter-gather list (SGL) size for workqueues during
setup on the supported HW. Application can then properly handle the SGL
size without explicitly setting it.
Signed-off-by: Yi Sun <yi.sun@intel.com>
Co-developed-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Tested-by: Yi Lai <yi1.lai@intel.com>
Acked-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 5cf419fe6b46..1c10b030bea7 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -375,6 +375,7 @@ static void idxd_wq_disable_cleanup(struct idxd_wq *wq)
memset(wq->name, 0, WQ_NAME_SIZE);
wq->max_xfer_bytes = WQ_DEFAULT_MAX_XFER;
idxd_wq_set_max_batch_size(idxd->data->type, wq, WQ_DEFAULT_MAX_BATCH);
+ idxd_wq_set_init_max_sgl_size(idxd, wq);
if (wq->opcap_bmap)
bitmap_copy(wq->opcap_bmap, idxd->opcap_bmap, IDXD_MAX_OPCAP_BITS);
}
@@ -974,6 +975,8 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
/* bytes 12-15 */
wq->wqcfg->max_xfer_shift = ilog2(wq->max_xfer_bytes);
idxd_wqcfg_set_max_batch_shift(idxd->data->type, wq->wqcfg, ilog2(wq->max_batch_size));
+ if (idxd_sgl_supported(idxd))
+ wq->wqcfg->max_sgl_shift = ilog2(wq->max_sgl_size);
/* bytes 32-63 */
if (idxd->hw.wq_cap.op_config && wq->opcap_bmap) {
@@ -1152,6 +1155,8 @@ static int idxd_wq_load_config(struct idxd_wq *wq)
wq->max_xfer_bytes = 1ULL << wq->wqcfg->max_xfer_shift;
idxd_wq_set_max_batch_size(idxd->data->type, wq, 1U << wq->wqcfg->max_batch_shift);
+ if (idxd_sgl_supported(idxd))
+ wq->max_sgl_size = 1U << wq->wqcfg->max_sgl_shift;
for (i = 0; i < WQCFG_STRIDES(idxd); i++) {
wqcfg_offset = WQCFG_OFFSET(idxd, wq->id, i);
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index cc0a3fe1c957..ea8c4daed38d 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -227,6 +227,7 @@ struct idxd_wq {
char name[WQ_NAME_SIZE + 1];
u64 max_xfer_bytes;
u32 max_batch_size;
+ u32 max_sgl_size;
/* Lock to protect upasid_xa access. */
struct mutex uc_lock;
@@ -348,6 +349,7 @@ struct idxd_device {
u64 max_xfer_bytes;
u32 max_batch_size;
+ u32 max_sgl_size;
int max_groups;
int max_engines;
int max_rdbufs;
@@ -692,6 +694,20 @@ static inline void idxd_wq_set_max_batch_size(int idxd_type, struct idxd_wq *wq,
wq->max_batch_size = max_batch_size;
}
+static bool idxd_sgl_supported(struct idxd_device *idxd)
+{
+ return idxd->data->type == IDXD_TYPE_DSA &&
+ idxd->hw.version >= DEVICE_VERSION_3 &&
+ idxd->hw.dsacap0.sgl_formats;
+}
+
+static inline void idxd_wq_set_init_max_sgl_size(struct idxd_device *idxd,
+ struct idxd_wq *wq)
+{
+ if (idxd_sgl_supported(idxd))
+ wq->max_sgl_size = 1U << idxd->hw.dsacap0.max_sgl_shift;
+}
+
static inline void idxd_wqcfg_set_max_batch_shift(int idxd_type, union wqcfg *wqcfg,
u32 max_batch_shift)
{
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 084df60d407b..395d5486aee6 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -217,6 +217,7 @@ static int idxd_setup_wqs(struct idxd_device *idxd)
init_completion(&wq->wq_resurrect);
wq->max_xfer_bytes = WQ_DEFAULT_MAX_XFER;
idxd_wq_set_max_batch_size(idxd->data->type, wq, WQ_DEFAULT_MAX_BATCH);
+ idxd_wq_set_init_max_sgl_size(idxd, wq);
wq->enqcmds_retries = IDXD_ENQCMDS_RETRIES;
wq->wqcfg = kzalloc_node(idxd->wqcfg_size, GFP_KERNEL, dev_to_node(dev));
if (!wq->wqcfg) {
@@ -587,6 +588,10 @@ static void idxd_read_caps(struct idxd_device *idxd)
idxd->hw.dsacap1.bits = ioread64(idxd->reg_base + IDXD_DSACAP1_OFFSET);
idxd->hw.dsacap2.bits = ioread64(idxd->reg_base + IDXD_DSACAP2_OFFSET);
}
+ if (idxd_sgl_supported(idxd)) {
+ idxd->max_sgl_size = 1U << idxd->hw.dsacap0.max_sgl_shift;
+ dev_dbg(dev, "max sgl size: %u\n", idxd->max_sgl_size);
+ }
/* read iaa cap */
if (idxd->data->type == IDXD_TYPE_IAX && idxd->hw.version >= DEVICE_VERSION_2)
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index 439bbc311591..c5f344c55a69 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -385,7 +385,8 @@ union wqcfg {
/* bytes 12-15 */
u32 max_xfer_shift:5;
u32 max_batch_shift:4;
- u32 rsvd4:23;
+ u32 max_sgl_shift:4;
+ u32 rsvd4:19;
/* bytes 16-19 */
u16 occupancy_inth;
--
2.43.0
^ permalink raw reply related [flat|nested] 6+ messages in thread