* [PATCH rc v7 1/4] iommu/arm-smmu-v3: Add update_safe bits to fix STE update sequence
2026-01-15 18:23 [PATCH rc v7 0/4] iommu/arm-smmu-v3: Fix hitless STE update in nesting cases Nicolin Chen
@ 2026-01-15 18:23 ` Nicolin Chen
2026-01-15 18:23 ` [PATCH rc v7 2/4] iommu/arm-smmu-v3: Mark STE MEV safe when computing the " Nicolin Chen
` (3 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: Nicolin Chen @ 2026-01-15 18:23 UTC (permalink / raw)
To: will
Cc: jgg, robin.murphy, joro, linux-arm-kernel, iommu, linux-kernel,
skolothumtho, praan, xueshuai, smostafa
From: Jason Gunthorpe <jgg@nvidia.com>
C_BAD_STE was observed when updating nested STE from an S1-bypass mode to
an S1DSS-bypass mode. As both modes enabled S2, the used bit is slightly
different than the normal S1-bypass and S1DSS-bypass modes. As a result,
fields like MEV and EATS in S2's used list marked the word1 as a critical
word that requested a STE.V=0. This breaks a hitless update.
However, both MEV and EATS aren't critical in terms of STE update. One
controls the merge of the events and the other controls the ATS that is
managed by the driver at the same time via pci_enable_ats().
Add an arm_smmu_get_ste_update_safe() to allow STE update algorithm to
relax those fields, avoiding the STE update breakages.
After this change, entry_set has no caller checking its return value, so
change it to void.
Note that this change is required by both MEV and EATS fields, which were
introduced in different kernel versions. So add get_update_safe() first.
MEV and EATS will be added to arm_smmu_get_ste_update_safe() separately.
Fixes: 1e8be08d1c91 ("iommu/arm-smmu-v3: Support IOMMU_DOMAIN_NESTED")
Cc: stable@vger.kernel.org
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Shuai Xue <xueshuai@linux.alibaba.com>
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 4 +++
.../iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c | 31 +++++++++++++++++--
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 28 ++++++++++++-----
3 files changed, 53 insertions(+), 10 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index ae23aacc3840..287e223c054d 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -900,6 +900,8 @@ struct arm_smmu_entry_writer {
struct arm_smmu_entry_writer_ops {
void (*get_used)(const __le64 *entry, __le64 *used);
+ void (*get_update_safe)(const __le64 *cur, const __le64 *target,
+ __le64 *safe_bits);
void (*sync)(struct arm_smmu_entry_writer *writer);
};
@@ -911,6 +913,8 @@ void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
#if IS_ENABLED(CONFIG_KUNIT)
void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits);
+void arm_smmu_get_ste_update_safe(const __le64 *cur, const __le64 *target,
+ __le64 *safe_bits);
void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *cur,
const __le64 *target);
void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
index d2671bfd3798..b254a94b2003 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
@@ -38,13 +38,16 @@ enum arm_smmu_test_master_feat {
static bool arm_smmu_entry_differs_in_used_bits(const __le64 *entry,
const __le64 *used_bits,
const __le64 *target,
+ const __le64 *safe,
unsigned int length)
{
bool differs = false;
unsigned int i;
for (i = 0; i < length; i++) {
- if ((entry[i] & used_bits[i]) != target[i])
+ __le64 used = used_bits[i] & ~safe[i];
+
+ if ((entry[i] & used) != (target[i] & used))
differs = true;
}
return differs;
@@ -56,12 +59,24 @@ arm_smmu_test_writer_record_syncs(struct arm_smmu_entry_writer *writer)
struct arm_smmu_test_writer *test_writer =
container_of(writer, struct arm_smmu_test_writer, writer);
__le64 *entry_used_bits;
+ __le64 *safe_target;
+ __le64 *safe_init;
entry_used_bits = kunit_kzalloc(
test_writer->test, sizeof(*entry_used_bits) * NUM_ENTRY_QWORDS,
GFP_KERNEL);
KUNIT_ASSERT_NOT_NULL(test_writer->test, entry_used_bits);
+ safe_target = kunit_kzalloc(test_writer->test,
+ sizeof(*safe_target) * NUM_ENTRY_QWORDS,
+ GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test_writer->test, safe_target);
+
+ safe_init = kunit_kzalloc(test_writer->test,
+ sizeof(*safe_init) * NUM_ENTRY_QWORDS,
+ GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test_writer->test, safe_init);
+
pr_debug("STE value is now set to: ");
print_hex_dump_debug(" ", DUMP_PREFIX_NONE, 16, 8,
test_writer->entry,
@@ -79,14 +94,23 @@ arm_smmu_test_writer_record_syncs(struct arm_smmu_entry_writer *writer)
* configuration.
*/
writer->ops->get_used(test_writer->entry, entry_used_bits);
+ if (writer->ops->get_update_safe)
+ writer->ops->get_update_safe(test_writer->entry,
+ test_writer->init_entry,
+ safe_init);
+ if (writer->ops->get_update_safe)
+ writer->ops->get_update_safe(test_writer->entry,
+ test_writer->target_entry,
+ safe_target);
KUNIT_EXPECT_FALSE(
test_writer->test,
arm_smmu_entry_differs_in_used_bits(
test_writer->entry, entry_used_bits,
- test_writer->init_entry, NUM_ENTRY_QWORDS) &&
+ test_writer->init_entry, safe_init,
+ NUM_ENTRY_QWORDS) &&
arm_smmu_entry_differs_in_used_bits(
test_writer->entry, entry_used_bits,
- test_writer->target_entry,
+ test_writer->target_entry, safe_target,
NUM_ENTRY_QWORDS));
}
}
@@ -106,6 +130,7 @@ arm_smmu_v3_test_debug_print_used_bits(struct arm_smmu_entry_writer *writer,
static const struct arm_smmu_entry_writer_ops test_ste_ops = {
.sync = arm_smmu_test_writer_record_syncs,
.get_used = arm_smmu_get_ste_used,
+ .get_update_safe = arm_smmu_get_ste_update_safe,
};
static const struct arm_smmu_entry_writer_ops test_cd_ops = {
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index d16d35c78c06..390446d259ab 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1082,6 +1082,13 @@ void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits)
}
EXPORT_SYMBOL_IF_KUNIT(arm_smmu_get_ste_used);
+VISIBLE_IF_KUNIT
+void arm_smmu_get_ste_update_safe(const __le64 *cur, const __le64 *target,
+ __le64 *safe_bits)
+{
+}
+EXPORT_SYMBOL_IF_KUNIT(arm_smmu_get_ste_update_safe);
+
/*
* Figure out if we can do a hitless update of entry to become target. Returns a
* bit mask where 1 indicates that qword needs to be set disruptively.
@@ -1094,13 +1101,22 @@ static u8 arm_smmu_entry_qword_diff(struct arm_smmu_entry_writer *writer,
{
__le64 target_used[NUM_ENTRY_QWORDS] = {};
__le64 cur_used[NUM_ENTRY_QWORDS] = {};
+ __le64 safe[NUM_ENTRY_QWORDS] = {};
u8 used_qword_diff = 0;
unsigned int i;
writer->ops->get_used(entry, cur_used);
writer->ops->get_used(target, target_used);
+ if (writer->ops->get_update_safe)
+ writer->ops->get_update_safe(entry, target, safe);
for (i = 0; i != NUM_ENTRY_QWORDS; i++) {
+ /*
+ * Safe is only used for bits that are used by both entries,
+ * otherwise it is sequenced according to the unused entry.
+ */
+ safe[i] &= target_used[i] & cur_used[i];
+
/*
* Check that masks are up to date, the make functions are not
* allowed to set a bit to 1 if the used function doesn't say it
@@ -1109,6 +1125,7 @@ static u8 arm_smmu_entry_qword_diff(struct arm_smmu_entry_writer *writer,
WARN_ON_ONCE(target[i] & ~target_used[i]);
/* Bits can change because they are not currently being used */
+ cur_used[i] &= ~safe[i];
unused_update[i] = (entry[i] & cur_used[i]) |
(target[i] & ~cur_used[i]);
/*
@@ -1121,7 +1138,7 @@ static u8 arm_smmu_entry_qword_diff(struct arm_smmu_entry_writer *writer,
return used_qword_diff;
}
-static bool entry_set(struct arm_smmu_entry_writer *writer, __le64 *entry,
+static void entry_set(struct arm_smmu_entry_writer *writer, __le64 *entry,
const __le64 *target, unsigned int start,
unsigned int len)
{
@@ -1137,7 +1154,6 @@ static bool entry_set(struct arm_smmu_entry_writer *writer, __le64 *entry,
if (changed)
writer->ops->sync(writer);
- return changed;
}
/*
@@ -1207,12 +1223,9 @@ void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *entry,
entry_set(writer, entry, target, 0, 1);
} else {
/*
- * No inuse bit changed. Sanity check that all unused bits are 0
- * in the entry. The target was already sanity checked by
- * compute_qword_diff().
+ * No inuse bit changed, though safe bits may have changed.
*/
- WARN_ON_ONCE(
- entry_set(writer, entry, target, 0, NUM_ENTRY_QWORDS));
+ entry_set(writer, entry, target, 0, NUM_ENTRY_QWORDS);
}
}
EXPORT_SYMBOL_IF_KUNIT(arm_smmu_write_entry);
@@ -1543,6 +1556,7 @@ static void arm_smmu_ste_writer_sync_entry(struct arm_smmu_entry_writer *writer)
static const struct arm_smmu_entry_writer_ops arm_smmu_ste_writer_ops = {
.sync = arm_smmu_ste_writer_sync_entry,
.get_used = arm_smmu_get_ste_used,
+ .get_update_safe = arm_smmu_get_ste_update_safe,
};
static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
--
2.43.0
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH rc v7 3/4] iommu/arm-smmu-v3: Mark EATS_TRANS safe when computing the update sequence
2026-01-15 18:23 [PATCH rc v7 0/4] iommu/arm-smmu-v3: Fix hitless STE update in nesting cases Nicolin Chen
2026-01-15 18:23 ` [PATCH rc v7 1/4] iommu/arm-smmu-v3: Add update_safe bits to fix STE update sequence Nicolin Chen
2026-01-15 18:23 ` [PATCH rc v7 2/4] iommu/arm-smmu-v3: Mark STE MEV safe when computing the " Nicolin Chen
@ 2026-01-15 18:23 ` Nicolin Chen
2026-01-15 18:23 ` [PATCH rc v7 4/4] iommu/arm-smmu-v3-test: Add nested s1bypass/s1dssbypass coverage Nicolin Chen
2026-01-23 17:49 ` [PATCH rc v7 0/4] iommu/arm-smmu-v3: Fix hitless STE update in nesting cases Will Deacon
4 siblings, 0 replies; 6+ messages in thread
From: Nicolin Chen @ 2026-01-15 18:23 UTC (permalink / raw)
To: will
Cc: jgg, robin.murphy, joro, linux-arm-kernel, iommu, linux-kernel,
skolothumtho, praan, xueshuai, smostafa
From: Jason Gunthorpe <jgg@nvidia.com>
If VM wants to toggle EATS_TRANS off at the same time as changing the CFG,
hypervisor will see EATS change to 0 and insert a V=0 breaking update into
the STE even though the VM did not ask for that.
In bare metal, EATS_TRANS is ignored by CFG=ABORT/BYPASS, which is why this
does not cause a problem until we have the nested case where CFG is always
a variation of S2 trans that does use EATS_TRANS.
Relax the rules for EATS_TRANS sequencing, we don't need it to be exact as
the enclosing code will always disable ATS at the PCI device when changing
EATS_TRANS. This ensures there are no ATS transactions that can race with
an EATS_TRANS change so we don't need to carefully sequence these bits.
Fixes: 1e8be08d1c91 ("iommu/arm-smmu-v3: Support IOMMU_DOMAIN_NESTED")
Cc: stable@vger.kernel.org
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Shuai Xue <xueshuai@linux.alibaba.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 26 +++++++++++++++++++++
1 file changed, 26 insertions(+)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index ccd6357fa5a8..77a87af5c673 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1086,6 +1086,32 @@ VISIBLE_IF_KUNIT
void arm_smmu_get_ste_update_safe(const __le64 *cur, const __le64 *target,
__le64 *safe_bits)
{
+ const __le64 eats_s1chk =
+ FIELD_PREP(STRTAB_STE_1_EATS, STRTAB_STE_1_EATS_S1CHK);
+ const __le64 eats_trans =
+ FIELD_PREP(STRTAB_STE_1_EATS, STRTAB_STE_1_EATS_TRANS);
+
+ /*
+ * When an STE changes EATS_TRANS, the sequencing code in the attach
+ * logic already will have the PCI cap for ATS disabled. Thus at this
+ * moment we can expect that the device will not generate ATS queries
+ * and so we don't care about the sequencing of EATS. The purpose of
+ * EATS_TRANS is to protect the system from hostile untrusted devices
+ * that issue ATS when the PCI config space is disabled. However, if
+ * EATS_TRANS is being changed, then we must have already trusted the
+ * device as the EATS_TRANS security block is being disabled.
+ *
+ * Note: now the EATS_TRANS update is moved to the first entry_set().
+ * Changing S2S and EATS might transiently result in S2S=1 and EATS=1
+ * which is a bad STE (see "5.2 Stream Table Entry"). In such a case,
+ * we can't do a hitless update. Also, it should not be added to the
+ * safe bits with STRTAB_STE_1_EATS_S1CHK, because EATS=0b11 would be
+ * effectively an errant 0b00 configuration.
+ */
+ if (!((cur[1] | target[1]) & cpu_to_le64(eats_s1chk)) &&
+ !((cur[2] | target[2]) & cpu_to_le64(STRTAB_STE_2_S2S)))
+ safe_bits[1] |= cpu_to_le64(eats_trans);
+
/*
* MEV does not meaningfully impact the operation of the HW, it only
* changes how many fault events are generated, thus we can relax it
--
2.43.0
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH rc v7 4/4] iommu/arm-smmu-v3-test: Add nested s1bypass/s1dssbypass coverage
2026-01-15 18:23 [PATCH rc v7 0/4] iommu/arm-smmu-v3: Fix hitless STE update in nesting cases Nicolin Chen
` (2 preceding siblings ...)
2026-01-15 18:23 ` [PATCH rc v7 3/4] iommu/arm-smmu-v3: Mark EATS_TRANS " Nicolin Chen
@ 2026-01-15 18:23 ` Nicolin Chen
2026-01-23 17:49 ` [PATCH rc v7 0/4] iommu/arm-smmu-v3: Fix hitless STE update in nesting cases Will Deacon
4 siblings, 0 replies; 6+ messages in thread
From: Nicolin Chen @ 2026-01-15 18:23 UTC (permalink / raw)
To: will
Cc: jgg, robin.murphy, joro, linux-arm-kernel, iommu, linux-kernel,
skolothumtho, praan, xueshuai, smostafa
STE in a nested case requires both S1 and S2 fields. And this makes the use
case different from the existing one.
Add coverage for previously failed cases shifting between S2-only and S1+S2
STEs.
Reviewed-by: Shuai Xue <xueshuai@linux.alibaba.com>
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
.../iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c | 47 +++++++++++++++++++
1 file changed, 47 insertions(+)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
index b254a94b2003..69c9ef441fc1 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
@@ -33,8 +33,12 @@ static struct mm_struct sva_mm = {
enum arm_smmu_test_master_feat {
ARM_SMMU_MASTER_TEST_ATS = BIT(0),
ARM_SMMU_MASTER_TEST_STALL = BIT(1),
+ ARM_SMMU_MASTER_TEST_NESTED = BIT(2),
};
+static void arm_smmu_test_make_s2_ste(struct arm_smmu_ste *ste,
+ enum arm_smmu_test_master_feat feat);
+
static bool arm_smmu_entry_differs_in_used_bits(const __le64 *entry,
const __le64 *used_bits,
const __le64 *target,
@@ -210,6 +214,18 @@ static void arm_smmu_test_make_cdtable_ste(struct arm_smmu_ste *ste,
};
arm_smmu_make_cdtable_ste(ste, &master, ats_enabled, s1dss);
+ if (feat & ARM_SMMU_MASTER_TEST_NESTED) {
+ struct arm_smmu_ste s2ste;
+ int i;
+
+ arm_smmu_test_make_s2_ste(&s2ste,
+ feat & ~ARM_SMMU_MASTER_TEST_NESTED);
+ ste->data[0] |= cpu_to_le64(
+ FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_NESTED));
+ ste->data[1] |= cpu_to_le64(STRTAB_STE_1_MEV);
+ for (i = 2; i < NUM_ENTRY_QWORDS; i++)
+ ste->data[i] = s2ste.data[i];
+ }
}
static void arm_smmu_v3_write_ste_test_bypass_to_abort(struct kunit *test)
@@ -567,6 +583,35 @@ static void arm_smmu_v3_write_ste_test_s2_to_s1_stall(struct kunit *test)
NUM_EXPECTED_SYNCS(3));
}
+static void
+arm_smmu_v3_write_ste_test_nested_s1dssbypass_to_s1bypass(struct kunit *test)
+{
+ struct arm_smmu_ste s1_ste;
+ struct arm_smmu_ste s2_ste;
+
+ arm_smmu_test_make_cdtable_ste(
+ &s1_ste, STRTAB_STE_1_S1DSS_BYPASS, fake_cdtab_dma_addr,
+ ARM_SMMU_MASTER_TEST_ATS | ARM_SMMU_MASTER_TEST_NESTED);
+ arm_smmu_test_make_s2_ste(&s2_ste, 0);
+ /* Expect an additional sync to unset ignored bits: EATS and MEV */
+ arm_smmu_v3_test_ste_expect_hitless_transition(test, &s1_ste, &s2_ste,
+ NUM_EXPECTED_SYNCS(3));
+}
+
+static void
+arm_smmu_v3_write_ste_test_nested_s1bypass_to_s1dssbypass(struct kunit *test)
+{
+ struct arm_smmu_ste s1_ste;
+ struct arm_smmu_ste s2_ste;
+
+ arm_smmu_test_make_cdtable_ste(
+ &s1_ste, STRTAB_STE_1_S1DSS_BYPASS, fake_cdtab_dma_addr,
+ ARM_SMMU_MASTER_TEST_ATS | ARM_SMMU_MASTER_TEST_NESTED);
+ arm_smmu_test_make_s2_ste(&s2_ste, 0);
+ arm_smmu_v3_test_ste_expect_hitless_transition(test, &s2_ste, &s1_ste,
+ NUM_EXPECTED_SYNCS(2));
+}
+
static void arm_smmu_v3_write_cd_test_sva_clear(struct kunit *test)
{
struct arm_smmu_cd cd = {};
@@ -613,6 +658,8 @@ static struct kunit_case arm_smmu_v3_test_cases[] = {
KUNIT_CASE(arm_smmu_v3_write_cd_test_s1_change_asid),
KUNIT_CASE(arm_smmu_v3_write_ste_test_s1_to_s2_stall),
KUNIT_CASE(arm_smmu_v3_write_ste_test_s2_to_s1_stall),
+ KUNIT_CASE(arm_smmu_v3_write_ste_test_nested_s1dssbypass_to_s1bypass),
+ KUNIT_CASE(arm_smmu_v3_write_ste_test_nested_s1bypass_to_s1dssbypass),
KUNIT_CASE(arm_smmu_v3_write_cd_test_sva_clear),
KUNIT_CASE(arm_smmu_v3_write_cd_test_sva_release),
{},
--
2.43.0
^ permalink raw reply related [flat|nested] 6+ messages in thread