* [PATCH v5 12/20] dma-direct: select DMA address encoding from DMA_ATTR_CC_SHARED
From: Aneesh Kumar K.V (Arm) @ 2026-05-22 4:28 UTC (permalink / raw)
To: iommu, linux-arm-kernel, linux-kernel, linux-coco
Cc: Aneesh Kumar K.V (Arm), Robin Murphy, Marek Szyprowski,
Will Deacon, Marc Zyngier, Steven Price, Suzuki K Poulose,
Catalin Marinas, Jiri Pirko, Jason Gunthorpe, Mostafa Saleh,
Petr Tesarik, Alexey Kardashevskiy, Dan Williams, Xu Yilun,
linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86, Jiri Pirko
In-Reply-To: <20260522042815.370873-1-aneesh.kumar@kernel.org>
Make the dma-direct helpers derive the DMA address encoding from
DMA_ATTR_CC_SHARED instead of implicitly relying on
force_dma_unencrypted() inside phys_to_dma_direct()
Pass an explicit unencrypted/decrypted state into phys_to_dma_direct(),
make the alloc paths return DMA addresses that match the requested buffer
encryption state. Also only call dma_set_decrypted() when
DMA_ATTR_CC_SHARED is actually set.
Tested-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
kernel/dma/direct.c | 42 +++++++++++++++++++++++++-----------------
1 file changed, 25 insertions(+), 17 deletions(-)
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index d7eee77f95af..6f3aff8448a0 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -24,11 +24,11 @@
u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24);
static inline dma_addr_t phys_to_dma_direct(struct device *dev,
- phys_addr_t phys)
+ phys_addr_t phys, bool unencrypted)
{
- if (force_dma_unencrypted(dev))
+ if (unencrypted)
return phys_to_dma_unencrypted(dev, phys);
- return phys_to_dma(dev, phys);
+ return phys_to_dma_encrypted(dev, phys);
}
static inline struct page *dma_direct_to_page(struct device *dev,
@@ -39,8 +39,9 @@ static inline struct page *dma_direct_to_page(struct device *dev,
u64 dma_direct_get_required_mask(struct device *dev)
{
+ bool require_decrypted = force_dma_unencrypted(dev);
phys_addr_t phys = (phys_addr_t)(max_pfn - 1) << PAGE_SHIFT;
- u64 max_dma = phys_to_dma_direct(dev, phys);
+ u64 max_dma = phys_to_dma_direct(dev, phys, require_decrypted);
return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
}
@@ -69,7 +70,8 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit)
bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
{
- dma_addr_t dma_addr = phys_to_dma_direct(dev, phys);
+ bool require_decrypted = force_dma_unencrypted(dev);
+ dma_addr_t dma_addr = phys_to_dma_direct(dev, phys, require_decrypted);
if (dma_addr == DMA_MAPPING_ERROR)
return false;
@@ -79,17 +81,18 @@ bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
static int dma_set_decrypted(struct device *dev, void *vaddr, size_t size)
{
- if (!force_dma_unencrypted(dev))
- return 0;
- return set_memory_decrypted((unsigned long)vaddr, PFN_UP(size));
+ int ret;
+
+ ret = set_memory_decrypted((unsigned long)vaddr, PFN_UP(size));
+ if (ret)
+ pr_warn_ratelimited("leaking DMA memory that can't be decrypted\n");
+ return ret;
}
static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size)
{
int ret;
- if (!force_dma_unencrypted(dev))
- return 0;
ret = set_memory_encrypted((unsigned long)vaddr, PFN_UP(size));
if (ret)
pr_warn_ratelimited("leaking DMA memory that can't be re-encrypted\n");
@@ -169,7 +172,8 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
dma_coherent_ok);
if (!page)
return NULL;
- *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
+ *dma_handle = phys_to_dma_direct(dev, page_to_phys(page),
+ !!(attrs & DMA_ATTR_CC_SHARED));
return ret;
}
@@ -185,9 +189,11 @@ static void *dma_direct_alloc_no_mapping(struct device *dev, size_t size,
/* remove any dirty cache lines on the kernel alias */
if (!PageHighMem(page))
arch_dma_prep_coherent(page, size);
-
- /* return the page pointer as the opaque cookie */
- *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
+ /*
+ * return the page pointer as the opaque cookie.
+ * Never used for unencrypted allocation
+ */
+ *dma_handle = phys_to_dma_encrypted(dev, page_to_phys(page));
return page;
}
@@ -332,7 +338,8 @@ void *dma_direct_alloc(struct device *dev, size_t size,
ret = uncached_cpu_addr;
}
- *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
+ *dma_handle = phys_to_dma_direct(dev, page_to_phys(page),
+ !!(attrs & DMA_ATTR_CC_SHARED));
return ret;
@@ -455,11 +462,12 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
return NULL;
ret = page_address(page);
- if (dma_set_decrypted(dev, ret, size))
+ if ((attrs & DMA_ATTR_CC_SHARED) && dma_set_decrypted(dev, ret, size))
goto out_leak_pages;
setup_page:
memset(ret, 0, size);
- *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
+ *dma_handle = phys_to_dma_direct(dev, page_to_phys(page),
+ !!(attrs & DMA_ATTR_CC_SHARED));
return page;
out_leak_pages:
return NULL;
--
2.43.0
^ permalink raw reply related
* [PATCH v5 11/20] dma-direct: set decrypted flag for remapped DMA allocations
From: Aneesh Kumar K.V (Arm) @ 2026-05-22 4:28 UTC (permalink / raw)
To: iommu, linux-arm-kernel, linux-kernel, linux-coco
Cc: Aneesh Kumar K.V (Arm), Robin Murphy, Marek Szyprowski,
Will Deacon, Marc Zyngier, Steven Price, Suzuki K Poulose,
Catalin Marinas, Jiri Pirko, Jason Gunthorpe, Mostafa Saleh,
Petr Tesarik, Alexey Kardashevskiy, Dan Williams, Xu Yilun,
linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86, Jiri Pirko
In-Reply-To: <20260522042815.370873-1-aneesh.kumar@kernel.org>
Devices that are DMA non-coherent and require a remap were skipping
dma_set_decrypted(), leaving DMA buffers encrypted even when the device
requires unencrypted access. Move the call after the if (remap) branch
so that both the direct and remapped allocation paths correctly mark the
allocation as decrypted (or fail cleanly) before use.
Fix dma_direct_alloc() and dma_direct_free() to apply set_memory_*() to
the linear-map alias of the backing pages instead of the remapped CPU
address. Also disallow highmem pages for DMA_ATTR_CC_SHARED, because
highmem buffers do not provide a usable linear-map address.
Fixes: f3c962226dbe ("dma-direct: clean up the remapping checks in dma_direct_alloc")
Tested-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
kernel/dma/direct.c | 55 ++++++++++++++++++++++++++++++++++++---------
1 file changed, 44 insertions(+), 11 deletions(-)
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 9e65c8432b6e..d7eee77f95af 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -196,6 +196,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
{
bool remap = false, set_uncached = false;
bool mark_mem_decrypt = false;
+ bool allow_highmem = true;
struct page *page;
void *ret;
@@ -214,6 +215,15 @@ void *dma_direct_alloc(struct device *dev, size_t size,
mark_mem_decrypt = true;
}
+ if (attrs & DMA_ATTR_CC_SHARED)
+ /*
+ * Unencrypted/shared DMA requires a linear-mapped buffer
+ * address to look up the PFN and set architecture-required PFN
+ * attributes. This is not possible with HighMem. Avoid HighMem
+ * allocation.
+ */
+ allow_highmem = false;
+
size = PAGE_ALIGN(size);
if (attrs & DMA_ATTR_NO_WARN)
gfp |= __GFP_NOWARN;
@@ -272,7 +282,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
}
/* we always manually zero the memory once we are done */
- page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true);
+ page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, allow_highmem);
if (!page)
return NULL;
@@ -287,6 +297,14 @@ void *dma_direct_alloc(struct device *dev, size_t size,
set_uncached = false;
}
+ if (mark_mem_decrypt) {
+ void *lm_addr;
+
+ lm_addr = page_address(page);
+ if (set_memory_decrypted((unsigned long)lm_addr, PFN_UP(size)))
+ goto out_leak_pages;
+ }
+
if (remap) {
pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs);
@@ -297,29 +315,36 @@ void *dma_direct_alloc(struct device *dev, size_t size,
ret = dma_common_contiguous_remap(page, size, prot,
__builtin_return_address(0));
if (!ret)
- goto out_free_pages;
+ goto out_encrypt_pages;
} else {
ret = page_address(page);
- if (mark_mem_decrypt && dma_set_decrypted(dev, ret, size))
- goto out_leak_pages;
}
memset(ret, 0, size);
if (set_uncached) {
+ void *uncached_cpu_addr;
+
arch_dma_prep_coherent(page, size);
- ret = arch_dma_set_uncached(ret, size);
- if (IS_ERR(ret))
- goto out_encrypt_pages;
+ uncached_cpu_addr = arch_dma_set_uncached(ret, size);
+ if (IS_ERR(uncached_cpu_addr))
+ goto out_free_remap_pages;
+ ret = uncached_cpu_addr;
}
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
return ret;
+
+out_free_remap_pages:
+ if (remap)
+ dma_common_free_remap(ret, size);
+
out_encrypt_pages:
- if (mark_mem_decrypt && dma_set_encrypted(dev, page_address(page), size))
- return NULL;
-out_free_pages:
+ if (mark_mem_decrypt &&
+ dma_set_encrypted(dev, page_address(page), size))
+ goto out_leak_pages;
+
if (!swiotlb_free(dev, page, size))
dma_free_contiguous(dev, page, size);
return NULL;
@@ -384,8 +409,16 @@ void dma_direct_free(struct device *dev, size_t size,
} else {
if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
arch_dma_clear_uncached(cpu_addr, size);
- if (mark_mem_encrypted && dma_set_encrypted(dev, cpu_addr, size))
+ }
+
+ if (mark_mem_encrypted) {
+ void *lm_addr;
+
+ lm_addr = phys_to_virt(phys);
+ if (set_memory_encrypted((unsigned long)lm_addr, PFN_UP(size))) {
+ pr_warn_ratelimited("leaking DMA memory that can't be re-encrypted\n");
return;
+ }
}
if (swiotlb_pool)
--
2.43.0
^ permalink raw reply related
* [PATCH v5 10/20] dma-direct: make dma_direct_map_phys() honor DMA_ATTR_CC_SHARED
From: Aneesh Kumar K.V (Arm) @ 2026-05-22 4:28 UTC (permalink / raw)
To: iommu, linux-arm-kernel, linux-kernel, linux-coco
Cc: Aneesh Kumar K.V (Arm), Robin Murphy, Marek Szyprowski,
Will Deacon, Marc Zyngier, Steven Price, Suzuki K Poulose,
Catalin Marinas, Jiri Pirko, Jason Gunthorpe, Mostafa Saleh,
Petr Tesarik, Alexey Kardashevskiy, Dan Williams, Xu Yilun,
linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86, Jiri Pirko
In-Reply-To: <20260522042815.370873-1-aneesh.kumar@kernel.org>
Teach dma_direct_map_phys() to select the DMA address encoding based on
DMA_ATTR_CC_SHARED.
Use phys_to_dma_unencrypted() for decrypted mappings and
phys_to_dma_encrypted() otherwise. If a device requires unencrypted DMA
but the source physical address is still encrypted, force the mapping
through swiotlb so the DMA address and backing memory attributes remain
consistent.
Update the arm64, x86, s390 and powerpc secure-guest setup to not use
swiotlb force option
Tested-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
Changes from v3:
* Handle DMA_ATTR_MMIO
---
arch/arm64/mm/init.c | 4 +--
arch/powerpc/platforms/pseries/svm.c | 2 +-
arch/s390/mm/init.c | 2 +-
arch/x86/kernel/pci-dma.c | 4 +--
kernel/dma/direct.c | 4 ++-
kernel/dma/direct.h | 45 +++++++++++++++-------------
6 files changed, 31 insertions(+), 30 deletions(-)
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index c1b223e7cc8e..a087ac5b15f7 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -340,10 +340,8 @@ void __init arch_mm_preinit(void)
unsigned int flags = SWIOTLB_VERBOSE;
bool swiotlb = max_pfn > PFN_DOWN(arm64_dma_phys_limit);
- if (is_realm_world()) {
+ if (is_realm_world())
swiotlb = true;
- flags |= SWIOTLB_FORCE;
- }
if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && !swiotlb) {
/*
diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c
index 384c9dc1899a..7a403dbd35ee 100644
--- a/arch/powerpc/platforms/pseries/svm.c
+++ b/arch/powerpc/platforms/pseries/svm.c
@@ -29,7 +29,7 @@ static int __init init_svm(void)
* need to use the SWIOTLB buffer for DMA even if dma_capable() says
* otherwise.
*/
- ppc_swiotlb_flags |= SWIOTLB_ANY | SWIOTLB_FORCE;
+ ppc_swiotlb_flags |= SWIOTLB_ANY;
/* Share the SWIOTLB buffer with the host. */
swiotlb_update_mem_attributes();
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index ad3c6d92b801..581af1483c42 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -163,7 +163,7 @@ static void __init pv_init(void)
virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
/* make sure bounce buffers are shared */
- swiotlb_init(true, SWIOTLB_FORCE | SWIOTLB_VERBOSE);
+ swiotlb_init(true, SWIOTLB_VERBOSE);
swiotlb_update_mem_attributes();
}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 6267363e0189..75cf8f6ae8cd 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -59,10 +59,8 @@ static void __init pci_swiotlb_detect(void)
* bounce buffers as the hypervisor can't access arbitrary VM memory
* that is not explicitly shared with it.
*/
- if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
x86_swiotlb_enable = true;
- x86_swiotlb_flags |= SWIOTLB_FORCE;
- }
}
#else
static inline void __init pci_swiotlb_detect(void)
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 429791b2599a..9e65c8432b6e 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -702,8 +702,10 @@ size_t dma_direct_max_mapping_size(struct device *dev)
{
/* If SWIOTLB is active, use its maximum mapping size */
if (is_swiotlb_active(dev) &&
- (dma_addressing_limited(dev) || is_swiotlb_force_bounce(dev)))
+ (dma_addressing_limited(dev) || is_swiotlb_force_bounce(dev) ||
+ force_dma_unencrypted(dev)))
return swiotlb_max_mapping_size(dev);
+
return SIZE_MAX;
}
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index e05dc7649366..f3fc28f352ba 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -88,37 +88,40 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
{
dma_addr_t dma_addr;
+ /*
+ * For a device requiring unencrypted DMA, MMIO memory is treated
+ * as shared by default.
+ */
+ if (force_dma_unencrypted(dev) && (attrs & DMA_ATTR_MMIO))
+ attrs |= DMA_ATTR_CC_SHARED;
+
if (is_swiotlb_force_bounce(dev)) {
- if (!(attrs & DMA_ATTR_CC_SHARED)) {
- if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT))
- return DMA_MAPPING_ERROR;
+ if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT))
+ return DMA_MAPPING_ERROR;
- return swiotlb_map(dev, phys, size, dir, attrs);
- }
- } else if (attrs & DMA_ATTR_CC_SHARED) {
- return DMA_MAPPING_ERROR;
+ return swiotlb_map(dev, phys, size, dir, attrs);
}
- if (attrs & DMA_ATTR_MMIO) {
- dma_addr = phys;
- if (unlikely(!dma_capable(dev, dma_addr, size, false, attrs)))
- goto err_overflow;
- } else if (attrs & DMA_ATTR_CC_SHARED) {
+ if (attrs & DMA_ATTR_CC_SHARED)
dma_addr = phys_to_dma_unencrypted(dev, phys);
+ else
+ dma_addr = phys_to_dma_encrypted(dev, phys);
+
+ if (attrs & DMA_ATTR_MMIO) {
if (unlikely(!dma_capable(dev, dma_addr, size, false, attrs)))
goto err_overflow;
- } else {
- dma_addr = phys_to_dma(dev, phys);
- if (unlikely(!dma_capable(dev, dma_addr, size, true, attrs)) ||
- dma_kmalloc_needs_bounce(dev, size, dir)) {
- if (is_swiotlb_active(dev) &&
- !(attrs & DMA_ATTR_REQUIRE_COHERENT))
- return swiotlb_map(dev, phys, size, dir, attrs);
+ goto dma_mapped;
+ }
- goto err_overflow;
- }
+ if (unlikely(!dma_capable(dev, dma_addr, size, true, attrs)) ||
+ dma_kmalloc_needs_bounce(dev, size, dir)) {
+ if (is_swiotlb_active(dev) &&
+ !(attrs & DMA_ATTR_REQUIRE_COHERENT))
+ return swiotlb_map(dev, phys, size, dir, attrs);
+ goto err_overflow;
}
+dma_mapped:
if (!dev_is_dma_coherent(dev) &&
!(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) {
arch_sync_dma_for_device(phys, size, dir);
--
2.43.0
^ permalink raw reply related
* [PATCH v5 09/20] dma-direct: pass attrs to dma_capable() for DMA_ATTR_CC_SHARED checks
From: Aneesh Kumar K.V (Arm) @ 2026-05-22 4:28 UTC (permalink / raw)
To: iommu, linux-arm-kernel, linux-kernel, linux-coco
Cc: Aneesh Kumar K.V (Arm), Robin Murphy, Marek Szyprowski,
Will Deacon, Marc Zyngier, Steven Price, Suzuki K Poulose,
Catalin Marinas, Jiri Pirko, Jason Gunthorpe, Mostafa Saleh,
Petr Tesarik, Alexey Kardashevskiy, Dan Williams, Xu Yilun,
linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86, Jiri Pirko
In-Reply-To: <20260522042815.370873-1-aneesh.kumar@kernel.org>
Teach dma_capable() about DMA_ATTR_CC_SHARED so the capability
check can reject encrypted DMA addresses for devices that require
unencrypted/shared DMA.
Also propagate DMA_ATTR_CC_SHARED in swiotlb_map() when the selected
SWIOTLB pool is decrypted so the capability check sees the correct DMA
address attribute.
Tested-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
arch/x86/kernel/amd_gart_64.c | 30 ++++++++++++++++--------------
drivers/xen/swiotlb-xen.c | 6 +++---
include/linux/dma-direct.h | 10 +++++++++-
kernel/dma/direct.h | 6 +++---
kernel/dma/swiotlb.c | 2 +-
5 files changed, 32 insertions(+), 22 deletions(-)
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index e8000a56732e..b5f1f031d45b 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -180,22 +180,23 @@ static void iommu_full(struct device *dev, size_t size, int dir)
}
static inline int
-need_iommu(struct device *dev, unsigned long addr, size_t size)
+need_iommu(struct device *dev, unsigned long addr, size_t size, unsigned long attrs)
{
- return force_iommu || !dma_capable(dev, addr, size, true);
+ return force_iommu || !dma_capable(dev, addr, size, true, attrs);
}
static inline int
-nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
+nonforced_iommu(struct device *dev, unsigned long addr, size_t size,
+ unsigned long attrs)
{
- return !dma_capable(dev, addr, size, true);
+ return !dma_capable(dev, addr, size, true, attrs);
}
/* Map a single continuous physical area into the IOMMU.
* Caller needs to check if the iommu is needed and flush.
*/
static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
- size_t size, int dir, unsigned long align_mask)
+ size_t size, int dir, unsigned long align_mask, unsigned long attrs)
{
unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
unsigned long iommu_page;
@@ -206,7 +207,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
iommu_page = alloc_iommu(dev, npages, align_mask);
if (iommu_page == -1) {
- if (!nonforced_iommu(dev, phys_mem, size))
+ if (!nonforced_iommu(dev, phys_mem, size, attrs))
return phys_mem;
if (panic_on_overflow)
panic("dma_map_area overflow %lu bytes\n", size);
@@ -231,10 +232,10 @@ static dma_addr_t gart_map_phys(struct device *dev, phys_addr_t paddr,
if (unlikely(attrs & DMA_ATTR_MMIO))
return DMA_MAPPING_ERROR;
- if (!need_iommu(dev, paddr, size))
+ if (!need_iommu(dev, paddr, size, attrs))
return paddr;
- bus = dma_map_area(dev, paddr, size, dir, 0);
+ bus = dma_map_area(dev, paddr, size, dir, 0, attrs);
flush_gart();
return bus;
@@ -289,7 +290,7 @@ static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
/* Fallback for dma_map_sg in case of overflow */
static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
- int nents, int dir)
+ int nents, int dir, unsigned long attrs)
{
struct scatterlist *s;
int i;
@@ -301,8 +302,8 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
for_each_sg(sg, s, nents, i) {
unsigned long addr = sg_phys(s);
- if (nonforced_iommu(dev, addr, s->length)) {
- addr = dma_map_area(dev, addr, s->length, dir, 0);
+ if (nonforced_iommu(dev, addr, s->length, attrs)) {
+ addr = dma_map_area(dev, addr, s->length, dir, 0, attrs);
if (addr == DMA_MAPPING_ERROR) {
if (i > 0)
gart_unmap_sg(dev, sg, i, dir, 0);
@@ -401,7 +402,7 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
s->dma_address = addr;
BUG_ON(s->length == 0);
- nextneed = need_iommu(dev, addr, s->length);
+ nextneed = need_iommu(dev, addr, s->length, attrs);
/* Handle the previous not yet processed entries */
if (i > start) {
@@ -449,7 +450,7 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
/* When it was forced or merged try again in a dumb way */
if (force_iommu || iommu_merge) {
- out = dma_map_sg_nonforce(dev, sg, nents, dir);
+ out = dma_map_sg_nonforce(dev, sg, nents, dir, attrs);
if (out > 0)
return out;
}
@@ -473,7 +474,8 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
return vaddr;
*dma_addr = dma_map_area(dev, virt_to_phys(vaddr), size,
- DMA_BIDIRECTIONAL, (1UL << get_order(size)) - 1);
+ DMA_BIDIRECTIONAL,
+ (1UL << get_order(size)) - 1, attrs);
flush_gart();
if (unlikely(*dma_addr == DMA_MAPPING_ERROR))
goto out_free;
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 8c4abe65cd49..e2538824ef52 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -212,7 +212,7 @@ static dma_addr_t xen_swiotlb_map_phys(struct device *dev, phys_addr_t phys,
BUG_ON(dir == DMA_NONE);
if (attrs & DMA_ATTR_MMIO) {
- if (unlikely(!dma_capable(dev, phys, size, false))) {
+ if (unlikely(!dma_capable(dev, phys, size, false, attrs))) {
dev_err_once(
dev,
"DMA addr %pa+%zu overflow (mask %llx, bus limit %llx).\n",
@@ -231,7 +231,7 @@ static dma_addr_t xen_swiotlb_map_phys(struct device *dev, phys_addr_t phys,
* we can safely return the device addr and not worry about bounce
* buffering it.
*/
- if (dma_capable(dev, dev_addr, size, true) &&
+ if (dma_capable(dev, dev_addr, size, true, attrs) &&
!dma_kmalloc_needs_bounce(dev, size, dir) &&
!range_straddles_page_boundary(phys, size) &&
!xen_arch_need_swiotlb(dev, phys, dev_addr) &&
@@ -253,7 +253,7 @@ static dma_addr_t xen_swiotlb_map_phys(struct device *dev, phys_addr_t phys,
/*
* Ensure that the address returned is DMA'ble
*/
- if (unlikely(!dma_capable(dev, dev_addr, size, true))) {
+ if (unlikely(!dma_capable(dev, dev_addr, size, true, attrs))) {
__swiotlb_tbl_unmap_single(dev, map, size, dir,
attrs | DMA_ATTR_SKIP_CPU_SYNC,
swiotlb_find_pool(dev, map));
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 94fad4e7c11e..daa31a1adf7b 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -135,12 +135,20 @@ static inline bool force_dma_unencrypted(struct device *dev)
#endif /* CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED */
static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size,
- bool is_ram)
+ bool is_ram, unsigned long attrs)
{
dma_addr_t end = addr + size - 1;
if (addr == DMA_MAPPING_ERROR)
return false;
+ /*
+ * The DMA address was derived from encrypted RAM, but this device
+ * requires unencrypted DMA addresses. Treat it as not DMA-capable
+ * so the caller can fall back to a suitable SWIOTLB pool.
+ */
+ if (!(attrs & DMA_ATTR_CC_SHARED) && force_dma_unencrypted(dev))
+ return false;
+
if (is_ram && !IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) &&
min(addr, end) < phys_to_dma(dev, PFN_PHYS(min_low_pfn)))
return false;
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index 7140c208c123..e05dc7649366 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -101,15 +101,15 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
if (attrs & DMA_ATTR_MMIO) {
dma_addr = phys;
- if (unlikely(!dma_capable(dev, dma_addr, size, false)))
+ if (unlikely(!dma_capable(dev, dma_addr, size, false, attrs)))
goto err_overflow;
} else if (attrs & DMA_ATTR_CC_SHARED) {
dma_addr = phys_to_dma_unencrypted(dev, phys);
- if (unlikely(!dma_capable(dev, dma_addr, size, false)))
+ if (unlikely(!dma_capable(dev, dma_addr, size, false, attrs)))
goto err_overflow;
} else {
dma_addr = phys_to_dma(dev, phys);
- if (unlikely(!dma_capable(dev, dma_addr, size, true)) ||
+ if (unlikely(!dma_capable(dev, dma_addr, size, true, attrs)) ||
dma_kmalloc_needs_bounce(dev, size, dir)) {
if (is_swiotlb_active(dev) &&
!(attrs & DMA_ATTR_REQUIRE_COHERENT))
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 2bf3981db35d..f4e8b241a1c4 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1678,7 +1678,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
else
dma_addr = phys_to_dma_encrypted(dev, swiotlb_addr);
- if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
+ if (unlikely(!dma_capable(dev, dma_addr, size, true, attrs))) {
__swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir,
attrs | DMA_ATTR_SKIP_CPU_SYNC,
swiotlb_find_pool(dev, swiotlb_addr));
--
2.43.0
^ permalink raw reply related
* [PATCH v5 08/20] dma-mapping: make dma_pgprot() honor DMA_ATTR_CC_SHARED
From: Aneesh Kumar K.V (Arm) @ 2026-05-22 4:28 UTC (permalink / raw)
To: iommu, linux-arm-kernel, linux-kernel, linux-coco
Cc: Aneesh Kumar K.V (Arm), Robin Murphy, Marek Szyprowski,
Will Deacon, Marc Zyngier, Steven Price, Suzuki K Poulose,
Catalin Marinas, Jiri Pirko, Jason Gunthorpe, Mostafa Saleh,
Petr Tesarik, Alexey Kardashevskiy, Dan Williams, Xu Yilun,
linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86, Jiri Pirko
In-Reply-To: <20260522042815.370873-1-aneesh.kumar@kernel.org>
Fold encrypted/decrypted pgprot selection into dma_pgprot() so callers
do not need to adjust the page protection separately.
Update dma_pgprot() to apply pgprot_decrypted() when
DMA_ATTR_CC_SHARED is set and pgprot_encrypted() otherwise Convert
the dma-direct allocation and mmap paths to pass DMA_ATTR_CC_SHARED
instead of open-coding force_dma_unencrypted() handling around
dma_pgprot().
Tested-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
kernel/dma/direct.c | 8 +++-----
kernel/dma/mapping.c | 16 ++++++++++++----
2 files changed, 15 insertions(+), 9 deletions(-)
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 7cf1618a235d..429791b2599a 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -290,9 +290,6 @@ void *dma_direct_alloc(struct device *dev, size_t size,
if (remap) {
pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs);
- if (force_dma_unencrypted(dev))
- prot = pgprot_decrypted(prot);
-
/* remove any dirty cache lines on the kernel alias */
arch_dma_prep_coherent(page, size);
@@ -614,9 +611,10 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
unsigned long pfn = PHYS_PFN(dma_to_phys(dev, dma_addr));
int ret = -ENXIO;
- vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
if (force_dma_unencrypted(dev))
- vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
+ attrs |= DMA_ATTR_CC_SHARED;
+
+ vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
return ret;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 23ed8eb9233e..44f715f3aa2d 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -543,13 +543,21 @@ EXPORT_SYMBOL(dma_get_sgtable_attrs);
*/
pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs)
{
+ pgprot_t dma_prot;
+
if (dev_is_dma_coherent(dev))
- return prot;
+ dma_prot = prot;
#ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE
- if (attrs & DMA_ATTR_WRITE_COMBINE)
- return pgprot_writecombine(prot);
+ else if (attrs & DMA_ATTR_WRITE_COMBINE)
+ dma_prot = pgprot_writecombine(prot);
#endif
- return pgprot_dmacoherent(prot);
+ else
+ dma_prot = pgprot_dmacoherent(prot);
+
+ if (attrs & DMA_ATTR_CC_SHARED)
+ return pgprot_decrypted(dma_prot);
+ else
+ return pgprot_encrypted(dma_prot);
}
#endif /* CONFIG_MMU */
--
2.43.0
^ permalink raw reply related
* [PATCH v5 07/20] dma: swiotlb: track pool encryption state and honor DMA_ATTR_CC_SHARED
From: Aneesh Kumar K.V (Arm) @ 2026-05-22 4:28 UTC (permalink / raw)
To: iommu, linux-arm-kernel, linux-kernel, linux-coco
Cc: Aneesh Kumar K.V (Arm), Robin Murphy, Marek Szyprowski,
Will Deacon, Marc Zyngier, Steven Price, Suzuki K Poulose,
Catalin Marinas, Jiri Pirko, Jason Gunthorpe, Mostafa Saleh,
Petr Tesarik, Alexey Kardashevskiy, Dan Williams, Xu Yilun,
linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86, Jiri Pirko
In-Reply-To: <20260522042815.370873-1-aneesh.kumar@kernel.org>
Teach swiotlb to distinguish between encrypted and decrypted bounce
buffer pools, and make allocation and mapping paths select a pool whose
state matches the requested DMA attributes.
Add a unencrypted flag to io_tlb_mem, initialize it for the default and
restricted pools, and propagate DMA_ATTR_CC_SHARED into swiotlb pool
allocation. Reject swiotlb alloc/map requests when the selected pool does
not match the required encrypted/decrypted state.
Also return DMA addresses with the matching phys_to_dma_{encrypted,
unencrypted} helper so the DMA address encoding stays consistent with the
chosen pool.
Tested-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
include/linux/dma-direct.h | 10 +++
include/linux/swiotlb.h | 8 +-
kernel/dma/direct.c | 13 +++-
kernel/dma/swiotlb.c | 154 ++++++++++++++++++++++++++++---------
4 files changed, 142 insertions(+), 43 deletions(-)
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index c249912456f9..94fad4e7c11e 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -77,6 +77,10 @@ static inline dma_addr_t dma_range_map_max(const struct bus_dma_region *map)
#ifndef phys_to_dma_unencrypted
#define phys_to_dma_unencrypted phys_to_dma
#endif
+
+#ifndef phys_to_dma_encrypted
+#define phys_to_dma_encrypted phys_to_dma
+#endif
#else
static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr)
{
@@ -90,6 +94,12 @@ static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev,
{
return dma_addr_unencrypted(__phys_to_dma(dev, paddr));
}
+
+static inline dma_addr_t phys_to_dma_encrypted(struct device *dev,
+ phys_addr_t paddr)
+{
+ return dma_addr_encrypted(__phys_to_dma(dev, paddr));
+}
/*
* If memory encryption is supported, phys_to_dma will set the memory encryption
* bit in the DMA address, and dma_to_phys will clear it.
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 29187cec90d8..4dcbf3931be1 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -81,6 +81,7 @@ struct io_tlb_pool {
struct list_head node;
struct rcu_head rcu;
bool transient;
+ bool unencrypted;
#endif
};
@@ -111,6 +112,7 @@ struct io_tlb_mem {
struct dentry *debugfs;
bool force_bounce;
bool for_alloc;
+ bool unencrypted;
#ifdef CONFIG_SWIOTLB_DYNAMIC
bool can_grow;
u64 phys_limit;
@@ -282,7 +284,8 @@ static inline void swiotlb_sync_single_for_cpu(struct device *dev,
extern void swiotlb_print_info(void);
#ifdef CONFIG_DMA_RESTRICTED_POOL
-struct page *swiotlb_alloc(struct device *dev, size_t size);
+struct page *swiotlb_alloc(struct device *dev, size_t size,
+ unsigned long attrs);
bool swiotlb_free(struct device *dev, struct page *page, size_t size);
void swiotlb_free_from_pool(struct device *dev, phys_addr_t tlb_addr,
size_t size, struct io_tlb_pool *pool);
@@ -292,7 +295,8 @@ static inline bool is_swiotlb_for_alloc(struct device *dev)
return dev->dma_io_tlb_mem->for_alloc;
}
#else
-static inline struct page *swiotlb_alloc(struct device *dev, size_t size)
+static inline struct page *swiotlb_alloc(struct device *dev, size_t size,
+ unsigned long attrs)
{
return NULL;
}
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index dd959716df33..7cf1618a235d 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -96,9 +96,10 @@ static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size)
return ret;
}
-static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size)
+static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size,
+ unsigned long attrs)
{
- struct page *page = swiotlb_alloc(dev, size);
+ struct page *page = swiotlb_alloc(dev, size, attrs);
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
swiotlb_free(dev, page, size);
@@ -258,8 +259,12 @@ void *dma_direct_alloc(struct device *dev, size_t size,
gfp, attrs);
if (is_swiotlb_for_alloc(dev)) {
- page = dma_direct_alloc_swiotlb(dev, size);
+ page = dma_direct_alloc_swiotlb(dev, size, attrs);
if (page) {
+ /*
+ * swiotlb allocations comes from pool already marked
+ * decrypted
+ */
mark_mem_decrypt = false;
goto setup_page;
}
@@ -407,7 +412,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
gfp, attrs);
if (is_swiotlb_for_alloc(dev)) {
- page = dma_direct_alloc_swiotlb(dev, size);
+ page = dma_direct_alloc_swiotlb(dev, size, attrs);
if (!page)
return NULL;
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 78ce05857c00..2bf3981db35d 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -259,10 +259,21 @@ void __init swiotlb_update_mem_attributes(void)
struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
unsigned long bytes;
+ /*
+ * if platform support memory encryption, swiotlb buffers are
+ * decrypted by default.
+ */
+ if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
+ io_tlb_default_mem.unencrypted = true;
+ else
+ io_tlb_default_mem.unencrypted = false;
+
if (!mem->nslabs || mem->late_alloc)
return;
bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT);
- set_memory_decrypted((unsigned long)mem->vaddr, bytes >> PAGE_SHIFT);
+
+ if (io_tlb_default_mem.unencrypted)
+ set_memory_decrypted((unsigned long)mem->vaddr, bytes >> PAGE_SHIFT);
}
static void swiotlb_init_io_tlb_pool(struct io_tlb_pool *mem, phys_addr_t start,
@@ -505,8 +516,10 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
if (!mem->slots)
goto error_slots;
- set_memory_decrypted((unsigned long)vstart,
- (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
+ if (io_tlb_default_mem.unencrypted)
+ set_memory_decrypted((unsigned long)vstart,
+ (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
+
swiotlb_init_io_tlb_pool(mem, virt_to_phys(vstart), nslabs, true,
nareas);
add_mem_pool(&io_tlb_default_mem, mem);
@@ -539,7 +552,9 @@ void __init swiotlb_exit(void)
tbl_size = PAGE_ALIGN(mem->end - mem->start);
slots_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), mem->nslabs));
- set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT);
+ if (io_tlb_default_mem.unencrypted)
+ set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT);
+
if (mem->late_alloc) {
area_order = get_order(array_size(sizeof(*mem->areas),
mem->nareas));
@@ -563,6 +578,7 @@ void __init swiotlb_exit(void)
* @gfp: GFP flags for the allocation.
* @bytes: Size of the buffer.
* @phys_limit: Maximum allowed physical address of the buffer.
+ * @unencrypted: true to allocate unencrypted memory, false for encrypted memory
*
* Allocate pages from the buddy allocator. If successful, make the allocated
* pages decrypted that they can be used for DMA.
@@ -570,7 +586,8 @@ void __init swiotlb_exit(void)
* Return: Decrypted pages, %NULL on allocation failure, or ERR_PTR(-EAGAIN)
* if the allocated physical address was above @phys_limit.
*/
-static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes, u64 phys_limit)
+static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes,
+ u64 phys_limit, bool unencrypted)
{
unsigned int order = get_order(bytes);
struct page *page;
@@ -588,13 +605,13 @@ static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes, u64 phys_limit)
}
vaddr = phys_to_virt(paddr);
- if (set_memory_decrypted((unsigned long)vaddr, PFN_UP(bytes)))
+ if (unencrypted && set_memory_decrypted((unsigned long)vaddr, PFN_UP(bytes)))
goto error;
return page;
error:
/* Intentional leak if pages cannot be encrypted again. */
- if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
+ if (unencrypted && !set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
__free_pages(page, order);
return NULL;
}
@@ -604,30 +621,26 @@ static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes, u64 phys_limit)
* @dev: Device for which a memory pool is allocated.
* @bytes: Size of the buffer.
* @phys_limit: Maximum allowed physical address of the buffer.
+ * @attrs: DMA attributes for the allocation.
* @gfp: GFP flags for the allocation.
*
* Return: Allocated pages, or %NULL on allocation failure.
*/
static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
- u64 phys_limit, gfp_t gfp)
+ u64 phys_limit, unsigned long attrs, gfp_t gfp)
{
struct page *page;
- unsigned long attrs = 0;
/*
* Allocate from the atomic pools if memory is encrypted and
* the allocation is atomic, because decrypting may block.
*/
- if (!gfpflags_allow_blocking(gfp) && dev && force_dma_unencrypted(dev)) {
+ if (!gfpflags_allow_blocking(gfp) && (attrs & DMA_ATTR_CC_SHARED)) {
void *vaddr;
if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
return NULL;
- /* swiotlb considered decrypted by default */
- if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
- attrs = DMA_ATTR_CC_SHARED;
-
return dma_alloc_from_pool(dev, bytes, &vaddr, gfp,
attrs, dma_coherent_ok);
}
@@ -638,7 +651,8 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
else if (phys_limit <= DMA_BIT_MASK(32))
gfp |= __GFP_DMA32;
- while (IS_ERR(page = alloc_dma_pages(gfp, bytes, phys_limit))) {
+ while (IS_ERR(page = alloc_dma_pages(gfp, bytes, phys_limit,
+ !!(attrs & DMA_ATTR_CC_SHARED)))) {
if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
phys_limit < DMA_BIT_MASK(64) &&
!(gfp & (__GFP_DMA32 | __GFP_DMA)))
@@ -657,15 +671,18 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
* swiotlb_free_tlb() - free a dynamically allocated IO TLB buffer
* @vaddr: Virtual address of the buffer.
* @bytes: Size of the buffer.
+ * @unencrypted: true if @vaddr was allocated decrypted and must be
+ * re-encrypted before being freed
*/
-static void swiotlb_free_tlb(void *vaddr, size_t bytes)
+static void swiotlb_free_tlb(void *vaddr, size_t bytes, bool unencrypted)
{
if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
dma_free_from_pool(NULL, vaddr, bytes))
return;
/* Intentional leak if pages cannot be encrypted again. */
- if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
+ if (!unencrypted ||
+ !set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
__free_pages(virt_to_page(vaddr), get_order(bytes));
}
@@ -676,6 +693,7 @@ static void swiotlb_free_tlb(void *vaddr, size_t bytes)
* @nslabs: Desired (maximum) number of slabs.
* @nareas: Number of areas.
* @phys_limit: Maximum DMA buffer physical address.
+ * @attrs: DMA attributes for the allocation.
* @gfp: GFP flags for the allocations.
*
* Allocate and initialize a new IO TLB memory pool. The actual number of
@@ -686,7 +704,8 @@ static void swiotlb_free_tlb(void *vaddr, size_t bytes)
*/
static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
unsigned long minslabs, unsigned long nslabs,
- unsigned int nareas, u64 phys_limit, gfp_t gfp)
+ unsigned int nareas, u64 phys_limit,
+ unsigned long attrs, gfp_t gfp)
{
struct io_tlb_pool *pool;
unsigned int slot_order;
@@ -704,9 +723,10 @@ static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
if (!pool)
goto error;
pool->areas = (void *)pool + sizeof(*pool);
+ pool->unencrypted = !!(attrs & DMA_ATTR_CC_SHARED);
tlb_size = nslabs << IO_TLB_SHIFT;
- while (!(tlb = swiotlb_alloc_tlb(dev, tlb_size, phys_limit, gfp))) {
+ while (!(tlb = swiotlb_alloc_tlb(dev, tlb_size, phys_limit, attrs, gfp))) {
if (nslabs <= minslabs)
goto error_tlb;
nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
@@ -724,7 +744,8 @@ static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
return pool;
error_slots:
- swiotlb_free_tlb(page_address(tlb), tlb_size);
+ swiotlb_free_tlb(page_address(tlb), tlb_size,
+ !!(attrs & DMA_ATTR_CC_SHARED));
error_tlb:
kfree(pool);
error:
@@ -742,7 +763,9 @@ static void swiotlb_dyn_alloc(struct work_struct *work)
struct io_tlb_pool *pool;
pool = swiotlb_alloc_pool(NULL, IO_TLB_MIN_SLABS, default_nslabs,
- default_nareas, mem->phys_limit, GFP_KERNEL);
+ default_nareas, mem->phys_limit,
+ mem->unencrypted ? DMA_ATTR_CC_SHARED : 0,
+ GFP_KERNEL);
if (!pool) {
pr_warn_ratelimited("Failed to allocate new pool");
return;
@@ -762,7 +785,7 @@ static void swiotlb_dyn_free(struct rcu_head *rcu)
size_t tlb_size = pool->end - pool->start;
free_pages((unsigned long)pool->slots, get_order(slots_size));
- swiotlb_free_tlb(pool->vaddr, tlb_size);
+ swiotlb_free_tlb(pool->vaddr, tlb_size, pool->unencrypted);
kfree(pool);
}
@@ -1037,13 +1060,11 @@ static void dec_transient_used(struct io_tlb_mem *mem, unsigned int nslots)
* Return: Index of the first allocated slot, or -1 on error.
*/
static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool,
- int area_index, phys_addr_t orig_addr, size_t alloc_size,
- unsigned int alloc_align_mask)
+ int area_index, phys_addr_t orig_addr, dma_addr_t tbl_dma_addr,
+ size_t alloc_size, unsigned int alloc_align_mask)
{
struct io_tlb_area *area = pool->areas + area_index;
unsigned long boundary_mask = dma_get_seg_boundary(dev);
- dma_addr_t tbl_dma_addr =
- phys_to_dma_unencrypted(dev, pool->start) & boundary_mask;
unsigned long max_slots = get_max_slots(boundary_mask);
unsigned int iotlb_align_mask = dma_get_min_align_mask(dev);
unsigned int nslots = nr_slots(alloc_size), stride;
@@ -1056,6 +1077,8 @@ static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool
BUG_ON(!nslots);
BUG_ON(area_index >= pool->nareas);
+ tbl_dma_addr &= boundary_mask;
+
/*
* Historically, swiotlb allocations >= PAGE_SIZE were guaranteed to be
* page-aligned in the absence of any other alignment requirements.
@@ -1167,6 +1190,7 @@ static int swiotlb_search_area(struct device *dev, int start_cpu,
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
struct io_tlb_pool *pool;
+ dma_addr_t tbl_dma_addr;
int area_index;
int index = -1;
@@ -1175,9 +1199,15 @@ static int swiotlb_search_area(struct device *dev, int start_cpu,
if (cpu_offset >= pool->nareas)
continue;
area_index = (start_cpu + cpu_offset) & (pool->nareas - 1);
+
+ if (mem->unencrypted)
+ tbl_dma_addr = phys_to_dma_unencrypted(dev, pool->start);
+ else
+ tbl_dma_addr = phys_to_dma_encrypted(dev, pool->start);
+
index = swiotlb_search_pool_area(dev, pool, area_index,
- orig_addr, alloc_size,
- alloc_align_mask);
+ orig_addr, tbl_dma_addr,
+ alloc_size, alloc_align_mask);
if (index >= 0) {
*retpool = pool;
break;
@@ -1207,6 +1237,7 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
struct io_tlb_pool *pool;
+ dma_addr_t tbl_dma_addr;
unsigned long nslabs;
unsigned long flags;
u64 phys_limit;
@@ -1232,11 +1263,17 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
nslabs = nr_slots(alloc_size);
phys_limit = min_not_zero(*dev->dma_mask, dev->bus_dma_limit);
pool = swiotlb_alloc_pool(dev, nslabs, nslabs, 1, phys_limit,
+ mem->unencrypted ? DMA_ATTR_CC_SHARED : 0,
GFP_NOWAIT);
if (!pool)
return -1;
- index = swiotlb_search_pool_area(dev, pool, 0, orig_addr,
+ if (mem->unencrypted)
+ tbl_dma_addr = phys_to_dma_unencrypted(dev, pool->start);
+ else
+ tbl_dma_addr = phys_to_dma_encrypted(dev, pool->start);
+
+ index = swiotlb_search_pool_area(dev, pool, 0, orig_addr, tbl_dma_addr,
alloc_size, alloc_align_mask);
if (index < 0) {
swiotlb_dyn_free(&pool->rcu);
@@ -1281,15 +1318,23 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
size_t alloc_size, unsigned int alloc_align_mask,
struct io_tlb_pool **retpool)
{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
struct io_tlb_pool *pool;
+ dma_addr_t tbl_dma_addr;
int start, i;
int index;
- *retpool = pool = &dev->dma_io_tlb_mem->defpool;
+ *retpool = pool = &mem->defpool;
+ if (mem->unencrypted)
+ tbl_dma_addr = phys_to_dma_unencrypted(dev, pool->start);
+ else
+ tbl_dma_addr = phys_to_dma_encrypted(dev, pool->start);
+
i = start = raw_smp_processor_id() & (pool->nareas - 1);
do {
index = swiotlb_search_pool_area(dev, pool, i, orig_addr,
- alloc_size, alloc_align_mask);
+ tbl_dma_addr, alloc_size,
+ alloc_align_mask);
if (index >= 0)
return index;
if (++i >= pool->nareas)
@@ -1372,9 +1417,19 @@ static unsigned long mem_used(struct io_tlb_mem *mem)
* any pre- or post-padding for alignment
* @alloc_align_mask: Required start and end alignment of the allocated buffer
* @dir: DMA direction
- * @attrs: Optional DMA attributes for the map operation
+ * @attrs: Optional DMA attributes for the map operation, updated
+ * to match the selected SWIOTLB pool
*
* Find and allocate a suitable sequence of IO TLB slots for the request.
+ * The device's SWIOTLB pool must match the device's current DMA encryption
+ * requirements. If the device requires decrypted DMA, bouncing is done through
+ * an unencrypted pool and the mapping is marked shared. If the device can DMA
+ * to encrypted memory, bouncing is done through an encrypted pool even when the
+ * original DMA address was unencrypted. Enabling encrypted DMA for a device is
+ * therefore expected to update its default io_tlb_mem to an encrypted pool, so
+ * later bounce mappings for both encrypted and decrypted original memory use
+ * that encrypted pool.
+ *
* The allocated space starts at an alignment specified by alloc_align_mask,
* and the size of the allocated space is rounded up so that the total amount
* of allocated space is a multiple of (alloc_align_mask + 1). If
@@ -1411,6 +1466,16 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
+ /* swiotlb pool is incorrect for this device */
+ if (unlikely(mem->unencrypted != force_dma_unencrypted(dev)))
+ return (phys_addr_t)DMA_MAPPING_ERROR;
+
+ /* Force attrs to match the kind of memory in the pool */
+ if (mem->unencrypted)
+ *attrs |= DMA_ATTR_CC_SHARED;
+ else
+ *attrs &= ~DMA_ATTR_CC_SHARED;
+
/*
* The default swiotlb memory pool is allocated with PAGE_SIZE
* alignment. If a mapping is requested with larger alignment,
@@ -1608,8 +1673,11 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
return DMA_MAPPING_ERROR;
- /* Ensure that the address returned is DMA'ble */
- dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
+ if (attrs & DMA_ATTR_CC_SHARED)
+ dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
+ else
+ dma_addr = phys_to_dma_encrypted(dev, swiotlb_addr);
+
if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
__swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir,
attrs | DMA_ATTR_SKIP_CPU_SYNC,
@@ -1773,7 +1841,7 @@ static inline void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
#ifdef CONFIG_DMA_RESTRICTED_POOL
-struct page *swiotlb_alloc(struct device *dev, size_t size)
+struct page *swiotlb_alloc(struct device *dev, size_t size, unsigned long attrs)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
struct io_tlb_pool *pool;
@@ -1784,6 +1852,9 @@ struct page *swiotlb_alloc(struct device *dev, size_t size)
if (!mem)
return NULL;
+ if (mem->unencrypted != !!(attrs & DMA_ATTR_CC_SHARED))
+ return NULL;
+
align = (1 << (get_order(size) + PAGE_SHIFT)) - 1;
index = swiotlb_find_slots(dev, 0, size, align, &pool);
if (index == -1)
@@ -1859,9 +1930,18 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
kfree(mem);
return -ENOMEM;
}
+ /*
+ * if platform supports memory encryption,
+ * restricted mem pool is decrypted by default
+ */
+ if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
+ mem->unencrypted = true;
+ set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
+ rmem->size >> PAGE_SHIFT);
+ } else {
+ mem->unencrypted = false;
+ }
- set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
- rmem->size >> PAGE_SHIFT);
swiotlb_init_io_tlb_pool(pool, rmem->base, nslabs,
false, nareas);
mem->force_bounce = true;
--
2.43.0
^ permalink raw reply related
* [PATCH v5 06/20] dma: swiotlb: pass mapping attributes by reference
From: Aneesh Kumar K.V (Arm) @ 2026-05-22 4:28 UTC (permalink / raw)
To: iommu, linux-arm-kernel, linux-kernel, linux-coco
Cc: Aneesh Kumar K.V (Arm), Robin Murphy, Marek Szyprowski,
Will Deacon, Marc Zyngier, Steven Price, Suzuki K Poulose,
Catalin Marinas, Jiri Pirko, Jason Gunthorpe, Mostafa Saleh,
Petr Tesarik, Alexey Kardashevskiy, Dan Williams, Xu Yilun,
linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86
In-Reply-To: <20260522042815.370873-1-aneesh.kumar@kernel.org>
Change swiotlb_tbl_map_single() to take the DMA mapping attributes by
reference and update the direct callers accordingly.
This is a preparatory change for a follow-up patch which updates the
attributes based on the selected swiotlb pool. Keeping the signature change
separate makes the follow-up patch easier to review.
No functional change in this patch.
Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
drivers/iommu/dma-iommu.c | 2 +-
drivers/xen/swiotlb-xen.c | 2 +-
include/linux/swiotlb.h | 2 +-
kernel/dma/swiotlb.c | 6 +++---
4 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index c2595bee3d41..725c7adb0a8d 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1180,7 +1180,7 @@ static phys_addr_t iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys,
trace_swiotlb_bounced(dev, phys, size);
phys = swiotlb_tbl_map_single(dev, phys, size, iova_mask(iovad), dir,
- attrs);
+ &attrs);
/*
* Untrusted devices should not see padding areas with random leftover
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 2cbf2b588f5b..8c4abe65cd49 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -243,7 +243,7 @@ static dma_addr_t xen_swiotlb_map_phys(struct device *dev, phys_addr_t phys,
*/
trace_swiotlb_bounced(dev, dev_addr, size);
- map = swiotlb_tbl_map_single(dev, phys, size, 0, dir, attrs);
+ map = swiotlb_tbl_map_single(dev, phys, size, 0, dir, &attrs);
if (map == (phys_addr_t)DMA_MAPPING_ERROR)
return DMA_MAPPING_ERROR;
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 133bb8ca9032..29187cec90d8 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -238,7 +238,7 @@ static inline phys_addr_t default_swiotlb_limit(void)
phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
size_t mapping_size, unsigned int alloc_aligned_mask,
- enum dma_data_direction dir, unsigned long attrs);
+ enum dma_data_direction dir, unsigned long *attrs);
dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys,
size_t size, enum dma_data_direction dir, unsigned long attrs);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index be4d418d92ac..78ce05857c00 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1391,7 +1391,7 @@ static unsigned long mem_used(struct io_tlb_mem *mem)
*/
phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
size_t mapping_size, unsigned int alloc_align_mask,
- enum dma_data_direction dir, unsigned long attrs)
+ enum dma_data_direction dir, unsigned long *attrs)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned int offset;
@@ -1425,7 +1425,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
size = ALIGN(mapping_size + offset, alloc_align_mask + 1);
index = swiotlb_find_slots(dev, orig_addr, size, alloc_align_mask, &pool);
if (index == -1) {
- if (!(attrs & DMA_ATTR_NO_WARN))
+ if (!(*attrs & DMA_ATTR_NO_WARN))
dev_warn_ratelimited(dev,
"swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
size, mem->nslabs, mem_used(mem));
@@ -1604,7 +1604,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size);
- swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, 0, dir, attrs);
+ swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, 0, dir, &attrs);
if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
return DMA_MAPPING_ERROR;
--
2.43.0
^ permalink raw reply related
* [PATCH v5 05/20] dma-pool: track decrypted atomic pools and select them via attrs
From: Aneesh Kumar K.V (Arm) @ 2026-05-22 4:28 UTC (permalink / raw)
To: iommu, linux-arm-kernel, linux-kernel, linux-coco
Cc: Aneesh Kumar K.V (Arm), Robin Murphy, Marek Szyprowski,
Will Deacon, Marc Zyngier, Steven Price, Suzuki K Poulose,
Catalin Marinas, Jiri Pirko, Jason Gunthorpe, Mostafa Saleh,
Petr Tesarik, Alexey Kardashevskiy, Dan Williams, Xu Yilun,
linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86, Jiri Pirko
In-Reply-To: <20260522042815.370873-1-aneesh.kumar@kernel.org>
Teach the atomic DMA pool code to distinguish between encrypted and
unencrypted pools, and make pool allocation select the matching pool based
on DMA attributes.
Introduce a dma_gen_pool wrapper that records whether a pool is
unencrypted, initialize that state when the atomic pools are created, and
use it when expanding and resizing the pools. Update dma_alloc_from_pool()
to take attrs and skip pools whose encrypted state does not match
DMA_ATTR_CC_SHARED. Update dma_free_from_pool() accordingly.
Also pass DMA_ATTR_CC_SHARED from the swiotlb atomic allocation path so
decrypted swiotlb allocations are taken from the correct atomic pool.
Tested-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
drivers/iommu/dma-iommu.c | 2 +-
include/linux/dma-map-ops.h | 2 +-
kernel/dma/direct.c | 11 ++-
kernel/dma/pool.c | 167 +++++++++++++++++++++++-------------
kernel/dma/swiotlb.c | 7 +-
5 files changed, 123 insertions(+), 66 deletions(-)
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 54d96e847f16..c2595bee3d41 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1673,7 +1673,7 @@ void *iommu_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
!gfpflags_allow_blocking(gfp) && !coherent)
page = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &cpu_addr,
- gfp, NULL);
+ gfp, attrs, NULL);
else
cpu_addr = iommu_dma_alloc_pages(dev, size, &page, gfp, attrs);
if (!cpu_addr)
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 6a1832a73cad..696b2c3a2305 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -212,7 +212,7 @@ void *dma_common_pages_remap(struct page **pages, size_t size, pgprot_t prot,
void dma_common_free_remap(void *cpu_addr, size_t size);
struct page *dma_alloc_from_pool(struct device *dev, size_t size,
- void **cpu_addr, gfp_t flags,
+ void **cpu_addr, gfp_t flags, unsigned long attrs,
bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t));
bool dma_free_from_pool(struct device *dev, void *start, size_t size);
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index a224b1bed6f9..dd959716df33 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -154,7 +154,7 @@ static bool dma_direct_use_pool(struct device *dev, gfp_t gfp)
}
static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t gfp)
+ dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
struct page *page;
u64 phys_limit;
@@ -164,7 +164,8 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
return NULL;
gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit);
- page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok);
+ page = dma_alloc_from_pool(dev, size, &ret, gfp, attrs,
+ dma_coherent_ok);
if (!page)
return NULL;
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
@@ -253,7 +254,8 @@ void *dma_direct_alloc(struct device *dev, size_t size,
*/
if ((remap || (attrs & DMA_ATTR_CC_SHARED)) &&
dma_direct_use_pool(dev, gfp))
- return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
+ return dma_direct_alloc_from_pool(dev, size, dma_handle,
+ gfp, attrs);
if (is_swiotlb_for_alloc(dev)) {
page = dma_direct_alloc_swiotlb(dev, size);
@@ -401,7 +403,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
attrs |= DMA_ATTR_CC_SHARED;
if ((attrs & DMA_ATTR_CC_SHARED) && dma_direct_use_pool(dev, gfp))
- return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
+ return dma_direct_alloc_from_pool(dev, size, dma_handle,
+ gfp, attrs);
if (is_swiotlb_for_alloc(dev)) {
page = dma_direct_alloc_swiotlb(dev, size);
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 2b2fbb709242..be78474a6c49 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -12,12 +12,18 @@
#include <linux/set_memory.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
+#include <linux/cc_platform.h>
-static struct gen_pool *atomic_pool_dma __ro_after_init;
+struct dma_gen_pool {
+ bool unencrypted;
+ struct gen_pool *pool;
+};
+
+static struct dma_gen_pool atomic_pool_dma __ro_after_init;
static unsigned long pool_size_dma;
-static struct gen_pool *atomic_pool_dma32 __ro_after_init;
+static struct dma_gen_pool atomic_pool_dma32 __ro_after_init;
static unsigned long pool_size_dma32;
-static struct gen_pool *atomic_pool_kernel __ro_after_init;
+static struct dma_gen_pool atomic_pool_kernel __ro_after_init;
static unsigned long pool_size_kernel;
/* Size can be defined by the coherent_pool command line */
@@ -76,11 +82,12 @@ static bool cma_in_zone(gfp_t gfp)
return true;
}
-static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
+static int atomic_pool_expand(struct dma_gen_pool *dma_pool, size_t pool_size,
gfp_t gfp)
{
unsigned int order;
struct page *page = NULL;
+ bool leak_pages = false;
void *addr;
int ret = -ENOMEM;
@@ -113,12 +120,17 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
* Memory in the atomic DMA pools must be unencrypted, the pools do not
* shrink so no re-encryption occurs in dma_direct_free().
*/
- ret = set_memory_decrypted((unsigned long)page_to_virt(page),
- 1 << order);
- if (ret)
- goto remove_mapping;
- ret = gen_pool_add_virt(pool, (unsigned long)addr, page_to_phys(page),
- pool_size, NUMA_NO_NODE);
+ if (dma_pool->unencrypted) {
+ ret = set_memory_decrypted((unsigned long)page_to_virt(page),
+ 1 << order);
+ if (ret) {
+ leak_pages = true;
+ goto remove_mapping;
+ }
+ }
+
+ ret = gen_pool_add_virt(dma_pool->pool, (unsigned long)addr,
+ page_to_phys(page), pool_size, NUMA_NO_NODE);
if (ret)
goto encrypt_mapping;
@@ -126,62 +138,67 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
return 0;
encrypt_mapping:
- ret = set_memory_encrypted((unsigned long)page_to_virt(page),
- 1 << order);
- if (WARN_ON_ONCE(ret)) {
- /* Decrypt succeeded but encrypt failed, purposely leak */
- goto out;
- }
+ if (dma_pool->unencrypted &&
+ set_memory_encrypted((unsigned long)page_to_virt(page), 1 << order))
+ leak_pages = true;
+
remove_mapping:
#ifdef CONFIG_DMA_DIRECT_REMAP
dma_common_free_remap(addr, pool_size);
free_page:
- __free_pages(page, order);
+ if (!leak_pages)
+ __free_pages(page, order);
#endif
out:
return ret;
}
-static void atomic_pool_resize(struct gen_pool *pool, gfp_t gfp)
+static void atomic_pool_resize(struct dma_gen_pool *dma_pool, gfp_t gfp)
{
- if (pool && gen_pool_avail(pool) < atomic_pool_size)
- atomic_pool_expand(pool, gen_pool_size(pool), gfp);
+ if (dma_pool->pool && gen_pool_avail(dma_pool->pool) < atomic_pool_size)
+ atomic_pool_expand(dma_pool, gen_pool_size(dma_pool->pool), gfp);
}
static void atomic_pool_work_fn(struct work_struct *work)
{
if (IS_ENABLED(CONFIG_ZONE_DMA))
- atomic_pool_resize(atomic_pool_dma,
+ atomic_pool_resize(&atomic_pool_dma,
GFP_KERNEL | GFP_DMA);
if (IS_ENABLED(CONFIG_ZONE_DMA32))
- atomic_pool_resize(atomic_pool_dma32,
+ atomic_pool_resize(&atomic_pool_dma32,
GFP_KERNEL | GFP_DMA32);
- atomic_pool_resize(atomic_pool_kernel, GFP_KERNEL);
+ atomic_pool_resize(&atomic_pool_kernel, GFP_KERNEL);
}
-static __init struct gen_pool *__dma_atomic_pool_init(size_t pool_size,
- gfp_t gfp)
+static __init struct dma_gen_pool *__dma_atomic_pool_init(struct dma_gen_pool *dma_pool,
+ size_t pool_size, gfp_t gfp)
{
- struct gen_pool *pool;
int ret;
- pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE);
- if (!pool)
+ dma_pool->pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE);
+ if (!dma_pool->pool)
return NULL;
- gen_pool_set_algo(pool, gen_pool_first_fit_order_align, NULL);
+ gen_pool_set_algo(dma_pool->pool, gen_pool_first_fit_order_align, NULL);
+
+ /* if platform is using memory encryption atomic pools are by default decrypted. */
+ if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
+ dma_pool->unencrypted = true;
+ else
+ dma_pool->unencrypted = false;
- ret = atomic_pool_expand(pool, pool_size, gfp);
+ ret = atomic_pool_expand(dma_pool, pool_size, gfp);
if (ret) {
- gen_pool_destroy(pool);
+ gen_pool_destroy(dma_pool->pool);
+ dma_pool->pool = NULL;
pr_err("DMA: failed to allocate %zu KiB %pGg pool for atomic allocation\n",
pool_size >> 10, &gfp);
return NULL;
}
pr_info("DMA: preallocated %zu KiB %pGg pool for atomic allocations\n",
- gen_pool_size(pool) >> 10, &gfp);
- return pool;
+ gen_pool_size(dma_pool->pool) >> 10, &gfp);
+ return dma_pool;
}
#ifdef CONFIG_ZONE_DMA32
@@ -207,21 +224,22 @@ static int __init dma_atomic_pool_init(void)
/* All memory might be in the DMA zone(s) to begin with */
if (has_managed_zone(ZONE_NORMAL)) {
- atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size,
- GFP_KERNEL);
- if (!atomic_pool_kernel)
+ __dma_atomic_pool_init(&atomic_pool_kernel, atomic_pool_size, GFP_KERNEL);
+ if (!atomic_pool_kernel.pool)
ret = -ENOMEM;
}
+
if (has_managed_dma()) {
- atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size,
- GFP_KERNEL | GFP_DMA);
- if (!atomic_pool_dma)
+ __dma_atomic_pool_init(&atomic_pool_dma, atomic_pool_size,
+ GFP_KERNEL | GFP_DMA);
+ if (!atomic_pool_dma.pool)
ret = -ENOMEM;
}
+
if (has_managed_dma32) {
- atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size,
- GFP_KERNEL | GFP_DMA32);
- if (!atomic_pool_dma32)
+ __dma_atomic_pool_init(&atomic_pool_dma32, atomic_pool_size,
+ GFP_KERNEL | GFP_DMA32);
+ if (!atomic_pool_dma32.pool)
ret = -ENOMEM;
}
@@ -230,19 +248,44 @@ static int __init dma_atomic_pool_init(void)
}
postcore_initcall(dma_atomic_pool_init);
-static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
+static inline struct dma_gen_pool *__dma_guess_pool(struct dma_gen_pool *first,
+ struct dma_gen_pool *second, struct dma_gen_pool *third)
{
- if (prev == NULL) {
+ if (first->pool)
+ return first;
+ if (second && second->pool)
+ return second;
+ if (third && third->pool)
+ return third;
+ return NULL;
+}
+
+static inline struct dma_gen_pool *dma_guess_pool(struct dma_gen_pool *prev,
+ gfp_t gfp)
+{
+ if (!prev) {
if (gfp & GFP_DMA)
- return atomic_pool_dma ?: atomic_pool_dma32 ?: atomic_pool_kernel;
+ return __dma_guess_pool(&atomic_pool_dma,
+ &atomic_pool_dma32,
+ &atomic_pool_kernel);
+
if (gfp & GFP_DMA32)
- return atomic_pool_dma32 ?: atomic_pool_dma ?: atomic_pool_kernel;
- return atomic_pool_kernel ?: atomic_pool_dma32 ?: atomic_pool_dma;
+ return __dma_guess_pool(&atomic_pool_dma32,
+ &atomic_pool_dma,
+ &atomic_pool_kernel);
+
+ return __dma_guess_pool(&atomic_pool_kernel,
+ &atomic_pool_dma32,
+ &atomic_pool_dma);
}
- if (prev == atomic_pool_kernel)
- return atomic_pool_dma32 ? atomic_pool_dma32 : atomic_pool_dma;
- if (prev == atomic_pool_dma32)
- return atomic_pool_dma;
+
+ if (prev == &atomic_pool_kernel)
+ return __dma_guess_pool(&atomic_pool_dma32,
+ &atomic_pool_dma, NULL);
+
+ if (prev == &atomic_pool_dma32)
+ return __dma_guess_pool(&atomic_pool_dma, NULL, NULL);
+
return NULL;
}
@@ -272,16 +315,20 @@ static struct page *__dma_alloc_from_pool(struct device *dev, size_t size,
}
struct page *dma_alloc_from_pool(struct device *dev, size_t size,
- void **cpu_addr, gfp_t gfp,
+ void **cpu_addr, gfp_t gfp, unsigned long attrs,
bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t))
{
- struct gen_pool *pool = NULL;
+ struct dma_gen_pool *dma_pool = NULL;
struct page *page;
bool pool_found = false;
- while ((pool = dma_guess_pool(pool, gfp))) {
+ while ((dma_pool = dma_guess_pool(dma_pool, gfp))) {
+
+ if (dma_pool->unencrypted != !!(attrs & DMA_ATTR_CC_SHARED))
+ continue;
+
pool_found = true;
- page = __dma_alloc_from_pool(dev, size, pool, cpu_addr,
+ page = __dma_alloc_from_pool(dev, size, dma_pool->pool, cpu_addr,
phys_addr_ok);
if (page)
return page;
@@ -296,12 +343,14 @@ struct page *dma_alloc_from_pool(struct device *dev, size_t size,
bool dma_free_from_pool(struct device *dev, void *start, size_t size)
{
- struct gen_pool *pool = NULL;
+ struct dma_gen_pool *dma_pool = NULL;
+
+ while ((dma_pool = dma_guess_pool(dma_pool, 0))) {
- while ((pool = dma_guess_pool(pool, 0))) {
- if (!gen_pool_has_addr(pool, (unsigned long)start, size))
+ if (!gen_pool_has_addr(dma_pool->pool, (unsigned long)start, size))
continue;
- gen_pool_free(pool, (unsigned long)start, size);
+
+ gen_pool_free(dma_pool->pool, (unsigned long)start, size);
return true;
}
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index ac03a6856c2e..be4d418d92ac 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -612,6 +612,7 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
u64 phys_limit, gfp_t gfp)
{
struct page *page;
+ unsigned long attrs = 0;
/*
* Allocate from the atomic pools if memory is encrypted and
@@ -623,8 +624,12 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
return NULL;
+ /* swiotlb considered decrypted by default */
+ if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
+ attrs = DMA_ATTR_CC_SHARED;
+
return dma_alloc_from_pool(dev, bytes, &vaddr, gfp,
- dma_coherent_ok);
+ attrs, dma_coherent_ok);
}
gfp &= ~GFP_ZONEMASK;
--
2.43.0
^ permalink raw reply related
* [PATCH v5 04/20] dma-direct: use DMA_ATTR_CC_SHARED in alloc/free paths
From: Aneesh Kumar K.V (Arm) @ 2026-05-22 4:27 UTC (permalink / raw)
To: iommu, linux-arm-kernel, linux-kernel, linux-coco
Cc: Aneesh Kumar K.V (Arm), Robin Murphy, Marek Szyprowski,
Will Deacon, Marc Zyngier, Steven Price, Suzuki K Poulose,
Catalin Marinas, Jiri Pirko, Jason Gunthorpe, Mostafa Saleh,
Petr Tesarik, Alexey Kardashevskiy, Dan Williams, Xu Yilun,
linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86, Jiri Pirko
In-Reply-To: <20260522042815.370873-1-aneesh.kumar@kernel.org>
Propagate force_dma_unencrypted() into DMA_ATTR_CC_SHARED in the
dma-direct allocation path and use the attribute to drive the related
decisions.
This updates dma_direct_alloc(), dma_direct_free(), and
dma_direct_alloc_pages() to fold the forced unencrypted case into attrs.
Tested-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
kernel/dma/direct.c | 53 +++++++++++++++++++++++++++++++++++++--------
1 file changed, 44 insertions(+), 9 deletions(-)
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index fe8e83a36058..a224b1bed6f9 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -193,16 +193,31 @@ void *dma_direct_alloc(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
bool remap = false, set_uncached = false;
- bool mark_mem_decrypt = true;
+ bool mark_mem_decrypt = false;
struct page *page;
void *ret;
+ /*
+ * DMA_ATTR_CC_SHARED is not a caller-visible dma_alloc_*()
+ * attribute. The direct allocator uses it internally after it has
+ * decided that the backing pages must be shared/decrypted, so the
+ * rest of the allocation path can consistently select DMA addresses,
+ * choose compatible pools and restore encryption on free.
+ */
+ if (attrs & DMA_ATTR_CC_SHARED)
+ return NULL;
+
+ if (force_dma_unencrypted(dev)) {
+ attrs |= DMA_ATTR_CC_SHARED;
+ mark_mem_decrypt = true;
+ }
+
size = PAGE_ALIGN(size);
if (attrs & DMA_ATTR_NO_WARN)
gfp |= __GFP_NOWARN;
- if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
- !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev))
+ if (((attrs & (DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_CC_SHARED)) ==
+ DMA_ATTR_NO_KERNEL_MAPPING) && !is_swiotlb_for_alloc(dev))
return dma_direct_alloc_no_mapping(dev, size, dma_handle, gfp);
if (!dev_is_dma_coherent(dev)) {
@@ -236,7 +251,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
* Remapping or decrypting memory may block, allocate the memory from
* the atomic pools instead if we aren't allowed block.
*/
- if ((remap || force_dma_unencrypted(dev)) &&
+ if ((remap || (attrs & DMA_ATTR_CC_SHARED)) &&
dma_direct_use_pool(dev, gfp))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
@@ -312,12 +327,24 @@ void dma_direct_free(struct device *dev, size_t size,
void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
{
phys_addr_t phys;
- bool mark_mem_encrypted = true;
+ bool mark_mem_encrypted = false;
struct io_tlb_pool *swiotlb_pool;
unsigned int page_order = get_order(size);
- if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
- !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) {
+ /* see dma_direct_alloc() for details */
+ WARN_ON(attrs & DMA_ATTR_CC_SHARED);
+
+ /*
+ * if the device had requested for an unencrypted buffer,
+ * convert it to encrypted on free
+ */
+ if (force_dma_unencrypted(dev)) {
+ attrs |= DMA_ATTR_CC_SHARED;
+ mark_mem_encrypted = true;
+ }
+
+ if (((attrs & (DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_CC_SHARED)) ==
+ DMA_ATTR_NO_KERNEL_MAPPING) && !is_swiotlb_for_alloc(dev)) {
/* cpu_addr is a struct page cookie, not a kernel address */
dma_free_contiguous(dev, cpu_addr, size);
return;
@@ -366,10 +393,14 @@ void dma_direct_free(struct device *dev, size_t size,
struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
{
+ unsigned long attrs = 0;
struct page *page;
void *ret;
- if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
+ if (force_dma_unencrypted(dev))
+ attrs |= DMA_ATTR_CC_SHARED;
+
+ if ((attrs & DMA_ATTR_CC_SHARED) && dma_direct_use_pool(dev, gfp))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
if (is_swiotlb_for_alloc(dev)) {
@@ -403,7 +434,11 @@ void dma_direct_free_pages(struct device *dev, size_t size,
phys_addr_t phys;
void *vaddr = page_address(page);
struct io_tlb_pool *swiotlb_pool;
- bool mark_mem_encrypted = true;
+ /*
+ * if the device had requested for an unencrypted buffer,
+ * convert it to encrypted on free
+ */
+ bool mark_mem_encrypted = force_dma_unencrypted(dev);
/* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
--
2.43.0
^ permalink raw reply related
* [PATCH v5 03/20] dma-direct: swiotlb: handle swiotlb alloc/free outside __dma_direct_alloc_pages
From: Aneesh Kumar K.V (Arm) @ 2026-05-22 4:27 UTC (permalink / raw)
To: iommu, linux-arm-kernel, linux-kernel, linux-coco
Cc: Aneesh Kumar K.V (Arm), Robin Murphy, Marek Szyprowski,
Will Deacon, Marc Zyngier, Steven Price, Suzuki K Poulose,
Catalin Marinas, Jiri Pirko, Jason Gunthorpe, Mostafa Saleh,
Petr Tesarik, Alexey Kardashevskiy, Dan Williams, Xu Yilun,
linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86, Jiri Pirko
In-Reply-To: <20260522042815.370873-1-aneesh.kumar@kernel.org>
Move swiotlb allocation out of __dma_direct_alloc_pages() and handle it in
dma_direct_alloc() / dma_direct_alloc_pages().
This is needed for follow-up changes that simplify the handling of
memory encryption/decryption based on the DMA attribute flags.
swiotlb backing pages are already mapped decrypted by
swiotlb_update_mem_attributes() and rmem_swiotlb_device_init(), so
dma-direct should not call dma_set_decrypted() on allocation nor
dma_set_encrypted() on free for swiotlb-backed memory.
Update alloc/free paths to detect swiotlb-backed pages and skip
encrypt/decrypt transitions for those paths. Keep the existing highmem
rejection in dma_direct_alloc_pages() for swiotlb allocations.
Only for "restricted-dma-pool", we currently set `for_alloc = true`, while
rmem_swiotlb_device_init() decrypts the whole pool up front. This pool is
typically used together with "shared-dma-pool", where the shared region is
accessed after remap/ioremap and the returned address is suitable for
decrypted memory access. So existing code paths remain valid.
Tested-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
include/linux/swiotlb.h | 6 ++++
kernel/dma/direct.c | 71 ++++++++++++++++++++++++++++++-----------
kernel/dma/swiotlb.c | 6 ++++
3 files changed, 65 insertions(+), 18 deletions(-)
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 3dae0f592063..133bb8ca9032 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -284,6 +284,8 @@ extern void swiotlb_print_info(void);
#ifdef CONFIG_DMA_RESTRICTED_POOL
struct page *swiotlb_alloc(struct device *dev, size_t size);
bool swiotlb_free(struct device *dev, struct page *page, size_t size);
+void swiotlb_free_from_pool(struct device *dev, phys_addr_t tlb_addr,
+ size_t size, struct io_tlb_pool *pool);
static inline bool is_swiotlb_for_alloc(struct device *dev)
{
@@ -299,6 +301,10 @@ static inline bool swiotlb_free(struct device *dev, struct page *page,
{
return false;
}
+static inline void swiotlb_free_from_pool(struct device *dev, phys_addr_t tlb_addr,
+ size_t size, struct io_tlb_pool *pool)
+{
+}
static inline bool is_swiotlb_for_alloc(struct device *dev)
{
return false;
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index ec887f443741..fe8e83a36058 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -96,14 +96,6 @@ static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size)
return ret;
}
-static void __dma_direct_free_pages(struct device *dev, struct page *page,
- size_t size)
-{
- if (swiotlb_free(dev, page, size))
- return;
- dma_free_contiguous(dev, page, size);
-}
-
static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size)
{
struct page *page = swiotlb_alloc(dev, size);
@@ -125,9 +117,6 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
WARN_ON_ONCE(!PAGE_ALIGNED(size));
- if (is_swiotlb_for_alloc(dev))
- return dma_direct_alloc_swiotlb(dev, size);
-
gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit);
page = dma_alloc_contiguous(dev, size, gfp);
if (page) {
@@ -204,6 +193,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
bool remap = false, set_uncached = false;
+ bool mark_mem_decrypt = true;
struct page *page;
void *ret;
@@ -250,11 +240,21 @@ void *dma_direct_alloc(struct device *dev, size_t size,
dma_direct_use_pool(dev, gfp))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
+ if (is_swiotlb_for_alloc(dev)) {
+ page = dma_direct_alloc_swiotlb(dev, size);
+ if (page) {
+ mark_mem_decrypt = false;
+ goto setup_page;
+ }
+ return NULL;
+ }
+
/* we always manually zero the memory once we are done */
page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true);
if (!page)
return NULL;
+setup_page:
/*
* dma_alloc_contiguous can return highmem pages depending on a
* combination the cma= arguments and per-arch setup. These need to be
@@ -281,7 +281,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
goto out_free_pages;
} else {
ret = page_address(page);
- if (dma_set_decrypted(dev, ret, size))
+ if (mark_mem_decrypt && dma_set_decrypted(dev, ret, size))
goto out_leak_pages;
}
@@ -298,10 +298,11 @@ void *dma_direct_alloc(struct device *dev, size_t size,
return ret;
out_encrypt_pages:
- if (dma_set_encrypted(dev, page_address(page), size))
+ if (mark_mem_decrypt && dma_set_encrypted(dev, page_address(page), size))
return NULL;
out_free_pages:
- __dma_direct_free_pages(dev, page, size);
+ if (!swiotlb_free(dev, page, size))
+ dma_free_contiguous(dev, page, size);
return NULL;
out_leak_pages:
return NULL;
@@ -310,6 +311,9 @@ void *dma_direct_alloc(struct device *dev, size_t size,
void dma_direct_free(struct device *dev, size_t size,
void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
{
+ phys_addr_t phys;
+ bool mark_mem_encrypted = true;
+ struct io_tlb_pool *swiotlb_pool;
unsigned int page_order = get_order(size);
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
@@ -338,16 +342,25 @@ void dma_direct_free(struct device *dev, size_t size,
dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
return;
+ phys = dma_to_phys(dev, dma_addr);
+ swiotlb_pool = swiotlb_find_pool(dev, phys);
+ if (swiotlb_pool)
+ /* Swiotlb doesn't need a page attribute update on free */
+ mark_mem_encrypted = false;
+
if (is_vmalloc_addr(cpu_addr)) {
vunmap(cpu_addr);
} else {
if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
arch_dma_clear_uncached(cpu_addr, size);
- if (dma_set_encrypted(dev, cpu_addr, size))
+ if (mark_mem_encrypted && dma_set_encrypted(dev, cpu_addr, size))
return;
}
- __dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size);
+ if (swiotlb_pool)
+ swiotlb_free_from_pool(dev, phys, size, swiotlb_pool);
+ else
+ dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size);
}
struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
@@ -359,6 +372,15 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
+ if (is_swiotlb_for_alloc(dev)) {
+ page = dma_direct_alloc_swiotlb(dev, size);
+ if (!page)
+ return NULL;
+
+ ret = page_address(page);
+ goto setup_page;
+ }
+
page = __dma_direct_alloc_pages(dev, size, gfp, false);
if (!page)
return NULL;
@@ -366,6 +388,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
ret = page_address(page);
if (dma_set_decrypted(dev, ret, size))
goto out_leak_pages;
+setup_page:
memset(ret, 0, size);
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
return page;
@@ -377,16 +400,28 @@ void dma_direct_free_pages(struct device *dev, size_t size,
struct page *page, dma_addr_t dma_addr,
enum dma_data_direction dir)
{
+ phys_addr_t phys;
void *vaddr = page_address(page);
+ struct io_tlb_pool *swiotlb_pool;
+ bool mark_mem_encrypted = true;
/* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
dma_free_from_pool(dev, vaddr, size))
return;
- if (dma_set_encrypted(dev, vaddr, size))
+ phys = page_to_phys(page);
+ swiotlb_pool = swiotlb_find_pool(dev, phys);
+ if (swiotlb_pool)
+ mark_mem_encrypted = false;
+
+ if (mark_mem_encrypted && dma_set_encrypted(dev, vaddr, size))
return;
- __dma_direct_free_pages(dev, page, size);
+
+ if (swiotlb_pool)
+ swiotlb_free_from_pool(dev, phys, size, swiotlb_pool);
+ else
+ dma_free_contiguous(dev, page, size);
}
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 1abd3e6146f4..ac03a6856c2e 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1809,6 +1809,12 @@ bool swiotlb_free(struct device *dev, struct page *page, size_t size)
return true;
}
+void swiotlb_free_from_pool(struct device *dev, phys_addr_t tlb_addr, size_t size,
+ struct io_tlb_pool *pool)
+{
+ swiotlb_release_slots(dev, tlb_addr, pool);
+}
+
static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
struct device *dev)
{
--
2.43.0
^ permalink raw reply related
* [PATCH v5 02/20] [DO NOT MERGE] s390: Expose protected virtualization through cc_platform_has()
From: Aneesh Kumar K.V (Arm) @ 2026-05-22 4:27 UTC (permalink / raw)
To: iommu, linux-arm-kernel, linux-kernel, linux-coco
Cc: Aneesh Kumar K.V (Arm), Robin Murphy, Marek Szyprowski,
Will Deacon, Marc Zyngier, Steven Price, Suzuki K Poulose,
Catalin Marinas, Jiri Pirko, Jason Gunthorpe, Mostafa Saleh,
Petr Tesarik, Alexey Kardashevskiy, Dan Williams, Xu Yilun,
linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86, Halil Pasic,
Matthew Rosato, Jaehoon Kim
In-Reply-To: <20260522042815.370873-1-aneesh.kumar@kernel.org>
Protected virtualization guests use memory encryption, so advertise that to
the rest of the kernel through cc_platform_has(CC_ATTR_MEM_ENCRYPT).
s390 already forces DMA mappings to be unencrypted for protected
virtualization guests through force_dma_unencrypted(). Add
ARCH_HAS_CC_PLATFORM and provide the matching cc_platform_has()
implementation
Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
Cc: Halil Pasic <pasic@linux.ibm.com>
Cc: Matthew Rosato <mjrosato@linux.ibm.com>
Cc: Jaehoon Kim <jhkim@linux.ibm.com>
---
arch/s390/Kconfig | 1 +
arch/s390/mm/init.c | 14 ++++++++++++++
2 files changed, 15 insertions(+)
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index ecbcbb781e40..9b5e6029e043 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -87,6 +87,7 @@ config S390
select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
select ARCH_HAS_CC_CAN_LINK
+ select ARCH_HAS_CC_PLATFORM
select ARCH_HAS_CPU_FINALIZE_INIT
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 1f72efc2a579..ad3c6d92b801 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -50,6 +50,7 @@
#include <linux/virtio_anchor.h>
#include <linux/virtio_config.h>
#include <linux/execmem.h>
+#include <linux/cc_platform.h>
pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir");
pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir");
@@ -140,6 +141,19 @@ bool force_dma_unencrypted(struct device *dev)
return is_prot_virt_guest();
}
+
+bool cc_platform_has(enum cc_attr attr)
+{
+ switch (attr) {
+ case CC_ATTR_MEM_ENCRYPT:
+ return is_prot_virt_guest();
+
+ default:
+ return false;
+ }
+}
+EXPORT_SYMBOL_GPL(cc_platform_has);
+
/* protected virtualization */
static void __init pv_init(void)
{
--
2.43.0
^ permalink raw reply related
* [PATCH v5 01/20] [DO NOT MERGE] arm64/coco: Add pKVM as a CC platform
From: Aneesh Kumar K.V (Arm) @ 2026-05-22 4:27 UTC (permalink / raw)
To: iommu, linux-arm-kernel, linux-kernel, linux-coco
Cc: Aneesh Kumar K.V (Arm), Robin Murphy, Marek Szyprowski,
Will Deacon, Marc Zyngier, Steven Price, Suzuki K Poulose,
Catalin Marinas, Jiri Pirko, Jason Gunthorpe, Mostafa Saleh,
Petr Tesarik, Alexey Kardashevskiy, Dan Williams, Xu Yilun,
linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86
In-Reply-To: <20260522042815.370873-1-aneesh.kumar@kernel.org>
pKVM does support memory encryption, expose that to the rest of
the kernel through cc_platform_has()
At the moment, all devices inside the guest are emulated which
requires its memory to be shared back to the host (decrypted), so
set force_dma_unencrypted() to always return true.
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
arch/arm64/include/asm/hypervisor.h | 6 ++++++
arch/arm64/include/asm/mem_encrypt.h | 3 ++-
arch/arm64/kernel/rsi.c | 12 ------------
arch/arm64/mm/init.c | 13 +++++++++++++
drivers/virt/coco/pkvm-guest/arm-pkvm-guest.c | 5 +++++
5 files changed, 26 insertions(+), 13 deletions(-)
diff --git a/arch/arm64/include/asm/hypervisor.h b/arch/arm64/include/asm/hypervisor.h
index a12fd897c877..1b0e15f290be 100644
--- a/arch/arm64/include/asm/hypervisor.h
+++ b/arch/arm64/include/asm/hypervisor.h
@@ -10,8 +10,14 @@ void kvm_arm_target_impl_cpu_init(void);
#ifdef CONFIG_ARM_PKVM_GUEST
void pkvm_init_hyp_services(void);
+bool is_protected_kvm_guest(void);
#else
static inline void pkvm_init_hyp_services(void) { };
+
+static inline bool is_protected_kvm_guest(void)
+{
+ return false;
+}
#endif
static inline void kvm_arch_init_hyp_services(void)
diff --git a/arch/arm64/include/asm/mem_encrypt.h b/arch/arm64/include/asm/mem_encrypt.h
index 314b2b52025f..636f45b4d8af 100644
--- a/arch/arm64/include/asm/mem_encrypt.h
+++ b/arch/arm64/include/asm/mem_encrypt.h
@@ -2,6 +2,7 @@
#ifndef __ASM_MEM_ENCRYPT_H
#define __ASM_MEM_ENCRYPT_H
+#include <asm/hypervisor.h>
#include <asm/rsi.h>
struct device;
@@ -20,7 +21,7 @@ int realm_register_memory_enc_ops(void);
static inline bool force_dma_unencrypted(struct device *dev)
{
- return is_realm_world();
+ return is_realm_world() || is_protected_kvm_guest();
}
/*
diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c
index 92160f2e57ff..25ca75ce1a4d 100644
--- a/arch/arm64/kernel/rsi.c
+++ b/arch/arm64/kernel/rsi.c
@@ -7,7 +7,6 @@
#include <linux/memblock.h>
#include <linux/psci.h>
#include <linux/swiotlb.h>
-#include <linux/cc_platform.h>
#include <linux/platform_device.h>
#include <asm/io.h>
@@ -23,17 +22,6 @@ EXPORT_SYMBOL(prot_ns_shared);
DEFINE_STATIC_KEY_FALSE_RO(rsi_present);
EXPORT_SYMBOL(rsi_present);
-bool cc_platform_has(enum cc_attr attr)
-{
- switch (attr) {
- case CC_ATTR_MEM_ENCRYPT:
- return is_realm_world();
- default:
- return false;
- }
-}
-EXPORT_SYMBOL_GPL(cc_platform_has);
-
static bool rsi_version_matches(void)
{
unsigned long ver_lower, ver_higher;
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 97987f850a33..c1b223e7cc8e 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -12,6 +12,7 @@
#include <linux/swap.h>
#include <linux/init.h>
#include <linux/cache.h>
+#include <linux/cc_platform.h>
#include <linux/mman.h>
#include <linux/nodemask.h>
#include <linux/initrd.h>
@@ -36,6 +37,7 @@
#include <asm/boot.h>
#include <asm/fixmap.h>
+#include <asm/hypervisor.h>
#include <asm/kasan.h>
#include <asm/kernel-pgtable.h>
#include <asm/kvm_host.h>
@@ -416,6 +418,17 @@ void dump_mem_limit(void)
}
}
+bool cc_platform_has(enum cc_attr attr)
+{
+ switch (attr) {
+ case CC_ATTR_MEM_ENCRYPT:
+ return is_realm_world() || is_protected_kvm_guest();
+ default:
+ return false;
+ }
+}
+EXPORT_SYMBOL_GPL(cc_platform_has);
+
#ifdef CONFIG_EXECMEM
static u64 module_direct_base __ro_after_init = 0;
static u64 module_plt_base __ro_after_init = 0;
diff --git a/drivers/virt/coco/pkvm-guest/arm-pkvm-guest.c b/drivers/virt/coco/pkvm-guest/arm-pkvm-guest.c
index 4230b817a80b..297e6d6019b8 100644
--- a/drivers/virt/coco/pkvm-guest/arm-pkvm-guest.c
+++ b/drivers/virt/coco/pkvm-guest/arm-pkvm-guest.c
@@ -95,6 +95,11 @@ static int mmio_guard_ioremap_hook(phys_addr_t phys, size_t size,
return 0;
}
+bool is_protected_kvm_guest(void)
+{
+ return !!pkvm_granule;
+}
+
void pkvm_init_hyp_services(void)
{
int i;
--
2.43.0
^ permalink raw reply related
* [PATCH v5 00/20] dma-mapping: Use DMA_ATTR_CC_SHARED through direct, pool and swiotlb paths
From: Aneesh Kumar K.V (Arm) @ 2026-05-22 4:27 UTC (permalink / raw)
To: iommu, linux-arm-kernel, linux-kernel, linux-coco
Cc: Aneesh Kumar K.V (Arm), Robin Murphy, Marek Szyprowski,
Will Deacon, Marc Zyngier, Steven Price, Suzuki K Poulose,
Catalin Marinas, Jiri Pirko, Jason Gunthorpe, Mostafa Saleh,
Petr Tesarik, Alexey Kardashevskiy, Dan Williams, Xu Yilun,
linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86
This series propagates DMA_ATTR_CC_SHARED through the dma-direct,
dma-pool, and swiotlb paths so that encrypted and decrypted DMA buffers
are handled consistently.
Today, the direct DMA path mostly relies on force_dma_unencrypted() for
shared/decrypted buffer handling. This series consolidates the
force_dma_unencrypted() checks in the top-level functions and ensures
that the remaining DMA interfaces use DMA attributes to make the correct
decisions.
The series:
- moves swiotlb-backed allocations out of __dma_direct_alloc_pages(),
- propagates DMA_ATTR_CC_SHARED through the dma-direct alloc/free
paths
- teaches the atomic DMA pools to track encrypted versus decrypted
state
- tracks swiotlb pool encryption state and enforces strict pool
selection
- centralizes encrypted/decrypted pgprot handling in dma_pgprot() using
DMA attributes
- passes DMA attributes down to dma_capable() so capability checks can
validate whether the selected DMA address encoding matches
DMA_ATTR_CC_SHARED
- makes dma_direct_map_phys() choose the DMA address encoding from
DMA_ATTR_CC_SHARED and fall back to swiotlb when a shared DMA request
cannot use the direct mapping, which lets arm64 and x86 CCA guests stop
relying on SWIOTLB_FORCE for DMA mappings
- use the selected swiotlb pool state to derive the returned DMA
address.
Changes since v4:
https://lore.kernel.org/all/20260512090408.794195-1-aneesh.kumar@kernel.org
* Add new patches based on Sashiko review:
swiotlb: Preserve allocation virtual address for dynamic pools
dma: free atomic pool pages by physical address
dma: swiotlb: handle set_memory_decrypted() failures
dma: swiotlb: free dynamic pools from process context
iommu/dma: Check atomic pool allocation result directly
* Include pKVM and s390 changes as dependent patches. These are not yet
ready to merge and are waiting for subsystem testing feedback.
* Drop the AMD GART patch because it requires wider testing.
* Update swiotlb_tbl_map_single() to take attrs by reference.
* Switch swiotlb_free() to use rcu_work.
* Avoid calling swiotlb_find_pool() multiple times in the free path.
* Make DMA_ATTR_MMIO imply DMA_ATTR_CC_SHARED for devices requiring unencrypted DMA.
Changes from v3:
https://lore.kernel.org/all/20260427055509.898190-1-aneesh.kumar@kernel.org
* Handle DMA_ATTR_MMIO correctly in dma_direct_map_phys()
* Address most of sashiko review
* Rebase to latest kernel
* drop SWIOTLB_FORCE for s390 and powerpc secure guest.
Changes from v2:
https://lore.kernel.org/all/20260420061415.3650870-1-aneesh.kumar@kernel.org
* pass attrs to dma_capable() and update direct, swiotlb, Xen swiotlb, and
x86 GART paths so the capability checks see the DMA address attr value
DMA_ATTR_CC_SHARED.
* rework dma_direct_map_phys() so DMA_ATTR_CC_SHARED selects
phys_to_dma_unencrypted() while the default path uses
phys_to_dma_encrypted(), with swiotlb fallback when the requested
shared/private state cannot be satisfied by a direct DMA address.
* stop relying on SWIOTLB_FORCE for arm64 and x86 CC guest DMA mappings;
swiotlb is still enabled there, but shared mappings is now selected
through the generic dma_direct_map_phys()/dma_capable() decision instead
of a global force-bounce flag.
Changes from v1:
https://lore.kernel.org/all/20260417085900.3062416-1-aneesh.kumar@kernel.org
* rebased to latest kernel (change from DMA_ATTR_CC_DECRYPTED -> DMA_ATTR_CC_SHARED)
* update the alloc path so DMA_ATTR_CC_SHARED is not a caller-visible attribute.
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Will Deacon <will@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: Suzuki K Poulose <Suzuki.Poulose@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Mostafa Saleh <smostafa@google.com>
Cc: Petr Tesarik <ptesarik@suse.com>
Cc: Alexey Kardashevskiy <aik@amd.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Xu Yilun <yilun.xu@linux.intel.com>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-s390@vger.kernel.org
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: "Christophe Leroy (CS GROUP)" <chleroy@kernel.org>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: x86@kernel.org
Aneesh Kumar K.V (Arm) (20):
[DO NOT MERGE] arm64/coco: Add pKVM as a CC platform
[DO NOT MERGE] s390: Expose protected virtualization through
cc_platform_has()
dma-direct: swiotlb: handle swiotlb alloc/free outside
__dma_direct_alloc_pages
dma-direct: use DMA_ATTR_CC_SHARED in alloc/free paths
dma-pool: track decrypted atomic pools and select them via attrs
dma: swiotlb: pass mapping attributes by reference
dma: swiotlb: track pool encryption state and honor DMA_ATTR_CC_SHARED
dma-mapping: make dma_pgprot() honor DMA_ATTR_CC_SHARED
dma-direct: pass attrs to dma_capable() for DMA_ATTR_CC_SHARED checks
dma-direct: make dma_direct_map_phys() honor DMA_ATTR_CC_SHARED
dma-direct: set decrypted flag for remapped DMA allocations
dma-direct: select DMA address encoding from DMA_ATTR_CC_SHARED
dma-pool: fix page leak in atomic_pool_expand() cleanup
dma-direct: rename ret to cpu_addr in alloc helpers
dma-direct: return struct page from dma_direct_alloc_from_pool()
iommu/dma: Check atomic pool allocation result directly
dma: swiotlb: free dynamic pools from process context
dma: swiotlb: handle set_memory_decrypted() failures
dma: free atomic pool pages by physical address
swiotlb: Preserve allocation virtual address for dynamic pools
arch/arm64/include/asm/hypervisor.h | 6 +
arch/arm64/include/asm/mem_encrypt.h | 3 +-
arch/arm64/kernel/rsi.c | 12 -
arch/arm64/mm/init.c | 17 +-
arch/powerpc/platforms/pseries/svm.c | 2 +-
arch/s390/Kconfig | 1 +
arch/s390/mm/init.c | 16 +-
arch/x86/kernel/amd_gart_64.c | 30 +-
arch/x86/kernel/pci-dma.c | 4 +-
drivers/iommu/dma-iommu.c | 15 +-
drivers/virt/coco/pkvm-guest/arm-pkvm-guest.c | 5 +
drivers/xen/swiotlb-xen.c | 8 +-
include/linux/dma-direct.h | 20 +-
include/linux/dma-map-ops.h | 3 +-
include/linux/swiotlb.h | 20 +-
kernel/dma/direct.c | 275 +++++++++++++-----
kernel/dma/direct.h | 47 +--
kernel/dma/mapping.c | 16 +-
kernel/dma/pool.c | 221 ++++++++++----
kernel/dma/swiotlb.c | 270 +++++++++++++----
20 files changed, 717 insertions(+), 274 deletions(-)
base-commit: 50897c955902c93ae71c38698abb910525ebdc89
--
2.43.0
^ permalink raw reply
* [soc:soc/defconfig] BUILD SUCCESS 9f86df230e276e5c24f5c6800f3ee63ab33167f7
From: kernel test robot @ 2026-05-22 3:54 UTC (permalink / raw)
To: Arnd Bergmann; +Cc: linux-arm-kernel, arm
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/soc/soc.git soc/defconfig
branch HEAD: 9f86df230e276e5c24f5c6800f3ee63ab33167f7 Merge tag 'cix-defconfig-v7.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/peter.chen/cix into soc/defconfig
elapsed time: 727m
configs tested: 135
configs skipped: 133
The following configs have been built successfully.
More configs may be tested in the coming days.
tested configs:
alpha allnoconfig gcc-15.2.0
arc allmodconfig clang-16
arc allnoconfig gcc-15.2.0
arc allyesconfig clang-23
arc randconfig-001-20260522 clang-23
arc randconfig-002-20260522 clang-23
arm allnoconfig gcc-15.2.0
arm allyesconfig clang-16
arm randconfig-001-20260522 clang-23
arm randconfig-002-20260522 clang-23
arm randconfig-003-20260522 clang-23
arm randconfig-004-20260522 clang-23
arm64 allmodconfig clang-23
arm64 allnoconfig gcc-15.2.0
arm64 randconfig-001 gcc-14.3.0
arm64 randconfig-001-20260522 gcc-14.3.0
arm64 randconfig-002 gcc-14.3.0
arm64 randconfig-002-20260522 gcc-14.3.0
arm64 randconfig-003 gcc-14.3.0
arm64 randconfig-003-20260522 gcc-14.3.0
arm64 randconfig-004 gcc-14.3.0
arm64 randconfig-004-20260522 gcc-14.3.0
csky allmodconfig gcc-15.2.0
csky allnoconfig gcc-15.2.0
csky randconfig-001 gcc-14.3.0
csky randconfig-001-20260522 gcc-14.3.0
csky randconfig-002 gcc-14.3.0
csky randconfig-002-20260522 gcc-14.3.0
hexagon allnoconfig gcc-15.2.0
i386 allnoconfig gcc-15.2.0
i386 buildonly-randconfig-001-20260522 clang-20
i386 buildonly-randconfig-002-20260522 clang-20
i386 buildonly-randconfig-003-20260522 clang-20
i386 buildonly-randconfig-004-20260522 clang-20
i386 buildonly-randconfig-005-20260522 clang-20
i386 buildonly-randconfig-006-20260522 clang-20
i386 randconfig-011-20260522 clang-20
i386 randconfig-012-20260522 clang-20
i386 randconfig-013-20260522 clang-20
i386 randconfig-014-20260522 clang-20
i386 randconfig-015-20260522 clang-20
i386 randconfig-016-20260522 clang-20
i386 randconfig-017-20260522 clang-20
loongarch allmodconfig clang-23
loongarch allnoconfig gcc-15.2.0
loongarch defconfig clang-19
m68k allmodconfig gcc-15.2.0
m68k allnoconfig gcc-15.2.0
m68k allyesconfig clang-16
m68k defconfig clang-19
m68k m5272c3_defconfig gcc-15.2.0
microblaze allnoconfig gcc-15.2.0
microblaze allyesconfig gcc-15.2.0
microblaze defconfig clang-19
mips allmodconfig gcc-15.2.0
mips allnoconfig gcc-15.2.0
mips allyesconfig gcc-15.2.0
nios2 allmodconfig clang-23
nios2 allnoconfig clang-23
nios2 defconfig clang-19
openrisc allmodconfig clang-23
openrisc allnoconfig clang-23
openrisc defconfig gcc-15.2.0
parisc allmodconfig gcc-15.2.0
parisc allnoconfig clang-23
parisc allyesconfig clang-19
parisc defconfig gcc-15.2.0
parisc randconfig-001-20260522 gcc-12.5.0
parisc randconfig-002-20260522 gcc-12.5.0
parisc64 defconfig clang-19
powerpc allmodconfig gcc-15.2.0
powerpc allnoconfig clang-23
powerpc randconfig-001-20260522 gcc-12.5.0
powerpc randconfig-002-20260522 gcc-12.5.0
powerpc64 randconfig-001-20260522 gcc-12.5.0
powerpc64 randconfig-002-20260522 gcc-12.5.0
riscv allmodconfig clang-23
riscv allnoconfig clang-23
riscv allyesconfig clang-16
riscv defconfig gcc-15.2.0
s390 allmodconfig clang-19
s390 allnoconfig clang-23
s390 allyesconfig gcc-15.2.0
s390 defconfig gcc-15.2.0
sh allnoconfig clang-23
sh allyesconfig clang-19
sh defconfig gcc-14
sparc allnoconfig clang-23
sparc defconfig gcc-15.2.0
sparc randconfig-001-20260522 gcc-9.5.0
sparc randconfig-002-20260522 gcc-9.5.0
sparc64 allmodconfig clang-23
sparc64 defconfig gcc-14
sparc64 randconfig-001-20260522 gcc-9.5.0
sparc64 randconfig-002-20260522 gcc-9.5.0
um allmodconfig clang-19
um allnoconfig clang-23
um defconfig gcc-14
um i386_defconfig gcc-14
um randconfig-001-20260522 gcc-9.5.0
um randconfig-002-20260522 gcc-9.5.0
um x86_64_defconfig gcc-14
x86_64 allnoconfig clang-23
x86_64 buildonly-randconfig-001-20260522 gcc-14
x86_64 buildonly-randconfig-002-20260522 gcc-14
x86_64 buildonly-randconfig-003-20260522 gcc-14
x86_64 buildonly-randconfig-004-20260522 gcc-14
x86_64 buildonly-randconfig-005-20260522 gcc-14
x86_64 buildonly-randconfig-006-20260522 gcc-14
x86_64 defconfig gcc-14
x86_64 randconfig-001-20260522 gcc-14
x86_64 randconfig-002-20260522 gcc-14
x86_64 randconfig-003-20260522 gcc-14
x86_64 randconfig-004-20260522 gcc-14
x86_64 randconfig-005-20260522 gcc-14
x86_64 randconfig-006-20260522 gcc-14
x86_64 randconfig-011-20260522 gcc-14
x86_64 randconfig-012-20260522 gcc-14
x86_64 randconfig-013-20260522 gcc-14
x86_64 randconfig-014-20260522 gcc-14
x86_64 randconfig-015-20260522 gcc-14
x86_64 randconfig-016-20260522 gcc-14
x86_64 randconfig-071-20260522 clang-20
x86_64 randconfig-072-20260522 clang-20
x86_64 randconfig-073-20260522 clang-20
x86_64 randconfig-074-20260522 clang-20
x86_64 randconfig-075-20260522 clang-20
x86_64 randconfig-076-20260522 clang-20
x86_64 rhel-9.4-bpf gcc-14
x86_64 rhel-9.4-kunit gcc-14
x86_64 rhel-9.4-ltp gcc-14
xtensa allnoconfig clang-23
xtensa allyesconfig clang-23
xtensa randconfig-001-20260522 gcc-9.5.0
xtensa randconfig-002-20260522 gcc-9.5.0
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply
* Re: [PATCH v2] ARM: dts: aspeed: Enable networking for Asus Kommando IPMI Card
From: Anirudh Srinivasan @ 2026-05-22 3:49 UTC (permalink / raw)
To: Andrew Jeffery
Cc: Andrew Lunn, devicetree, linux-arm-kernel, linux-aspeed,
linux-kernel, Rob Herring, Conor Dooley, Krzysztof Kozlowski,
Joel Stanley
In-Reply-To: <20260331-asus-kommando-networking-v2-1-f7d72ae5d40d@gmail.com>
Hi Andrew,
On Tue, Mar 31, 2026 at 9:18 AM Anirudh Srinivasan
<anirudhsriniv@gmail.com> wrote:
>
> Adds the DT nodes needed for ethernet support for Asus Kommando, with
> phy mode set to rgmii-id.
>
> When this DT was originally added, the phy mode was set to rgmii (which
> was incorrect). It was suggested to remove networking support from the
> DT till the Aspeed networking driver was patched so that the correct phy
> mode could be used.
>
> The discussion in [1] mentions that u-boot was inserting clk delays that
> weren't needed, which resulted in needing to set the phy mode in linux
> to rgmii incorrectly. The solution suggested there was to patch u-boot to
> no longer insert these clk delays and use rgmii-id as the phy mode for
> any future DTs added to linux.
>
> This DT was tested (on the OpenBMC u-boot fork [2]) with a u-boot DT
> modified to insert clk delays of 0 (instead of patching u-boot itself).
> [3] adds a u-boot DT for this device (without networking) and describes
> how to patch it to add networking support. If this patched DT is used,
> then networking works with rgmii-id phy mode in both u-boot and linux.
>
> [1] https://lore.kernel.org/linux-aspeed/ef88bb50-9f2c-458d-a7e5-dc5ecb9c777a@lunn.ch/
> [2] https://github.com/openbmc/u-boot/tree/v2019.04-aspeed-openbmc
> [3] https://lore.kernel.org/openbmc/20260328-asus-kommando-v2-1-2a656f8cd314@gmail.com/
>
> Signed-off-by: Anirudh Srinivasan <anirudhsriniv@gmail.com>
> ---
> This patch is based off aspeed/arm/dt from bmc tree
> ---
> Changes in v2:
> - Commit message now mentions that the u-boot tested against is the
> openbmc u-boot fork
> - Link to v1: https://lore.kernel.org/r/20260328-asus-kommando-networking-v1-1-66d308b88536@gmail.com
> ---
> .../dts/aspeed/aspeed-bmc-asus-kommando-ipmi-card.dts | 18 ++++++++++++++++++
> 1 file changed, 18 insertions(+)
>
> diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-asus-kommando-ipmi-card.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-asus-kommando-ipmi-card.dts
> index ab7ad320067c1ddc0fea9ac386fd488c8ef28184..e0f7d92efa18ccbad2c336236c3b9d01b7de1bba 100644
> --- a/arch/arm/boot/dts/aspeed/aspeed-bmc-asus-kommando-ipmi-card.dts
> +++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-asus-kommando-ipmi-card.dts
> @@ -107,6 +107,24 @@ &gpio1 {
> /*18E0 32*/ "","","","","","","","";
> };
>
> +&mac2 {
> + status = "okay";
> +
> + phy-mode = "rgmii-id";
> + phy-handle = <ðphy2>;
> + pinctrl-names = "default";
> + pinctrl-0 = <&pinctrl_rgmii3_default>;
> +};
> +
> +&mdio2 {
> + status = "okay";
> +
> + ethphy2: ethernet-phy@0 {
> + compatible = "ethernet-phy-ieee802.3-c22";
> + reg = <0>;
> + };
> +};
> +
> &vhub {
> status = "okay";
> };
>
> ---
> base-commit: 76b4ec8efdc3887cdbf730da2e55881fc1a18770
> change-id: 20260328-asus-kommando-networking-5c0612aa6b8c
>
> Best regards,
> --
> Anirudh Srinivasan <anirudhsriniv@gmail.com>
>
While we're figuring out what to do with u-boot, what are your
thoughts on getting this patch in so that the kernel DTS changes
needed for networking land in this cycle?
The current commit message might become somewhat outdated if the
u-boot patch changes though, so not sure if that's okay.
--
Regards
Anirudh Srinivasan
^ permalink raw reply
* [PATCH 2/2] PCI: imx6: Add imx_pcie_perst_found() to inspect the parsed result
From: Sherry Sun (OSS) @ 2026-05-22 3:43 UTC (permalink / raw)
To: hongxing.zhu, l.stach, Frank.Li, bhelgaas, lpieralisi,
kwilczynski, mani, robh, s.hauer, kernel, festevam, will
Cc: imx, linux-pci, linux-arm-kernel, linux-kernel, sherry.sun
In-Reply-To: <20260522034344.1147775-1-sherry.sun@oss.nxp.com>
From: Sherry Sun <sherry.sun@nxp.com>
Since pci_host_common_parse_port() doesn't return failure for "property
not found"(-ENODEV), the caller should inspect the parsed result and
decide whether to fall back to the legacy binding.
Add imx_pcie_perst_found() to inspect the parsed result.
Signed-off-by: Sherry Sun <sherry.sun@nxp.com>
---
drivers/pci/controller/dwc/pci-imx6.c | 25 +++++++++++++++++--------
1 file changed, 17 insertions(+), 8 deletions(-)
diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c
index b137551871fc..34756f28fcc6 100644
--- a/drivers/pci/controller/dwc/pci-imx6.c
+++ b/drivers/pci/controller/dwc/pci-imx6.c
@@ -1287,6 +1287,18 @@ static void imx_pcie_assert_perst(struct imx_pcie *imx_pcie, bool assert)
}
}
+static bool imx_pcie_perst_found(struct pci_host_bridge *bridge)
+{
+ struct pci_host_port *port;
+
+ list_for_each_entry(port, &bridge->ports, list) {
+ if (!list_empty(&port->perst))
+ return true;
+ }
+
+ return false;
+}
+
static int imx_pcie_host_init(struct dw_pcie_rp *pp)
{
struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
@@ -1299,15 +1311,12 @@ static int imx_pcie_host_init(struct dw_pcie_rp *pp)
/* Parse Root Port nodes if present */
ret = pci_host_common_parse_ports(dev, bridge);
if (ret) {
- if (ret != -ENODEV) {
- dev_err(dev, "Failed to parse Root Port nodes: %d\n", ret);
- return ret;
- }
+ dev_err(dev, "Failed to parse Root Port nodes: %d\n", ret);
+ return ret;
+ }
- /*
- * Fall back to legacy binding for DT backwards
- * compatibility
- */
+ /* Fallback to legacy binding for DT backwards compatibility. */
+ if (!imx_pcie_perst_found(bridge)) {
ret = imx_pcie_parse_legacy_binding(imx_pcie);
if (ret)
return ret;
--
2.37.1
^ permalink raw reply related
* [PATCH 1/2] PCI: host-generic: Simplify return value handling in pci_host_common_parse_port(s)
From: Sherry Sun (OSS) @ 2026-05-22 3:43 UTC (permalink / raw)
To: hongxing.zhu, l.stach, Frank.Li, bhelgaas, lpieralisi,
kwilczynski, mani, robh, s.hauer, kernel, festevam, will
Cc: imx, linux-pci, linux-arm-kernel, linux-kernel, sherry.sun
In-Reply-To: <20260522034344.1147775-1-sherry.sun@oss.nxp.com>
From: Sherry Sun <sherry.sun@nxp.com>
The pci_host_common_parse_port() shouldn't check the RC-level binding.
That's a policy decision that belongs to the caller, not this common
helper.
Simplify pci_host_common_parse_port() to only parses properties from the
Root Port(and its children) without checking the RC node. Also change
pci_host_common_parse_ports() to return 0 when no ports are found, since
it is not an error.
So now both functions won't return failure for "property not found" or
"port not found", they purely returns 0 on success and a negative error
code on real failures.
Signed-off-by: Sherry Sun <sherry.sun@nxp.com>
---
drivers/pci/controller/pci-host-common.c | 29 ++++--------------------
1 file changed, 5 insertions(+), 24 deletions(-)
diff --git a/drivers/pci/controller/pci-host-common.c b/drivers/pci/controller/pci-host-common.c
index 67455caaf9e6..7ce5a939e3d9 100644
--- a/drivers/pci/controller/pci-host-common.c
+++ b/drivers/pci/controller/pci-host-common.c
@@ -108,8 +108,7 @@ static int pci_host_common_parse_perst(struct device *dev,
* dependencies and the driver may fail to operate if required resources
* are missing.
*
- * Return: 0 on success, -ENODEV if PERST# found in RC node (legacy binding
- * should be used), Other negative error codes on failure.
+ * Return: 0 on success, negative error codes on failure.
*/
static int pci_host_common_parse_port(struct device *dev,
struct pci_host_bridge *bridge,
@@ -128,22 +127,6 @@ static int pci_host_common_parse_port(struct device *dev,
if (ret)
return ret;
- /*
- * 1. PERST# found in RP or its child nodes - list is not empty,
- * continue
- *
- * 2. PERST# not found in RP/children, but found in RC node -
- * return -ENODEV to fallback legacy binding
- *
- * 3. PERST# not found anywhere - list is empty, continue (optional
- * PERST#)
- */
- if (list_empty(&port->perst)) {
- if (of_property_present(dev->of_node, "reset-gpios") ||
- of_property_present(dev->of_node, "reset-gpio"))
- return -ENODEV;
- }
-
INIT_LIST_HEAD(&port->list);
list_add_tail(&port->list, &bridge->ports);
@@ -158,13 +141,11 @@ static int pci_host_common_parse_port(struct device *dev,
* Iterate through child nodes of the host bridge and parse Root Port
* properties (currently only reset GPIOs).
*
- * Return: 0 on success, -ENODEV if no ports found or PERST# found in RC
- * node (legacy binding should be used), Other negative error codes on
- * failure.
+ * Return: 0 on success, negative error codes on failure.
*/
int pci_host_common_parse_ports(struct device *dev, struct pci_host_bridge *bridge)
{
- int ret = -ENODEV;
+ int ret = 0;
for_each_available_child_of_node_scoped(dev->of_node, of_port) {
if (!of_node_is_type(of_port, "pci"))
@@ -174,8 +155,8 @@ int pci_host_common_parse_ports(struct device *dev, struct pci_host_bridge *brid
goto err_cleanup;
}
- if (ret)
- return ret;
+ if (list_empty(&bridge->ports))
+ return 0;
return devm_add_action_or_reset(dev, pci_host_common_delete_ports,
&bridge->ports);
--
2.37.1
^ permalink raw reply related
* [PATCH 0/2] PCI: imx6: Improve PERST# fallback logic
From: Sherry Sun (OSS) @ 2026-05-22 3:43 UTC (permalink / raw)
To: hongxing.zhu, l.stach, Frank.Li, bhelgaas, lpieralisi,
kwilczynski, mani, robh, s.hauer, kernel, festevam, will
Cc: imx, linux-pci, linux-arm-kernel, linux-kernel, sherry.sun
From: Sherry Sun <sherry.sun@nxp.com>
The pci_host_common_parse_port() shouldn't decide whether to fall back
to the legacy RC-level binding by checking for "reset-gpios/reset-gpio"
properties on the RC node and returning -ENODEV. That's a policy
decision belongs to the caller, not this common helper.
This patch set improves the PERST# (PCIe reset) GPIO fallback logic
across the generic PCI host bridge helper and the i.MX6 PCIe driver.
Sherry Sun (2):
PCI: host-generic: Simplify return value handling in
pci_host_common_parse_port(s)
PCI: imx6: Add imx_pcie_perst_found() to inspect the parsed result
drivers/pci/controller/dwc/pci-imx6.c | 25 +++++++++++++-------
drivers/pci/controller/pci-host-common.c | 29 ++++--------------------
2 files changed, 22 insertions(+), 32 deletions(-)
base-commit: 550604d6c9b9efc8d068aff94dc301694a7afdee
--
2.37.1
^ permalink raw reply
* [GIT PULL] CIX dts changes for v7.2-rc1
From: Peter Chen @ 2026-05-22 3:21 UTC (permalink / raw)
To: soc, arm, soc; +Cc: fugang.duan, linux-arm-kernel, cix-kernel-upstream
The following changes since commit 254f49634ee16a731174d2ae34bc50bd5f45e731:
Linux 7.1-rc1 (2026-04-26 14:19:00 -0700)
are available in the Git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/peter.chen/cix.git/ tags/cix-dt-v7.2-rc1
for you to fetch changes up to 89d0ad7f3a60d3076da2d90ece8d78d0eb60bb91:
arm64: dts: cix: Add CPU idle states for Sky1 (2026-05-11 11:36:40 +0800)
----------------------------------------------------------------
- Add cpuidle and cpufreq support for Sky1
----------------------------------------------------------------
Devin Li (2):
arm64: dts: cix: Add SCMI performance domains for CPUFreq on Sky1
arm64: dts: cix: Add CPU idle states for Sky1
arch/arm64/boot/dts/cix/sky1-power.h | 13 +++++++
arch/arm64/boot/dts/cix/sky1.dtsi | 70 ++++++++++++++++++++++++++++++++++++
2 files changed, 83 insertions(+)
--
Best regards,
Peter
^ permalink raw reply
* Re: [PATCH v6 01/10] dt-bindings: display: rockchip: analogix-dp: Fix hclk as third clock for RK3588
From: Damon Ding @ 2026-05-22 3:02 UTC (permalink / raw)
To: Conor Dooley
Cc: hjc, heiko, andy.yan, maarten.lankhorst, mripard, tzimmermann,
airlied, simona, robh, krzk+dt, conor+dt, andrzej.hajda,
neil.armstrong, rfoss, Laurent.pinchart, jonas, jernej.skrabec,
nicolas.frattaroli, cristian.ciocaltea, sebastian.reichel,
dmitry.baryshkov, luca.ceresoli, dianders, m.szyprowski,
dri-devel, devicetree, linux-arm-kernel, linux-rockchip,
linux-kernel
In-Reply-To: <20260521-waking-case-cd709f0a7712@spud>
Hi Conor,
On 5/22/2026 3:54 AM, Conor Dooley wrote:
> On Thu, May 21, 2026 at 04:08:26PM +0800, Damon Ding wrote:
>> RK3588 eDP controller requires HCLK_VO1 to access the VO1 GRF
>> registers and enable the video datapath.
>>
>> Previously, the clock was enabled implicitly via the 'rockchip,vo-grf'
>> phandle reference, which allowed the eDP to work without explicitly
>> managing the hclk_vo1 clock. However, this is not safe or explicit.
>>
>> To make the clock dependency explicit, enforce per-SoC clock-names
>> requirements:
>> - RK3288: 2 clocks (dp, pclk)
>> - RK3399: 3 clocks (dp, pclk, grf)
>> - RK3588: 3 clocks (dp, pclk, hclk)
>>
>> Do not reuse the 'grf' clock name for RK3588 because it represents
>> a different clock with distinct control logic:
>> - The 'grf' clock is only for GRF register access and is toggled
>> dynamically during register access.
>> - The 'hclk' clock controls both GRF access and video datapath
>> gating, and must remain enabled during probe.
>>
>> Fixes: f855146263b1 ("dt-bindings: display: rockchip: analogix-dp: Add support for RK3588")
>> Signed-off-by: Damon Ding <damon.ding@rock-chips.com>
>>
>> ---
>>
>> Changes in v4:
>> - Modify the commit msg.
>>
>> Changes in v5:
>> - Enforce the correct third clock name on a per-compatible basis.
>> - Modify the commit msg simultaneously.
>>
>> Changes in v6:
>> - Expand more detail commit msg about using hclk instead of grf clock.
>> ---
>> .../rockchip/rockchip,analogix-dp.yaml | 37 +++++++++++++++++--
>> 1 file changed, 33 insertions(+), 4 deletions(-)
>>
>> diff --git a/Documentation/devicetree/bindings/display/rockchip/rockchip,analogix-dp.yaml b/Documentation/devicetree/bindings/display/rockchip/rockchip,analogix-dp.yaml
>> index d99b23b88cc5..8001c1facf98 100644
>> --- a/Documentation/devicetree/bindings/display/rockchip/rockchip,analogix-dp.yaml
>> +++ b/Documentation/devicetree/bindings/display/rockchip/rockchip,analogix-dp.yaml
>> @@ -23,10 +23,7 @@ properties:
>>
>> clock-names:
>> minItems: 2
>> - items:
>> - - const: dp
>> - - const: pclk
>> - - const: grf
>
> Instead of removing this, you can make it
>
> items:
> - const: dp
> - const pclk
> - enum:
> - grf
> - hclk
>
>> + maxItems: 3
>
> And delete this.
>
Oh I see, thanks for clarifying.
The idea is to keep all valid clock names at the top level, and use only
minItems/maxItems in allOf to differentiate platforms.
>>
>> power-domains:
>> maxItems: 1
>> @@ -60,6 +57,33 @@ required:
>> allOf:
>> - $ref: /schemas/display/bridge/analogix,dp.yaml#
>>
>> + - if:
>> + properties:
>> + compatible:
>> + contains:
>> + enum:
>> + - rockchip,rk3288-dp
>> + then:
>> + properties:
>> + clock-names:
>> + items:
>> + - const: dp
>> + - const: pclk
>
> Then this becomes
> clock-names:
> maxItems: 2
> clocks:
> maxItems: 2
>
>> +
>> + - if:
>> + properties:
>> + compatible:
>> + contains:
>> + enum:
>> + - rockchip,rk3399-edp
>> + then:
>> + properties:
>> + clock-names:
>> + items:
>> + - const: dp
>> + - const: pclk
>> + - const: grf
>
> maybe this needs a minItems: 3? Depends on if the grf clock is
> mandatory. Also probably needs a
> clocks:
> minItems: 3
> if that's the case.
>
Yes, the minItems should be 3 for both clocks and clock-names.
>> +
>> - if:
>> properties:
>> compatible:
>> @@ -68,6 +92,11 @@ allOf:
>> - rockchip,rk3588-edp
>> then:
>> properties:
>> + clock-names:
>> + items:
>> + - const: dp
>> + - const: pclk
>> + - const: hclk
>
> And the same here as for the 3399.
>
Will update in v7.
Best regards,
Damon
^ permalink raw reply
* Re: [PATCH v9 3/4] coresight: cti: add Qualcomm extended CTI identification and quirks
From: Jie Gan @ 2026-05-22 2:38 UTC (permalink / raw)
To: Yingchao Deng, Suzuki K Poulose, Mike Leach, James Clark, Leo Yan,
Alexander Shishkin
Cc: coresight, linux-arm-kernel, linux-kernel, linux-arm-msm,
jinlong.mao, quic_yingdeng, tingwei.zhang
In-Reply-To: <20260521-extended_cti-v9-3-d21f4f92c51e@oss.qualcomm.com>
On 5/21/2026 8:16 PM, Yingchao Deng wrote:
> Qualcomm implements an extended variant of the ARM CoreSight CTI with a
> different register layout and vendor-specific behavior. While the
> programming model remains largely compatible, the register offsets differ
> from the standard ARM CTI and require explicit handling.
>
> Detect Qualcomm CTIs via the DEVARCH register and record this in the CTI
> driver data. Introduce a small mapping layer to translate standard CTI
> register offsets to Qualcomm-specific offsets, allowing the rest of the
> driver to use a common register access path.
>
> Additionally, handle a Qualcomm-specific quirk where the CLAIMSET
> register is incorrectly initialized to a non-zero value, which can cause
> tools or drivers to assume the component is already claimed. Clear the
> register during probe to reflect the actual unclaimed state.
>
> No functional change is intended for standard ARM CTI devices.
>
> Co-developed-by: Jinlong Mao <jinlong.mao@oss.qualcomm.com>
> Signed-off-by: Jinlong Mao <jinlong.mao@oss.qualcomm.com>
> Signed-off-by: Yingchao Deng <yingchao.deng@oss.qualcomm.com>
> ---
> drivers/hwtracing/coresight/coresight-cti-core.c | 27 +++++++++-
> drivers/hwtracing/coresight/coresight-cti.h | 7 ++-
> drivers/hwtracing/coresight/qcom-cti.h | 65 ++++++++++++++++++++++++
> 3 files changed, 97 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/hwtracing/coresight/coresight-cti-core.c b/drivers/hwtracing/coresight/coresight-cti-core.c
> index c5cc2706e241..2dac5eb4ecca 100644
> --- a/drivers/hwtracing/coresight/coresight-cti-core.c
> +++ b/drivers/hwtracing/coresight/coresight-cti-core.c
> @@ -21,6 +21,7 @@
>
> #include "coresight-priv.h"
> #include "coresight-cti.h"
> +#include "qcom-cti.h"
>
> /*
> * CTI devices can be associated with a PE, or be connected to CoreSight
> @@ -45,6 +46,9 @@ static DEFINE_MUTEX(ect_mutex);
> static void __iomem *__reg_addr(struct cti_drvdata *drvdata, u32 off,
> u32 index)
> {
> + if (unlikely(rvdata->is_qcom_cti))
typo.
s/rvdata/drvdata
Thanks,
Jie
> + off = cti_qcom_reg_off(off);
> +
> return drvdata->base + off + sizeof(u32) * index;
> }
>
> @@ -172,6 +176,9 @@ void cti_write_intack(struct device *dev, u32 ackval)
> /* DEVID[19:16] - number of CTM channels */
> #define CTI_DEVID_CTMCHANNELS(devid_val) ((int) BMVAL(devid_val, 16, 19))
>
> +/* DEVARCH[31:21] - ARCHITECT */
> +#define CTI_DEVARCH_ARCHITECT(devarch_val) ((int)BMVAL(devarch_val, 21, 31))
> +
> static int cti_set_default_config(struct device *dev,
> struct cti_drvdata *drvdata)
> {
> @@ -702,6 +709,7 @@ static int cti_probe(struct amba_device *adev, const struct amba_id *id)
> struct coresight_desc cti_desc;
> struct coresight_platform_data *pdata = NULL;
> struct resource *res = &adev->res;
> + u32 devarch;
>
> /* driver data*/
> drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
> @@ -726,6 +734,22 @@ static int cti_probe(struct amba_device *adev, const struct amba_id *id)
>
> raw_spin_lock_init(&drvdata->spinlock);
>
> + devarch = readl_relaxed(drvdata->base + CORESIGHT_DEVARCH);
> + if (CTI_DEVARCH_ARCHITECT(devarch) == ARCHITECT_QCOM) {
> + drvdata->is_qcom_cti = true;
> + /*
> + * QCOM CTI does not implement Claimtag functionality as
> + * per CoreSight specification, but its CLAIMSET register
> + * is incorrectly initialized to 0xF. This can mislead
> + * tools or drivers into thinking the component is claimed.
> + *
> + * Reset CLAIMSET to 0 to reflect that no claims are active.
> + */
> + CS_UNLOCK(drvdata->base);
> + writel_relaxed(0, drvdata->base + CORESIGHT_CLAIMSET);
> + CS_LOCK(drvdata->base);
> + }
> +
> /* initialise CTI driver config values */
> ret = cti_set_default_config(dev, drvdata);
> if (ret)
> @@ -782,7 +806,8 @@ static int cti_probe(struct amba_device *adev, const struct amba_id *id)
>
> /* all done - dec pm refcount */
> pm_runtime_put(&adev->dev);
> - dev_info(&drvdata->csdev->dev, "CTI initialized\n");
> + dev_info(&drvdata->csdev->dev,
> + "%sCTI initialized\n", drvdata->is_qcom_cti ? "QCOM " : "");
> return 0;
> }
>
> diff --git a/drivers/hwtracing/coresight/coresight-cti.h b/drivers/hwtracing/coresight/coresight-cti.h
> index 98b8de8a3687..08ea6daf5b3c 100644
> --- a/drivers/hwtracing/coresight/coresight-cti.h
> +++ b/drivers/hwtracing/coresight/coresight-cti.h
> @@ -54,10 +54,11 @@ struct fwnode_handle;
> /*
> * CTI CSSoc 600 has a max of 32 trigger signals per direction.
> * CTI CSSoc 400 has 8 IO triggers - other CTIs can be impl def.
> + * QCOM CTI supports up to 128 trigger signals per direction.
> * Max of in and out defined in the DEVID register.
> * - pick up actual number used from .dts parameters if present.
> */
> -#define CTIINOUTEN_MAX 32
> +#define CTIINOUTEN_MAX 128
>
> /**
> * Group of related trigger signals
> @@ -168,6 +169,9 @@ struct cti_config {
> * @spinlock: Control data access to one at a time.
> * @config: Configuration data for this CTI device.
> * @node: List entry of this device in the list of CTI devices.
> + * @is_qcom_cti: True if this CTI is a Qualcomm vendor-specific
> + * variant that requires register offset translation
> + * via cti_qcom_reg_off().
> */
> struct cti_drvdata {
> void __iomem *base;
> @@ -176,6 +180,7 @@ struct cti_drvdata {
> raw_spinlock_t spinlock;
> struct cti_config config;
> struct list_head node;
> + bool is_qcom_cti;
> };
>
> /*
> diff --git a/drivers/hwtracing/coresight/qcom-cti.h b/drivers/hwtracing/coresight/qcom-cti.h
> new file mode 100644
> index 000000000000..d3846613a0de
> --- /dev/null
> +++ b/drivers/hwtracing/coresight/qcom-cti.h
> @@ -0,0 +1,65 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
> + */
> +
> +#ifndef _CORESIGHT_QCOM_CTI_H
> +#define _CORESIGHT_QCOM_CTI_H
> +
> +#include "coresight-cti.h"
> +
> +#define ARCHITECT_QCOM 0x477
> +
> +/* CTI programming registers */
> +#define QCOM_CTIINTACK 0x020
> +#define QCOM_CTIAPPSET 0x004
> +#define QCOM_CTIAPPCLEAR 0x008
> +#define QCOM_CTIAPPPULSE 0x00C
> +#define QCOM_CTIINEN 0x400
> +#define QCOM_CTIOUTEN 0x800
> +#define QCOM_CTITRIGINSTATUS 0x040
> +#define QCOM_CTITRIGOUTSTATUS 0x060
> +#define QCOM_CTICHINSTATUS 0x080
> +#define QCOM_CTICHOUTSTATUS 0x084
> +#define QCOM_CTIGATE 0x088
> +#define QCOM_ASICCTL 0x08C
> +/* Integration test registers */
> +#define QCOM_ITCHINACK 0xE70
> +#define QCOM_ITTRIGINACK 0xE80
> +#define QCOM_ITCHOUT 0xE74
> +#define QCOM_ITTRIGOUT 0xEA0
> +#define QCOM_ITCHOUTACK 0xE78
> +#define QCOM_ITTRIGOUTACK 0xEC0
> +#define QCOM_ITCHIN 0xE7C
> +#define QCOM_ITTRIGIN 0xEE0
> +
> +static inline u32 cti_qcom_reg_off(u32 offset)
> +{
> + switch (offset) {
> + case CTIINTACK: return QCOM_CTIINTACK;
> + case CTIAPPSET: return QCOM_CTIAPPSET;
> + case CTIAPPCLEAR: return QCOM_CTIAPPCLEAR;
> + case CTIAPPPULSE: return QCOM_CTIAPPPULSE;
> + case CTIINEN: return QCOM_CTIINEN;
> + case CTIOUTEN: return QCOM_CTIOUTEN;
> + case CTITRIGINSTATUS: return QCOM_CTITRIGINSTATUS;
> + case CTITRIGOUTSTATUS: return QCOM_CTITRIGOUTSTATUS;
> + case CTICHINSTATUS: return QCOM_CTICHINSTATUS;
> + case CTICHOUTSTATUS: return QCOM_CTICHOUTSTATUS;
> + case CTIGATE: return QCOM_CTIGATE;
> + case ASICCTL: return QCOM_ASICCTL;
> + case ITCHINACK: return QCOM_ITCHINACK;
> + case ITTRIGINACK: return QCOM_ITTRIGINACK;
> + case ITCHOUT: return QCOM_ITCHOUT;
> + case ITTRIGOUT: return QCOM_ITTRIGOUT;
> + case ITCHOUTACK: return QCOM_ITCHOUTACK;
> + case ITTRIGOUTACK: return QCOM_ITTRIGOUTACK;
> + case ITCHIN: return QCOM_ITCHIN;
> + case ITTRIGIN: return QCOM_ITTRIGIN;
> +
> + default:
> + return offset;
> + }
> +}
> +
> +#endif /* _CORESIGHT_QCOM_CTI_H */
>
^ permalink raw reply
* Re: [PATCH v2 0/5] mm: reduce mmap_lock contention and improve page fault performance
From: Barry Song (Xiaomi) @ 2026-05-22 2:33 UTC (permalink / raw)
To: willy
Cc: akpm, baohua, bhe, chentao, chrisl, david, jack, kasong,
kunwu.chan, liam, lianux.mm, linux-arm-kernel, linux-kernel,
linux-mm, linux-riscv, linux-s390, linuxppc-dev, liyangouwen1,
ljs, loongarch, mhocko, nphamcs, nzzhao, pfalcato, rppt,
shikemeng, surenb, vbabka, wanglian, youngjun.park
In-Reply-To: <ag4kj84EcKqamdB-@casper.infradead.org>
On Thu, May 21, 2026 at 5:16 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Thu, May 21, 2026 at 05:14:20AM +0800, Barry Song wrote:
> > My understanding is that we should not blame applications here. This is 2026:
> > there are basically only two kinds of applications — single-threaded and
> > multi-threaded — and single-threaded applications are nearly extinct.
>
> all of the applications i run are either single threaded or don't fork.
> what multithreaded applications call fork?
As I replied to David [1], we cannot control what those apps do.
Technically, I agree with you that calling fork() within a
multithreaded app may not be a good idea. But in such a complex
ecosystem, we cannot simply say no to those apps.
Especially when our phones are improving the kernel with this fix,
our customers may instead complain that our phones regress their
apps first. That feels unfair.
I can offer a two-step plan. For the first step, we keep the
current approach of dropping the VMA lock and retrying page faults,
while trying to make the smallest possible change.
As discussed with Suren, the draft code is being changed from a
whitelist approach to a blacklist approach. This way, we do not
need to touch `filemap.c` at all (probably because you are already
maintaining `filemap.c` perfectly):
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 63de8e8684f2..4101d5fa7a82 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1322,6 +1322,7 @@ void do_user_addr_fault(struct pt_regs *regs,
if (!(flags & FAULT_FLAG_USER))
goto lock_mmap;
+retry_vma:
vma = lock_vma_under_rcu(mm, address);
if (!vma)
goto lock_mmap;
@@ -1351,6 +1352,8 @@ void do_user_addr_fault(struct pt_regs *regs,
ARCH_DEFAULT_PKEY);
return;
}
+ if (!(fault & VM_FAULT_RETRY_HARD))
+ goto retry_vma;
lock_mmap:
retry:
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index a308e2c23b82..eeb7d6091bef 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1659,6 +1659,7 @@ typedef __bitwise unsigned int vm_fault_t;
* @VM_FAULT_NOPAGE: ->fault installed the pte, not return page
* @VM_FAULT_LOCKED: ->fault locked the returned page
* @VM_FAULT_RETRY: ->fault blocked, must retry
+ * @VM_FAULT_RETRY_HARD: ->fault blocked, must retry via mmap_lock
* @VM_FAULT_FALLBACK: huge page fault failed, fall back to small
* @VM_FAULT_DONE_COW: ->fault has fully handled COW
* @VM_FAULT_NEEDDSYNC: ->fault did not modify page tables and needs
@@ -1678,10 +1679,11 @@ enum vm_fault_reason {
VM_FAULT_NOPAGE = (__force vm_fault_t)0x000100,
VM_FAULT_LOCKED = (__force vm_fault_t)0x000200,
VM_FAULT_RETRY = (__force vm_fault_t)0x000400,
- VM_FAULT_FALLBACK = (__force vm_fault_t)0x000800,
- VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000,
- VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000,
- VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000,
+ VM_FAULT_RETRY_HARD = (__force vm_fault_t)0x000800,
+ VM_FAULT_FALLBACK = (__force vm_fault_t)0x001000,
+ VM_FAULT_DONE_COW = (__force vm_fault_t)0x002000,
+ VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x004000,
+ VM_FAULT_COMPLETED = (__force vm_fault_t)0x008000,
VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000,
};
diff --git a/mm/memory.c b/mm/memory.c
index 7c020995eafc..b3e7ffdd83f9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3797,7 +3797,7 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)
if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK))
return 0;
vma_end_read(vma);
- return VM_FAULT_RETRY;
+ return VM_FAULT_RETRY | VM_FAULT_RETRY_HARD;
}
/**
@@ -3824,7 +3824,7 @@ vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf)
return 0;
if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
if (!mmap_read_trylock(vma->vm_mm))
- return VM_FAULT_RETRY;
+ return VM_FAULT_RETRY | VM_FAULT_RETRY_HARD;
}
if (__anon_vma_prepare(vma))
ret = VM_FAULT_OOM;
@@ -4778,7 +4778,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* under VMA lock.
*/
vma_end_read(vma);
- ret = VM_FAULT_RETRY;
+ ret = VM_FAULT_RETRY | VM_FAULT_RETRY_HARD;
goto out;
}
For the second step, we can move forward with your approach of
ripping out the PF retry code, after getting in touch with the
owners of those popular apps one by one to understand why they are
doing this and whether they can find a different approach. In
short, this would allow for a one- or two-year transition period.
What do you think about that?
[1] https://lore.kernel.org/linux-mm/CAGsJ_4xC5LdhuoWV1=tK-RZ5rkjc8aOKOkmb1L_8BG_3gtJhDg@mail.gmail.com/
^ permalink raw reply related
* [PATCH] ASoC: rockchip: i2s: Use managed hclk and runtime PM cleanup
From: Cássio Gabriel @ 2026-05-22 2:30 UTC (permalink / raw)
To: Liam Girdwood, Mark Brown, Takashi Iwai, Jaroslav Kysela,
Heiko Stuebner
Cc: linux-sound, linux-arm-kernel, linux-rockchip, linux-kernel,
Cássio Gabriel
The Rockchip I2S driver mixes devm-managed probe resources with manual
runtime PM and hclk cleanup. This leaves the remove path doing runtime PM
shutdown and clock disable before devm-managed ASoC and PCM resources are
released.
Keep the bus clock enabled for the device lifetime with
devm_clk_get_enabled(), and move the runtime PM teardown into devres so the
unwind order matches the managed registrations. This also removes the
remove callback, which only existed for cleanup.
Use a devm action for the final runtime suspend and register it before the
managed runtime PM action, so teardown disables runtime PM before forcing
the device into the suspended state.
Signed-off-by: Cássio Gabriel <cassiogabrielcontato@gmail.com>
---
sound/soc/rockchip/rockchip_i2s.c | 68 +++++++++++++++------------------------
1 file changed, 26 insertions(+), 42 deletions(-)
diff --git a/sound/soc/rockchip/rockchip_i2s.c b/sound/soc/rockchip/rockchip_i2s.c
index 49ff86b35ef1..4e3af0f37941 100644
--- a/sound/soc/rockchip/rockchip_i2s.c
+++ b/sound/soc/rockchip/rockchip_i2s.c
@@ -664,6 +664,14 @@ static const struct of_device_id rockchip_i2s_match[] __maybe_unused = {
};
MODULE_DEVICE_TABLE(of, rockchip_i2s_match);
+static void rockchip_i2s_suspend(void *data)
+{
+ struct device *dev = data;
+
+ if (!pm_runtime_status_suspended(dev))
+ i2s_runtime_suspend(dev);
+}
+
static int rockchip_i2s_init_dai(struct rk_i2s_dev *i2s, struct resource *res,
struct snd_soc_dai_driver **dp)
{
@@ -758,37 +766,28 @@ static int rockchip_i2s_probe(struct platform_device *pdev)
}
/* try to prepare related clocks */
- i2s->hclk = devm_clk_get(&pdev->dev, "i2s_hclk");
+ i2s->hclk = devm_clk_get_enabled(&pdev->dev, "i2s_hclk");
if (IS_ERR(i2s->hclk)) {
dev_err(&pdev->dev, "Can't retrieve i2s bus clock\n");
return PTR_ERR(i2s->hclk);
}
- ret = clk_prepare_enable(i2s->hclk);
- if (ret) {
- dev_err(i2s->dev, "hclock enable failed %d\n", ret);
- return ret;
- }
i2s->mclk = devm_clk_get(&pdev->dev, "i2s_clk");
if (IS_ERR(i2s->mclk)) {
dev_err(&pdev->dev, "Can't retrieve i2s master clock\n");
- ret = PTR_ERR(i2s->mclk);
- goto err_clk;
+ return PTR_ERR(i2s->mclk);
}
regs = devm_platform_get_and_ioremap_resource(pdev, 0, &res);
- if (IS_ERR(regs)) {
- ret = PTR_ERR(regs);
- goto err_clk;
- }
+ if (IS_ERR(regs))
+ return PTR_ERR(regs);
i2s->regmap = devm_regmap_init_mmio(&pdev->dev, regs,
&rockchip_i2s_regmap_config);
if (IS_ERR(i2s->regmap)) {
dev_err(&pdev->dev,
"Failed to initialise managed register map\n");
- ret = PTR_ERR(i2s->regmap);
- goto err_clk;
+ return PTR_ERR(i2s->regmap);
}
i2s->bclk_ratio = 64;
@@ -799,8 +798,7 @@ static int rockchip_i2s_probe(struct platform_device *pdev)
i2s->bclk_off = pinctrl_lookup_state(i2s->pinctrl, "bclk_off");
if (IS_ERR_OR_NULL(i2s->bclk_off)) {
dev_err(&pdev->dev, "failed to find i2s bclk_off\n");
- ret = -EINVAL;
- goto err_clk;
+ return -EINVAL;
}
}
} else {
@@ -811,16 +809,23 @@ static int rockchip_i2s_probe(struct platform_device *pdev)
dev_set_drvdata(&pdev->dev, i2s);
- pm_runtime_enable(&pdev->dev);
+ ret = devm_add_action(&pdev->dev, rockchip_i2s_suspend, &pdev->dev);
+ if (ret)
+ return ret;
+
+ ret = devm_pm_runtime_enable(&pdev->dev);
+ if (ret)
+ return ret;
+
if (!pm_runtime_enabled(&pdev->dev)) {
ret = i2s_runtime_resume(&pdev->dev);
if (ret)
- goto err_pm_disable;
+ return ret;
}
ret = rockchip_i2s_init_dai(i2s, res, &dai);
if (ret)
- goto err_pm_disable;
+ return ret;
ret = devm_snd_soc_register_component(&pdev->dev,
&rockchip_i2s_component,
@@ -828,36 +833,16 @@ static int rockchip_i2s_probe(struct platform_device *pdev)
if (ret) {
dev_err(&pdev->dev, "Could not register DAI\n");
- goto err_suspend;
+ return ret;
}
ret = devm_snd_dmaengine_pcm_register(&pdev->dev, NULL, 0);
if (ret) {
dev_err(&pdev->dev, "Could not register PCM\n");
- goto err_suspend;
+ return ret;
}
return 0;
-
-err_suspend:
- if (!pm_runtime_status_suspended(&pdev->dev))
- i2s_runtime_suspend(&pdev->dev);
-err_pm_disable:
- pm_runtime_disable(&pdev->dev);
-err_clk:
- clk_disable_unprepare(i2s->hclk);
- return ret;
-}
-
-static void rockchip_i2s_remove(struct platform_device *pdev)
-{
- struct rk_i2s_dev *i2s = dev_get_drvdata(&pdev->dev);
-
- pm_runtime_disable(&pdev->dev);
- if (!pm_runtime_status_suspended(&pdev->dev))
- i2s_runtime_suspend(&pdev->dev);
-
- clk_disable_unprepare(i2s->hclk);
}
static const struct dev_pm_ops rockchip_i2s_pm_ops = {
@@ -866,7 +851,6 @@ static const struct dev_pm_ops rockchip_i2s_pm_ops = {
static struct platform_driver rockchip_i2s_driver = {
.probe = rockchip_i2s_probe,
- .remove = rockchip_i2s_remove,
.driver = {
.name = DRV_NAME,
.of_match_table = of_match_ptr(rockchip_i2s_match),
---
base-commit: 40cc9602caf2539369bd3dd7d66ee67e204e75ef
change-id: 20260521-asoc-rockchip-i2s-devm-cleanup-6cade5f495e5
Best regards,
--
Cássio Gabriel <cassiogabrielcontato@gmail.com>
^ permalink raw reply related
* Re: [PATCH v3 5/9] KVM: arm64: selftests: Add shadow_stage2 test
From: Itaru Kitayama @ 2026-05-22 2:02 UTC (permalink / raw)
To: Wei-Lin Chang
Cc: linux-kernel, kvm, linux-kselftest, linux-arm-kernel, kvmarm,
Paolo Bonzini, Shuah Khan, Marc Zyngier, Oliver Upton, Joey Gouly,
Suzuki K Poulose, Zenghui Yu, Catalin Marinas, Will Deacon
In-Reply-To: <20260516183003.799058-6-weilin.chang@arm.com>
On Sat, May 16, 2026 at 07:29:59PM +0100, Wei-Lin Chang wrote:
> The shadow_stage2 test is aimed to exercise the shadow page table
> management code in KVM. In this first patch a basic test similar to
> hello_nested is created. Right now it doesn't turn on stage-2 for the
> nested guest (L2) yet, therefore the shadow page table code in KVM will
> only be triggered minimally now.
>
> Signed-off-by: Wei-Lin Chang <weilin.chang@arm.com>
> ---
> tools/testing/selftests/kvm/Makefile.kvm | 1 +
> .../selftests/kvm/arm64/shadow_stage2.c | 128 ++++++++++++++++++
> 2 files changed, 129 insertions(+)
> create mode 100644 tools/testing/selftests/kvm/arm64/shadow_stage2.c
>
> diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
> index e8c108e0c487..c0fac2ba1339 100644
> --- a/tools/testing/selftests/kvm/Makefile.kvm
> +++ b/tools/testing/selftests/kvm/Makefile.kvm
> @@ -176,6 +176,7 @@ TEST_GEN_PROGS_arm64 += arm64/page_fault_test
> TEST_GEN_PROGS_arm64 += arm64/psci_test
> TEST_GEN_PROGS_arm64 += arm64/sea_to_user
> TEST_GEN_PROGS_arm64 += arm64/set_id_regs
> +TEST_GEN_PROGS_arm64 += arm64/shadow_stage2
> TEST_GEN_PROGS_arm64 += arm64/smccc_filter
> TEST_GEN_PROGS_arm64 += arm64/vcpu_width_config
> TEST_GEN_PROGS_arm64 += arm64/vgic_init
> diff --git a/tools/testing/selftests/kvm/arm64/shadow_stage2.c b/tools/testing/selftests/kvm/arm64/shadow_stage2.c
> new file mode 100644
> index 000000000000..cf76a2b0582d
> --- /dev/null
> +++ b/tools/testing/selftests/kvm/arm64/shadow_stage2.c
> @@ -0,0 +1,128 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * shadow_stage2 - Test correctness of shadow stage 2
> + */
> +
> +#include "nested.h"
> +#include "processor.h"
> +#include "test_util.h"
> +#include "ucall.h"
> +
> +#define XLATE2GPA (0xABCD)
> +#define L2STACKSZ (0x100)
> +
> +#define L2SUCCESS (0x0)
> +#define L2FAILED (0x1)
> +#define L2SYNC (0x2)
> +
> +/*
> + * TPIDR_EL2 is used to store vcpu id, so save and restore it.
> + */
> +static gpa_t ucall_translate_to_gpa(void *gva)
> +{
> + gpa_t gpa;
> + u64 vcpu_id = read_sysreg(tpidr_el2);
> +
> + GUEST_SYNC2(XLATE2GPA, gva);
> +
> + /* get the result from userspace */
> + gpa = read_sysreg(tpidr_el2);
> +
> + write_sysreg(vcpu_id, tpidr_el2);
> +
> + return gpa;
> +}
> +
> +static void l2_guest_code(void)
> +{
> + do_hvc(L2SYNC, 10, 0);
> + do_hvc(L2SYNC, 20, 0);
> + do_hvc(L2SYNC, 30, 0);
> +
> + do_hvc(L2SUCCESS, 0, 0);
> +}
> +
> +static void guest_code(void)
> +{
> + struct vcpu vcpu;
> + struct hyp_data hyp_data;
> + int ret, i = 0;
> + gpa_t l2_pc, l2_stack_top;
> + /* force 16-byte alignment for the stack pointer */
> + u8 l2_stack[L2STACKSZ] __attribute__((aligned(16)));
> +
> + GUEST_ASSERT_EQ(get_current_el(), 2);
> + GUEST_PRINTF("vEL2 entry\n");
> +
> + l2_pc = ucall_translate_to_gpa(l2_guest_code);
> + l2_stack_top = ucall_translate_to_gpa(&l2_stack[L2STACKSZ]);
> +
> + init_vcpu(&vcpu, l2_pc, l2_stack_top);
> + prepare_hyp();
> +
> + while (true) {
> + GUEST_PRINTF("L2 enter\n");
> + ret = run_l2(&vcpu, &hyp_data);
> + GUEST_PRINTF("L2 exit\n");
> + GUEST_ASSERT_EQ(ret, ARM_EXCEPTION_TRAP);
> + GUEST_ASSERT_EQ(ESR_ELx_EC(read_sysreg(esr_el2)), ESR_ELx_EC_HVC64);
> +
> + if (vcpu.context.regs.regs[0] == L2SYNC)
> + GUEST_SYNC3(L2SYNC, i++, vcpu.context.regs.regs[1]);
> + else
> + break;
> + }
> +
> + if (vcpu.context.regs.regs[0] != L2SUCCESS)
> + GUEST_FAIL("L2 failed\n");
> +
> + GUEST_PRINTF("L2 success!\n");
> + GUEST_DONE();
> +}
> +
> +int main(void)
> +{
> + struct kvm_vcpu_init init;
> + struct kvm_vcpu *vcpu;
> + struct kvm_vm *vm;
> + struct ucall uc;
> + gpa_t gpa;
> +
> + TEST_REQUIRE(kvm_check_cap(KVM_CAP_ARM_EL2));
> + vm = vm_create(1);
This assumes the default P40V48 4KB guest creation, so perhaps you'd
want to change this to __vm_create(VM_SHAPE(), 1, 0) so the test runs too on other
16KB and 64KB kernels? I've been testing it with your stage 2 unmapping
optimization series together with this v3 KVM selftest series.
Thanks,
Itaru.
> +
> + kvm_get_default_vcpu_target(vm, &init);
> + init.features[0] |= BIT(KVM_ARM_VCPU_HAS_EL2);
> + vcpu = aarch64_vcpu_add(vm, 0, &init, guest_code);
> + kvm_arch_vm_finalize_vcpus(vm);
> +
> + while (true) {
> + vcpu_run(vcpu);
> +
> + switch (get_ucall(vcpu, &uc)) {
> + case UCALL_SYNC:
> + if (uc.args[0] == XLATE2GPA) {
> + gpa = addr_gva2gpa(vm, (gva_t)uc.args[1]);
> + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TPIDR_EL2), gpa);
> + }
> + if (uc.args[0] == L2SYNC)
> + pr_info("L2SYNC, L1 info: %ld, L2 info: %ld\n", uc.args[1], uc.args[2]);
> + break;
> + case UCALL_PRINTF:
> + pr_info("[L1] %s", uc.buffer);
> + break;
> + case UCALL_DONE:
> + pr_info("DONE!\n");
> + goto end;
> + case UCALL_ABORT:
> + REPORT_GUEST_ASSERT(uc);
> + fallthrough;
> + default:
> + TEST_FAIL("Unhandled ucall: %ld\n", uc.cmd);
> + }
> + }
> +
> +end:
> + kvm_vm_free(vm);
> + return 0;
> +}
> --
> 2.43.0
>
^ permalink raw reply
* [PATCH v3 2/8] bpf: Recover arena kernel faults with scratch page
From: Tejun Heo @ 2026-05-22 1:59 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
Kumar Kartikeya Dwivedi
Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
David Hildenbrand, Mike Rapoport, Emil Tsalapatis,
kernel test robot, sched-ext, bpf, x86, linux-arm-kernel,
linux-mm, linux-kernel, Tejun Heo
In-Reply-To: <20260520235052.4180316-1-tj@kernel.org>
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
BPF arena usage is becoming more prevalent, but kernel <-> BPF communication
over arena memory is awkward today. Data has to be staged through a trusted
kernel pointer with extra code and copying on the BPF side. While reads
through arena pointers can use a fault-safe helper, writes don't have a good
solution. The in-line alternative would need instruction emulation or asm
fixup labels.
Enable direct kernel-side reads and writes within GUARD_SZ / 2 of any
handed-in arena pointer, without bounds checking. A per-arena scratch page
is installed by the arch fault path into empty arena kernel PTEs - x86 from
page_fault_oops() for not-present faults, arm64 from __do_kernel_fault() for
translation faults, both after the existing exception-table and KFENCE
handling. The faulting instruction retries and the access is also reported
through the program's BPF stream, preserving error reporting.
bpf_prog_find_from_stack() resolves the current BPF program (and its arena)
from the kernel stack - no new bpf_run_ctx state is added. Recovery covers
the 4 GiB arena plus the upper half-guard (GUARD_SZ / 2). The lower
half-guard is excluded because well-behaved kfuncs only access forward from
arena pointers. The kfunc-author contract - access at most GUARD_SZ / 2 past
a handed-in pointer - is documented in Documentation/bpf/kfuncs.rst.
The install is lock-free via ptep_try_set(). On race-loss the winning
installer's PTE is already valid, so the access retry succeeds. The arena
clear path uses ptep_get_and_clear() so installer and clearer race through
atomic accessors. No flush_tlb_kernel_range() afterwards. Stale "not mapped"
entries just cause one extra re-fault, cheaper than a global IPI on every
install.
Scratch exists only to keep the kernel from oopsing on an in-line arena
access. Its presence at a PTE means the BPF program has already
malfunctioned, and the violation is reported through the program's BPF
stream. The only requirement for behavior on a scratched PTE is that the
kernel doesn't crash. In particular, any user-side access through such a PTE
may segfault. The shared scratch page is freed once during map destruction.
BPF instruction faults continue to use the existing JIT exception-table
path. This patch changes only the kernel-text fault path. No UAPI flag is
added. The new behavior is the default.
v2: Use ptep_get_and_clear() in apply_range_clear_cb(). (David)
v3: Stub bpf_arena_handle_page_fault() for !CONFIG_BPF_SYSCALL. (lkp)
Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
---
Re-posting just patch 2/8 of the v3 series with the build fix. Will post
a full v4 rolling up all patches later if needed.
Documentation/bpf/kfuncs.rst | 14 +++
arch/arm64/mm/fault.c | 10 +-
arch/x86/mm/fault.c | 12 ++-
include/linux/bpf.h | 1 +
include/linux/bpf_defs.h | 19 ++++
kernel/bpf/arena.c | 177 +++++++++++++++++++++++++++--------
kernel/bpf/core.c | 5 +
7 files changed, 191 insertions(+), 47 deletions(-)
create mode 100644 include/linux/bpf_defs.h
diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
index 75e6c078e0e7..6d497e720998 100644
--- a/Documentation/bpf/kfuncs.rst
+++ b/Documentation/bpf/kfuncs.rst
@@ -462,6 +462,20 @@ In order to accommodate such requirements, the verifier will enforce strict
PTR_TO_BTF_ID type matching if two types have the exact same name, with one
being suffixed with ``___init``.
+2.8 Accessing arena memory through kfunc arguments
+--------------------------------------------------
+
+A read or write at any address inside an arena does not oops the kernel.
+Unallocated arena pages are lazily backed by a scratch page and the
+access is reported through the program's BPF stream as an error. Only
+the BPF program's correctness is affected; the kernel itself remains
+intact.
+
+The arena is followed by a ``GUARD_SZ / 2`` (32 KiB) guard region that
+is also covered by this recovery. A kfunc handed an arena pointer may
+therefore access up to ``GUARD_SZ / 2`` past it without bounds-checking
+against the arena. Larger accesses must verify the range explicitly.
+
.. _BPF_kfunc_lifecycle_expectations:
3. kfunc lifecycle expectations
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 920a8b244d59..0d58d667fcd8 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -9,6 +9,7 @@
#include <linux/acpi.h>
#include <linux/bitfield.h>
+#include <linux/bpf_defs.h>
#include <linux/extable.h>
#include <linux/kfence.h>
#include <linux/signal.h>
@@ -416,9 +417,12 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr,
} else if (addr < PAGE_SIZE) {
msg = "NULL pointer dereference";
} else {
- if (esr_fsc_is_translation_fault(esr) &&
- kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
- return;
+ if (esr_fsc_is_translation_fault(esr)) {
+ if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
+ return;
+ if (bpf_arena_handle_page_fault(addr, esr & ESR_ELx_WNR, regs->pc))
+ return;
+ }
msg = "paging request";
}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f0e77e084482..b0f103ddbd23 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -8,6 +8,7 @@
#include <linux/sched/task_stack.h> /* task_stack_*(), ... */
#include <linux/kdebug.h> /* oops_begin/end, ... */
#include <linux/memblock.h> /* max_low_pfn */
+#include <linux/bpf_defs.h> /* bpf_arena_handle_page_fault */
#include <linux/kfence.h> /* kfence_handle_page_fault */
#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
@@ -688,10 +689,13 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
if (IS_ENABLED(CONFIG_EFI))
efi_crash_gracefully_on_page_fault(address);
- /* Only not-present faults should be handled by KFENCE. */
- if (!(error_code & X86_PF_PROT) &&
- kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
- return;
+ /* Only not-present faults should be handled by KFENCE or BPF arena. */
+ if (!(error_code & X86_PF_PROT)) {
+ if (kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
+ return;
+ if (bpf_arena_handle_page_fault(address, error_code & X86_PF_WRITE, regs->ip))
+ return;
+ }
oops:
/*
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0136a108d083..831996c411cf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -6,6 +6,7 @@
#include <uapi/linux/bpf.h>
#include <uapi/linux/filter.h>
+#include <linux/bpf_defs.h>
#include <crypto/sha2.h>
#include <linux/workqueue.h>
diff --git a/include/linux/bpf_defs.h b/include/linux/bpf_defs.h
new file mode 100644
index 000000000000..2185cd3966d4
--- /dev/null
+++ b/include/linux/bpf_defs.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Subset of bpf.h declarations, split out so files that need only these
+ * declarations can avoid bpf.h's full include cost.
+ */
+#ifndef _LINUX_BPF_DEFS_H
+#define _LINUX_BPF_DEFS_H
+
+#ifdef CONFIG_BPF_SYSCALL
+bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip);
+#else
+static inline bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write,
+ unsigned long fault_ip)
+{
+ return false;
+}
+#endif
+
+#endif /* _LINUX_BPF_DEFS_H */
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 08d008cc471e..1c0b87ecc817 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -53,6 +53,7 @@ struct bpf_arena {
u64 user_vm_start;
u64 user_vm_end;
struct vm_struct *kern_vm;
+ struct page *scratch_page;
struct range_tree rt;
/* protects rt */
rqspinlock_t spinlock;
@@ -118,6 +119,11 @@ struct apply_range_data {
int i;
};
+struct clear_range_data {
+ struct llist_head *free_pages;
+ struct page *scratch_page;
+};
+
static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
{
struct apply_range_data *d = data;
@@ -144,33 +150,59 @@ static void flush_vmap_cache(unsigned long start, unsigned long size)
flush_cache_vmap(start, start + size);
}
-static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages)
+static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
{
+ struct clear_range_data *d = data;
pte_t old_pte;
struct page *page;
- /* sanity check */
- old_pte = ptep_get(pte);
+ /*
+ * Pairs with ptep_try_set() in the kernel-fault scratch installer.
+ * Both sides must be atomic.
+ */
+ old_pte = ptep_get_and_clear(&init_mm, addr, pte);
if (pte_none(old_pte) || !pte_present(old_pte))
- return 0; /* nothing to do */
+ return 0;
page = pte_page(old_pte);
if (WARN_ON_ONCE(!page))
return -EINVAL;
- pte_clear(&init_mm, addr, pte);
+ /*
+ * Skip the per-arena scratch page. A kernel fault on an unallocated uaddr
+ * scratches its PTE. A later bpf_arena_free_pages() over that range walks
+ * here. Without the skip, scratch_page would be freed.
+ */
+ if (page == d->scratch_page)
+ return 0;
+
+ __llist_add(&page->pcp_llist, d->free_pages);
+ return 0;
+}
- /* Add page to the list so it is freed later */
- if (free_pages)
- __llist_add(&page->pcp_llist, free_pages);
+static int apply_range_set_scratch_cb(pte_t *pte, unsigned long addr, void *data)
+{
+ struct page *scratch_page = data;
+ if (!pte_none(ptep_get(pte)))
+ return 0;
+ /*
+ * Best-effort install. ptep_try_set() returns false only if another
+ * installer (real allocation or concurrent fault) won the cmpxchg.
+ * Their PTE is already valid, so the access retry succeeds.
+ *
+ * No flush_tlb_kernel_range() needed. Stale "not mapped" entries just
+ * cause one extra re-fault through this same path.
+ */
+ ptep_try_set(pte, mk_pte(scratch_page, PAGE_KERNEL));
return 0;
}
static int populate_pgtable_except_pte(struct bpf_arena *arena)
{
+ /* Populate intermediates for the recovery range (4 GiB + upper half-guard). */
return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
- KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL);
+ SZ_4G + GUARD_SZ / 2, apply_range_set_cb, NULL);
}
static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
@@ -221,22 +253,29 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
init_irq_work(&arena->free_irq, arena_free_irq);
INIT_WORK(&arena->free_work, arena_free_worker);
bpf_map_init_from_attr(&arena->map, attr);
+
+ err = bpf_map_alloc_pages(&arena->map, NUMA_NO_NODE, 1, &arena->scratch_page);
+ if (err)
+ goto err_free_arena;
+
range_tree_init(&arena->rt);
err = range_tree_set(&arena->rt, 0, attr->max_entries);
- if (err) {
- bpf_map_area_free(arena);
- goto err;
- }
+ if (err)
+ goto err_free_scratch;
mutex_init(&arena->lock);
raw_res_spin_lock_init(&arena->spinlock);
err = populate_pgtable_except_pte(arena);
- if (err) {
- range_tree_destroy(&arena->rt);
- bpf_map_area_free(arena);
- goto err;
- }
+ if (err)
+ goto err_destroy_rt;
return &arena->map;
+
+err_destroy_rt:
+ range_tree_destroy(&arena->rt);
+err_free_scratch:
+ __free_page(arena->scratch_page);
+err_free_arena:
+ bpf_map_area_free(arena);
err:
free_vm_area(kern_vm);
return ERR_PTR(err);
@@ -244,6 +283,7 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
{
+ struct bpf_arena *arena = data;
struct page *page;
pte_t pte;
@@ -251,6 +291,12 @@ static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
if (!pte_present(pte)) /* sanity check */
return 0;
page = pte_page(pte);
+ /*
+ * Skip the scratch page. The walk is page-table-driven, not range-tree-driven,
+ * so it can visit scratch PTEs at uaddrs the BPF program never allocated.
+ */
+ if (page == arena->scratch_page)
+ return 0;
/*
* We do not update pte here:
* 1. Nobody should be accessing bpf_arena's range outside of a kernel bug
@@ -286,9 +332,10 @@ static void arena_map_free(struct bpf_map *map)
* free those pages.
*/
apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
- KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
+ SZ_4G + GUARD_SZ / 2, existing_page_cb, arena);
free_vm_area(arena->kern_vm);
range_tree_destroy(&arena->rt);
+ __free_page(arena->scratch_page);
bpf_map_area_free(arena);
}
@@ -374,33 +421,37 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
return VM_FAULT_RETRY;
page = vmalloc_to_page((void *)kaddr);
- if (page)
+ if (page) {
+ if (page == arena->scratch_page)
+ /* BPF triggered scratch here; don't lazy-alloc over it */
+ goto out_sigsegv;
/* already have a page vmap-ed */
goto out;
+ }
bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
/* User space requested to segfault when page is not allocated by bpf prog */
- goto out_unlock_sigsegv;
+ goto out_sigsegv_memcg;
ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
if (ret)
- goto out_unlock_sigsegv;
+ goto out_sigsegv_memcg;
struct apply_range_data data = { .pages = &page, .i = 0 };
/* Account into memcg of the process that created bpf_arena */
ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
if (ret) {
range_tree_set(&arena->rt, vmf->pgoff, 1);
- goto out_unlock_sigsegv;
+ goto out_sigsegv_memcg;
}
ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data);
if (ret) {
range_tree_set(&arena->rt, vmf->pgoff, 1);
free_pages_nolock(page, 0);
- goto out_unlock_sigsegv;
+ goto out_sigsegv_memcg;
}
flush_vmap_cache(kaddr, PAGE_SIZE);
bpf_map_memcg_exit(old_memcg, new_memcg);
@@ -409,8 +460,9 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
vmf->page = page;
return 0;
-out_unlock_sigsegv:
+out_sigsegv_memcg:
bpf_map_memcg_exit(old_memcg, new_memcg);
+out_sigsegv:
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
return VM_FAULT_SIGSEGV;
}
@@ -668,6 +720,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
struct llist_head free_pages;
struct llist_node *pos, *t;
struct arena_free_span *s;
+ struct clear_range_data cdata;
unsigned long flags;
int ret = 0;
@@ -696,9 +749,11 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
range_tree_set(&arena->rt, pgoff, page_cnt);
init_llist_head(&free_pages);
+ cdata.free_pages = &free_pages;
+ cdata.scratch_page = arena->scratch_page;
/* clear ptes and collect struct pages */
apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
- apply_range_clear_cb, &free_pages);
+ apply_range_clear_cb, &cdata);
/* drop the lock to do the tlb flush and zap pages */
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
@@ -788,6 +843,7 @@ static void arena_free_worker(struct work_struct *work)
struct arena_free_span *s;
u64 arena_vm_start, user_vm_start;
struct llist_head free_pages;
+ struct clear_range_data cdata;
struct page *page;
unsigned long full_uaddr;
long kaddr, page_cnt, pgoff;
@@ -801,6 +857,8 @@ static void arena_free_worker(struct work_struct *work)
bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
init_llist_head(&free_pages);
+ cdata.free_pages = &free_pages;
+ cdata.scratch_page = arena->scratch_page;
arena_vm_start = bpf_arena_get_kern_vm_start(arena);
user_vm_start = bpf_arena_get_user_vm_start(arena);
@@ -813,7 +871,7 @@ static void arena_free_worker(struct work_struct *work)
/* clear ptes and collect pages in free_pages llist */
apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
- apply_range_clear_cb, &free_pages);
+ apply_range_clear_cb, &cdata);
range_tree_set(&arena->rt, pgoff, page_cnt);
}
@@ -928,23 +986,12 @@ static int __init kfunc_init(void)
}
late_initcall(kfunc_init);
-void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
+static void __bpf_prog_report_arena_violation(struct bpf_prog *prog, bool write,
+ unsigned long addr, unsigned long fault_ip)
{
struct bpf_stream_stage ss;
- struct bpf_prog *prog;
u64 user_vm_start;
- /*
- * The RCU read lock is held to safely traverse the latch tree, but we
- * don't need its protection when accessing the prog, since it will not
- * disappear while we are handling the fault.
- */
- rcu_read_lock();
- prog = bpf_prog_ksym_find(fault_ip);
- rcu_read_unlock();
- if (!prog)
- return;
-
/* Use main prog for stream access */
prog = prog->aux->main_prog_aux->prog;
@@ -957,3 +1004,53 @@ void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned lo
bpf_stream_dump_stack(ss);
}));
}
+
+bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip)
+{
+ struct bpf_arena *arena;
+ struct bpf_prog *prog;
+ unsigned long kbase;
+ unsigned long page_addr = addr & PAGE_MASK;
+
+ prog = bpf_prog_find_from_stack();
+ if (!prog)
+ return false;
+
+ arena = prog->aux->arena;
+ /* a prog not using arena may be on stack, so arena can be NULL */
+ if (!arena)
+ return false;
+
+ kbase = bpf_arena_get_kern_vm_start(arena);
+
+ /*
+ * Recovery covers the 4 GiB mappable band plus the upper half-guard.
+ * Lower guard is unreachable from kfuncs; an address there indicates
+ * a different bug class - leave it to the regular kernel oops path.
+ */
+ if (page_addr < kbase || page_addr >= kbase + SZ_4G + GUARD_SZ / 2)
+ return false;
+
+ apply_to_page_range(&init_mm, page_addr, PAGE_SIZE,
+ apply_range_set_scratch_cb, arena->scratch_page);
+ flush_vmap_cache(page_addr, PAGE_SIZE);
+ __bpf_prog_report_arena_violation(prog, is_write, page_addr - kbase, fault_ip);
+ return true;
+}
+
+void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
+{
+ struct bpf_prog *prog;
+
+ /*
+ * The RCU read lock is held to safely traverse the latch tree, but we
+ * don't need its protection when accessing the prog, since it will not
+ * disappear while we are handling the fault.
+ */
+ rcu_read_lock();
+ prog = bpf_prog_ksym_find(fault_ip);
+ rcu_read_unlock();
+ if (!prog)
+ return;
+ __bpf_prog_report_arena_violation(prog, write, addr, fault_ip);
+}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 066b86e7233c..fa368d8920d9 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -3290,6 +3290,11 @@ __weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
{
return 0;
}
+__weak bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write,
+ unsigned long fault_ip)
+{
+ return false;
+}
#ifdef CONFIG_BPF_SYSCALL
static int __init bpf_global_ma_init(void)
--
2.54.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox