* [PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache
@ 2018-06-15 10:53 Vivek Gautam
[not found] ` <20180615105329.26800-1-vivek.gautam-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
2018-10-23 4:15 ` Tomasz Figa
0 siblings, 2 replies; 19+ messages in thread
From: Vivek Gautam @ 2018-06-15 10:53 UTC (permalink / raw)
To: will.deacon, robin.murphy, joro
Cc: linux-arm-kernel, iommu, linux-kernel, linux-arm-msm, pdaly,
Vivek Gautam
Qualcomm SoCs have an additional level of cache called as
System cache or Last level cache[1]. This cache sits right
before the DDR, and is tightly coupled with the memory
controller.
The cache is available to all the clients present in the
SoC system. The clients request their slices from this system
cache, make it active, and can then start using it. For these
clients with smmu, to start using the system cache for
dma buffers and related page tables [2], few of the memory
attributes need to be set accordingly.
This change makes the related memory Outer-Shareable, and
updates the MAIR with necessary protection.
The MAIR attribute requirements are:
Inner Cacheablity = 0
Outer Cacheablity = 1, Write-Back Write Allocate
Outer Shareablity = 1
This change is a realisation of following changes
from downstream msm-4.9:
iommu: io-pgtable-arm: Support DOMAIN_ATTRIBUTE_USE_UPSTREAM_HINT
iommu: io-pgtable-arm: Implement IOMMU_USE_UPSTREAM_HINT
[1] https://patchwork.kernel.org/patch/10422531/
[2] https://patchwork.kernel.org/patch/10302791/
Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
---
drivers/iommu/arm-smmu.c | 14 ++++++++++++++
drivers/iommu/io-pgtable-arm.c | 24 +++++++++++++++++++-----
drivers/iommu/io-pgtable.h | 4 ++++
include/linux/iommu.h | 4 ++++
4 files changed, 41 insertions(+), 5 deletions(-)
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index f7a96bcf94a6..8058e7205034 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -249,6 +249,7 @@ struct arm_smmu_domain {
struct mutex init_mutex; /* Protects smmu pointer */
spinlock_t cb_lock; /* Serialises ATS1* ops and TLB syncs */
struct iommu_domain domain;
+ bool has_sys_cache;
};
struct arm_smmu_option_prop {
@@ -862,6 +863,8 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK)
pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_NO_DMA;
+ if (smmu_domain->has_sys_cache)
+ pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_SYS_CACHE;
smmu_domain->smmu = smmu;
pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
@@ -1477,6 +1480,9 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
case DOMAIN_ATTR_NESTING:
*(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
return 0;
+ case DOMAIN_ATTR_USE_SYS_CACHE:
+ *((int *)data) = smmu_domain->has_sys_cache;
+ return 0;
default:
return -ENODEV;
}
@@ -1506,6 +1512,14 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
break;
+ case DOMAIN_ATTR_USE_SYS_CACHE:
+ if (smmu_domain->smmu) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+ if (*((int *)data))
+ smmu_domain->has_sys_cache = true;
+ break;
default:
ret = -ENODEV;
}
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 010a254305dd..b2aee1828524 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -169,9 +169,11 @@
#define ARM_LPAE_MAIR_ATTR_DEVICE 0x04
#define ARM_LPAE_MAIR_ATTR_NC 0x44
#define ARM_LPAE_MAIR_ATTR_WBRWA 0xff
+#define ARM_LPAE_MAIR_ATTR_SYS_CACHE 0xf4
#define ARM_LPAE_MAIR_ATTR_IDX_NC 0
#define ARM_LPAE_MAIR_ATTR_IDX_CACHE 1
#define ARM_LPAE_MAIR_ATTR_IDX_DEV 2
+#define ARM_LPAE_MAIR_ATTR_IDX_SYS_CACHE 3
/* IOPTE accessors */
#define iopte_deref(pte,d) __va(iopte_to_paddr(pte, d))
@@ -442,6 +444,10 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
else if (prot & IOMMU_CACHE)
pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
+ else if (prot & IOMMU_SYS_CACHE)
+ pte |= (ARM_LPAE_MAIR_ATTR_IDX_SYS_CACHE
+ << ARM_LPAE_PTE_ATTRINDX_SHIFT);
+
} else {
pte = ARM_LPAE_PTE_HAP_FAULT;
if (prot & IOMMU_READ)
@@ -771,7 +777,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
u64 reg;
struct arm_lpae_io_pgtable *data;
- if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA))
+ if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA |
+ IO_PGTABLE_QUIRK_SYS_CACHE))
return NULL;
data = arm_lpae_alloc_pgtable(cfg);
@@ -779,9 +786,14 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
return NULL;
/* TCR */
- reg = (ARM_LPAE_TCR_SH_IS << ARM_LPAE_TCR_SH0_SHIFT) |
- (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN0_SHIFT) |
- (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN0_SHIFT);
+ if (cfg->quirks & IO_PGTABLE_QUIRK_SYS_CACHE) {
+ reg = (ARM_LPAE_TCR_SH_OS << ARM_LPAE_TCR_SH0_SHIFT) |
+ (ARM_LPAE_TCR_RGN_NC << ARM_LPAE_TCR_IRGN0_SHIFT);
+ } else {
+ reg = (ARM_LPAE_TCR_SH_IS << ARM_LPAE_TCR_SH0_SHIFT) |
+ (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN0_SHIFT);
+ }
+ reg |= (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN0_SHIFT);
switch (ARM_LPAE_GRANULE(data)) {
case SZ_4K:
@@ -833,7 +845,9 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
(ARM_LPAE_MAIR_ATTR_WBRWA
<< ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_CACHE)) |
(ARM_LPAE_MAIR_ATTR_DEVICE
- << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV));
+ << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV)) |
+ (ARM_LPAE_MAIR_ATTR_SYS_CACHE
+ << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_SYS_CACHE));
cfg->arm_lpae_s1_cfg.mair[0] = reg;
cfg->arm_lpae_s1_cfg.mair[1] = 0;
diff --git a/drivers/iommu/io-pgtable.h b/drivers/iommu/io-pgtable.h
index 2df79093cad9..b5a398380e9f 100644
--- a/drivers/iommu/io-pgtable.h
+++ b/drivers/iommu/io-pgtable.h
@@ -71,12 +71,16 @@ struct io_pgtable_cfg {
* be accessed by a fully cache-coherent IOMMU or CPU (e.g. for a
* software-emulated IOMMU), such that pagetable updates need not
* be treated as explicit DMA data.
+ *
+ * IO_PGTABLE_QUIRK_SYS_CACHE: Override the attributes set in TCR for
+ * the page table walker when using system cache.
*/
#define IO_PGTABLE_QUIRK_ARM_NS BIT(0)
#define IO_PGTABLE_QUIRK_NO_PERMS BIT(1)
#define IO_PGTABLE_QUIRK_TLBI_ON_MAP BIT(2)
#define IO_PGTABLE_QUIRK_ARM_MTK_4GB BIT(3)
#define IO_PGTABLE_QUIRK_NO_DMA BIT(4)
+ #define IO_PGTABLE_QUIRK_SYS_CACHE BIT(5)
unsigned long quirks;
unsigned long pgsize_bitmap;
unsigned int ias;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 19938ee6eb31..dacb9648e9b3 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -41,6 +41,9 @@
* if the IOMMU page table format is equivalent.
*/
#define IOMMU_PRIV (1 << 5)
+/* Use last level cache available with few architectures */
+#define IOMMU_SYS_CACHE (1 << 6)
+
struct iommu_ops;
struct iommu_group;
@@ -124,6 +127,7 @@ enum iommu_attr {
DOMAIN_ATTR_FSL_PAMU_ENABLE,
DOMAIN_ATTR_FSL_PAMUV1,
DOMAIN_ATTR_NESTING, /* two stages of translation */
+ DOMAIN_ATTR_USE_SYS_CACHE,
DOMAIN_ATTR_MAX,
};
--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation
^ permalink raw reply related [flat|nested] 19+ messages in thread[parent not found: <20180615105329.26800-1-vivek.gautam-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>]
* Re: [PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache [not found] ` <20180615105329.26800-1-vivek.gautam-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org> @ 2018-06-15 16:52 ` Will Deacon [not found] ` <20180615165232.GE2202-5wv7dgnIgG8@public.gmane.org> 0 siblings, 1 reply; 19+ messages in thread From: Will Deacon @ 2018-06-15 16:52 UTC (permalink / raw) To: Vivek Gautam Cc: pdaly-sgV2jX0FEOL9JmXXK+q4OQ, linux-arm-msm-u79uwXL29TY76Z2rM5mHXA, linux-kernel-u79uwXL29TY76Z2rM5mHXA, iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r Hi Vivek, On Fri, Jun 15, 2018 at 04:23:29PM +0530, Vivek Gautam wrote: > Qualcomm SoCs have an additional level of cache called as > System cache or Last level cache[1]. This cache sits right > before the DDR, and is tightly coupled with the memory > controller. > The cache is available to all the clients present in the > SoC system. The clients request their slices from this system > cache, make it active, and can then start using it. For these > clients with smmu, to start using the system cache for > dma buffers and related page tables [2], few of the memory > attributes need to be set accordingly. > This change makes the related memory Outer-Shareable, and > updates the MAIR with necessary protection. > > The MAIR attribute requirements are: > Inner Cacheablity = 0 > Outer Cacheablity = 1, Write-Back Write Allocate > Outer Shareablity = 1 Hmm, so is this cache coherent with the CPU or not? Why don't normal non-cacheable mappings allocated in the LLC by default? > diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c > index f7a96bcf94a6..8058e7205034 100644 > --- a/drivers/iommu/arm-smmu.c > +++ b/drivers/iommu/arm-smmu.c > @@ -249,6 +249,7 @@ struct arm_smmu_domain { > struct mutex init_mutex; /* Protects smmu pointer */ > spinlock_t cb_lock; /* Serialises ATS1* ops and TLB syncs */ > struct iommu_domain domain; > + bool has_sys_cache; > }; > > struct arm_smmu_option_prop { > @@ -862,6 +863,8 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, > > if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK) > pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_NO_DMA; > + if (smmu_domain->has_sys_cache) > + pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_SYS_CACHE; > > smmu_domain->smmu = smmu; > pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain); > @@ -1477,6 +1480,9 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain, > case DOMAIN_ATTR_NESTING: > *(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED); > return 0; > + case DOMAIN_ATTR_USE_SYS_CACHE: > + *((int *)data) = smmu_domain->has_sys_cache; > + return 0; I really don't like exposing this to clients directly like this, particularly as there aren't any in-tree users. I would prefer that we provide a way for the io-pgtable code to have its MAIR values overridden so that all non-coherent DMA ends up using the system cache. Will ^ permalink raw reply [flat|nested] 19+ messages in thread
[parent not found: <20180615165232.GE2202-5wv7dgnIgG8@public.gmane.org>]
* Re: [PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache [not found] ` <20180615165232.GE2202-5wv7dgnIgG8@public.gmane.org> @ 2018-06-15 17:12 ` Jordan Crouse 2018-06-19 8:34 ` Vivek Gautam 1 sibling, 0 replies; 19+ messages in thread From: Jordan Crouse @ 2018-06-15 17:12 UTC (permalink / raw) To: Will Deacon Cc: pdaly-sgV2jX0FEOL9JmXXK+q4OQ, linux-arm-msm-u79uwXL29TY76Z2rM5mHXA, linux-kernel-u79uwXL29TY76Z2rM5mHXA, iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r On Fri, Jun 15, 2018 at 05:52:32PM +0100, Will Deacon wrote: > Hi Vivek, > > On Fri, Jun 15, 2018 at 04:23:29PM +0530, Vivek Gautam wrote: > > Qualcomm SoCs have an additional level of cache called as > > System cache or Last level cache[1]. This cache sits right > > before the DDR, and is tightly coupled with the memory > > controller. > > The cache is available to all the clients present in the > > SoC system. The clients request their slices from this system > > cache, make it active, and can then start using it. For these > > clients with smmu, to start using the system cache for > > dma buffers and related page tables [2], few of the memory > > attributes need to be set accordingly. > > This change makes the related memory Outer-Shareable, and > > updates the MAIR with necessary protection. > > > > The MAIR attribute requirements are: > > Inner Cacheablity = 0 > > Outer Cacheablity = 1, Write-Back Write Allocate > > Outer Shareablity = 1 > > Hmm, so is this cache coherent with the CPU or not? Why don't normal > non-cacheable mappings allocated in the LLC by default? > > > diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c > > index f7a96bcf94a6..8058e7205034 100644 > > --- a/drivers/iommu/arm-smmu.c > > +++ b/drivers/iommu/arm-smmu.c > > @@ -249,6 +249,7 @@ struct arm_smmu_domain { > > struct mutex init_mutex; /* Protects smmu pointer */ > > spinlock_t cb_lock; /* Serialises ATS1* ops and TLB syncs */ > > struct iommu_domain domain; > > + bool has_sys_cache; > > }; > > > > struct arm_smmu_option_prop { > > @@ -862,6 +863,8 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, > > > > if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK) > > pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_NO_DMA; > > + if (smmu_domain->has_sys_cache) > > + pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_SYS_CACHE; > > > > smmu_domain->smmu = smmu; > > pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain); > > @@ -1477,6 +1480,9 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain, > > case DOMAIN_ATTR_NESTING: > > *(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED); > > return 0; > > + case DOMAIN_ATTR_USE_SYS_CACHE: > > + *((int *)data) = smmu_domain->has_sys_cache; > > + return 0; > > I really don't like exposing this to clients directly like this, > particularly as there aren't any in-tree users. I would prefer that we > provide a way for the io-pgtable code to have its MAIR values overridden > so that all non-coherent DMA ends up using the system cache. FWIW here is a future in-tree user for LLC: https://patchwork.freedesktop.org/series/40545/ Specifically: https://patchwork.freedesktop.org/patch/212400/ Jordan -- The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache [not found] ` <20180615165232.GE2202-5wv7dgnIgG8@public.gmane.org> 2018-06-15 17:12 ` Jordan Crouse @ 2018-06-19 8:34 ` Vivek Gautam [not found] ` <CAFp+6iFm29ufb2Pr7Gb-2O_aN3GQLH4rcyWhbQGZ3QiwCC8vPg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org> 1 sibling, 1 reply; 19+ messages in thread From: Vivek Gautam @ 2018-06-19 8:34 UTC (permalink / raw) To: Will Deacon Cc: pdaly-sgV2jX0FEOL9JmXXK+q4OQ, linux-arm-msm, open list, list-Y9sIeH5OGRo@public.gmane.org:IOMMU DRIVERS <iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org>, Joerg Roedel <joro-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>, , linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r Hi Will, On Fri, Jun 15, 2018 at 10:22 PM, Will Deacon <will.deacon-5wv7dgnIgG8@public.gmane.org> wrote: > Hi Vivek, > > On Fri, Jun 15, 2018 at 04:23:29PM +0530, Vivek Gautam wrote: >> Qualcomm SoCs have an additional level of cache called as >> System cache or Last level cache[1]. This cache sits right >> before the DDR, and is tightly coupled with the memory >> controller. >> The cache is available to all the clients present in the >> SoC system. The clients request their slices from this system >> cache, make it active, and can then start using it. For these >> clients with smmu, to start using the system cache for >> dma buffers and related page tables [2], few of the memory >> attributes need to be set accordingly. >> This change makes the related memory Outer-Shareable, and >> updates the MAIR with necessary protection. >> >> The MAIR attribute requirements are: >> Inner Cacheablity = 0 >> Outer Cacheablity = 1, Write-Back Write Allocate >> Outer Shareablity = 1 > > Hmm, so is this cache coherent with the CPU or not? Thanks for reviewing. Yes, this LLC is cache coherent with CPU, so we mark for Outer-cacheable. The different masters such as GPU as able to allocated and activate a slice in this Last Level Cache. > Why don't normal > non-cacheable mappings allocated in the LLC by default? Sorry, I couldn't fully understand your question here. Few of the masters on qcom socs are not io-coherent, so for them the IC has to be marked as 0. But they are able to use the LLC with OC marked as 1. Handling the IO-coherency is possibly a separate change to address? > >> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c >> index f7a96bcf94a6..8058e7205034 100644 >> --- a/drivers/iommu/arm-smmu.c >> +++ b/drivers/iommu/arm-smmu.c >> @@ -249,6 +249,7 @@ struct arm_smmu_domain { >> struct mutex init_mutex; /* Protects smmu pointer */ >> spinlock_t cb_lock; /* Serialises ATS1* ops and TLB syncs */ >> struct iommu_domain domain; >> + bool has_sys_cache; >> }; >> >> struct arm_smmu_option_prop { >> @@ -862,6 +863,8 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, >> >> if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK) >> pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_NO_DMA; >> + if (smmu_domain->has_sys_cache) >> + pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_SYS_CACHE; >> >> smmu_domain->smmu = smmu; >> pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain); >> @@ -1477,6 +1480,9 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain, >> case DOMAIN_ATTR_NESTING: >> *(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED); >> return 0; >> + case DOMAIN_ATTR_USE_SYS_CACHE: >> + *((int *)data) = smmu_domain->has_sys_cache; >> + return 0; > > I really don't like exposing this to clients directly like this, > particularly as there aren't any in-tree users. I would prefer that we > provide a way for the io-pgtable code to have its MAIR values overridden > so that all non-coherent DMA ends up using the system cache. >From the way it looks from the users of LLC (as also pointed to by Jordan), the masters have to request and activate their slices in the cache, and then they can start using it. Before that the transaction don't go through LLC. But I will try to find out more on this. Thanks & Regards Vivek > > Will > -- > To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in > the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ^ permalink raw reply [flat|nested] 19+ messages in thread
[parent not found: <CAFp+6iFm29ufb2Pr7Gb-2O_aN3GQLH4rcyWhbQGZ3QiwCC8vPg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>]
* Re: [PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache [not found] ` <CAFp+6iFm29ufb2Pr7Gb-2O_aN3GQLH4rcyWhbQGZ3QiwCC8vPg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org> @ 2018-06-27 16:37 ` Will Deacon [not found] ` <20180627163749.GA8729-5wv7dgnIgG8@public.gmane.org> 2018-09-20 11:41 ` Vivek Gautam 0 siblings, 2 replies; 19+ messages in thread From: Will Deacon @ 2018-06-27 16:37 UTC (permalink / raw) To: Vivek Gautam Cc: pdaly-sgV2jX0FEOL9JmXXK+q4OQ, linux-arm-msm, open list, list-Y9sIeH5OGRo@public.gmane.org:IOMMU DRIVERS <iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org>, Joerg Roedel <joro-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>, , linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r Hi Vivek, On Tue, Jun 19, 2018 at 02:04:44PM +0530, Vivek Gautam wrote: > On Fri, Jun 15, 2018 at 10:22 PM, Will Deacon <will.deacon-5wv7dgnIgG8@public.gmane.org> wrote: > > On Fri, Jun 15, 2018 at 04:23:29PM +0530, Vivek Gautam wrote: > >> Qualcomm SoCs have an additional level of cache called as > >> System cache or Last level cache[1]. This cache sits right > >> before the DDR, and is tightly coupled with the memory > >> controller. > >> The cache is available to all the clients present in the > >> SoC system. The clients request their slices from this system > >> cache, make it active, and can then start using it. For these > >> clients with smmu, to start using the system cache for > >> dma buffers and related page tables [2], few of the memory > >> attributes need to be set accordingly. > >> This change makes the related memory Outer-Shareable, and > >> updates the MAIR with necessary protection. > >> > >> The MAIR attribute requirements are: > >> Inner Cacheablity = 0 > >> Outer Cacheablity = 1, Write-Back Write Allocate > >> Outer Shareablity = 1 > > > > Hmm, so is this cache coherent with the CPU or not? > > Thanks for reviewing. > Yes, this LLC is cache coherent with CPU, so we mark for Outer-cacheable. > The different masters such as GPU as able to allocated and activate a slice > in this Last Level Cache. What I mean is, for example, if the CPU writes some data using Normal, Inner Shareable, Inner/Outer Cacheable, Inner/Outer Write-back, Non-transient Read/Write-allocate and a device reads that data using your MAIR encoding above, is the device guaranteed to see the CPU writes after the CPU has executed a DSB instruction? I don't think so, because the ARM ARM would say that there's a mismatch on the Inner Cacheability attribute. > > Why don't normal > > non-cacheable mappings allocated in the LLC by default? > > Sorry, I couldn't fully understand your question here. > Few of the masters on qcom socs are not io-coherent, so for them > the IC has to be marked as 0. By IC you mean Inner Cacheability? In your MAIR encoding above, it is zero so I don't understand the problem. What goes wrong if non-coherent devices use your MAIR encoding for their DMA buffers? > But they are able to use the LLC with OC marked as 1. The issue here is that whatever attributes we put in the SMMU need to align with the attributes used by the CPU in order to avoid introducing mismatched aliases. Currently, we support three types of mapping in the SMMU: 1. DMA non-coherent (e.g. "dma-coherent" is not set on the device) Normal, Inner Shareable, Inner/Outer Non-Cacheable 2. DMA coherent (e.g. "dma-coherent" is set on the device) [IOMMU_CACHE] Normal, Inner Shareable, Inner/Outer Cacheable, Inner/Outer Write-back, Non-transient Read/Write-allocate 3. MMIO (e.g. MSI doorbell) [IOMMU_MMIO] Device-nGnRE (Outer Shareable) So either you override one of these types (I was suggesting (1)) or you need to create a new memory type, along with the infrastructure for it to be recognised on a per-device basis and used by the DMA API so that we don't get mismatched aliases on the CPU. Will ^ permalink raw reply [flat|nested] 19+ messages in thread
[parent not found: <20180627163749.GA8729-5wv7dgnIgG8@public.gmane.org>]
* Re: [PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache [not found] ` <20180627163749.GA8729-5wv7dgnIgG8@public.gmane.org> @ 2018-07-24 9:43 ` Vivek Gautam [not found] ` <CAFp+6iHnA1Jj8wKO08YYEBKVF2_3oEuOQOcW2boL=AYZ9+b=UA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org> 0 siblings, 1 reply; 19+ messages in thread From: Vivek Gautam @ 2018-07-24 9:43 UTC (permalink / raw) To: Will Deacon Cc: linux-arm-msm, list-Y9sIeH5OGRo@public.gmane.org:IOMMU DRIVERS <iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org>, Joerg Roedel <joro-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>, , open list, Linux ARM, pdaly-sgV2jX0FEOL9JmXXK+q4OQ Hi Will, On Wed, Jun 27, 2018 at 10:07 PM, Will Deacon <will.deacon-5wv7dgnIgG8@public.gmane.org> wrote: > Hi Vivek, > > On Tue, Jun 19, 2018 at 02:04:44PM +0530, Vivek Gautam wrote: >> On Fri, Jun 15, 2018 at 10:22 PM, Will Deacon <will.deacon-5wv7dgnIgG8@public.gmane.org> wrote: >> > On Fri, Jun 15, 2018 at 04:23:29PM +0530, Vivek Gautam wrote: >> >> Qualcomm SoCs have an additional level of cache called as >> >> System cache or Last level cache[1]. This cache sits right >> >> before the DDR, and is tightly coupled with the memory >> >> controller. >> >> The cache is available to all the clients present in the >> >> SoC system. The clients request their slices from this system >> >> cache, make it active, and can then start using it. For these >> >> clients with smmu, to start using the system cache for >> >> dma buffers and related page tables [2], few of the memory >> >> attributes need to be set accordingly. >> >> This change makes the related memory Outer-Shareable, and >> >> updates the MAIR with necessary protection. >> >> >> >> The MAIR attribute requirements are: >> >> Inner Cacheablity = 0 >> >> Outer Cacheablity = 1, Write-Back Write Allocate >> >> Outer Shareablity = 1 >> > >> > Hmm, so is this cache coherent with the CPU or not? >> >> Thanks for reviewing. >> Yes, this LLC is cache coherent with CPU, so we mark for Outer-cacheable. >> The different masters such as GPU as able to allocated and activate a slice >> in this Last Level Cache. > > What I mean is, for example, if the CPU writes some data using Normal, Inner > Shareable, Inner/Outer Cacheable, Inner/Outer Write-back, Non-transient > Read/Write-allocate and a device reads that data using your MAIR encoding > above, is the device guaranteed to see the CPU writes after the CPU has > executed a DSB instruction? > > I don't think so, because the ARM ARM would say that there's a mismatch on > the Inner Cacheability attribute. > >> > Why don't normal >> > non-cacheable mappings allocated in the LLC by default? >> >> Sorry, I couldn't fully understand your question here. >> Few of the masters on qcom socs are not io-coherent, so for them >> the IC has to be marked as 0. > > By IC you mean Inner Cacheability? In your MAIR encoding above, it is zero > so I don't understand the problem. What goes wrong if non-coherent devices > use your MAIR encoding for their DMA buffers? > >> But they are able to use the LLC with OC marked as 1. > > The issue here is that whatever attributes we put in the SMMU need to align > with the attributes used by the CPU in order to avoid introducing mismatched > aliases. Currently, we support three types of mapping in the SMMU: > > 1. DMA non-coherent (e.g. "dma-coherent" is not set on the device) > Normal, Inner Shareable, Inner/Outer Non-Cacheable > > 2. DMA coherent (e.g. "dma-coherent" is set on the device) [IOMMU_CACHE] > Normal, Inner Shareable, Inner/Outer Cacheable, Inner/Outer > Write-back, Non-transient Read/Write-allocate > > 3. MMIO (e.g. MSI doorbell) [IOMMU_MMIO] > Device-nGnRE (Outer Shareable) > > So either you override one of these types (I was suggesting (1)) or you need > to create a new memory type, along with the infrastructure for it to be > recognised on a per-device basis and used by the DMA API so that we don't > get mismatched aliases on the CPU. My apologies for delay in responding to this thread. I have been digging and getting in touch with internal tech teams to get more information on this. I will update as soon as I have enough details. Thanks. Best regards Vivek > > Will > _______________________________________________ > iommu mailing list > iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org > https://lists.linuxfoundation.org/mailman/listinfo/iommu -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ^ permalink raw reply [flat|nested] 19+ messages in thread
[parent not found: <CAFp+6iHnA1Jj8wKO08YYEBKVF2_3oEuOQOcW2boL=AYZ9+b=UA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>]
* Re: [PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache [not found] ` <CAFp+6iHnA1Jj8wKO08YYEBKVF2_3oEuOQOcW2boL=AYZ9+b=UA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org> @ 2018-09-19 19:35 ` Jordan Crouse 2018-09-20 10:25 ` Vivek Gautam 0 siblings, 1 reply; 19+ messages in thread From: Jordan Crouse @ 2018-09-19 19:35 UTC (permalink / raw) To: Vivek Gautam Cc: pdaly-sgV2jX0FEOL9JmXXK+q4OQ, linux-arm-msm, Will Deacon, open list, list-Y9sIeH5OGRo@public.gmane.org:IOMMU DRIVERS <iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org>, Joerg Roedel <joro-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>, , Linux ARM On Tue, Jul 24, 2018 at 03:13:37PM +0530, Vivek Gautam wrote: > Hi Will, > > > On Wed, Jun 27, 2018 at 10:07 PM, Will Deacon <will.deacon-5wv7dgnIgG8@public.gmane.org> wrote: > > Hi Vivek, > > > > On Tue, Jun 19, 2018 at 02:04:44PM +0530, Vivek Gautam wrote: > >> On Fri, Jun 15, 2018 at 10:22 PM, Will Deacon <will.deacon-5wv7dgnIgG8@public.gmane.org> wrote: > >> > On Fri, Jun 15, 2018 at 04:23:29PM +0530, Vivek Gautam wrote: > >> >> Qualcomm SoCs have an additional level of cache called as > >> >> System cache or Last level cache[1]. This cache sits right > >> >> before the DDR, and is tightly coupled with the memory > >> >> controller. > >> >> The cache is available to all the clients present in the > >> >> SoC system. The clients request their slices from this system > >> >> cache, make it active, and can then start using it. For these > >> >> clients with smmu, to start using the system cache for > >> >> dma buffers and related page tables [2], few of the memory > >> >> attributes need to be set accordingly. > >> >> This change makes the related memory Outer-Shareable, and > >> >> updates the MAIR with necessary protection. > >> >> > >> >> The MAIR attribute requirements are: > >> >> Inner Cacheablity = 0 > >> >> Outer Cacheablity = 1, Write-Back Write Allocate > >> >> Outer Shareablity = 1 > >> > > >> > Hmm, so is this cache coherent with the CPU or not? > >> > >> Thanks for reviewing. > >> Yes, this LLC is cache coherent with CPU, so we mark for Outer-cacheable. > >> The different masters such as GPU as able to allocated and activate a slice > >> in this Last Level Cache. > > > > What I mean is, for example, if the CPU writes some data using Normal, Inner > > Shareable, Inner/Outer Cacheable, Inner/Outer Write-back, Non-transient > > Read/Write-allocate and a device reads that data using your MAIR encoding > > above, is the device guaranteed to see the CPU writes after the CPU has > > executed a DSB instruction? > > > > I don't think so, because the ARM ARM would say that there's a mismatch on > > the Inner Cacheability attribute. > > > >> > Why don't normal > >> > non-cacheable mappings allocated in the LLC by default? > >> > >> Sorry, I couldn't fully understand your question here. > >> Few of the masters on qcom socs are not io-coherent, so for them > >> the IC has to be marked as 0. > > > > By IC you mean Inner Cacheability? In your MAIR encoding above, it is zero > > so I don't understand the problem. What goes wrong if non-coherent devices > > use your MAIR encoding for their DMA buffers? > > > >> But they are able to use the LLC with OC marked as 1. > > > > The issue here is that whatever attributes we put in the SMMU need to align > > with the attributes used by the CPU in order to avoid introducing mismatched > > aliases. Currently, we support three types of mapping in the SMMU: > > > > 1. DMA non-coherent (e.g. "dma-coherent" is not set on the device) > > Normal, Inner Shareable, Inner/Outer Non-Cacheable > > > > 2. DMA coherent (e.g. "dma-coherent" is set on the device) [IOMMU_CACHE] > > Normal, Inner Shareable, Inner/Outer Cacheable, Inner/Outer > > Write-back, Non-transient Read/Write-allocate > > > > 3. MMIO (e.g. MSI doorbell) [IOMMU_MMIO] > > Device-nGnRE (Outer Shareable) > > > > So either you override one of these types (I was suggesting (1)) or you need > > to create a new memory type, along with the infrastructure for it to be > > recognised on a per-device basis and used by the DMA API so that we don't > > get mismatched aliases on the CPU. > > My apologies for delay in responding to this thread. > I have been digging and getting in touch with internal tech teams > to get more information on this. I will update as soon as I have enough > details. > Thanks. Hi Vivek. I want to revive this discussion. I believe that Andy has pulled in the base LLCC support so this the remaining dependency we need to implement the LLCC in the GPU driver. Thanks, Jordan -- The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache 2018-09-19 19:35 ` Jordan Crouse @ 2018-09-20 10:25 ` Vivek Gautam 0 siblings, 0 replies; 19+ messages in thread From: Vivek Gautam @ 2018-09-20 10:25 UTC (permalink / raw) To: Will Deacon, linux-arm-msm, list@263.net:IOMMU DRIVERS <iommu@lists.linux-foundation.org>, Joerg Roedel <joro@8bytes.org>,, open list, Linux ARM, pdaly On Thu, Sep 20, 2018 at 1:05 AM Jordan Crouse <jcrouse@codeaurora.org> wrote: > > On Tue, Jul 24, 2018 at 03:13:37PM +0530, Vivek Gautam wrote: > > Hi Will, > > > > > > On Wed, Jun 27, 2018 at 10:07 PM, Will Deacon <will.deacon@arm.com> wrote: > > > Hi Vivek, > > > > > > On Tue, Jun 19, 2018 at 02:04:44PM +0530, Vivek Gautam wrote: > > >> On Fri, Jun 15, 2018 at 10:22 PM, Will Deacon <will.deacon@arm.com> wrote: > > >> > On Fri, Jun 15, 2018 at 04:23:29PM +0530, Vivek Gautam wrote: > > >> >> Qualcomm SoCs have an additional level of cache called as > > >> >> System cache or Last level cache[1]. This cache sits right > > >> >> before the DDR, and is tightly coupled with the memory > > >> >> controller. > > >> >> The cache is available to all the clients present in the > > >> >> SoC system. The clients request their slices from this system > > >> >> cache, make it active, and can then start using it. For these > > >> >> clients with smmu, to start using the system cache for > > >> >> dma buffers and related page tables [2], few of the memory > > >> >> attributes need to be set accordingly. > > >> >> This change makes the related memory Outer-Shareable, and > > >> >> updates the MAIR with necessary protection. > > >> >> > > >> >> The MAIR attribute requirements are: > > >> >> Inner Cacheablity = 0 > > >> >> Outer Cacheablity = 1, Write-Back Write Allocate > > >> >> Outer Shareablity = 1 > > >> > > > >> > Hmm, so is this cache coherent with the CPU or not? > > >> > > >> Thanks for reviewing. > > >> Yes, this LLC is cache coherent with CPU, so we mark for Outer-cacheable. > > >> The different masters such as GPU as able to allocated and activate a slice > > >> in this Last Level Cache. > > > > > > What I mean is, for example, if the CPU writes some data using Normal, Inner > > > Shareable, Inner/Outer Cacheable, Inner/Outer Write-back, Non-transient > > > Read/Write-allocate and a device reads that data using your MAIR encoding > > > above, is the device guaranteed to see the CPU writes after the CPU has > > > executed a DSB instruction? > > > > > > I don't think so, because the ARM ARM would say that there's a mismatch on > > > the Inner Cacheability attribute. > > > > > >> > Why don't normal > > >> > non-cacheable mappings allocated in the LLC by default? > > >> > > >> Sorry, I couldn't fully understand your question here. > > >> Few of the masters on qcom socs are not io-coherent, so for them > > >> the IC has to be marked as 0. > > > > > > By IC you mean Inner Cacheability? In your MAIR encoding above, it is zero > > > so I don't understand the problem. What goes wrong if non-coherent devices > > > use your MAIR encoding for their DMA buffers? > > > > > >> But they are able to use the LLC with OC marked as 1. > > > > > > The issue here is that whatever attributes we put in the SMMU need to align > > > with the attributes used by the CPU in order to avoid introducing mismatched > > > aliases. Currently, we support three types of mapping in the SMMU: > > > > > > 1. DMA non-coherent (e.g. "dma-coherent" is not set on the device) > > > Normal, Inner Shareable, Inner/Outer Non-Cacheable > > > > > > 2. DMA coherent (e.g. "dma-coherent" is set on the device) [IOMMU_CACHE] > > > Normal, Inner Shareable, Inner/Outer Cacheable, Inner/Outer > > > Write-back, Non-transient Read/Write-allocate > > > > > > 3. MMIO (e.g. MSI doorbell) [IOMMU_MMIO] > > > Device-nGnRE (Outer Shareable) > > > > > > So either you override one of these types (I was suggesting (1)) or you need > > > to create a new memory type, along with the infrastructure for it to be > > > recognised on a per-device basis and used by the DMA API so that we don't > > > get mismatched aliases on the CPU. > > > > My apologies for delay in responding to this thread. > > I have been digging and getting in touch with internal tech teams > > to get more information on this. I will update as soon as I have enough > > details. > > Thanks. > > Hi Vivek. I want to revive this discussion. I believe that Andy has pulled > in the base LLCC support so this the remaining dependency we need to implement > the LLCC in the GPU driver. Hi Jordan, yes I was in process of gathering information about the system cache usage and the attributes configurations required when devices use system cache. Let me respond to Will's questions now. Thanks Vivek > > Thanks, > Jordan > > -- > The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, > a Linux Foundation Collaborative Project > _______________________________________________ > iommu mailing list > iommu@lists.linux-foundation.org > https://lists.linuxfoundation.org/mailman/listinfo/iommu -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache 2018-06-27 16:37 ` Will Deacon [not found] ` <20180627163749.GA8729-5wv7dgnIgG8@public.gmane.org> @ 2018-09-20 11:41 ` Vivek Gautam 2018-09-28 13:19 ` Will Deacon 1 sibling, 1 reply; 19+ messages in thread From: Vivek Gautam @ 2018-09-20 11:41 UTC (permalink / raw) To: Will Deacon Cc: pdaly, linux-arm-msm, open list, list@263.net:IOMMU DRIVERS <iommu@lists.linux-foundation.org>, Joerg Roedel <joro@8bytes.org>,, Linux ARM, Jordan Crouse, pratikp Hi Will, On Wed, Jun 27, 2018 at 10:07 PM Will Deacon <will.deacon@arm.com> wrote: > > Hi Vivek, > > On Tue, Jun 19, 2018 at 02:04:44PM +0530, Vivek Gautam wrote: > > On Fri, Jun 15, 2018 at 10:22 PM, Will Deacon <will.deacon@arm.com> wrote: > > > On Fri, Jun 15, 2018 at 04:23:29PM +0530, Vivek Gautam wrote: > > >> Qualcomm SoCs have an additional level of cache called as > > >> System cache or Last level cache[1]. This cache sits right > > >> before the DDR, and is tightly coupled with the memory > > >> controller. > > >> The cache is available to all the clients present in the > > >> SoC system. The clients request their slices from this system > > >> cache, make it active, and can then start using it. For these > > >> clients with smmu, to start using the system cache for > > >> dma buffers and related page tables [2], few of the memory > > >> attributes need to be set accordingly. > > >> This change makes the related memory Outer-Shareable, and > > >> updates the MAIR with necessary protection. > > >> > > >> The MAIR attribute requirements are: > > >> Inner Cacheablity = 0 > > >> Outer Cacheablity = 1, Write-Back Write Allocate > > >> Outer Shareablity = 1 > > > > > > Hmm, so is this cache coherent with the CPU or not? > > > > Thanks for reviewing. > > Yes, this LLC is cache coherent with CPU, so we mark for Outer-cacheable. > > The different masters such as GPU as able to allocated and activate a slice > > in this Last Level Cache. > > What I mean is, for example, if the CPU writes some data using Normal, Inner > Shareable, Inner/Outer Cacheable, Inner/Outer Write-back, Non-transient > Read/Write-allocate and a device reads that data using your MAIR encoding > above, is the device guaranteed to see the CPU writes after the CPU has > executed a DSB instruction? No, these MAIR configurations don't guarantee that devices will have coherent view of what CPU writes. Not all devices can snoop into CPU caches (only IO-Coherent devices can). So a normal cached memory configuration in CPU MMU tables, and SMMU page tables is valid only for few devices that are IO-coherent. Moreover, CPU can lookup in system cache, and so do all devices; allocation will depend on h/w configurations and memory attributes. So anything that CPU caches in system cache will be coherently visible to devices. > > I don't think so, because the ARM ARM would say that there's a mismatch on > the Inner Cacheability attribute. > > > > Why don't normal > > > non-cacheable mappings allocated in the LLC by default? > > > > Sorry, I couldn't fully understand your question here. > > Few of the masters on qcom socs are not io-coherent, so for them > > the IC has to be marked as 0. > > By IC you mean Inner Cacheability? In your MAIR encoding above, it is zero > so I don't understand the problem. What goes wrong if non-coherent devices > use your MAIR encoding for their DMA buffers? > > > But they are able to use the LLC with OC marked as 1. > > The issue here is that whatever attributes we put in the SMMU need to align > with the attributes used by the CPU in order to avoid introducing mismatched > aliases. Not really, right? Devices can use Inner non-Cacheable, Outer-cacheable (IC=0, OC=1) to allocate into the system cache (as these devices don't want to allocate in their inner caches), and the CPU will have a coherent view of these buffers/page-tables. This should be a normal cached non-IO-Coherent memory. But anything that CPU writes using Normal, Inner Shareable, Inner/Outer Cacheable, Inner/Outer Write-back, Non-transient Read/Write-allocate, may not be visible to the device. Also added Jordan, and Pratik to this thread. Thanks & Regards Vivek > Currently, we support three types of mapping in the SMMU: > > 1. DMA non-coherent (e.g. "dma-coherent" is not set on the device) > Normal, Inner Shareable, Inner/Outer Non-Cacheable > > 2. DMA coherent (e.g. "dma-coherent" is set on the device) [IOMMU_CACHE] > Normal, Inner Shareable, Inner/Outer Cacheable, Inner/Outer > Write-back, Non-transient Read/Write-allocate > > 3. MMIO (e.g. MSI doorbell) [IOMMU_MMIO] > Device-nGnRE (Outer Shareable) > > So either you override one of these types (I was suggesting (1)) or you need > to create a new memory type, along with the infrastructure for it to be > recognised on a per-device basis and used by the DMA API so that we don't > get mismatched aliases on the CPU. > > Will > _______________________________________________ > iommu mailing list > iommu@lists.linux-foundation.org > https://lists.linuxfoundation.org/mailman/listinfo/iommu -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache 2018-09-20 11:41 ` Vivek Gautam @ 2018-09-28 13:19 ` Will Deacon 2018-10-05 5:25 ` Vivek Gautam 0 siblings, 1 reply; 19+ messages in thread From: Will Deacon @ 2018-09-28 13:19 UTC (permalink / raw) To: Vivek Gautam Cc: pdaly, linux-arm-msm, open list, list@263.net:IOMMU DRIVERS <iommu@lists.linux-foundation.org>, Joerg Roedel <joro@8bytes.org>,, Linux ARM, Jordan Crouse, pratikp Hi Vivek, On Thu, Sep 20, 2018 at 05:11:53PM +0530, Vivek Gautam wrote: > On Wed, Jun 27, 2018 at 10:07 PM Will Deacon <will.deacon@arm.com> wrote: > > On Tue, Jun 19, 2018 at 02:04:44PM +0530, Vivek Gautam wrote: > > > On Fri, Jun 15, 2018 at 10:22 PM, Will Deacon <will.deacon@arm.com> wrote: > > > > On Fri, Jun 15, 2018 at 04:23:29PM +0530, Vivek Gautam wrote: > > > >> Qualcomm SoCs have an additional level of cache called as > > > >> System cache or Last level cache[1]. This cache sits right > > > >> before the DDR, and is tightly coupled with the memory > > > >> controller. > > > >> The cache is available to all the clients present in the > > > >> SoC system. The clients request their slices from this system > > > >> cache, make it active, and can then start using it. For these > > > >> clients with smmu, to start using the system cache for > > > >> dma buffers and related page tables [2], few of the memory > > > >> attributes need to be set accordingly. > > > >> This change makes the related memory Outer-Shareable, and > > > >> updates the MAIR with necessary protection. > > > >> > > > >> The MAIR attribute requirements are: > > > >> Inner Cacheablity = 0 > > > >> Outer Cacheablity = 1, Write-Back Write Allocate > > > >> Outer Shareablity = 1 > > > > > > > > Hmm, so is this cache coherent with the CPU or not? > > > > > > Thanks for reviewing. > > > Yes, this LLC is cache coherent with CPU, so we mark for Outer-cacheable. > > > The different masters such as GPU as able to allocated and activate a slice > > > in this Last Level Cache. > > > > What I mean is, for example, if the CPU writes some data using Normal, Inner > > Shareable, Inner/Outer Cacheable, Inner/Outer Write-back, Non-transient > > Read/Write-allocate and a device reads that data using your MAIR encoding > > above, is the device guaranteed to see the CPU writes after the CPU has > > executed a DSB instruction? > > No, these MAIR configurations don't guarantee that devices will have > coherent view > of what CPU writes. Not all devices can snoop into CPU caches (only IO-Coherent > devices can). > So a normal cached memory configuration in CPU MMU tables, and SMMU page tables > is valid only for few devices that are IO-coherent. > > Moreover, CPU can lookup in system cache, and so do all devices; > allocation will depend on h/w configurations and memory attributes. > So anything that CPU caches in system cache will be coherently visible > to devices. > > > > > I don't think so, because the ARM ARM would say that there's a mismatch on > > the Inner Cacheability attribute. > > > > > > Why don't normal > > > > non-cacheable mappings allocated in the LLC by default? > > > > > > Sorry, I couldn't fully understand your question here. > > > Few of the masters on qcom socs are not io-coherent, so for them > > > the IC has to be marked as 0. > > > > By IC you mean Inner Cacheability? In your MAIR encoding above, it is zero > > so I don't understand the problem. What goes wrong if non-coherent devices > > use your MAIR encoding for their DMA buffers? > > > > > But they are able to use the LLC with OC marked as 1. > > > > The issue here is that whatever attributes we put in the SMMU need to align > > with the attributes used by the CPU in order to avoid introducing mismatched > > aliases. > > Not really, right? > Devices can use Inner non-Cacheable, Outer-cacheable (IC=0, OC=1) to allocate > into the system cache (as these devices don't want to allocate in > their inner caches), > and the CPU will have a coherent view of these buffers/page-tables. > This should be > a normal cached non-IO-Coherent memory. > > But anything that CPU writes using Normal, Inner Shareable, > Inner/Outer Cacheable, > Inner/Outer Write-back, Non-transient Read/Write-allocate, may not be visible > to the device. > > Also added Jordan, and Pratik to this thread. Sorry, but I'm still completely confused. If you only end up with mismatched memory attributes in the non-coherent case, then why can't you just follow my suggestion to override the attributes for non-coherent mappings on your SoC? Will ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache 2018-09-28 13:19 ` Will Deacon @ 2018-10-05 5:25 ` Vivek Gautam 0 siblings, 0 replies; 19+ messages in thread From: Vivek Gautam @ 2018-10-05 5:25 UTC (permalink / raw) To: Will Deacon Cc: pdaly-sgV2jX0FEOL9JmXXK+q4OQ, linux-arm-msm, open list, list-Y9sIeH5OGRo@public.gmane.org:IOMMU DRIVERS <iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org>, Joerg Roedel <joro-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>, , pratikp-sgV2jX0FEOL9JmXXK+q4OQ, Linux ARM Hi Will, On Fri, Sep 28, 2018 at 6:49 PM Will Deacon <will.deacon-5wv7dgnIgG8@public.gmane.org> wrote: > > Hi Vivek, > > On Thu, Sep 20, 2018 at 05:11:53PM +0530, Vivek Gautam wrote: > > On Wed, Jun 27, 2018 at 10:07 PM Will Deacon <will.deacon-5wv7dgnIgG8@public.gmane.org> wrote: > > > On Tue, Jun 19, 2018 at 02:04:44PM +0530, Vivek Gautam wrote: > > > > On Fri, Jun 15, 2018 at 10:22 PM, Will Deacon <will.deacon-5wv7dgnIgG8@public.gmane.org> wrote: > > > > > On Fri, Jun 15, 2018 at 04:23:29PM +0530, Vivek Gautam wrote: > > > > >> Qualcomm SoCs have an additional level of cache called as > > > > >> System cache or Last level cache[1]. This cache sits right > > > > >> before the DDR, and is tightly coupled with the memory > > > > >> controller. > > > > >> The cache is available to all the clients present in the > > > > >> SoC system. The clients request their slices from this system > > > > >> cache, make it active, and can then start using it. For these > > > > >> clients with smmu, to start using the system cache for > > > > >> dma buffers and related page tables [2], few of the memory > > > > >> attributes need to be set accordingly. > > > > >> This change makes the related memory Outer-Shareable, and > > > > >> updates the MAIR with necessary protection. > > > > >> > > > > >> The MAIR attribute requirements are: > > > > >> Inner Cacheablity = 0 > > > > >> Outer Cacheablity = 1, Write-Back Write Allocate > > > > >> Outer Shareablity = 1 > > > > > > > > > > Hmm, so is this cache coherent with the CPU or not? > > > > > > > > Thanks for reviewing. > > > > Yes, this LLC is cache coherent with CPU, so we mark for Outer-cacheable. > > > > The different masters such as GPU as able to allocated and activate a slice > > > > in this Last Level Cache. > > > > > > What I mean is, for example, if the CPU writes some data using Normal, Inner > > > Shareable, Inner/Outer Cacheable, Inner/Outer Write-back, Non-transient > > > Read/Write-allocate and a device reads that data using your MAIR encoding > > > above, is the device guaranteed to see the CPU writes after the CPU has > > > executed a DSB instruction? > > > > No, these MAIR configurations don't guarantee that devices will have > > coherent view > > of what CPU writes. Not all devices can snoop into CPU caches (only IO-Coherent > > devices can). > > So a normal cached memory configuration in CPU MMU tables, and SMMU page tables > > is valid only for few devices that are IO-coherent. > > > > Moreover, CPU can lookup in system cache, and so do all devices; > > allocation will depend on h/w configurations and memory attributes. > > So anything that CPU caches in system cache will be coherently visible > > to devices. > > > > > > > > I don't think so, because the ARM ARM would say that there's a mismatch on > > > the Inner Cacheability attribute. > > > > > > > > Why don't normal > > > > > non-cacheable mappings allocated in the LLC by default? > > > > > > > > Sorry, I couldn't fully understand your question here. > > > > Few of the masters on qcom socs are not io-coherent, so for them > > > > the IC has to be marked as 0. > > > > > > By IC you mean Inner Cacheability? In your MAIR encoding above, it is zero > > > so I don't understand the problem. What goes wrong if non-coherent devices > > > use your MAIR encoding for their DMA buffers? > > > > > > > But they are able to use the LLC with OC marked as 1. > > > > > > The issue here is that whatever attributes we put in the SMMU need to align > > > with the attributes used by the CPU in order to avoid introducing mismatched > > > aliases. > > > > Not really, right? > > Devices can use Inner non-Cacheable, Outer-cacheable (IC=0, OC=1) to allocate > > into the system cache (as these devices don't want to allocate in > > their inner caches), > > and the CPU will have a coherent view of these buffers/page-tables. > > This should be > > a normal cached non-IO-Coherent memory. > > > > But anything that CPU writes using Normal, Inner Shareable, > > Inner/Outer Cacheable, > > Inner/Outer Write-back, Non-transient Read/Write-allocate, may not be visible > > to the device. > > > > Also added Jordan, and Pratik to this thread. > > Sorry, but I'm still completely confused. > > If you only end up with mismatched memory attributes in the non-coherent > case, then why can't you just follow my suggestion to override the > attributes for non-coherent mappings on your SoC? As seen in downstream kernels there are few non-coherent devices which would not want to allocate in system cache, and therefore would want Inner/Outer non-cached memory. So, we may want to either override the attributes per-device, or as you suggested we may want to introduce another memory type 'sys-cached' that can be added with its separate infra. Thanks. [...] Best regards Vivek -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache 2018-06-15 10:53 [PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache Vivek Gautam [not found] ` <20180615105329.26800-1-vivek.gautam-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org> @ 2018-10-23 4:15 ` Tomasz Figa 2018-10-24 17:48 ` Vivek Gautam 1 sibling, 1 reply; 19+ messages in thread From: Tomasz Figa @ 2018-10-23 4:15 UTC (permalink / raw) To: Vivek Gautam Cc: Will Deacon, Robin Murphy, list@263.net:IOMMU DRIVERS <iommu@lists.linux-foundation.org>, Joerg Roedel <joro@8bytes.org>,, pdaly, linux-arm-msm, Linux Kernel Mailing List Hi Vivek, On Fri, Jun 15, 2018 at 7:53 PM Vivek Gautam <vivek.gautam@codeaurora.org> wrote: > > Qualcomm SoCs have an additional level of cache called as > System cache or Last level cache[1]. This cache sits right > before the DDR, and is tightly coupled with the memory > controller. > The cache is available to all the clients present in the > SoC system. The clients request their slices from this system > cache, make it active, and can then start using it. For these > clients with smmu, to start using the system cache for > dma buffers and related page tables [2], few of the memory > attributes need to be set accordingly. > This change makes the related memory Outer-Shareable, and > updates the MAIR with necessary protection. > > The MAIR attribute requirements are: > Inner Cacheablity = 0 > Outer Cacheablity = 1, Write-Back Write Allocate > Outer Shareablity = 1 > > This change is a realisation of following changes > from downstream msm-4.9: > iommu: io-pgtable-arm: Support DOMAIN_ATTRIBUTE_USE_UPSTREAM_HINT > iommu: io-pgtable-arm: Implement IOMMU_USE_UPSTREAM_HINT Would you be able to provide links to those 2 downstream changes? > > [1] https://patchwork.kernel.org/patch/10422531/ > [2] https://patchwork.kernel.org/patch/10302791/ > > Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org> > --- > drivers/iommu/arm-smmu.c | 14 ++++++++++++++ > drivers/iommu/io-pgtable-arm.c | 24 +++++++++++++++++++----- > drivers/iommu/io-pgtable.h | 4 ++++ > include/linux/iommu.h | 4 ++++ > 4 files changed, 41 insertions(+), 5 deletions(-) > > diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c > index f7a96bcf94a6..8058e7205034 100644 > --- a/drivers/iommu/arm-smmu.c > +++ b/drivers/iommu/arm-smmu.c > @@ -249,6 +249,7 @@ struct arm_smmu_domain { > struct mutex init_mutex; /* Protects smmu pointer */ > spinlock_t cb_lock; /* Serialises ATS1* ops and TLB syncs */ > struct iommu_domain domain; > + bool has_sys_cache; > }; > > struct arm_smmu_option_prop { > @@ -862,6 +863,8 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, > > if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK) > pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_NO_DMA; > + if (smmu_domain->has_sys_cache) > + pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_SYS_CACHE; > > smmu_domain->smmu = smmu; > pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain); > @@ -1477,6 +1480,9 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain, > case DOMAIN_ATTR_NESTING: > *(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED); > return 0; > + case DOMAIN_ATTR_USE_SYS_CACHE: > + *((int *)data) = smmu_domain->has_sys_cache; > + return 0; > default: > return -ENODEV; > } > @@ -1506,6 +1512,14 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain, > smmu_domain->stage = ARM_SMMU_DOMAIN_S1; > > break; > + case DOMAIN_ATTR_USE_SYS_CACHE: > + if (smmu_domain->smmu) { > + ret = -EPERM; > + goto out_unlock; > + } > + if (*((int *)data)) > + smmu_domain->has_sys_cache = true; > + break; > default: > ret = -ENODEV; > } > diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c > index 010a254305dd..b2aee1828524 100644 > --- a/drivers/iommu/io-pgtable-arm.c > +++ b/drivers/iommu/io-pgtable-arm.c > @@ -169,9 +169,11 @@ > #define ARM_LPAE_MAIR_ATTR_DEVICE 0x04 > #define ARM_LPAE_MAIR_ATTR_NC 0x44 > #define ARM_LPAE_MAIR_ATTR_WBRWA 0xff > +#define ARM_LPAE_MAIR_ATTR_SYS_CACHE 0xf4 > #define ARM_LPAE_MAIR_ATTR_IDX_NC 0 > #define ARM_LPAE_MAIR_ATTR_IDX_CACHE 1 > #define ARM_LPAE_MAIR_ATTR_IDX_DEV 2 > +#define ARM_LPAE_MAIR_ATTR_IDX_SYS_CACHE 3 > > /* IOPTE accessors */ > #define iopte_deref(pte,d) __va(iopte_to_paddr(pte, d)) > @@ -442,6 +444,10 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, > else if (prot & IOMMU_CACHE) > pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE > << ARM_LPAE_PTE_ATTRINDX_SHIFT); > + else if (prot & IOMMU_SYS_CACHE) > + pte |= (ARM_LPAE_MAIR_ATTR_IDX_SYS_CACHE > + << ARM_LPAE_PTE_ATTRINDX_SHIFT); > + Okay, so we favor the full caching (IC WBRWA, OC WBRWA, OS) first if requested or otherwise try to use system cache (IC NC, OC WBWA?, OS)? Sounds fine. nit: Unnecessary blank line. > } else { > pte = ARM_LPAE_PTE_HAP_FAULT; > if (prot & IOMMU_READ) > @@ -771,7 +777,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) > u64 reg; > struct arm_lpae_io_pgtable *data; > > - if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA)) > + if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA | > + IO_PGTABLE_QUIRK_SYS_CACHE)) > return NULL; > > data = arm_lpae_alloc_pgtable(cfg); > @@ -779,9 +786,14 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) > return NULL; > > /* TCR */ > - reg = (ARM_LPAE_TCR_SH_IS << ARM_LPAE_TCR_SH0_SHIFT) | > - (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN0_SHIFT) | > - (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN0_SHIFT); > + if (cfg->quirks & IO_PGTABLE_QUIRK_SYS_CACHE) { > + reg = (ARM_LPAE_TCR_SH_OS << ARM_LPAE_TCR_SH0_SHIFT) | > + (ARM_LPAE_TCR_RGN_NC << ARM_LPAE_TCR_IRGN0_SHIFT); Contrary to the earlier code which favored IC/IS if possible, here we seem to disable IC/IS if the SYS_CACHE quirk is requested, regardless of whether it could still be desirable to use IC/IS. Perhaps rather than IO_PGTABLE_QUIRK_SYS_CACHE, we need something like IO_PGTABLE_QUIRK_NO_INNER_CACHE? > + } else { > + reg = (ARM_LPAE_TCR_SH_IS << ARM_LPAE_TCR_SH0_SHIFT) | > + (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN0_SHIFT); > + } > + reg |= (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN0_SHIFT); > [keeping the context] Best regards, Tomasz ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache 2018-10-23 4:15 ` Tomasz Figa @ 2018-10-24 17:48 ` Vivek Gautam 0 siblings, 0 replies; 19+ messages in thread From: Vivek Gautam @ 2018-10-24 17:48 UTC (permalink / raw) To: Tomasz Figa Cc: pdaly, linux-arm-msm, Will Deacon, open list, list@263.net:IOMMU DRIVERS <iommu@lists.linux-foundation.org>, Joerg Roedel <joro@8bytes.org>,, Robin Murphy, Linux ARM Hi Tomasz, On Tue, Oct 23, 2018 at 9:45 AM Tomasz Figa <tfiga@chromium.org> wrote: > > Hi Vivek, > > On Fri, Jun 15, 2018 at 7:53 PM Vivek Gautam > <vivek.gautam@codeaurora.org> wrote: > > > > Qualcomm SoCs have an additional level of cache called as > > System cache or Last level cache[1]. This cache sits right > > before the DDR, and is tightly coupled with the memory > > controller. > > The cache is available to all the clients present in the > > SoC system. The clients request their slices from this system > > cache, make it active, and can then start using it. For these > > clients with smmu, to start using the system cache for > > dma buffers and related page tables [2], few of the memory > > attributes need to be set accordingly. > > This change makes the related memory Outer-Shareable, and > > updates the MAIR with necessary protection. > > > > The MAIR attribute requirements are: > > Inner Cacheablity = 0 > > Outer Cacheablity = 1, Write-Back Write Allocate > > Outer Shareablity = 1 > > > > This change is a realisation of following changes > > from downstream msm-4.9: > > iommu: io-pgtable-arm: Support DOMAIN_ATTRIBUTE_USE_UPSTREAM_HINT > > iommu: io-pgtable-arm: Implement IOMMU_USE_UPSTREAM_HINT > > Would you be able to provide links to those 2 downstream changes? Thanks for the review. Here are the links for the changes: [1] -- iommu: io-pgtable-arm: Support DOMAIN_ATTRIBUTE_USE_UPSTREAM_HINT [2] -- iommu: io-pgtable-arm: Implement IOMMU_USE_UPSTREAM_HINT [1] https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9&id=bf762276796e79ca90014992f4d9da5593fa7d51 [2] https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9&id=d4c72c413ea27c43f60825193d4de9cb8ffd9602 > > > > > [1] https://patchwork.kernel.org/patch/10422531/ > > [2] https://patchwork.kernel.org/patch/10302791/ > > > > Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org> > > --- > > drivers/iommu/arm-smmu.c | 14 ++++++++++++++ > > drivers/iommu/io-pgtable-arm.c | 24 +++++++++++++++++++----- > > drivers/iommu/io-pgtable.h | 4 ++++ > > include/linux/iommu.h | 4 ++++ > > 4 files changed, 41 insertions(+), 5 deletions(-) > > > > diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c > > index f7a96bcf94a6..8058e7205034 100644 > > --- a/drivers/iommu/arm-smmu.c > > +++ b/drivers/iommu/arm-smmu.c > > @@ -249,6 +249,7 @@ struct arm_smmu_domain { > > struct mutex init_mutex; /* Protects smmu pointer */ > > spinlock_t cb_lock; /* Serialises ATS1* ops and TLB syncs */ > > struct iommu_domain domain; > > + bool has_sys_cache; > > }; > > > > struct arm_smmu_option_prop { > > @@ -862,6 +863,8 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, > > > > if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK) > > pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_NO_DMA; > > + if (smmu_domain->has_sys_cache) > > + pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_SYS_CACHE; > > > > smmu_domain->smmu = smmu; > > pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain); > > @@ -1477,6 +1480,9 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain, > > case DOMAIN_ATTR_NESTING: > > *(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED); > > return 0; > > + case DOMAIN_ATTR_USE_SYS_CACHE: > > + *((int *)data) = smmu_domain->has_sys_cache; > > + return 0; > > default: > > return -ENODEV; > > } > > @@ -1506,6 +1512,14 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain, > > smmu_domain->stage = ARM_SMMU_DOMAIN_S1; > > > > break; > > + case DOMAIN_ATTR_USE_SYS_CACHE: > > + if (smmu_domain->smmu) { > > + ret = -EPERM; > > + goto out_unlock; > > + } > > + if (*((int *)data)) > > + smmu_domain->has_sys_cache = true; > > + break; > > default: > > ret = -ENODEV; > > } > > diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c > > index 010a254305dd..b2aee1828524 100644 > > --- a/drivers/iommu/io-pgtable-arm.c > > +++ b/drivers/iommu/io-pgtable-arm.c > > @@ -169,9 +169,11 @@ > > #define ARM_LPAE_MAIR_ATTR_DEVICE 0x04 > > #define ARM_LPAE_MAIR_ATTR_NC 0x44 > > #define ARM_LPAE_MAIR_ATTR_WBRWA 0xff > > +#define ARM_LPAE_MAIR_ATTR_SYS_CACHE 0xf4 > > #define ARM_LPAE_MAIR_ATTR_IDX_NC 0 > > #define ARM_LPAE_MAIR_ATTR_IDX_CACHE 1 > > #define ARM_LPAE_MAIR_ATTR_IDX_DEV 2 > > +#define ARM_LPAE_MAIR_ATTR_IDX_SYS_CACHE 3 > > > > /* IOPTE accessors */ > > #define iopte_deref(pte,d) __va(iopte_to_paddr(pte, d)) > > @@ -442,6 +444,10 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, > > else if (prot & IOMMU_CACHE) > > pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE > > << ARM_LPAE_PTE_ATTRINDX_SHIFT); > > + else if (prot & IOMMU_SYS_CACHE) > > + pte |= (ARM_LPAE_MAIR_ATTR_IDX_SYS_CACHE > > + << ARM_LPAE_PTE_ATTRINDX_SHIFT); > > + > > Okay, so we favor the full caching (IC WBRWA, OC WBRWA, OS) first if > requested or otherwise try to use system cache (IC NC, OC WBWA?, OS)? > Sounds fine. That's right. When devices can use full caching, the system cache will also be used. Otherwise when devices can't allocate/use inner caches, system cache (Outer WBRWA, and IC NC) is the option. > > nit: Unnecessary blank line. will remove it. > > > } else { > > pte = ARM_LPAE_PTE_HAP_FAULT; > > if (prot & IOMMU_READ) > > @@ -771,7 +777,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) > > u64 reg; > > struct arm_lpae_io_pgtable *data; > > > > - if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA)) > > + if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA | > > + IO_PGTABLE_QUIRK_SYS_CACHE)) > > return NULL; > > > > data = arm_lpae_alloc_pgtable(cfg); > > @@ -779,9 +786,14 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) > > return NULL; > > > > /* TCR */ > > - reg = (ARM_LPAE_TCR_SH_IS << ARM_LPAE_TCR_SH0_SHIFT) | > > - (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN0_SHIFT) | > > - (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN0_SHIFT); > > + if (cfg->quirks & IO_PGTABLE_QUIRK_SYS_CACHE) { > > + reg = (ARM_LPAE_TCR_SH_OS << ARM_LPAE_TCR_SH0_SHIFT) | > > + (ARM_LPAE_TCR_RGN_NC << ARM_LPAE_TCR_IRGN0_SHIFT); > > Contrary to the earlier code which favored IC/IS if possible, here we > seem to disable IC/IS if the SYS_CACHE quirk is requested, regardless > of whether it could still be desirable to use IC/IS. Perhaps rather > than > IO_PGTABLE_QUIRK_SYS_CACHE, we need something like > IO_PGTABLE_QUIRK_NO_INNER_CACHE? IIUC, with QUIRK_NO_INNER_CACHE, we would explicitly handle the case of non I/O-coherent devices that can't use inner caches. Other devices will have coherent view of inner as well as outer caches (including system cache). Do I understand you correctly? Best regards Vivek > > > + } else { > > + reg = (ARM_LPAE_TCR_SH_IS << ARM_LPAE_TCR_SH0_SHIFT) | > > + (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN0_SHIFT); > > + } > > + reg |= (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN0_SHIFT); > > > > [keeping the context] > > Best regards, > Tomasz > _______________________________________________ > iommu mailing list > iommu@lists.linux-foundation.org > https://lists.linuxfoundation.org/mailman/listinfo/iommu -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache
@ 2018-12-04 11:01 Vivek Gautam
2018-12-04 15:21 ` Robin Murphy
0 siblings, 1 reply; 19+ messages in thread
From: Vivek Gautam @ 2018-12-04 11:01 UTC (permalink / raw)
To: joro-zLv9SwRftAIdnm+yROfE0A, will.deacon-5wv7dgnIgG8,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
Cc: pdaly-sgV2jX0FEOL9JmXXK+q4OQ,
linux-arm-msm-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, robin.murphy-5wv7dgnIgG8,
pratikp-sgV2jX0FEOL9JmXXK+q4OQ
Qualcomm SoCs have an additional level of cache called as
System cache, aka. Last level cache (LLC). This cache sits right
before the DDR, and is tightly coupled with the memory controller.
The cache is available to all the clients present in the SoC system.
The clients request their slices from this system cache, make it
active, and can then start using it.
For these clients with smmu, to start using the system cache for
buffers and, related page tables [1], memory attributes need to be
set accordingly.
This change updates the MAIR and TCR configurations with correct
attributes to use this system cache.
To explain a little about memory attribute requirements here:
Non-coherent I/O devices can't look-up into inner caches. However,
coherent I/O devices can. But both can allocate in the system cache
based on system policy and configured memory attributes in page
tables.
CPUs can access both inner and outer caches (including system cache,
aka. Last level cache), and can allocate into system cache too
based on memory attributes, and system policy.
Further looking at memory types, we have following -
a) Normal uncached :- MAIR 0x44, inner non-cacheable,
outer non-cacheable;
b) Normal cached :- MAIR 0xff, inner read write-back non-transient,
outer read write-back non-transient;
attribute setting for coherenet I/O devices.
and, for non-coherent i/o devices that can allocate in system cache
another type gets added -
c) Normal sys-cached/non-inner-cached :-
MAIR 0xf4, inner non-cacheable,
outer read write-back non-transient
So, CPU will automatically use the system cache for memory marked as
normal cached. The normal sys-cached is downgraded to normal non-cached
memory for CPUs.
Coherent I/O devices can use system cache by marking the memory as
normal cached.
Non-coherent I/O devices, to use system cache, should mark the memory as
normal sys-cached in page tables.
This change is a realisation of following changes
from downstream msm-4.9:
iommu: io-pgtable-arm: Support DOMAIN_ATTRIBUTE_USE_UPSTREAM_HINT[2]
iommu: io-pgtable-arm: Implement IOMMU_USE_UPSTREAM_HINT[3]
[1] https://patchwork.kernel.org/patch/10302791/
[2] https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9&id=bf762276796e79ca90014992f4d9da5593fa7d51
[3] https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9&id=d4c72c413ea27c43f60825193d4de9cb8ffd9602
Signed-off-by: Vivek Gautam <vivek.gautam-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
---
Changes since v1:
- Addressed Tomasz's comments for basing the change on
"NO_INNER_CACHE" concept for non-coherent I/O devices
rather than capturing "SYS_CACHE". This is to indicate
clearly the intent of non-coherent I/O devices that
can't access inner caches.
drivers/iommu/arm-smmu.c | 15 +++++++++++++++
drivers/iommu/dma-iommu.c | 3 +++
drivers/iommu/io-pgtable-arm.c | 22 +++++++++++++++++-----
drivers/iommu/io-pgtable.h | 5 +++++
include/linux/iommu.h | 3 +++
5 files changed, 43 insertions(+), 5 deletions(-)
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index ba18d89d4732..047f7ff95b0d 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -255,6 +255,7 @@ struct arm_smmu_domain {
struct mutex init_mutex; /* Protects smmu pointer */
spinlock_t cb_lock; /* Serialises ATS1* ops and TLB syncs */
struct iommu_domain domain;
+ bool no_inner_cache;
};
struct arm_smmu_option_prop {
@@ -897,6 +898,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
if (smmu_domain->non_strict)
pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
+ if (smmu_domain->no_inner_cache)
+ pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NO_IC;
+
smmu_domain->smmu = smmu;
pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
if (!pgtbl_ops) {
@@ -1579,6 +1583,9 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
case DOMAIN_ATTR_NESTING:
*(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
return 0;
+ case DOMAIN_ATTR_NO_IC:
+ *((int *)data) = smmu_domain->no_inner_cache;
+ return 0;
default:
return -ENODEV;
}
@@ -1619,6 +1626,14 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
else
smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
break;
+ case DOMAIN_ATTR_NO_IC:
+ if (smmu_domain->smmu) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+ if (*((int *)data))
+ smmu_domain->no_inner_cache = true;
+ break;
default:
ret = -ENODEV;
}
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index d1b04753b204..87c3d59c4a6c 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -354,6 +354,9 @@ int dma_info_to_prot(enum dma_data_direction dir, bool coherent,
{
int prot = coherent ? IOMMU_CACHE : 0;
+ if (!coherent && (attrs & DOMAIN_ATTR_NO_IC))
+ prot |= IOMMU_NO_IC;
+
if (attrs & DMA_ATTR_PRIVILEGED)
prot |= IOMMU_PRIV;
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 237cacd4a62b..815b86067bcc 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -168,10 +168,12 @@
#define ARM_LPAE_MAIR_ATTR_MASK 0xff
#define ARM_LPAE_MAIR_ATTR_DEVICE 0x04
#define ARM_LPAE_MAIR_ATTR_NC 0x44
+#define ARM_LPAE_MAIR_ATTR_NO_IC 0xf4
#define ARM_LPAE_MAIR_ATTR_WBRWA 0xff
#define ARM_LPAE_MAIR_ATTR_IDX_NC 0
#define ARM_LPAE_MAIR_ATTR_IDX_CACHE 1
#define ARM_LPAE_MAIR_ATTR_IDX_DEV 2
+#define ARM_LPAE_MAIR_ATTR_IDX_NO_IC 3
/* IOPTE accessors */
#define iopte_deref(pte,d) __va(iopte_to_paddr(pte, d))
@@ -443,6 +445,9 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
else if (prot & IOMMU_CACHE)
pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
+ else if (prot & IOMMU_NO_IC)
+ pte |= (ARM_LPAE_MAIR_ATTR_IDX_NO_IC
+ << ARM_LPAE_PTE_ATTRINDX_SHIFT);
} else {
pte = ARM_LPAE_PTE_HAP_FAULT;
if (prot & IOMMU_READ)
@@ -780,7 +785,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
struct arm_lpae_io_pgtable *data;
if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA |
- IO_PGTABLE_QUIRK_NON_STRICT))
+ IO_PGTABLE_QUIRK_NON_STRICT |
+ IO_PGTABLE_QUIRK_NO_IC))
return NULL;
data = arm_lpae_alloc_pgtable(cfg);
@@ -788,9 +794,13 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
return NULL;
/* TCR */
- reg = (ARM_LPAE_TCR_SH_IS << ARM_LPAE_TCR_SH0_SHIFT) |
- (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN0_SHIFT) |
- (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN0_SHIFT);
+ if (cfg->quirks & IO_PGTABLE_QUIRK_NO_IC)
+ reg = ARM_LPAE_TCR_RGN_NC << ARM_LPAE_TCR_IRGN0_SHIFT;
+ else
+ reg = ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN0_SHIFT;
+
+ reg |= (ARM_LPAE_TCR_SH_IS << ARM_LPAE_TCR_SH0_SHIFT) |
+ (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN0_SHIFT);
switch (ARM_LPAE_GRANULE(data)) {
case SZ_4K:
@@ -842,7 +852,9 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
(ARM_LPAE_MAIR_ATTR_WBRWA
<< ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_CACHE)) |
(ARM_LPAE_MAIR_ATTR_DEVICE
- << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV));
+ << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV)) |
+ (ARM_LPAE_MAIR_ATTR_NO_IC
+ << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_NO_IC));
cfg->arm_lpae_s1_cfg.mair[0] = reg;
cfg->arm_lpae_s1_cfg.mair[1] = 0;
diff --git a/drivers/iommu/io-pgtable.h b/drivers/iommu/io-pgtable.h
index 47d5ae559329..450a4adf9052 100644
--- a/drivers/iommu/io-pgtable.h
+++ b/drivers/iommu/io-pgtable.h
@@ -75,6 +75,10 @@ struct io_pgtable_cfg {
* IO_PGTABLE_QUIRK_NON_STRICT: Skip issuing synchronous leaf TLBIs
* on unmap, for DMA domains using the flush queue mechanism for
* delayed invalidation.
+ *
+ * IO_PGTABLE_QUIRK_NO_IC: Override the attributes to use only the outer
+ * cache, and not inner cache for non-coherent devices doing normal
+ * sys-cached memory.
*/
#define IO_PGTABLE_QUIRK_ARM_NS BIT(0)
#define IO_PGTABLE_QUIRK_NO_PERMS BIT(1)
@@ -82,6 +86,7 @@ struct io_pgtable_cfg {
#define IO_PGTABLE_QUIRK_ARM_MTK_4GB BIT(3)
#define IO_PGTABLE_QUIRK_NO_DMA BIT(4)
#define IO_PGTABLE_QUIRK_NON_STRICT BIT(5)
+ #define IO_PGTABLE_QUIRK_NO_IC BIT(6)
unsigned long quirks;
unsigned long pgsize_bitmap;
unsigned int ias;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index a1d28f42cb77..c30ee7f8d82d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -41,6 +41,8 @@
* if the IOMMU page table format is equivalent.
*/
#define IOMMU_PRIV (1 << 5)
+/* Don't use inner caches */
+#define IOMMU_NO_IC (1 << 6)
struct iommu_ops;
struct iommu_group;
@@ -125,6 +127,7 @@ enum iommu_attr {
DOMAIN_ATTR_FSL_PAMUV1,
DOMAIN_ATTR_NESTING, /* two stages of translation */
DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
+ DOMAIN_ATTR_NO_IC,
DOMAIN_ATTR_MAX,
};
--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation
^ permalink raw reply related [flat|nested] 19+ messages in thread* Re: [PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache 2018-12-04 11:01 Vivek Gautam @ 2018-12-04 15:21 ` Robin Murphy [not found] ` <99682bd2-1ca6-406a-890c-b34c25a1b2b3-5wv7dgnIgG8@public.gmane.org> 0 siblings, 1 reply; 19+ messages in thread From: Robin Murphy @ 2018-12-04 15:21 UTC (permalink / raw) To: Vivek Gautam, joro, will.deacon, iommu Cc: robdclark, linux-kernel, tfiga, jcrouse, linux-arm-msm, pdaly, pratikp On 04/12/2018 11:01, Vivek Gautam wrote: > Qualcomm SoCs have an additional level of cache called as > System cache, aka. Last level cache (LLC). This cache sits right > before the DDR, and is tightly coupled with the memory controller. > The cache is available to all the clients present in the SoC system. > The clients request their slices from this system cache, make it > active, and can then start using it. > For these clients with smmu, to start using the system cache for > buffers and, related page tables [1], memory attributes need to be > set accordingly. > This change updates the MAIR and TCR configurations with correct > attributes to use this system cache. > > To explain a little about memory attribute requirements here: > > Non-coherent I/O devices can't look-up into inner caches. However, > coherent I/O devices can. But both can allocate in the system cache > based on system policy and configured memory attributes in page > tables. > CPUs can access both inner and outer caches (including system cache, > aka. Last level cache), and can allocate into system cache too > based on memory attributes, and system policy. > > Further looking at memory types, we have following - > a) Normal uncached :- MAIR 0x44, inner non-cacheable, > outer non-cacheable; > b) Normal cached :- MAIR 0xff, inner read write-back non-transient, > outer read write-back non-transient; > attribute setting for coherenet I/O devices. > > and, for non-coherent i/o devices that can allocate in system cache > another type gets added - > c) Normal sys-cached/non-inner-cached :- > MAIR 0xf4, inner non-cacheable, > outer read write-back non-transient > > So, CPU will automatically use the system cache for memory marked as > normal cached. The normal sys-cached is downgraded to normal non-cached > memory for CPUs. > Coherent I/O devices can use system cache by marking the memory as > normal cached. > Non-coherent I/O devices, to use system cache, should mark the memory as > normal sys-cached in page tables. > > This change is a realisation of following changes > from downstream msm-4.9: > iommu: io-pgtable-arm: Support DOMAIN_ATTRIBUTE_USE_UPSTREAM_HINT[2] > iommu: io-pgtable-arm: Implement IOMMU_USE_UPSTREAM_HINT[3] > > [1] https://patchwork.kernel.org/patch/10302791/ > [2] https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9&id=bf762276796e79ca90014992f4d9da5593fa7d51 > [3] https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9&id=d4c72c413ea27c43f60825193d4de9cb8ffd9602 > > Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org> > --- > > Changes since v1: > - Addressed Tomasz's comments for basing the change on > "NO_INNER_CACHE" concept for non-coherent I/O devices > rather than capturing "SYS_CACHE". This is to indicate > clearly the intent of non-coherent I/O devices that > can't access inner caches. That seems backwards to me - there is already a fundamental assumption that non-coherent devices can't access caches. What we're adding here is a weird exception where they *can* use some level of cache despite still being non-coherent overall. In other words, it's not a case of downgrading coherent devices' accesses to bypass inner caches, it's upgrading non-coherent devices' accesses to hit the outer cache. That's certainly the understanding I got from talking with Pratik at Plumbers, and it does appear to fit with your explanation above despite the final conclusion you draw being different. I do see what Tomasz meant in terms of the TCR attributes, but what we currently do there is a little unintuitive and not at all representative of actual mapping attributes - I'll come back to that inline. > drivers/iommu/arm-smmu.c | 15 +++++++++++++++ > drivers/iommu/dma-iommu.c | 3 +++ > drivers/iommu/io-pgtable-arm.c | 22 +++++++++++++++++----- > drivers/iommu/io-pgtable.h | 5 +++++ > include/linux/iommu.h | 3 +++ > 5 files changed, 43 insertions(+), 5 deletions(-) As a minor nit, I'd prefer this as at least two patches to separate the io-pgtable changes and arm-smmu changes - basically I'd expect it to look much the same as the non-strict mode support did. > diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c > index ba18d89d4732..047f7ff95b0d 100644 > --- a/drivers/iommu/arm-smmu.c > +++ b/drivers/iommu/arm-smmu.c > @@ -255,6 +255,7 @@ struct arm_smmu_domain { > struct mutex init_mutex; /* Protects smmu pointer */ > spinlock_t cb_lock; /* Serialises ATS1* ops and TLB syncs */ > struct iommu_domain domain; > + bool no_inner_cache; Can we keep all the domain flags together please? In fact, I'd be inclined to implement an options bitmap as we do elsewhere rather than proliferate multiple bools. > }; > > struct arm_smmu_option_prop { > @@ -897,6 +898,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, > if (smmu_domain->non_strict) > pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT; > > + if (smmu_domain->no_inner_cache) > + pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NO_IC; Maybe we need to be a bit cleverer about setting the quirk (and/or allowing the domain attribute to be set), since depending on configuration and hardware support the domain may end up picking a stage 2 or short-descriptor format and thus being rendered unusable. > + > smmu_domain->smmu = smmu; > pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain); > if (!pgtbl_ops) { > @@ -1579,6 +1583,9 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain, > case DOMAIN_ATTR_NESTING: > *(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED); > return 0; > + case DOMAIN_ATTR_NO_IC: > + *((int *)data) = smmu_domain->no_inner_cache; > + return 0; > default: > return -ENODEV; > } > @@ -1619,6 +1626,14 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain, > else > smmu_domain->stage = ARM_SMMU_DOMAIN_S1; > break; > + case DOMAIN_ATTR_NO_IC: > + if (smmu_domain->smmu) { > + ret = -EPERM; > + goto out_unlock; > + } > + if (*((int *)data)) > + smmu_domain->no_inner_cache = true; This makes the attribute impossible to disable again, even before the domain is initialised - is that intentional? (and if so, why?) > + break; > default: > ret = -ENODEV; > } > diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c > index d1b04753b204..87c3d59c4a6c 100644 > --- a/drivers/iommu/dma-iommu.c > +++ b/drivers/iommu/dma-iommu.c > @@ -354,6 +354,9 @@ int dma_info_to_prot(enum dma_data_direction dir, bool coherent, > { > int prot = coherent ? IOMMU_CACHE : 0; > > + if (!coherent && (attrs & DOMAIN_ATTR_NO_IC)) > + prot |= IOMMU_NO_IC; > + Erm, that's going to be a hilariously unexpected interpretation of DMA_ATTR_FORCE_CONTIGUOUS... I'm not sure it would really makes sense to expose fine-grained controls at the DMA API level anyway, given the main point is to largely abstract away the notion of caches altogether. > if (attrs & DMA_ATTR_PRIVILEGED) > prot |= IOMMU_PRIV; > > diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c > index 237cacd4a62b..815b86067bcc 100644 > --- a/drivers/iommu/io-pgtable-arm.c > +++ b/drivers/iommu/io-pgtable-arm.c > @@ -168,10 +168,12 @@ > #define ARM_LPAE_MAIR_ATTR_MASK 0xff > #define ARM_LPAE_MAIR_ATTR_DEVICE 0x04 > #define ARM_LPAE_MAIR_ATTR_NC 0x44 > +#define ARM_LPAE_MAIR_ATTR_NO_IC 0xf4 > #define ARM_LPAE_MAIR_ATTR_WBRWA 0xff > #define ARM_LPAE_MAIR_ATTR_IDX_NC 0 > #define ARM_LPAE_MAIR_ATTR_IDX_CACHE 1 > #define ARM_LPAE_MAIR_ATTR_IDX_DEV 2 > +#define ARM_LPAE_MAIR_ATTR_IDX_NO_IC 3 > > /* IOPTE accessors */ > #define iopte_deref(pte,d) __va(iopte_to_paddr(pte, d)) > @@ -443,6 +445,9 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, > else if (prot & IOMMU_CACHE) > pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE > << ARM_LPAE_PTE_ATTRINDX_SHIFT); > + else if (prot & IOMMU_NO_IC) > + pte |= (ARM_LPAE_MAIR_ATTR_IDX_NO_IC > + << ARM_LPAE_PTE_ATTRINDX_SHIFT); > } else { > pte = ARM_LPAE_PTE_HAP_FAULT; > if (prot & IOMMU_READ) > @@ -780,7 +785,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) > struct arm_lpae_io_pgtable *data; > > if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA | > - IO_PGTABLE_QUIRK_NON_STRICT)) > + IO_PGTABLE_QUIRK_NON_STRICT | > + IO_PGTABLE_QUIRK_NO_IC)) > return NULL; > > data = arm_lpae_alloc_pgtable(cfg); > @@ -788,9 +794,13 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) > return NULL; > > /* TCR */ > - reg = (ARM_LPAE_TCR_SH_IS << ARM_LPAE_TCR_SH0_SHIFT) | > - (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN0_SHIFT) | > - (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN0_SHIFT); The subtle assumption here is that if the SMMU is coherent then these are the attributes we actually want to use, but if it's non-coherent then the interconnect should ignore them anyway so it doesn't really matter. Does either of those aspects hold for qcom SoCs? TBH if we're going to touch the TCR attributes at all then we should probably correct that sloppiness first - there's an occasional argument for using non-cacheable pagetables even on a coherent SMMU if reducing snoop traffic/latency on walks outweighs the cost of cache maintenance on PTE updates, but anyone thinking they can get that by overriding dma-coherent silently gets the worst of both worlds thanks to this current TCR value. > + if (cfg->quirks & IO_PGTABLE_QUIRK_NO_IC) > + reg = ARM_LPAE_TCR_RGN_NC << ARM_LPAE_TCR_IRGN0_SHIFT; > + else > + reg = ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN0_SHIFT; > + > + reg |= (ARM_LPAE_TCR_SH_IS << ARM_LPAE_TCR_SH0_SHIFT) | > + (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN0_SHIFT); > > switch (ARM_LPAE_GRANULE(data)) { > case SZ_4K: > @@ -842,7 +852,9 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) > (ARM_LPAE_MAIR_ATTR_WBRWA > << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_CACHE)) | > (ARM_LPAE_MAIR_ATTR_DEVICE > - << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV)); > + << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV)) | > + (ARM_LPAE_MAIR_ATTR_NO_IC > + << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_NO_IC)); > > cfg->arm_lpae_s1_cfg.mair[0] = reg; > cfg->arm_lpae_s1_cfg.mair[1] = 0; > diff --git a/drivers/iommu/io-pgtable.h b/drivers/iommu/io-pgtable.h > index 47d5ae559329..450a4adf9052 100644 > --- a/drivers/iommu/io-pgtable.h > +++ b/drivers/iommu/io-pgtable.h > @@ -75,6 +75,10 @@ struct io_pgtable_cfg { > * IO_PGTABLE_QUIRK_NON_STRICT: Skip issuing synchronous leaf TLBIs > * on unmap, for DMA domains using the flush queue mechanism for > * delayed invalidation. > + * > + * IO_PGTABLE_QUIRK_NO_IC: Override the attributes to use only the outer > + * cache, and not inner cache for non-coherent devices doing normal > + * sys-cached memory. As above, mappings for non-coherent devices would never be expected to have the inner cacheable attribute anyway, so the comment doesn't really make sense. > */ > #define IO_PGTABLE_QUIRK_ARM_NS BIT(0) > #define IO_PGTABLE_QUIRK_NO_PERMS BIT(1) > @@ -82,6 +86,7 @@ struct io_pgtable_cfg { > #define IO_PGTABLE_QUIRK_ARM_MTK_4GB BIT(3) > #define IO_PGTABLE_QUIRK_NO_DMA BIT(4) > #define IO_PGTABLE_QUIRK_NON_STRICT BIT(5) > + #define IO_PGTABLE_QUIRK_NO_IC BIT(6) > unsigned long quirks; > unsigned long pgsize_bitmap; > unsigned int ias; > diff --git a/include/linux/iommu.h b/include/linux/iommu.h > index a1d28f42cb77..c30ee7f8d82d 100644 > --- a/include/linux/iommu.h > +++ b/include/linux/iommu.h > @@ -41,6 +41,8 @@ > * if the IOMMU page table format is equivalent. > */ > #define IOMMU_PRIV (1 << 5) > +/* Don't use inner caches */ > +#define IOMMU_NO_IC (1 << 6) As it stands, this sounds like it should only make sense when combined with IOMMU_CACHE, yet the implementation only affects mappings which would otherwise be INC-ONC anyway. It could really do with having some clearer expectations set. > > struct iommu_ops; > struct iommu_group; > @@ -125,6 +127,7 @@ enum iommu_attr { > DOMAIN_ATTR_FSL_PAMUV1, > DOMAIN_ATTR_NESTING, /* two stages of translation */ > DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, > + DOMAIN_ATTR_NO_IC, At the IOMMU API level, nobody's going to have a clue what "NO_IC" means. For how non-generic a concept it really is, I'd personally name it something like DOMAIN_ATTR_QCOM_SYSTEM_CACHE. Robin. > DOMAIN_ATTR_MAX, > }; > > ^ permalink raw reply [flat|nested] 19+ messages in thread
[parent not found: <99682bd2-1ca6-406a-890c-b34c25a1b2b3-5wv7dgnIgG8@public.gmane.org>]
* Re: [PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache [not found] ` <99682bd2-1ca6-406a-890c-b34c25a1b2b3-5wv7dgnIgG8@public.gmane.org> @ 2018-12-07 9:24 ` Vivek Gautam 2018-12-13 3:50 ` Tomasz Figa [not found] ` <CAFp+6iE7U3HCJwHkeL9A4DXUVryt13YxDCYfBDxcSSM2Z_vqcg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org> 0 siblings, 2 replies; 19+ messages in thread From: Vivek Gautam @ 2018-12-07 9:24 UTC (permalink / raw) To: Robin Murphy Cc: pdaly-sgV2jX0FEOL9JmXXK+q4OQ, linux-arm-msm, Will Deacon, open list, list-Y9sIeH5OGRo@public.gmane.org:IOMMU DRIVERS <iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org>, Joerg Roedel <joro-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>, , pratikp-sgV2jX0FEOL9JmXXK+q4OQ Hi Robin, On Tue, Dec 4, 2018 at 8:51 PM Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org> wrote: > > On 04/12/2018 11:01, Vivek Gautam wrote: > > Qualcomm SoCs have an additional level of cache called as > > System cache, aka. Last level cache (LLC). This cache sits right > > before the DDR, and is tightly coupled with the memory controller. > > The cache is available to all the clients present in the SoC system. > > The clients request their slices from this system cache, make it > > active, and can then start using it. > > For these clients with smmu, to start using the system cache for > > buffers and, related page tables [1], memory attributes need to be > > set accordingly. > > This change updates the MAIR and TCR configurations with correct > > attributes to use this system cache. > > > > To explain a little about memory attribute requirements here: > > > > Non-coherent I/O devices can't look-up into inner caches. However, > > coherent I/O devices can. But both can allocate in the system cache > > based on system policy and configured memory attributes in page > > tables. > > CPUs can access both inner and outer caches (including system cache, > > aka. Last level cache), and can allocate into system cache too > > based on memory attributes, and system policy. > > > > Further looking at memory types, we have following - > > a) Normal uncached :- MAIR 0x44, inner non-cacheable, > > outer non-cacheable; > > b) Normal cached :- MAIR 0xff, inner read write-back non-transient, > > outer read write-back non-transient; > > attribute setting for coherenet I/O devices. > > > > and, for non-coherent i/o devices that can allocate in system cache > > another type gets added - > > c) Normal sys-cached/non-inner-cached :- > > MAIR 0xf4, inner non-cacheable, > > outer read write-back non-transient > > > > So, CPU will automatically use the system cache for memory marked as > > normal cached. The normal sys-cached is downgraded to normal non-cached > > memory for CPUs. > > Coherent I/O devices can use system cache by marking the memory as > > normal cached. > > Non-coherent I/O devices, to use system cache, should mark the memory as > > normal sys-cached in page tables. > > > > This change is a realisation of following changes > > from downstream msm-4.9: > > iommu: io-pgtable-arm: Support DOMAIN_ATTRIBUTE_USE_UPSTREAM_HINT[2] > > iommu: io-pgtable-arm: Implement IOMMU_USE_UPSTREAM_HINT[3] > > > > [1] https://patchwork.kernel.org/patch/10302791/ > > [2] https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9&id=bf762276796e79ca90014992f4d9da5593fa7d51 > > [3] https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9&id=d4c72c413ea27c43f60825193d4de9cb8ffd9602 > > > > Signed-off-by: Vivek Gautam <vivek.gautam-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org> > > --- > > > > Changes since v1: > > - Addressed Tomasz's comments for basing the change on > > "NO_INNER_CACHE" concept for non-coherent I/O devices > > rather than capturing "SYS_CACHE". This is to indicate > > clearly the intent of non-coherent I/O devices that > > can't access inner caches. > > That seems backwards to me - there is already a fundamental assumption > that non-coherent devices can't access caches. What we're adding here is > a weird exception where they *can* use some level of cache despite still > being non-coherent overall. > > In other words, it's not a case of downgrading coherent devices' > accesses to bypass inner caches, it's upgrading non-coherent devices' > accesses to hit the outer cache. That's certainly the understanding I > got from talking with Pratik at Plumbers, and it does appear to fit with > your explanation above despite the final conclusion you draw being > different. Thanks for the thorough review of the change. Right, I guess it's rather an upgrade for non-coherent devices to use an outer cache than a downgrade for coherent devices. > > I do see what Tomasz meant in terms of the TCR attributes, but what we > currently do there is a little unintuitive and not at all representative > of actual mapping attributes - I'll come back to that inline. > > > drivers/iommu/arm-smmu.c | 15 +++++++++++++++ > > drivers/iommu/dma-iommu.c | 3 +++ > > drivers/iommu/io-pgtable-arm.c | 22 +++++++++++++++++----- > > drivers/iommu/io-pgtable.h | 5 +++++ > > include/linux/iommu.h | 3 +++ > > 5 files changed, 43 insertions(+), 5 deletions(-) > > As a minor nit, I'd prefer this as at least two patches to separate the > io-pgtable changes and arm-smmu changes - basically I'd expect it to > look much the same as the non-strict mode support did. Sure, will split the patch. > > > diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c > > index ba18d89d4732..047f7ff95b0d 100644 > > --- a/drivers/iommu/arm-smmu.c > > +++ b/drivers/iommu/arm-smmu.c > > @@ -255,6 +255,7 @@ struct arm_smmu_domain { > > struct mutex init_mutex; /* Protects smmu pointer */ > > spinlock_t cb_lock; /* Serialises ATS1* ops and TLB syncs */ > > struct iommu_domain domain; > > + bool no_inner_cache; > > Can we keep all the domain flags together please? In fact, I'd be > inclined to implement an options bitmap as we do elsewhere rather than > proliferate multiple bools. Yea, changing this to bitmap makes sense. Will update this. > > > }; > > > > struct arm_smmu_option_prop { > > @@ -897,6 +898,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, > > if (smmu_domain->non_strict) > > pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT; > > > > + if (smmu_domain->no_inner_cache) > > + pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NO_IC; > > Maybe we need to be a bit cleverer about setting the quirk (and/or > allowing the domain attribute to be set), since depending on > configuration and hardware support the domain may end up picking a stage > 2 or short-descriptor format and thus being rendered unusable. I don't think I completely get you here. But, do you mean that to set such quirks we should first check configurations such as the domain's stage, and the format before deciding whether we want to set this or not? > > > + > > smmu_domain->smmu = smmu; > > pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain); > > if (!pgtbl_ops) { > > @@ -1579,6 +1583,9 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain, > > case DOMAIN_ATTR_NESTING: > > *(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED); > > return 0; > > + case DOMAIN_ATTR_NO_IC: > > + *((int *)data) = smmu_domain->no_inner_cache; > > + return 0; > > default: > > return -ENODEV; > > } > > @@ -1619,6 +1626,14 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain, > > else > > smmu_domain->stage = ARM_SMMU_DOMAIN_S1; > > break; > > + case DOMAIN_ATTR_NO_IC: > > + if (smmu_domain->smmu) { > > + ret = -EPERM; > > + goto out_unlock; > > + } > > + if (*((int *)data)) > > + smmu_domain->no_inner_cache = true; > > This makes the attribute impossible to disable again, even before the > domain is initialized - is that intentional? (and if so, why?) Right. I should add for data = 0 as well. That should help to disable this attribute again. > > > + break; > > default: > > ret = -ENODEV; > > } > > diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c > > index d1b04753b204..87c3d59c4a6c 100644 > > --- a/drivers/iommu/dma-iommu.c > > +++ b/drivers/iommu/dma-iommu.c > > @@ -354,6 +354,9 @@ int dma_info_to_prot(enum dma_data_direction dir, bool coherent, > > { > > int prot = coherent ? IOMMU_CACHE : 0; > > > > + if (!coherent && (attrs & DOMAIN_ATTR_NO_IC)) > > + prot |= IOMMU_NO_IC; > > + > > Erm, that's going to be a hilariously unexpected interpretation of > DMA_ATTR_FORCE_CONTIGUOUS... Right. :) I guess i will take your suggestion to have something like DOMAIN_ATTR_QCOM_SYSTEM_CACHE. > > I'm not sure it would really makes sense to expose fine-grained controls > at the DMA API level anyway, given the main point is to largely abstract > away the notion of caches altogether. But there are DMA devices (such as video) which use DMA mapping APIs only, and which are non-coherent-upgraded-to-use-sytem-cache. Such devices can't force set IOMMU quirks unless they do iommu_get_domain_for_dev() and then set the domain attributes. Will that be better way? > > > if (attrs & DMA_ATTR_PRIVILEGED) > > prot |= IOMMU_PRIV; > > > > diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c > > index 237cacd4a62b..815b86067bcc 100644 > > --- a/drivers/iommu/io-pgtable-arm.c > > +++ b/drivers/iommu/io-pgtable-arm.c > > @@ -168,10 +168,12 @@ > > #define ARM_LPAE_MAIR_ATTR_MASK 0xff > > #define ARM_LPAE_MAIR_ATTR_DEVICE 0x04 > > #define ARM_LPAE_MAIR_ATTR_NC 0x44 > > +#define ARM_LPAE_MAIR_ATTR_NO_IC 0xf4 > > #define ARM_LPAE_MAIR_ATTR_WBRWA 0xff > > #define ARM_LPAE_MAIR_ATTR_IDX_NC 0 > > #define ARM_LPAE_MAIR_ATTR_IDX_CACHE 1 > > #define ARM_LPAE_MAIR_ATTR_IDX_DEV 2 > > +#define ARM_LPAE_MAIR_ATTR_IDX_NO_IC 3 > > > > /* IOPTE accessors */ > > #define iopte_deref(pte,d) __va(iopte_to_paddr(pte, d)) > > @@ -443,6 +445,9 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, > > else if (prot & IOMMU_CACHE) > > pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE > > << ARM_LPAE_PTE_ATTRINDX_SHIFT); > > + else if (prot & IOMMU_NO_IC) > > + pte |= (ARM_LPAE_MAIR_ATTR_IDX_NO_IC > > + << ARM_LPAE_PTE_ATTRINDX_SHIFT); > > } else { > > pte = ARM_LPAE_PTE_HAP_FAULT; > > if (prot & IOMMU_READ) > > @@ -780,7 +785,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) > > struct arm_lpae_io_pgtable *data; > > > > if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA | > > - IO_PGTABLE_QUIRK_NON_STRICT)) > > + IO_PGTABLE_QUIRK_NON_STRICT | > > + IO_PGTABLE_QUIRK_NO_IC)) > > return NULL; > > > > data = arm_lpae_alloc_pgtable(cfg); > > @@ -788,9 +794,13 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) > > return NULL; > > > > /* TCR */ > > - reg = (ARM_LPAE_TCR_SH_IS << ARM_LPAE_TCR_SH0_SHIFT) | > > - (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN0_SHIFT) | > > - (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN0_SHIFT); > > The subtle assumption here is that if the SMMU is coherent then these > are the attributes we actually want to use, but if it's non-coherent > then the interconnect should ignore them anyway so it doesn't really > matter. Does either of those aspects hold for qcom SoCs? >From the downstream [1] it's clear that default for smmu is set to Non-cached access. So, I don't think the interconnect helps us. OR, possibly we are just forcing these mappings be uncached. > > TBH if we're going to touch the TCR attributes at all then we should > probably correct that sloppiness first - there's an occasional argument > for using non-cacheable pagetables even on a coherent SMMU if reducing > snoop traffic/latency on walks outweighs the cost of cache maintenance > on PTE updates, but anyone thinking they can get that by overriding > dma-coherent silently gets the worst of both worlds thanks to this > current TCR value. So, what do you suggest? This is something that's smmu's implementation specific detail, not something that's going to vary from one domain to another? Isn't that right? So, in that case additional dt property can help setting a quirk? > > > + if (cfg->quirks & IO_PGTABLE_QUIRK_NO_IC) > > + reg = ARM_LPAE_TCR_RGN_NC << ARM_LPAE_TCR_IRGN0_SHIFT; > > + else > > + reg = ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN0_SHIFT; > > + > > + reg |= (ARM_LPAE_TCR_SH_IS << ARM_LPAE_TCR_SH0_SHIFT) | > > + (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN0_SHIFT); > > > > switch (ARM_LPAE_GRANULE(data)) { > > case SZ_4K: > > @@ -842,7 +852,9 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) > > (ARM_LPAE_MAIR_ATTR_WBRWA > > << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_CACHE)) | > > (ARM_LPAE_MAIR_ATTR_DEVICE > > - << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV)); > > + << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV)) | > > + (ARM_LPAE_MAIR_ATTR_NO_IC > > + << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_NO_IC)); > > > > cfg->arm_lpae_s1_cfg.mair[0] = reg; > > cfg->arm_lpae_s1_cfg.mair[1] = 0; > > diff --git a/drivers/iommu/io-pgtable.h b/drivers/iommu/io-pgtable.h > > index 47d5ae559329..450a4adf9052 100644 > > --- a/drivers/iommu/io-pgtable.h > > +++ b/drivers/iommu/io-pgtable.h > > @@ -75,6 +75,10 @@ struct io_pgtable_cfg { > > * IO_PGTABLE_QUIRK_NON_STRICT: Skip issuing synchronous leaf TLBIs > > * on unmap, for DMA domains using the flush queue mechanism for > > * delayed invalidation. > > + * > > + * IO_PGTABLE_QUIRK_NO_IC: Override the attributes to use only the outer > > + * cache, and not inner cache for non-coherent devices doing normal > > + * sys-cached memory. > > As above, mappings for non-coherent devices would never be expected to > have the inner cacheable attribute anyway, so the comment doesn't really > make sense. Right, will update this comment. > > > */ > > #define IO_PGTABLE_QUIRK_ARM_NS BIT(0) > > #define IO_PGTABLE_QUIRK_NO_PERMS BIT(1) > > @@ -82,6 +86,7 @@ struct io_pgtable_cfg { > > #define IO_PGTABLE_QUIRK_ARM_MTK_4GB BIT(3) > > #define IO_PGTABLE_QUIRK_NO_DMA BIT(4) > > #define IO_PGTABLE_QUIRK_NON_STRICT BIT(5) > > + #define IO_PGTABLE_QUIRK_NO_IC BIT(6) > > unsigned long quirks; > > unsigned long pgsize_bitmap; > > unsigned int ias; > > diff --git a/include/linux/iommu.h b/include/linux/iommu.h > > index a1d28f42cb77..c30ee7f8d82d 100644 > > --- a/include/linux/iommu.h > > +++ b/include/linux/iommu.h > > @@ -41,6 +41,8 @@ > > * if the IOMMU page table format is equivalent. > > */ > > #define IOMMU_PRIV (1 << 5) > > +/* Don't use inner caches */ > > +#define IOMMU_NO_IC (1 << 6) > > As it stands, this sounds like it should only make sense when combined > with IOMMU_CACHE, yet the implementation only affects mappings which > would otherwise be INC-ONC anyway. It could really do with having some > clearer expectations set. Yea, the comment and a change in the string for this macro too can be updated to clearly show the intent of using system cache for non-coherent devices. Will do it. > > > > > struct iommu_ops; > > struct iommu_group; > > @@ -125,6 +127,7 @@ enum iommu_attr { > > DOMAIN_ATTR_FSL_PAMUV1, > > DOMAIN_ATTR_NESTING, /* two stages of translation */ > > DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, > > + DOMAIN_ATTR_NO_IC, > > At the IOMMU API level, nobody's going to have a clue what "NO_IC" > means. For how non-generic a concept it really is, I'd personally name > it something like DOMAIN_ATTR_QCOM_SYSTEM_CACHE. Yea, will update this too. > > Robin. > > > DOMAIN_ATTR_MAX, > > }; > > > > [1] https://source.codeaurora.org/quic/la/kernel/msm-4.9/tree/drivers/iommu/io-pgtable-arm.c?h=msm-4.9#n1028 Best regards Vivek -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache 2018-12-07 9:24 ` Vivek Gautam @ 2018-12-13 3:50 ` Tomasz Figa [not found] ` <CAAFQd5C+BygjdBhBOsiBW=4kOgC0a=V8s9om6jqa6yzmn0TEWw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org> [not found] ` <CAFp+6iE7U3HCJwHkeL9A4DXUVryt13YxDCYfBDxcSSM2Z_vqcg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org> 1 sibling, 1 reply; 19+ messages in thread From: Tomasz Figa @ 2018-12-13 3:50 UTC (permalink / raw) To: Vivek Gautam Cc: Robin Murphy, list@263.net:IOMMU DRIVERS <iommu@lists.linux-foundation.org>, Joerg Roedel <joro@8bytes.org>,, Will Deacon On Fri, Dec 7, 2018 at 6:25 PM Vivek Gautam <vivek.gautam@codeaurora.org> wrote: > > Hi Robin, > > On Tue, Dec 4, 2018 at 8:51 PM Robin Murphy <robin.murphy@arm.com> wrote: > > > > On 04/12/2018 11:01, Vivek Gautam wrote: > > > Qualcomm SoCs have an additional level of cache called as > > > System cache, aka. Last level cache (LLC). This cache sits right > > > before the DDR, and is tightly coupled with the memory controller. > > > The cache is available to all the clients present in the SoC system. > > > The clients request their slices from this system cache, make it > > > active, and can then start using it. > > > For these clients with smmu, to start using the system cache for > > > buffers and, related page tables [1], memory attributes need to be > > > set accordingly. > > > This change updates the MAIR and TCR configurations with correct > > > attributes to use this system cache. > > > > > > To explain a little about memory attribute requirements here: > > > > > > Non-coherent I/O devices can't look-up into inner caches. However, > > > coherent I/O devices can. But both can allocate in the system cache > > > based on system policy and configured memory attributes in page > > > tables. > > > CPUs can access both inner and outer caches (including system cache, > > > aka. Last level cache), and can allocate into system cache too > > > based on memory attributes, and system policy. > > > > > > Further looking at memory types, we have following - > > > a) Normal uncached :- MAIR 0x44, inner non-cacheable, > > > outer non-cacheable; > > > b) Normal cached :- MAIR 0xff, inner read write-back non-transient, > > > outer read write-back non-transient; > > > attribute setting for coherenet I/O devices. > > > > > > and, for non-coherent i/o devices that can allocate in system cache > > > another type gets added - > > > c) Normal sys-cached/non-inner-cached :- > > > MAIR 0xf4, inner non-cacheable, > > > outer read write-back non-transient > > > > > > So, CPU will automatically use the system cache for memory marked as > > > normal cached. The normal sys-cached is downgraded to normal non-cached > > > memory for CPUs. > > > Coherent I/O devices can use system cache by marking the memory as > > > normal cached. > > > Non-coherent I/O devices, to use system cache, should mark the memory as > > > normal sys-cached in page tables. > > > > > > This change is a realisation of following changes > > > from downstream msm-4.9: > > > iommu: io-pgtable-arm: Support DOMAIN_ATTRIBUTE_USE_UPSTREAM_HINT[2] > > > iommu: io-pgtable-arm: Implement IOMMU_USE_UPSTREAM_HINT[3] > > > > > > [1] https://patchwork.kernel.org/patch/10302791/ > > > [2] https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9&id=bf762276796e79ca90014992f4d9da5593fa7d51 > > > [3] https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9&id=d4c72c413ea27c43f60825193d4de9cb8ffd9602 > > > > > > Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org> > > > --- > > > > > > Changes since v1: > > > - Addressed Tomasz's comments for basing the change on > > > "NO_INNER_CACHE" concept for non-coherent I/O devices > > > rather than capturing "SYS_CACHE". This is to indicate > > > clearly the intent of non-coherent I/O devices that > > > can't access inner caches. > > > > That seems backwards to me - there is already a fundamental assumption > > that non-coherent devices can't access caches. What we're adding here is > > a weird exception where they *can* use some level of cache despite still > > being non-coherent overall. > > > > In other words, it's not a case of downgrading coherent devices' > > accesses to bypass inner caches, it's upgrading non-coherent devices' > > accesses to hit the outer cache. That's certainly the understanding I > > got from talking with Pratik at Plumbers, and it does appear to fit with > > your explanation above despite the final conclusion you draw being > > different. > > Thanks for the thorough review of the change. > Right, I guess it's rather an upgrade for non-coherent devices to use > an outer cache than a downgrade for coherent devices. > Note that it was not my suggestion to use "NO_INNER_CACHE" for enabling the system cache, sorry for not being clear. What I was asking for in my comment was regarding the previous patch disabling inner cache if system cache is requested, which may not make for coherent devices, which could benefit from using both inner and system cache. So note that there are several cases here: - coherent, IC, system cache alloc, - coherent. non-IC, system cache alloc, - coherent, IC, system cache look-up, - noncoherent device, non-IC, system cache alloc, - noncoherent device, non-IC, system cache look-up. Given the presence or lack of coherency for the device, which of the 2/3 options is the best depends on the use case, e.g. DMA/CPU access pattern, sharing memory between multiple devices, etc. Best regards, Tomasz ^ permalink raw reply [flat|nested] 19+ messages in thread
[parent not found: <CAAFQd5C+BygjdBhBOsiBW=4kOgC0a=V8s9om6jqa6yzmn0TEWw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>]
* Re: [PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache [not found] ` <CAAFQd5C+BygjdBhBOsiBW=4kOgC0a=V8s9om6jqa6yzmn0TEWw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org> @ 2019-01-02 7:22 ` Vivek Gautam 0 siblings, 0 replies; 19+ messages in thread From: Vivek Gautam @ 2019-01-02 7:22 UTC (permalink / raw) To: Tomasz Figa Cc: pdaly-sgV2jX0FEOL9JmXXK+q4OQ, linux-arm-msm, Will Deacon, Linux Kernel Mailing List, open list:IOMMU DRIVERS, Robin Murphy, pratikp-sgV2jX0FEOL9JmXXK+q4OQ On Thu, Dec 13, 2018 at 9:20 AM Tomasz Figa <tfiga-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote: > > On Fri, Dec 7, 2018 at 6:25 PM Vivek Gautam <vivek.gautam-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org> wrote: > > > > Hi Robin, > > > > On Tue, Dec 4, 2018 at 8:51 PM Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org> wrote: > > > > > > On 04/12/2018 11:01, Vivek Gautam wrote: > > > > Qualcomm SoCs have an additional level of cache called as > > > > System cache, aka. Last level cache (LLC). This cache sits right > > > > before the DDR, and is tightly coupled with the memory controller. > > > > The cache is available to all the clients present in the SoC system. > > > > The clients request their slices from this system cache, make it > > > > active, and can then start using it. > > > > For these clients with smmu, to start using the system cache for > > > > buffers and, related page tables [1], memory attributes need to be > > > > set accordingly. > > > > This change updates the MAIR and TCR configurations with correct > > > > attributes to use this system cache. > > > > > > > > To explain a little about memory attribute requirements here: > > > > > > > > Non-coherent I/O devices can't look-up into inner caches. However, > > > > coherent I/O devices can. But both can allocate in the system cache > > > > based on system policy and configured memory attributes in page > > > > tables. > > > > CPUs can access both inner and outer caches (including system cache, > > > > aka. Last level cache), and can allocate into system cache too > > > > based on memory attributes, and system policy. > > > > > > > > Further looking at memory types, we have following - > > > > a) Normal uncached :- MAIR 0x44, inner non-cacheable, > > > > outer non-cacheable; > > > > b) Normal cached :- MAIR 0xff, inner read write-back non-transient, > > > > outer read write-back non-transient; > > > > attribute setting for coherenet I/O devices. > > > > > > > > and, for non-coherent i/o devices that can allocate in system cache > > > > another type gets added - > > > > c) Normal sys-cached/non-inner-cached :- > > > > MAIR 0xf4, inner non-cacheable, > > > > outer read write-back non-transient > > > > > > > > So, CPU will automatically use the system cache for memory marked as > > > > normal cached. The normal sys-cached is downgraded to normal non-cached > > > > memory for CPUs. > > > > Coherent I/O devices can use system cache by marking the memory as > > > > normal cached. > > > > Non-coherent I/O devices, to use system cache, should mark the memory as > > > > normal sys-cached in page tables. > > > > > > > > This change is a realisation of following changes > > > > from downstream msm-4.9: > > > > iommu: io-pgtable-arm: Support DOMAIN_ATTRIBUTE_USE_UPSTREAM_HINT[2] > > > > iommu: io-pgtable-arm: Implement IOMMU_USE_UPSTREAM_HINT[3] > > > > > > > > [1] https://patchwork.kernel.org/patch/10302791/ > > > > [2] https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9&id=bf762276796e79ca90014992f4d9da5593fa7d51 > > > > [3] https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9&id=d4c72c413ea27c43f60825193d4de9cb8ffd9602 > > > > > > > > Signed-off-by: Vivek Gautam <vivek.gautam-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org> > > > > --- > > > > > > > > Changes since v1: > > > > - Addressed Tomasz's comments for basing the change on > > > > "NO_INNER_CACHE" concept for non-coherent I/O devices > > > > rather than capturing "SYS_CACHE". This is to indicate > > > > clearly the intent of non-coherent I/O devices that > > > > can't access inner caches. > > > > > > That seems backwards to me - there is already a fundamental assumption > > > that non-coherent devices can't access caches. What we're adding here is > > > a weird exception where they *can* use some level of cache despite still > > > being non-coherent overall. > > > > > > In other words, it's not a case of downgrading coherent devices' > > > accesses to bypass inner caches, it's upgrading non-coherent devices' > > > accesses to hit the outer cache. That's certainly the understanding I > > > got from talking with Pratik at Plumbers, and it does appear to fit with > > > your explanation above despite the final conclusion you draw being > > > different. > > > > Thanks for the thorough review of the change. > > Right, I guess it's rather an upgrade for non-coherent devices to use > > an outer cache than a downgrade for coherent devices. > > > > Note that it was not my suggestion to use "NO_INNER_CACHE" for > enabling the system cache, sorry for not being clear. What I was > asking for in my comment was regarding the previous patch disabling > inner cache if system cache is requested, which may not make for > coherent devices, which could benefit from using both inner and system > cache. Sorry for not taking the cue correctly. The intention of the change was to let coherent devices use system cache as well. But I guess the change wasn't designed correctly. > > So note that there are several cases here: > - coherent, IC, system cache alloc, > - coherent. non-IC, system cache alloc, > - coherent, IC, system cache look-up, > - noncoherent device, non-IC, system cache alloc, > - noncoherent device, non-IC, system cache look-up. > > Given the presence or lack of coherency for the device, which of the > 2/3 options is the best depends on the use case, e.g. DMA/CPU access > pattern, sharing memory between multiple devices, etc. > - coherent, IC, system cache alloc, - coherent, IC, system cache look-up, These two are default for coherent mappings. Coherent devices and coherent IOMMUs can use inner caches/outer caches, and can allocate and lookup in system caches. - noncoherent device, non-IC, system cache look-up, --> Always - noncoherent device, non-IC, system cache allocate. --> Depends on system policy. So, any page table memory for non-coherent SMMUs could very well use system cache and CPU is free to look-up into it. And so do the non-coherent and coherent devices can use the system cache. Best regards Vivek > Best regards, > Tomasz > _______________________________________________ > iommu mailing list > iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org > https://lists.linuxfoundation.org/mailman/listinfo/iommu -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ^ permalink raw reply [flat|nested] 19+ messages in thread
[parent not found: <CAFp+6iE7U3HCJwHkeL9A4DXUVryt13YxDCYfBDxcSSM2Z_vqcg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>]
* Re: [PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache [not found] ` <CAFp+6iE7U3HCJwHkeL9A4DXUVryt13YxDCYfBDxcSSM2Z_vqcg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org> @ 2019-01-02 7:52 ` Vivek Gautam 0 siblings, 0 replies; 19+ messages in thread From: Vivek Gautam @ 2019-01-02 7:52 UTC (permalink / raw) To: Robin Murphy Cc: pdaly-sgV2jX0FEOL9JmXXK+q4OQ, linux-arm-msm, Will Deacon, open list, list-Y9sIeH5OGRo@public.gmane.org:IOMMU DRIVERS <iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org>, Joerg Roedel <joro-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>, , pratikp-sgV2jX0FEOL9JmXXK+q4OQ Hi Robin, On Fri, Dec 7, 2018 at 2:54 PM Vivek Gautam <vivek.gautam-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org> wrote: > > Hi Robin, > > On Tue, Dec 4, 2018 at 8:51 PM Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org> wrote: > > > > On 04/12/2018 11:01, Vivek Gautam wrote: > > > Qualcomm SoCs have an additional level of cache called as > > > System cache, aka. Last level cache (LLC). This cache sits right > > > before the DDR, and is tightly coupled with the memory controller. > > > The cache is available to all the clients present in the SoC system. > > > The clients request their slices from this system cache, make it > > > active, and can then start using it. > > > For these clients with smmu, to start using the system cache for > > > buffers and, related page tables [1], memory attributes need to be > > > set accordingly. > > > This change updates the MAIR and TCR configurations with correct > > > attributes to use this system cache. > > > > > > To explain a little about memory attribute requirements here: > > > > > > Non-coherent I/O devices can't look-up into inner caches. However, > > > coherent I/O devices can. But both can allocate in the system cache > > > based on system policy and configured memory attributes in page > > > tables. > > > CPUs can access both inner and outer caches (including system cache, > > > aka. Last level cache), and can allocate into system cache too > > > based on memory attributes, and system policy. > > > > > > Further looking at memory types, we have following - > > > a) Normal uncached :- MAIR 0x44, inner non-cacheable, > > > outer non-cacheable; > > > b) Normal cached :- MAIR 0xff, inner read write-back non-transient, > > > outer read write-back non-transient; > > > attribute setting for coherenet I/O devices. > > > > > > and, for non-coherent i/o devices that can allocate in system cache > > > another type gets added - > > > c) Normal sys-cached/non-inner-cached :- > > > MAIR 0xf4, inner non-cacheable, > > > outer read write-back non-transient > > > > > > So, CPU will automatically use the system cache for memory marked as > > > normal cached. The normal sys-cached is downgraded to normal non-cached > > > memory for CPUs. > > > Coherent I/O devices can use system cache by marking the memory as > > > normal cached. > > > Non-coherent I/O devices, to use system cache, should mark the memory as > > > normal sys-cached in page tables. > > > > > > This change is a realisation of following changes > > > from downstream msm-4.9: > > > iommu: io-pgtable-arm: Support DOMAIN_ATTRIBUTE_USE_UPSTREAM_HINT[2] > > > iommu: io-pgtable-arm: Implement IOMMU_USE_UPSTREAM_HINT[3] > > > > > > [1] https://patchwork.kernel.org/patch/10302791/ > > > [2] https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9&id=bf762276796e79ca90014992f4d9da5593fa7d51 > > > [3] https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9&id=d4c72c413ea27c43f60825193d4de9cb8ffd9602 > > > > > > Signed-off-by: Vivek Gautam <vivek.gautam-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org> > > > --- > > > > > > Changes since v1: > > > - Addressed Tomasz's comments for basing the change on > > > "NO_INNER_CACHE" concept for non-coherent I/O devices > > > rather than capturing "SYS_CACHE". This is to indicate > > > clearly the intent of non-coherent I/O devices that > > > can't access inner caches. > > > > That seems backwards to me - there is already a fundamental assumption > > that non-coherent devices can't access caches. What we're adding here is > > a weird exception where they *can* use some level of cache despite still > > being non-coherent overall. > > > > In other words, it's not a case of downgrading coherent devices' > > accesses to bypass inner caches, it's upgrading non-coherent devices' > > accesses to hit the outer cache. That's certainly the understanding I > > got from talking with Pratik at Plumbers, and it does appear to fit with > > your explanation above despite the final conclusion you draw being > > different. > > Thanks for the thorough review of the change. > Right, I guess it's rather an upgrade for non-coherent devices to use > an outer cache than a downgrade for coherent devices. > > > > > I do see what Tomasz meant in terms of the TCR attributes, but what we > > currently do there is a little unintuitive and not at all representative > > of actual mapping attributes - I'll come back to that inline. > > > > > drivers/iommu/arm-smmu.c | 15 +++++++++++++++ > > > drivers/iommu/dma-iommu.c | 3 +++ > > > drivers/iommu/io-pgtable-arm.c | 22 +++++++++++++++++----- > > > drivers/iommu/io-pgtable.h | 5 +++++ > > > include/linux/iommu.h | 3 +++ > > > 5 files changed, 43 insertions(+), 5 deletions(-) > > > > As a minor nit, I'd prefer this as at least two patches to separate the > > io-pgtable changes and arm-smmu changes - basically I'd expect it to > > look much the same as the non-strict mode support did. > > Sure, will split the patch. > > > > > > diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c > > > index ba18d89d4732..047f7ff95b0d 100644 > > > --- a/drivers/iommu/arm-smmu.c > > > +++ b/drivers/iommu/arm-smmu.c > > > @@ -255,6 +255,7 @@ struct arm_smmu_domain { > > > struct mutex init_mutex; /* Protects smmu pointer */ > > > spinlock_t cb_lock; /* Serialises ATS1* ops and TLB syncs */ > > > struct iommu_domain domain; > > > + bool no_inner_cache; > > > > Can we keep all the domain flags together please? In fact, I'd be > > inclined to implement an options bitmap as we do elsewhere rather than > > proliferate multiple bools. > > Yea, changing this to bitmap makes sense. Will update this. > > > > > > }; > > > > > > struct arm_smmu_option_prop { > > > @@ -897,6 +898,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, > > > if (smmu_domain->non_strict) > > > pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT; > > > > > > + if (smmu_domain->no_inner_cache) > > > + pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NO_IC; > > > > Maybe we need to be a bit cleverer about setting the quirk (and/or > > allowing the domain attribute to be set), since depending on > > configuration and hardware support the domain may end up picking a stage > > 2 or short-descriptor format and thus being rendered unusable. > > I don't think I completely get you here. > But, do you mean that to set such quirks we should first check configurations > such as the domain's stage, and the format before deciding whether > we want to set this or not? > > > > > > + > > > smmu_domain->smmu = smmu; > > > pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain); > > > if (!pgtbl_ops) { > > > @@ -1579,6 +1583,9 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain, > > > case DOMAIN_ATTR_NESTING: > > > *(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED); > > > return 0; > > > + case DOMAIN_ATTR_NO_IC: > > > + *((int *)data) = smmu_domain->no_inner_cache; > > > + return 0; > > > default: > > > return -ENODEV; > > > } > > > @@ -1619,6 +1626,14 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain, > > > else > > > smmu_domain->stage = ARM_SMMU_DOMAIN_S1; > > > break; > > > + case DOMAIN_ATTR_NO_IC: > > > + if (smmu_domain->smmu) { > > > + ret = -EPERM; > > > + goto out_unlock; > > > + } > > > + if (*((int *)data)) > > > + smmu_domain->no_inner_cache = true; > > > > This makes the attribute impossible to disable again, even before the > > domain is initialized - is that intentional? (and if so, why?) > > Right. I should add for data = 0 as well. That should help to disable this > attribute again. > > > > > > + break; > > > default: > > > ret = -ENODEV; > > > } > > > diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c > > > index d1b04753b204..87c3d59c4a6c 100644 > > > --- a/drivers/iommu/dma-iommu.c > > > +++ b/drivers/iommu/dma-iommu.c > > > @@ -354,6 +354,9 @@ int dma_info_to_prot(enum dma_data_direction dir, bool coherent, > > > { > > > int prot = coherent ? IOMMU_CACHE : 0; > > > > > > + if (!coherent && (attrs & DOMAIN_ATTR_NO_IC)) > > > + prot |= IOMMU_NO_IC; > > > + > > > > Erm, that's going to be a hilariously unexpected interpretation of > > DMA_ATTR_FORCE_CONTIGUOUS... > > Right. :) > I guess i will take your suggestion to have something like > DOMAIN_ATTR_QCOM_SYSTEM_CACHE. > > > > > I'm not sure it would really makes sense to expose fine-grained controls > > at the DMA API level anyway, given the main point is to largely abstract > > away the notion of caches altogether. > > But there are DMA devices (such as video) which use DMA mapping APIs only, > and which are non-coherent-upgraded-to-use-sytem-cache. Such devices > can't force set IOMMU quirks unless they do iommu_get_domain_for_dev() > and then set the domain attributes. > Will that be better way? Any suggestions here? > > > > > > if (attrs & DMA_ATTR_PRIVILEGED) > > > prot |= IOMMU_PRIV; > > > > > > diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c > > > index 237cacd4a62b..815b86067bcc 100644 > > > --- a/drivers/iommu/io-pgtable-arm.c > > > +++ b/drivers/iommu/io-pgtable-arm.c > > > @@ -168,10 +168,12 @@ > > > #define ARM_LPAE_MAIR_ATTR_MASK 0xff > > > #define ARM_LPAE_MAIR_ATTR_DEVICE 0x04 > > > #define ARM_LPAE_MAIR_ATTR_NC 0x44 > > > +#define ARM_LPAE_MAIR_ATTR_NO_IC 0xf4 > > > #define ARM_LPAE_MAIR_ATTR_WBRWA 0xff > > > #define ARM_LPAE_MAIR_ATTR_IDX_NC 0 > > > #define ARM_LPAE_MAIR_ATTR_IDX_CACHE 1 > > > #define ARM_LPAE_MAIR_ATTR_IDX_DEV 2 > > > +#define ARM_LPAE_MAIR_ATTR_IDX_NO_IC 3 > > > > > > /* IOPTE accessors */ > > > #define iopte_deref(pte,d) __va(iopte_to_paddr(pte, d)) > > > @@ -443,6 +445,9 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, > > > else if (prot & IOMMU_CACHE) > > > pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE > > > << ARM_LPAE_PTE_ATTRINDX_SHIFT); > > > + else if (prot & IOMMU_NO_IC) > > > + pte |= (ARM_LPAE_MAIR_ATTR_IDX_NO_IC > > > + << ARM_LPAE_PTE_ATTRINDX_SHIFT); > > > } else { > > > pte = ARM_LPAE_PTE_HAP_FAULT; > > > if (prot & IOMMU_READ) > > > @@ -780,7 +785,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) > > > struct arm_lpae_io_pgtable *data; > > > > > > if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA | > > > - IO_PGTABLE_QUIRK_NON_STRICT)) > > > + IO_PGTABLE_QUIRK_NON_STRICT | > > > + IO_PGTABLE_QUIRK_NO_IC)) > > > return NULL; > > > > > > data = arm_lpae_alloc_pgtable(cfg); > > > @@ -788,9 +794,13 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) > > > return NULL; > > > > > > /* TCR */ > > > - reg = (ARM_LPAE_TCR_SH_IS << ARM_LPAE_TCR_SH0_SHIFT) | > > > - (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN0_SHIFT) | > > > - (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN0_SHIFT); > > > > The subtle assumption here is that if the SMMU is coherent then these > > are the attributes we actually want to use, but if it's non-coherent > > then the interconnect should ignore them anyway so it doesn't really > > matter. Does either of those aspects hold for qcom SoCs? > > From the downstream [1] it's clear that default for smmu is set to > Non-cached access. > So, I don't think the interconnect helps us. OR, possibly we are just > forcing these > mappings be uncached. > > > > > TBH if we're going to touch the TCR attributes at all then we should > > probably correct that sloppiness first - there's an occasional argument > > for using non-cacheable pagetables even on a coherent SMMU if reducing > > snoop traffic/latency on walks outweighs the cost of cache maintenance > > on PTE updates, but anyone thinking they can get that by overriding > > dma-coherent silently gets the worst of both worlds thanks to this > > current TCR value. > > So, what do you suggest? > This is something that's smmu's implementation specific detail, not something > that's going to vary from one domain to another? Isn't that right? > So, in that case additional dt property can help setting a quirk? I have a change that adds "arm,smmu-pgtable-non-coherent" option and based on that adds a quick IO_PGTABLE_QUIRK_NON_COHERENT. But before that I would like to check if we can make use of IO_PGTABLE_QUIRK_NO_DMA? In present design though we don't force page table mappings to be non-coherent based on this quirk. Do we just rely on the interconnect, as you said earlier, for non-coherent SMMU? Anyone who wants to just force smmu's page table to be non-coherent can use IO_PGTABLE_QUIRK_NON_COHERENT when not declaring the SMMU as dma-coherent. [snip] -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ^ permalink raw reply [flat|nested] 19+ messages in thread
end of thread, other threads:[~2019-01-02 7:52 UTC | newest]
Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-06-15 10:53 [PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache Vivek Gautam
[not found] ` <20180615105329.26800-1-vivek.gautam-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
2018-06-15 16:52 ` Will Deacon
[not found] ` <20180615165232.GE2202-5wv7dgnIgG8@public.gmane.org>
2018-06-15 17:12 ` Jordan Crouse
2018-06-19 8:34 ` Vivek Gautam
[not found] ` <CAFp+6iFm29ufb2Pr7Gb-2O_aN3GQLH4rcyWhbQGZ3QiwCC8vPg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2018-06-27 16:37 ` Will Deacon
[not found] ` <20180627163749.GA8729-5wv7dgnIgG8@public.gmane.org>
2018-07-24 9:43 ` Vivek Gautam
[not found] ` <CAFp+6iHnA1Jj8wKO08YYEBKVF2_3oEuOQOcW2boL=AYZ9+b=UA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2018-09-19 19:35 ` Jordan Crouse
2018-09-20 10:25 ` Vivek Gautam
2018-09-20 11:41 ` Vivek Gautam
2018-09-28 13:19 ` Will Deacon
2018-10-05 5:25 ` Vivek Gautam
2018-10-23 4:15 ` Tomasz Figa
2018-10-24 17:48 ` Vivek Gautam
-- strict thread matches above, loose matches on Subject: below --
2018-12-04 11:01 Vivek Gautam
2018-12-04 15:21 ` Robin Murphy
[not found] ` <99682bd2-1ca6-406a-890c-b34c25a1b2b3-5wv7dgnIgG8@public.gmane.org>
2018-12-07 9:24 ` Vivek Gautam
2018-12-13 3:50 ` Tomasz Figa
[not found] ` <CAAFQd5C+BygjdBhBOsiBW=4kOgC0a=V8s9om6jqa6yzmn0TEWw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2019-01-02 7:22 ` Vivek Gautam
[not found] ` <CAFp+6iE7U3HCJwHkeL9A4DXUVryt13YxDCYfBDxcSSM2Z_vqcg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2019-01-02 7:52 ` Vivek Gautam
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).