From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Leizhen (ThunderTown)" Subject: Re: [PATCH v3 2/3] iommu/arm-smmu-v3: Poll for CMD_SYNC outside cmdq lock Date: Thu, 19 Jul 2018 15:22:22 +0800 Message-ID: <5B503C2E.40602@huawei.com> References: <625ffbc273577515c324f6deea66a366e675b751.1508334262.git.robin.murphy@arm.com> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: <625ffbc273577515c324f6deea66a366e675b751.1508334262.git.robin.murphy-5wv7dgnIgG8@public.gmane.org> List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: iommu-bounces-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org Errors-To: iommu-bounces-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org To: Robin Murphy , will.deacon-5wv7dgnIgG8@public.gmane.org Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org, linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org List-Id: iommu@lists.linux-foundation.org On 2017/10/18 22:04, Robin Murphy wrote: > Even without the MSI trick, we can still do a lot better than hogging > the entire queue while it drains. All we actually need to do for the > necessary guarantee of completion is wait for our particular command to > have been consumed, and as long as we keep track of where it is there is > no need to block other CPUs from adding further commands in the > meantime. There is one theoretical (but incredibly unlikely) edge case > to avoid, where cons has wrapped twice to still appear 'behind' the sync > position - this is easily disambiguated by adding a generation count to > the queue to indicate when prod wraps, since cons cannot wrap twice > without prod having wrapped at least once. Hi Robin, I applied your patch and got below improvemnet for NVMe device. Randomly Read IOPS: 146K --> 214K Randomly Write IOPS: 143K --> 212K Tested-by: Zhen Lei > > Signed-off-by: Robin Murphy > --- > > v3: > - Move queue checks into helpers > - Avoid race by updating generation before prod (after some > deliberation I've concluded that it doesn't actually need any > special handling for the timeout failure case either) > > drivers/iommu/arm-smmu-v3.c | 91 +++++++++++++++++++++++++++++++++------------ > 1 file changed, 68 insertions(+), 23 deletions(-) > > diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c > index 83b76404e882..3130e7182dde 100644 > --- a/drivers/iommu/arm-smmu-v3.c > +++ b/drivers/iommu/arm-smmu-v3.c > @@ -417,7 +417,6 @@ > > /* High-level queue structures */ > #define ARM_SMMU_POLL_TIMEOUT_US 100 > -#define ARM_SMMU_CMDQ_DRAIN_TIMEOUT_US 1000000 /* 1s! */ > #define ARM_SMMU_SYNC_TIMEOUT_US 1000000 /* 1s! */ > > #define MSI_IOVA_BASE 0x8000000 > @@ -540,6 +539,7 @@ struct arm_smmu_queue { > struct arm_smmu_cmdq { > struct arm_smmu_queue q; > spinlock_t lock; > + int generation; > }; > > struct arm_smmu_evtq { > @@ -737,6 +737,17 @@ static bool queue_empty(struct arm_smmu_queue *q) > Q_WRP(q, q->prod) == Q_WRP(q, q->cons); > } > > +static bool queue_behind(struct arm_smmu_queue *q, u32 idx) > +{ > + return Q_IDX(q, q->cons) < Q_IDX(q, idx); > +} > + > +static bool queue_ahead_not_wrapped(struct arm_smmu_queue *q, u32 idx) > +{ > + return Q_IDX(q, q->cons) >= Q_IDX(q, idx) && > + Q_WRP(q, q->cons) == Q_WRP(q, idx); > +} > + > static void queue_sync_cons(struct arm_smmu_queue *q) > { > q->cons = readl_relaxed(q->cons_reg); > @@ -770,21 +781,12 @@ static void queue_inc_prod(struct arm_smmu_queue *q) > writel(q->prod, q->prod_reg); > } > > -/* > - * Wait for the SMMU to consume items. If drain is true, wait until the queue > - * is empty. Otherwise, wait until there is at least one free slot. > - */ > -static int queue_poll_cons(struct arm_smmu_queue *q, bool drain, bool wfe) > +/* Wait for the SMMU to consume items, until there is at least one free slot */ > +static int queue_poll_cons(struct arm_smmu_queue *q, bool wfe) > { > - ktime_t timeout; > - unsigned int delay = 1; > + ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US); > > - /* Wait longer if it's queue drain */ > - timeout = ktime_add_us(ktime_get(), drain ? > - ARM_SMMU_CMDQ_DRAIN_TIMEOUT_US : > - ARM_SMMU_POLL_TIMEOUT_US); > - > - while (queue_sync_cons(q), (drain ? !queue_empty(q) : queue_full(q))) { > + while (queue_sync_cons(q), queue_full(q)) { > if (ktime_compare(ktime_get(), timeout) > 0) > return -ETIMEDOUT; > > @@ -792,8 +794,7 @@ static int queue_poll_cons(struct arm_smmu_queue *q, bool drain, bool wfe) > wfe(); > } else { > cpu_relax(); > - udelay(delay); > - delay *= 2; > + udelay(1); > } > } > > @@ -959,15 +960,20 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) > queue_write(Q_ENT(q, cons), cmd, q->ent_dwords); > } > > -static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd) > +static u32 arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd) > { > struct arm_smmu_queue *q = &smmu->cmdq.q; > bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); > > + if (Q_IDX(q, q->prod + 1) == 0) > + smmu->cmdq.generation++; > + > while (queue_insert_raw(q, cmd) == -ENOSPC) { > - if (queue_poll_cons(q, false, wfe)) > + if (queue_poll_cons(q, wfe)) > dev_err_ratelimited(smmu->dev, "CMDQ timeout\n"); > } > + > + return q->prod; > } > > static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, > @@ -1001,15 +1007,53 @@ static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx) > return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0; > } > > +static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx, > + int sync_gen) > +{ > + ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_SYNC_TIMEOUT_US); > + struct arm_smmu_queue *q = &smmu->cmdq.q; > + bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); > + unsigned int delay = 1; > + > + do { > + queue_sync_cons(q); > + /* > + * If we see updates quickly enough, cons has passed sync_idx, > + * but not yet wrapped. At worst, cons might have actually > + * wrapped an even number of times, but that still guarantees > + * the original sync must have been consumed. > + */ > + if (queue_ahead_not_wrapped(q, sync_idx)) > + return 0; > + /* > + * Otherwise, cons may have passed sync_idx and wrapped one or > + * more times to appear behind it again, but in that case prod > + * must also be one or more generations ahead. > + */ > + if (queue_behind(q, sync_idx) && > + READ_ONCE(smmu->cmdq.generation) != sync_gen) > + return 0; > + > + if (wfe) { > + wfe(); > + } else { > + cpu_relax(); > + udelay(delay); > + delay *= 2; > + } > + } while (ktime_before(ktime_get(), timeout)); > + > + return -ETIMEDOUT; > +} > + > static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) > { > u64 cmd[CMDQ_ENT_DWORDS]; > unsigned long flags; > - bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); > bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) && > (smmu->features & ARM_SMMU_FEAT_COHERENCY); > struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC }; > - int ret; > + int ret, sync_idx, sync_gen; > > if (msi) { > ent.sync.msidata = atomic_inc_return_relaxed(&smmu->sync_nr); > @@ -1018,13 +1062,14 @@ static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) > arm_smmu_cmdq_build_cmd(cmd, &ent); > > spin_lock_irqsave(&smmu->cmdq.lock, flags); > - arm_smmu_cmdq_insert_cmd(smmu, cmd); > - if (!msi) > - ret = queue_poll_cons(&smmu->cmdq.q, true, wfe); > + sync_idx = arm_smmu_cmdq_insert_cmd(smmu, cmd); > + sync_gen = READ_ONCE(smmu->cmdq.generation); > spin_unlock_irqrestore(&smmu->cmdq.lock, flags); > > if (msi) > ret = arm_smmu_sync_poll_msi(smmu, ent.sync.msidata); > + else > + ret = arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen); > if (ret) > dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n"); > } > -- Thanks! BestRegards From mboxrd@z Thu Jan 1 00:00:00 1970 From: thunder.leizhen@huawei.com (Leizhen (ThunderTown)) Date: Thu, 19 Jul 2018 15:22:22 +0800 Subject: [PATCH v3 2/3] iommu/arm-smmu-v3: Poll for CMD_SYNC outside cmdq lock In-Reply-To: <625ffbc273577515c324f6deea66a366e675b751.1508334262.git.robin.murphy@arm.com> References: <625ffbc273577515c324f6deea66a366e675b751.1508334262.git.robin.murphy@arm.com> Message-ID: <5B503C2E.40602@huawei.com> To: linux-arm-kernel@lists.infradead.org List-Id: linux-arm-kernel.lists.infradead.org On 2017/10/18 22:04, Robin Murphy wrote: > Even without the MSI trick, we can still do a lot better than hogging > the entire queue while it drains. All we actually need to do for the > necessary guarantee of completion is wait for our particular command to > have been consumed, and as long as we keep track of where it is there is > no need to block other CPUs from adding further commands in the > meantime. There is one theoretical (but incredibly unlikely) edge case > to avoid, where cons has wrapped twice to still appear 'behind' the sync > position - this is easily disambiguated by adding a generation count to > the queue to indicate when prod wraps, since cons cannot wrap twice > without prod having wrapped at least once. Hi Robin, I applied your patch and got below improvemnet for NVMe device. Randomly Read IOPS: 146K --> 214K Randomly Write IOPS: 143K --> 212K Tested-by: Zhen Lei > > Signed-off-by: Robin Murphy > --- > > v3: > - Move queue checks into helpers > - Avoid race by updating generation before prod (after some > deliberation I've concluded that it doesn't actually need any > special handling for the timeout failure case either) > > drivers/iommu/arm-smmu-v3.c | 91 +++++++++++++++++++++++++++++++++------------ > 1 file changed, 68 insertions(+), 23 deletions(-) > > diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c > index 83b76404e882..3130e7182dde 100644 > --- a/drivers/iommu/arm-smmu-v3.c > +++ b/drivers/iommu/arm-smmu-v3.c > @@ -417,7 +417,6 @@ > > /* High-level queue structures */ > #define ARM_SMMU_POLL_TIMEOUT_US 100 > -#define ARM_SMMU_CMDQ_DRAIN_TIMEOUT_US 1000000 /* 1s! */ > #define ARM_SMMU_SYNC_TIMEOUT_US 1000000 /* 1s! */ > > #define MSI_IOVA_BASE 0x8000000 > @@ -540,6 +539,7 @@ struct arm_smmu_queue { > struct arm_smmu_cmdq { > struct arm_smmu_queue q; > spinlock_t lock; > + int generation; > }; > > struct arm_smmu_evtq { > @@ -737,6 +737,17 @@ static bool queue_empty(struct arm_smmu_queue *q) > Q_WRP(q, q->prod) == Q_WRP(q, q->cons); > } > > +static bool queue_behind(struct arm_smmu_queue *q, u32 idx) > +{ > + return Q_IDX(q, q->cons) < Q_IDX(q, idx); > +} > + > +static bool queue_ahead_not_wrapped(struct arm_smmu_queue *q, u32 idx) > +{ > + return Q_IDX(q, q->cons) >= Q_IDX(q, idx) && > + Q_WRP(q, q->cons) == Q_WRP(q, idx); > +} > + > static void queue_sync_cons(struct arm_smmu_queue *q) > { > q->cons = readl_relaxed(q->cons_reg); > @@ -770,21 +781,12 @@ static void queue_inc_prod(struct arm_smmu_queue *q) > writel(q->prod, q->prod_reg); > } > > -/* > - * Wait for the SMMU to consume items. If drain is true, wait until the queue > - * is empty. Otherwise, wait until there is at least one free slot. > - */ > -static int queue_poll_cons(struct arm_smmu_queue *q, bool drain, bool wfe) > +/* Wait for the SMMU to consume items, until there is at least one free slot */ > +static int queue_poll_cons(struct arm_smmu_queue *q, bool wfe) > { > - ktime_t timeout; > - unsigned int delay = 1; > + ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US); > > - /* Wait longer if it's queue drain */ > - timeout = ktime_add_us(ktime_get(), drain ? > - ARM_SMMU_CMDQ_DRAIN_TIMEOUT_US : > - ARM_SMMU_POLL_TIMEOUT_US); > - > - while (queue_sync_cons(q), (drain ? !queue_empty(q) : queue_full(q))) { > + while (queue_sync_cons(q), queue_full(q)) { > if (ktime_compare(ktime_get(), timeout) > 0) > return -ETIMEDOUT; > > @@ -792,8 +794,7 @@ static int queue_poll_cons(struct arm_smmu_queue *q, bool drain, bool wfe) > wfe(); > } else { > cpu_relax(); > - udelay(delay); > - delay *= 2; > + udelay(1); > } > } > > @@ -959,15 +960,20 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) > queue_write(Q_ENT(q, cons), cmd, q->ent_dwords); > } > > -static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd) > +static u32 arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd) > { > struct arm_smmu_queue *q = &smmu->cmdq.q; > bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); > > + if (Q_IDX(q, q->prod + 1) == 0) > + smmu->cmdq.generation++; > + > while (queue_insert_raw(q, cmd) == -ENOSPC) { > - if (queue_poll_cons(q, false, wfe)) > + if (queue_poll_cons(q, wfe)) > dev_err_ratelimited(smmu->dev, "CMDQ timeout\n"); > } > + > + return q->prod; > } > > static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, > @@ -1001,15 +1007,53 @@ static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx) > return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0; > } > > +static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx, > + int sync_gen) > +{ > + ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_SYNC_TIMEOUT_US); > + struct arm_smmu_queue *q = &smmu->cmdq.q; > + bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); > + unsigned int delay = 1; > + > + do { > + queue_sync_cons(q); > + /* > + * If we see updates quickly enough, cons has passed sync_idx, > + * but not yet wrapped. At worst, cons might have actually > + * wrapped an even number of times, but that still guarantees > + * the original sync must have been consumed. > + */ > + if (queue_ahead_not_wrapped(q, sync_idx)) > + return 0; > + /* > + * Otherwise, cons may have passed sync_idx and wrapped one or > + * more times to appear behind it again, but in that case prod > + * must also be one or more generations ahead. > + */ > + if (queue_behind(q, sync_idx) && > + READ_ONCE(smmu->cmdq.generation) != sync_gen) > + return 0; > + > + if (wfe) { > + wfe(); > + } else { > + cpu_relax(); > + udelay(delay); > + delay *= 2; > + } > + } while (ktime_before(ktime_get(), timeout)); > + > + return -ETIMEDOUT; > +} > + > static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) > { > u64 cmd[CMDQ_ENT_DWORDS]; > unsigned long flags; > - bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); > bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) && > (smmu->features & ARM_SMMU_FEAT_COHERENCY); > struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC }; > - int ret; > + int ret, sync_idx, sync_gen; > > if (msi) { > ent.sync.msidata = atomic_inc_return_relaxed(&smmu->sync_nr); > @@ -1018,13 +1062,14 @@ static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) > arm_smmu_cmdq_build_cmd(cmd, &ent); > > spin_lock_irqsave(&smmu->cmdq.lock, flags); > - arm_smmu_cmdq_insert_cmd(smmu, cmd); > - if (!msi) > - ret = queue_poll_cons(&smmu->cmdq.q, true, wfe); > + sync_idx = arm_smmu_cmdq_insert_cmd(smmu, cmd); > + sync_gen = READ_ONCE(smmu->cmdq.generation); > spin_unlock_irqrestore(&smmu->cmdq.lock, flags); > > if (msi) > ret = arm_smmu_sync_poll_msi(smmu, ent.sync.msidata); > + else > + ret = arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen); > if (ret) > dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n"); > } > -- Thanks! BestRegards