linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 07/34] perf/arm: optimize opencoded atomic find_bit() API
       [not found] <20231118155105.25678-1-yury.norov@gmail.com>
@ 2023-11-18 15:50 ` Yury Norov
  2023-11-21 15:53   ` Will Deacon
  2023-11-18 15:50 ` [PATCH 08/34] drivers/perf: optimize ali_drw_get_counter_idx() by using find_bit() Yury Norov
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 10+ messages in thread
From: Yury Norov @ 2023-11-18 15:50 UTC (permalink / raw)
  To: linux-kernel, Will Deacon, Mark Rutland, linux-arm-kernel
  Cc: Yury Norov, Jan Kara, Mirsad Todorovac, Matthew Wilcox,
	Rasmus Villemoes, Andy Shevchenko, Maxim Kuvyrkov, Alexey Klimov

Switch subsystem to use atomic find_bit() or atomic iterators as
appropriate.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/perf/arm-cci.c        | 23 +++++------------------
 drivers/perf/arm-ccn.c        | 10 ++--------
 drivers/perf/arm_dmc620_pmu.c |  9 ++-------
 drivers/perf/arm_pmuv3.c      |  8 ++------
 4 files changed, 11 insertions(+), 39 deletions(-)

diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c
index 61de861eaf91..70fbf9d09d37 100644
--- a/drivers/perf/arm-cci.c
+++ b/drivers/perf/arm-cci.c
@@ -320,12 +320,8 @@ static int cci400_get_event_idx(struct cci_pmu *cci_pmu,
 		return CCI400_PMU_CYCLE_CNTR_IDX;
 	}
 
-	for (idx = CCI400_PMU_CNTR0_IDX; idx <= CCI_PMU_CNTR_LAST(cci_pmu); ++idx)
-		if (!test_and_set_bit(idx, hw->used_mask))
-			return idx;
-
-	/* No counters available */
-	return -EAGAIN;
+	idx = find_and_set_bit(hw->used_mask, CCI_PMU_CNTR_LAST(cci_pmu) + 1);
+	return idx < CCI_PMU_CNTR_LAST(cci_pmu) + 1 ? idx : -EAGAIN;
 }
 
 static int cci400_validate_hw_event(struct cci_pmu *cci_pmu, unsigned long hw_event)
@@ -802,13 +798,8 @@ static int pmu_get_event_idx(struct cci_pmu_hw_events *hw, struct perf_event *ev
 	if (cci_pmu->model->get_event_idx)
 		return cci_pmu->model->get_event_idx(cci_pmu, hw, cci_event);
 
-	/* Generic code to find an unused idx from the mask */
-	for (idx = 0; idx <= CCI_PMU_CNTR_LAST(cci_pmu); idx++)
-		if (!test_and_set_bit(idx, hw->used_mask))
-			return idx;
-
-	/* No counters available */
-	return -EAGAIN;
+	idx = find_and_set_bit(hw->used_mask, CCI_PMU_CNTR_LAST(cci_pmu) + 1);
+	return idx < CCI_PMU_CNTR_LAST(cci_pmu) + 1 ? idx : -EAGAIN;
 }
 
 static int pmu_map_event(struct perf_event *event)
@@ -861,12 +852,8 @@ static void pmu_free_irq(struct cci_pmu *cci_pmu)
 {
 	int i;
 
-	for (i = 0; i < cci_pmu->nr_irqs; i++) {
-		if (!test_and_clear_bit(i, &cci_pmu->active_irqs))
-			continue;
-
+	for_each_test_and_clear_bit(i, &cci_pmu->active_irqs, cci_pmu->nr_irqs)
 		free_irq(cci_pmu->irqs[i], cci_pmu);
-	}
 }
 
 static u32 pmu_read_counter(struct perf_event *event)
diff --git a/drivers/perf/arm-ccn.c b/drivers/perf/arm-ccn.c
index 728d13d8e98a..d657701b1f23 100644
--- a/drivers/perf/arm-ccn.c
+++ b/drivers/perf/arm-ccn.c
@@ -589,15 +589,9 @@ static const struct attribute_group *arm_ccn_pmu_attr_groups[] = {
 
 static int arm_ccn_pmu_alloc_bit(unsigned long *bitmap, unsigned long size)
 {
-	int bit;
-
-	do {
-		bit = find_first_zero_bit(bitmap, size);
-		if (bit >= size)
-			return -EAGAIN;
-	} while (test_and_set_bit(bit, bitmap));
+	int bit = find_and_set_bit(bitmap, size);
 
-	return bit;
+	return bit < size ? bit : -EAGAIN;
 }
 
 /* All RN-I and RN-D nodes have identical PMUs */
diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c
index 30cea6859574..e41c84dabc3e 100644
--- a/drivers/perf/arm_dmc620_pmu.c
+++ b/drivers/perf/arm_dmc620_pmu.c
@@ -303,13 +303,8 @@ static int dmc620_get_event_idx(struct perf_event *event)
 		end_idx = DMC620_PMU_MAX_COUNTERS;
 	}
 
-	for (idx = start_idx; idx < end_idx; ++idx) {
-		if (!test_and_set_bit(idx, dmc620_pmu->used_mask))
-			return idx;
-	}
-
-	/* The counters are all in use. */
-	return -EAGAIN;
+	idx = find_and_set_next_bit(dmc620_pmu->used_mask, end_idx, start_idx);
+	return idx < end_idx ? idx : -EAGAIN;
 }
 
 static inline
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 18b91b56af1d..784b0383e9f8 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -825,13 +825,9 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
 static int armv8pmu_get_single_idx(struct pmu_hw_events *cpuc,
 				    struct arm_pmu *cpu_pmu)
 {
-	int idx;
+	int idx = find_and_set_next_bit(cpuc->used_mask, cpu_pmu->num_events, ARMV8_IDX_COUNTER0);
 
-	for (idx = ARMV8_IDX_COUNTER0; idx < cpu_pmu->num_events; idx++) {
-		if (!test_and_set_bit(idx, cpuc->used_mask))
-			return idx;
-	}
-	return -EAGAIN;
+	return idx < cpu_pmu->num_events ? idx : -EAGAIN;
 }
 
 static int armv8pmu_get_chain_idx(struct pmu_hw_events *cpuc,
-- 
2.39.2


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 08/34] drivers/perf: optimize ali_drw_get_counter_idx() by using find_bit()
       [not found] <20231118155105.25678-1-yury.norov@gmail.com>
  2023-11-18 15:50 ` [PATCH 07/34] perf/arm: optimize opencoded atomic find_bit() API Yury Norov
@ 2023-11-18 15:50 ` Yury Norov
  2023-11-21 15:54   ` Will Deacon
  2023-11-18 15:50 ` [PATCH 17/34] iommu: use atomic find_bit() API where appropriate Yury Norov
  2023-11-18 15:51 ` [PATCH 31/34] drivers/perf: optimize m1_pmu_get_event_idx() by using find_bit() API Yury Norov
  3 siblings, 1 reply; 10+ messages in thread
From: Yury Norov @ 2023-11-18 15:50 UTC (permalink / raw)
  To: linux-kernel, Shuai Xue, Will Deacon, Mark Rutland,
	linux-arm-kernel
  Cc: Yury Norov, Jan Kara, Mirsad Todorovac, Matthew Wilcox,
	Rasmus Villemoes, Andy Shevchenko, Maxim Kuvyrkov, Alexey Klimov

The function searches used_mask for a set bit in a for-loop bit by bit.
We can do it faster by using atomic find_and_set_bit().

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/perf/alibaba_uncore_drw_pmu.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/perf/alibaba_uncore_drw_pmu.c b/drivers/perf/alibaba_uncore_drw_pmu.c
index 19d459a36be5..2a3b7701d568 100644
--- a/drivers/perf/alibaba_uncore_drw_pmu.c
+++ b/drivers/perf/alibaba_uncore_drw_pmu.c
@@ -274,15 +274,9 @@ static const struct attribute_group *ali_drw_pmu_attr_groups[] = {
 static int ali_drw_get_counter_idx(struct perf_event *event)
 {
 	struct ali_drw_pmu *drw_pmu = to_ali_drw_pmu(event->pmu);
-	int idx;
+	int idx = find_and_set_bit(drw_pmu->used_mask, ALI_DRW_PMU_COMMON_MAX_COUNTERS);
 
-	for (idx = 0; idx < ALI_DRW_PMU_COMMON_MAX_COUNTERS; ++idx) {
-		if (!test_and_set_bit(idx, drw_pmu->used_mask))
-			return idx;
-	}
-
-	/* The counters are all in use. */
-	return -EBUSY;
+	return idx < ALI_DRW_PMU_COMMON_MAX_COUNTERS ? idx : -EBUSY;
 }
 
 static u64 ali_drw_pmu_read_counter(struct perf_event *event)
-- 
2.39.2


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 17/34] iommu: use atomic find_bit() API where appropriate
       [not found] <20231118155105.25678-1-yury.norov@gmail.com>
  2023-11-18 15:50 ` [PATCH 07/34] perf/arm: optimize opencoded atomic find_bit() API Yury Norov
  2023-11-18 15:50 ` [PATCH 08/34] drivers/perf: optimize ali_drw_get_counter_idx() by using find_bit() Yury Norov
@ 2023-11-18 15:50 ` Yury Norov
  2023-11-18 15:51 ` [PATCH 31/34] drivers/perf: optimize m1_pmu_get_event_idx() by using find_bit() API Yury Norov
  3 siblings, 0 replies; 10+ messages in thread
From: Yury Norov @ 2023-11-18 15:50 UTC (permalink / raw)
  To: linux-kernel, Will Deacon, Robin Murphy, Joerg Roedel, Andy Gross,
	Bjorn Andersson, Konrad Dybcio, Yury Norov, linux-arm-kernel,
	iommu, linux-arm-msm
  Cc: Jan Kara, Mirsad Todorovac, Matthew Wilcox, Rasmus Villemoes,
	Andy Shevchenko, Maxim Kuvyrkov, Alexey Klimov

Fix opencoded find_and_set_next_bit() in __arm_smmu_alloc_bitmap()
and msm_iommu_alloc_ctx(), and make them nice one-liner wrappers.

While here, refactor msm_iommu_attach_dev() and msm_iommu_alloc_ctx()
so that error codes don't mismatch.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/iommu/arm/arm-smmu/arm-smmu.h | 10 ++--------
 drivers/iommu/msm_iommu.c             | 18 ++++--------------
 2 files changed, 6 insertions(+), 22 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index 703fd5817ec1..004a4704ebf1 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -453,15 +453,9 @@ struct arm_smmu_impl {
 
 static inline int __arm_smmu_alloc_bitmap(unsigned long *map, int start, int end)
 {
-	int idx;
+	int idx = find_and_set_next_bit(map, end, start);
 
-	do {
-		idx = find_next_zero_bit(map, end, start);
-		if (idx == end)
-			return -ENOSPC;
-	} while (test_and_set_bit(idx, map));
-
-	return idx;
+	return idx < end ? idx : -ENOSPC;
 }
 
 static inline void __iomem *arm_smmu_page(struct arm_smmu_device *smmu, int n)
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index f86af9815d6f..67124f4228b1 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -185,17 +185,9 @@ static const struct iommu_flush_ops msm_iommu_flush_ops = {
 	.tlb_add_page = __flush_iotlb_page,
 };
 
-static int msm_iommu_alloc_ctx(unsigned long *map, int start, int end)
+static int msm_iommu_alloc_ctx(struct msm_iommu_dev *iommu)
 {
-	int idx;
-
-	do {
-		idx = find_next_zero_bit(map, end, start);
-		if (idx == end)
-			return -ENOSPC;
-	} while (test_and_set_bit(idx, map));
-
-	return idx;
+	return find_and_set_bit(iommu->context_map, iommu->ncb);
 }
 
 static void msm_iommu_free_ctx(unsigned long *map, int idx)
@@ -418,10 +410,8 @@ static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
 					ret = -EEXIST;
 					goto fail;
 				}
-				master->num =
-					msm_iommu_alloc_ctx(iommu->context_map,
-							    0, iommu->ncb);
-				if (IS_ERR_VALUE(master->num)) {
+				master->num = msm_iommu_alloc_ctx(iommu);
+				if (master->num >= iommu->ncb) {
 					ret = -ENODEV;
 					goto fail;
 				}
-- 
2.39.2


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 31/34] drivers/perf: optimize m1_pmu_get_event_idx() by using find_bit() API
       [not found] <20231118155105.25678-1-yury.norov@gmail.com>
                   ` (2 preceding siblings ...)
  2023-11-18 15:50 ` [PATCH 17/34] iommu: use atomic find_bit() API where appropriate Yury Norov
@ 2023-11-18 15:51 ` Yury Norov
  2023-11-18 18:40   ` Marc Zyngier
  3 siblings, 1 reply; 10+ messages in thread
From: Yury Norov @ 2023-11-18 15:51 UTC (permalink / raw)
  To: linux-kernel, Will Deacon, Mark Rutland, Marc Zyngier,
	linux-arm-kernel
  Cc: Yury Norov, Jan Kara, Mirsad Todorovac, Matthew Wilcox,
	Rasmus Villemoes, Andy Shevchenko, Maxim Kuvyrkov, Alexey Klimov

The function searches used_mask for a bit in a for-loop bit by bit.
We can do it faster by using atomic find_and_set_bit().

The comment to the function says that it searches for the first free
counter, but obviously for_each_set_bit() searches for the first set
counter. The following test_and_set_bit() tries to enable already set
bit, which is weird.

This patch, by using find_and_set_bit(), fixes this automatically.

Fixes: a639027a1be1 ("drivers/perf: Add Apple icestorm/firestorm CPU PMU driver")
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/perf/apple_m1_cpu_pmu.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/perf/apple_m1_cpu_pmu.c b/drivers/perf/apple_m1_cpu_pmu.c
index cd2de44b61b9..2d50670ffb01 100644
--- a/drivers/perf/apple_m1_cpu_pmu.c
+++ b/drivers/perf/apple_m1_cpu_pmu.c
@@ -447,12 +447,8 @@ static int m1_pmu_get_event_idx(struct pmu_hw_events *cpuc,
 	 * counting on the PMU at any given time, and by placing the
 	 * most constraining events first.
 	 */
-	for_each_set_bit(idx, &affinity, M1_PMU_NR_COUNTERS) {
-		if (!test_and_set_bit(idx, cpuc->used_mask))
-			return idx;
-	}
-
-	return -EAGAIN;
+	idx = find_and_set_bit(cpuc->used_mask, M1_PMU_NR_COUNTERS);
+	return idx < M1_PMU_NR_COUNTERS ? idx : -EAGAIN;
 }
 
 static void m1_pmu_clear_event_idx(struct pmu_hw_events *cpuc,
-- 
2.39.2


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 31/34] drivers/perf: optimize m1_pmu_get_event_idx() by using find_bit() API
  2023-11-18 15:51 ` [PATCH 31/34] drivers/perf: optimize m1_pmu_get_event_idx() by using find_bit() API Yury Norov
@ 2023-11-18 18:40   ` Marc Zyngier
  2023-11-18 18:45     ` Yury Norov
  0 siblings, 1 reply; 10+ messages in thread
From: Marc Zyngier @ 2023-11-18 18:40 UTC (permalink / raw)
  To: Yury Norov
  Cc: linux-kernel, Will Deacon, Mark Rutland, linux-arm-kernel,
	Jan Kara, Mirsad Todorovac, Matthew Wilcox, Rasmus Villemoes,
	Andy Shevchenko, Maxim Kuvyrkov, Alexey Klimov

On Sat, 18 Nov 2023 15:51:02 +0000,
Yury Norov <yury.norov@gmail.com> wrote:
> 
> The function searches used_mask for a bit in a for-loop bit by bit.
> We can do it faster by using atomic find_and_set_bit().

Sure, let's do things fast. Correctness is overrated anyway.

> 
> The comment to the function says that it searches for the first free
> counter, but obviously for_each_set_bit() searches for the first set
> counter.

No it doesn't. It iterates over the counters the event can count on.

> The following test_and_set_bit() tries to enable already set
> bit, which is weird.

Maybe you could try to actually read the code?

> 
> This patch, by using find_and_set_bit(), fixes this automatically.

This doesn't fix anything, but instead actively breaks the driver.

> 
> Fixes: a639027a1be1 ("drivers/perf: Add Apple icestorm/firestorm CPU PMU driver")
> Signed-off-by: Yury Norov <yury.norov@gmail.com>
> ---
>  drivers/perf/apple_m1_cpu_pmu.c | 8 ++------
>  1 file changed, 2 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/perf/apple_m1_cpu_pmu.c b/drivers/perf/apple_m1_cpu_pmu.c
> index cd2de44b61b9..2d50670ffb01 100644
> --- a/drivers/perf/apple_m1_cpu_pmu.c
> +++ b/drivers/perf/apple_m1_cpu_pmu.c
> @@ -447,12 +447,8 @@ static int m1_pmu_get_event_idx(struct pmu_hw_events *cpuc,
>  	 * counting on the PMU at any given time, and by placing the
>  	 * most constraining events first.
>  	 */
> -	for_each_set_bit(idx, &affinity, M1_PMU_NR_COUNTERS) {
> -		if (!test_and_set_bit(idx, cpuc->used_mask))
> -			return idx;
> -	}
> -
> -	return -EAGAIN;
> +	idx = find_and_set_bit(cpuc->used_mask, M1_PMU_NR_COUNTERS);
> +	return idx < M1_PMU_NR_COUNTERS ? idx : -EAGAIN;

So now you're picking any possible counter, irrespective of the
possible affinity of the event. This is great.

	M.

-- 
Without deviation from the norm, progress is not possible.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 31/34] drivers/perf: optimize m1_pmu_get_event_idx() by using find_bit() API
  2023-11-18 18:40   ` Marc Zyngier
@ 2023-11-18 18:45     ` Yury Norov
  0 siblings, 0 replies; 10+ messages in thread
From: Yury Norov @ 2023-11-18 18:45 UTC (permalink / raw)
  To: Marc Zyngier
  Cc: linux-kernel, Will Deacon, Mark Rutland, linux-arm-kernel,
	Jan Kara, Mirsad Todorovac, Matthew Wilcox, Rasmus Villemoes,
	Andy Shevchenko, Maxim Kuvyrkov, Alexey Klimov

On Sat, Nov 18, 2023 at 06:40:43PM +0000, Marc Zyngier wrote:
> On Sat, 18 Nov 2023 15:51:02 +0000,
> Yury Norov <yury.norov@gmail.com> wrote:
> > 
> > The function searches used_mask for a bit in a for-loop bit by bit.
> > We can do it faster by using atomic find_and_set_bit().
> 
> Sure, let's do things fast. Correctness is overrated anyway.
> 
> > 
> > The comment to the function says that it searches for the first free
> > counter, but obviously for_each_set_bit() searches for the first set
> > counter.
> 
> No it doesn't. It iterates over the counters the event can count on.
> 
> > The following test_and_set_bit() tries to enable already set
> > bit, which is weird.
> 
> Maybe you could try to actually read the code?
> 
> > 
> > This patch, by using find_and_set_bit(), fixes this automatically.
> 
> This doesn't fix anything, but instead actively breaks the driver.
> 
> > 
> > Fixes: a639027a1be1 ("drivers/perf: Add Apple icestorm/firestorm CPU PMU driver")
> > Signed-off-by: Yury Norov <yury.norov@gmail.com>
> > ---
> >  drivers/perf/apple_m1_cpu_pmu.c | 8 ++------
> >  1 file changed, 2 insertions(+), 6 deletions(-)
> > 
> > diff --git a/drivers/perf/apple_m1_cpu_pmu.c b/drivers/perf/apple_m1_cpu_pmu.c
> > index cd2de44b61b9..2d50670ffb01 100644
> > --- a/drivers/perf/apple_m1_cpu_pmu.c
> > +++ b/drivers/perf/apple_m1_cpu_pmu.c
> > @@ -447,12 +447,8 @@ static int m1_pmu_get_event_idx(struct pmu_hw_events *cpuc,
> >  	 * counting on the PMU at any given time, and by placing the
> >  	 * most constraining events first.
> >  	 */
> > -	for_each_set_bit(idx, &affinity, M1_PMU_NR_COUNTERS) {
> > -		if (!test_and_set_bit(idx, cpuc->used_mask))
> > -			return idx;
> > -	}
> > -
> > -	return -EAGAIN;
> > +	idx = find_and_set_bit(cpuc->used_mask, M1_PMU_NR_COUNTERS);
> > +	return idx < M1_PMU_NR_COUNTERS ? idx : -EAGAIN;
> 
> So now you're picking any possible counter, irrespective of the
> possible affinity of the event. This is great.

Ok, I'll drop the patch. Sorry.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 07/34] perf/arm: optimize opencoded atomic find_bit() API
  2023-11-18 15:50 ` [PATCH 07/34] perf/arm: optimize opencoded atomic find_bit() API Yury Norov
@ 2023-11-21 15:53   ` Will Deacon
  2023-11-21 16:16     ` Yury Norov
  0 siblings, 1 reply; 10+ messages in thread
From: Will Deacon @ 2023-11-21 15:53 UTC (permalink / raw)
  To: Yury Norov
  Cc: linux-kernel, Mark Rutland, linux-arm-kernel, Jan Kara,
	Mirsad Todorovac, Matthew Wilcox, Rasmus Villemoes,
	Andy Shevchenko, Maxim Kuvyrkov, Alexey Klimov

On Sat, Nov 18, 2023 at 07:50:38AM -0800, Yury Norov wrote:
> Switch subsystem to use atomic find_bit() or atomic iterators as
> appropriate.
> 
> Signed-off-by: Yury Norov <yury.norov@gmail.com>
> ---
>  drivers/perf/arm-cci.c        | 23 +++++------------------
>  drivers/perf/arm-ccn.c        | 10 ++--------
>  drivers/perf/arm_dmc620_pmu.c |  9 ++-------
>  drivers/perf/arm_pmuv3.c      |  8 ++------
>  4 files changed, 11 insertions(+), 39 deletions(-)
> 
> diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c
> index 61de861eaf91..70fbf9d09d37 100644
> --- a/drivers/perf/arm-cci.c
> +++ b/drivers/perf/arm-cci.c
> @@ -320,12 +320,8 @@ static int cci400_get_event_idx(struct cci_pmu *cci_pmu,
>  		return CCI400_PMU_CYCLE_CNTR_IDX;
>  	}
>  
> -	for (idx = CCI400_PMU_CNTR0_IDX; idx <= CCI_PMU_CNTR_LAST(cci_pmu); ++idx)
> -		if (!test_and_set_bit(idx, hw->used_mask))
> -			return idx;
> -
> -	/* No counters available */
> -	return -EAGAIN;
> +	idx = find_and_set_bit(hw->used_mask, CCI_PMU_CNTR_LAST(cci_pmu) + 1);

CCI400_PMU_CNTR0_IDX is defined as 1, so isn't this wrong?

[...]

> diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c
> index 30cea6859574..e41c84dabc3e 100644
> --- a/drivers/perf/arm_dmc620_pmu.c
> +++ b/drivers/perf/arm_dmc620_pmu.c
> @@ -303,13 +303,8 @@ static int dmc620_get_event_idx(struct perf_event *event)
>  		end_idx = DMC620_PMU_MAX_COUNTERS;
>  	}
>  
> -	for (idx = start_idx; idx < end_idx; ++idx) {
> -		if (!test_and_set_bit(idx, dmc620_pmu->used_mask))
> -			return idx;
> -	}
> -
> -	/* The counters are all in use. */
> -	return -EAGAIN;
> +	idx = find_and_set_next_bit(dmc620_pmu->used_mask, end_idx, start_idx);

It might just be me, but I'd find this a tonne easier to read if you swapped
the last two arguments around so that the offset came before the limit in
the new function.

Will

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 08/34] drivers/perf: optimize ali_drw_get_counter_idx() by using find_bit()
  2023-11-18 15:50 ` [PATCH 08/34] drivers/perf: optimize ali_drw_get_counter_idx() by using find_bit() Yury Norov
@ 2023-11-21 15:54   ` Will Deacon
  0 siblings, 0 replies; 10+ messages in thread
From: Will Deacon @ 2023-11-21 15:54 UTC (permalink / raw)
  To: Yury Norov
  Cc: linux-kernel, Shuai Xue, Mark Rutland, linux-arm-kernel, Jan Kara,
	Mirsad Todorovac, Matthew Wilcox, Rasmus Villemoes,
	Andy Shevchenko, Maxim Kuvyrkov, Alexey Klimov

On Sat, Nov 18, 2023 at 07:50:39AM -0800, Yury Norov wrote:
> The function searches used_mask for a set bit in a for-loop bit by bit.
> We can do it faster by using atomic find_and_set_bit().
> 
> Signed-off-by: Yury Norov <yury.norov@gmail.com>
> ---
>  drivers/perf/alibaba_uncore_drw_pmu.c | 10 ++--------
>  1 file changed, 2 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/perf/alibaba_uncore_drw_pmu.c b/drivers/perf/alibaba_uncore_drw_pmu.c
> index 19d459a36be5..2a3b7701d568 100644
> --- a/drivers/perf/alibaba_uncore_drw_pmu.c
> +++ b/drivers/perf/alibaba_uncore_drw_pmu.c
> @@ -274,15 +274,9 @@ static const struct attribute_group *ali_drw_pmu_attr_groups[] = {
>  static int ali_drw_get_counter_idx(struct perf_event *event)
>  {
>  	struct ali_drw_pmu *drw_pmu = to_ali_drw_pmu(event->pmu);
> -	int idx;
> +	int idx = find_and_set_bit(drw_pmu->used_mask, ALI_DRW_PMU_COMMON_MAX_COUNTERS);
>  
> -	for (idx = 0; idx < ALI_DRW_PMU_COMMON_MAX_COUNTERS; ++idx) {
> -		if (!test_and_set_bit(idx, drw_pmu->used_mask))
> -			return idx;
> -	}
> -
> -	/* The counters are all in use. */
> -	return -EBUSY;
> +	return idx < ALI_DRW_PMU_COMMON_MAX_COUNTERS ? idx : -EBUSY;
>  }
>  
>  static u64 ali_drw_pmu_read_counter(struct perf_event *event)
> -- 
> 2.39.2

Acked-by: Will Deacon <will@kernel.org>

Will


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 07/34] perf/arm: optimize opencoded atomic find_bit() API
  2023-11-21 15:53   ` Will Deacon
@ 2023-11-21 16:16     ` Yury Norov
  2023-11-21 16:17       ` Will Deacon
  0 siblings, 1 reply; 10+ messages in thread
From: Yury Norov @ 2023-11-21 16:16 UTC (permalink / raw)
  To: Will Deacon
  Cc: linux-kernel, Mark Rutland, linux-arm-kernel, Jan Kara,
	Mirsad Todorovac, Matthew Wilcox, Rasmus Villemoes,
	Andy Shevchenko, Maxim Kuvyrkov, Alexey Klimov

On Tue, Nov 21, 2023 at 03:53:44PM +0000, Will Deacon wrote:
> On Sat, Nov 18, 2023 at 07:50:38AM -0800, Yury Norov wrote:
> > Switch subsystem to use atomic find_bit() or atomic iterators as
> > appropriate.
> > 
> > Signed-off-by: Yury Norov <yury.norov@gmail.com>
> > ---
> >  drivers/perf/arm-cci.c        | 23 +++++------------------
> >  drivers/perf/arm-ccn.c        | 10 ++--------
> >  drivers/perf/arm_dmc620_pmu.c |  9 ++-------
> >  drivers/perf/arm_pmuv3.c      |  8 ++------
> >  4 files changed, 11 insertions(+), 39 deletions(-)
> > 
> > diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c
> > index 61de861eaf91..70fbf9d09d37 100644
> > --- a/drivers/perf/arm-cci.c
> > +++ b/drivers/perf/arm-cci.c
> > @@ -320,12 +320,8 @@ static int cci400_get_event_idx(struct cci_pmu *cci_pmu,
> >  		return CCI400_PMU_CYCLE_CNTR_IDX;
> >  	}
> >  
> > -	for (idx = CCI400_PMU_CNTR0_IDX; idx <= CCI_PMU_CNTR_LAST(cci_pmu); ++idx)
> > -		if (!test_and_set_bit(idx, hw->used_mask))
> > -			return idx;
> > -
> > -	/* No counters available */
> > -	return -EAGAIN;
> > +	idx = find_and_set_bit(hw->used_mask, CCI_PMU_CNTR_LAST(cci_pmu) + 1);
> 
> CCI400_PMU_CNTR0_IDX is defined as 1, so isn't this wrong?

You're right. Will fix in v2
 
> [...]
> 
> > diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c
> > index 30cea6859574..e41c84dabc3e 100644
> > --- a/drivers/perf/arm_dmc620_pmu.c
> > +++ b/drivers/perf/arm_dmc620_pmu.c
> > @@ -303,13 +303,8 @@ static int dmc620_get_event_idx(struct perf_event *event)
> >  		end_idx = DMC620_PMU_MAX_COUNTERS;
> >  	}
> >  
> > -	for (idx = start_idx; idx < end_idx; ++idx) {
> > -		if (!test_and_set_bit(idx, dmc620_pmu->used_mask))
> > -			return idx;
> > -	}
> > -
> > -	/* The counters are all in use. */
> > -	return -EAGAIN;
> > +	idx = find_and_set_next_bit(dmc620_pmu->used_mask, end_idx, start_idx);
> 
> It might just be me, but I'd find this a tonne easier to read if you swapped
> the last two arguments around so that the offset came before the limit in
> the new function.

I personally agree, but we already have find_next_*_bit(addr, nbits, offset)
functions, and having atomic versions of the same with different order
of arguments will make it even more messy...

Thanks,
        Yury

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 07/34] perf/arm: optimize opencoded atomic find_bit() API
  2023-11-21 16:16     ` Yury Norov
@ 2023-11-21 16:17       ` Will Deacon
  0 siblings, 0 replies; 10+ messages in thread
From: Will Deacon @ 2023-11-21 16:17 UTC (permalink / raw)
  To: Yury Norov
  Cc: linux-kernel, Mark Rutland, linux-arm-kernel, Jan Kara,
	Mirsad Todorovac, Matthew Wilcox, Rasmus Villemoes,
	Andy Shevchenko, Maxim Kuvyrkov, Alexey Klimov

On Tue, Nov 21, 2023 at 08:16:13AM -0800, Yury Norov wrote:
> On Tue, Nov 21, 2023 at 03:53:44PM +0000, Will Deacon wrote:
> > On Sat, Nov 18, 2023 at 07:50:38AM -0800, Yury Norov wrote:
> > > Switch subsystem to use atomic find_bit() or atomic iterators as
> > > appropriate.
> > > 
> > > Signed-off-by: Yury Norov <yury.norov@gmail.com>
> > > ---
> > >  drivers/perf/arm-cci.c        | 23 +++++------------------
> > >  drivers/perf/arm-ccn.c        | 10 ++--------
> > >  drivers/perf/arm_dmc620_pmu.c |  9 ++-------
> > >  drivers/perf/arm_pmuv3.c      |  8 ++------
> > >  4 files changed, 11 insertions(+), 39 deletions(-)
> > > 
> > > diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c
> > > index 61de861eaf91..70fbf9d09d37 100644
> > > --- a/drivers/perf/arm-cci.c
> > > +++ b/drivers/perf/arm-cci.c
> > > @@ -320,12 +320,8 @@ static int cci400_get_event_idx(struct cci_pmu *cci_pmu,
> > >  		return CCI400_PMU_CYCLE_CNTR_IDX;
> > >  	}
> > >  
> > > -	for (idx = CCI400_PMU_CNTR0_IDX; idx <= CCI_PMU_CNTR_LAST(cci_pmu); ++idx)
> > > -		if (!test_and_set_bit(idx, hw->used_mask))
> > > -			return idx;
> > > -
> > > -	/* No counters available */
> > > -	return -EAGAIN;
> > > +	idx = find_and_set_bit(hw->used_mask, CCI_PMU_CNTR_LAST(cci_pmu) + 1);
> > 
> > CCI400_PMU_CNTR0_IDX is defined as 1, so isn't this wrong?
> 
> You're right. Will fix in v2
>  
> > [...]
> > 
> > > diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c
> > > index 30cea6859574..e41c84dabc3e 100644
> > > --- a/drivers/perf/arm_dmc620_pmu.c
> > > +++ b/drivers/perf/arm_dmc620_pmu.c
> > > @@ -303,13 +303,8 @@ static int dmc620_get_event_idx(struct perf_event *event)
> > >  		end_idx = DMC620_PMU_MAX_COUNTERS;
> > >  	}
> > >  
> > > -	for (idx = start_idx; idx < end_idx; ++idx) {
> > > -		if (!test_and_set_bit(idx, dmc620_pmu->used_mask))
> > > -			return idx;
> > > -	}
> > > -
> > > -	/* The counters are all in use. */
> > > -	return -EAGAIN;
> > > +	idx = find_and_set_next_bit(dmc620_pmu->used_mask, end_idx, start_idx);
> > 
> > It might just be me, but I'd find this a tonne easier to read if you swapped
> > the last two arguments around so that the offset came before the limit in
> > the new function.
> 
> I personally agree, but we already have find_next_*_bit(addr, nbits, offset)
> functions, and having atomic versions of the same with different order
> of arguments will make it even more messy...

Urgh, and there's loads of them too :(

Will

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2023-11-21 16:18 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20231118155105.25678-1-yury.norov@gmail.com>
2023-11-18 15:50 ` [PATCH 07/34] perf/arm: optimize opencoded atomic find_bit() API Yury Norov
2023-11-21 15:53   ` Will Deacon
2023-11-21 16:16     ` Yury Norov
2023-11-21 16:17       ` Will Deacon
2023-11-18 15:50 ` [PATCH 08/34] drivers/perf: optimize ali_drw_get_counter_idx() by using find_bit() Yury Norov
2023-11-21 15:54   ` Will Deacon
2023-11-18 15:50 ` [PATCH 17/34] iommu: use atomic find_bit() API where appropriate Yury Norov
2023-11-18 15:51 ` [PATCH 31/34] drivers/perf: optimize m1_pmu_get_event_idx() by using find_bit() API Yury Norov
2023-11-18 18:40   ` Marc Zyngier
2023-11-18 18:45     ` Yury Norov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).