public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/2] iommu/iova: Make the rcache depot properly flexible
@ 2023-08-21 18:22 Robin Murphy
  2023-08-21 18:22 ` [PATCH v2 1/2] iommu/iova: Make the rcache depot scale better Robin Murphy
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Robin Murphy @ 2023-08-21 18:22 UTC (permalink / raw)
  To: joro
  Cc: will, iommu, linux-kernel, zhangzekun11, john.g.garry,
	dheerajkumar.srivastava, jsnitsel

v1: https://lore.kernel.org/linux-iommu/cover.1692033783.git.robin.murphy@arm.com/

Hi all,

Here's a quick update, cleaning up the silly bugs from v1 and taking
John's suggestion to improve the comments.

Cheers,
Robin.


Robin Murphy (2):
  iommu/iova: Make the rcache depot scale better
  iommu/iova: Manage the depot list size

 drivers/iommu/iova.c | 94 ++++++++++++++++++++++++++++++--------------
 1 file changed, 64 insertions(+), 30 deletions(-)

-- 
2.39.2.101.g768bb238c484.dirty


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v2 1/2] iommu/iova: Make the rcache depot scale better
  2023-08-21 18:22 [PATCH v2 0/2] iommu/iova: Make the rcache depot properly flexible Robin Murphy
@ 2023-08-21 18:22 ` Robin Murphy
  2023-08-25 16:55   ` Jerry Snitselaar
  2023-08-21 18:22 ` [PATCH v2 2/2] iommu/iova: Manage the depot list size Robin Murphy
  2023-08-29  1:20 ` [PATCH v2 0/2] iommu/iova: Make the rcache depot properly flexible zhangzekun (A)
  2 siblings, 1 reply; 8+ messages in thread
From: Robin Murphy @ 2023-08-21 18:22 UTC (permalink / raw)
  To: joro
  Cc: will, iommu, linux-kernel, zhangzekun11, john.g.garry,
	dheerajkumar.srivastava, jsnitsel

The algorithm in the original paper specifies the storage of full
magazines in the depot as an unbounded list rather than a fixed-size
array. It turns out to be pretty straightforward to do this in our
implementation with no significant loss of efficiency. This allows
the depot to scale up to the working set sizes of larger systems,
while also potentially saving some memory on smaller ones too.

Since this involves touching struct iova_magazine with the requisite
care, we may as well reinforce the comment with a proper assertion too.

Reviewed-by: John Garry <john.g.garry@oracle.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---

v2: Fix freeing loops, Improve comment and add matching assertion

 drivers/iommu/iova.c | 65 ++++++++++++++++++++++++--------------------
 1 file changed, 35 insertions(+), 30 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 10b964600948..dd2309e9a6c5 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -622,15 +622,19 @@ EXPORT_SYMBOL_GPL(reserve_iova);
 /*
  * As kmalloc's buffer size is fixed to power of 2, 127 is chosen to
  * assure size of 'iova_magazine' to be 1024 bytes, so that no memory
- * will be wasted.
+ * will be wasted. Since only full magazines are inserted into the depot,
+ * we don't need to waste PFN capacity on a separate list head either.
  */
 #define IOVA_MAG_SIZE 127
-#define MAX_GLOBAL_MAGS 32	/* magazines per bin */
 
 struct iova_magazine {
-	unsigned long size;
+	union {
+		unsigned long size;
+		struct iova_magazine *next;
+	};
 	unsigned long pfns[IOVA_MAG_SIZE];
 };
+static_assert(!(sizeof(struct iova_magazine) & (sizeof(struct iova_magazine) - 1)));
 
 struct iova_cpu_rcache {
 	spinlock_t lock;
@@ -640,8 +644,7 @@ struct iova_cpu_rcache {
 
 struct iova_rcache {
 	spinlock_t lock;
-	unsigned long depot_size;
-	struct iova_magazine *depot[MAX_GLOBAL_MAGS];
+	struct iova_magazine *depot;
 	struct iova_cpu_rcache __percpu *cpu_rcaches;
 };
 
@@ -717,6 +720,21 @@ static void iova_magazine_push(struct iova_magazine *mag, unsigned long pfn)
 	mag->pfns[mag->size++] = pfn;
 }
 
+static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
+{
+	struct iova_magazine *mag = rcache->depot;
+
+	rcache->depot = mag->next;
+	mag->size = IOVA_MAG_SIZE;
+	return mag;
+}
+
+static void iova_depot_push(struct iova_rcache *rcache, struct iova_magazine *mag)
+{
+	mag->next = rcache->depot;
+	rcache->depot = mag;
+}
+
 int iova_domain_init_rcaches(struct iova_domain *iovad)
 {
 	unsigned int cpu;
@@ -734,7 +752,6 @@ int iova_domain_init_rcaches(struct iova_domain *iovad)
 
 		rcache = &iovad->rcaches[i];
 		spin_lock_init(&rcache->lock);
-		rcache->depot_size = 0;
 		rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache),
 						     cache_line_size());
 		if (!rcache->cpu_rcaches) {
@@ -776,7 +793,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
 				 struct iova_rcache *rcache,
 				 unsigned long iova_pfn)
 {
-	struct iova_magazine *mag_to_free = NULL;
 	struct iova_cpu_rcache *cpu_rcache;
 	bool can_insert = false;
 	unsigned long flags;
@@ -794,12 +810,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
 
 		if (new_mag) {
 			spin_lock(&rcache->lock);
-			if (rcache->depot_size < MAX_GLOBAL_MAGS) {
-				rcache->depot[rcache->depot_size++] =
-						cpu_rcache->loaded;
-			} else {
-				mag_to_free = cpu_rcache->loaded;
-			}
+			iova_depot_push(rcache, cpu_rcache->loaded);
 			spin_unlock(&rcache->lock);
 
 			cpu_rcache->loaded = new_mag;
@@ -812,11 +823,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
 
 	spin_unlock_irqrestore(&cpu_rcache->lock, flags);
 
-	if (mag_to_free) {
-		iova_magazine_free_pfns(mag_to_free, iovad);
-		iova_magazine_free(mag_to_free);
-	}
-
 	return can_insert;
 }
 
@@ -854,9 +860,9 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
 		has_pfn = true;
 	} else {
 		spin_lock(&rcache->lock);
-		if (rcache->depot_size > 0) {
+		if (rcache->depot) {
 			iova_magazine_free(cpu_rcache->loaded);
-			cpu_rcache->loaded = rcache->depot[--rcache->depot_size];
+			cpu_rcache->loaded = iova_depot_pop(rcache);
 			has_pfn = true;
 		}
 		spin_unlock(&rcache->lock);
@@ -895,9 +901,8 @@ static void free_iova_rcaches(struct iova_domain *iovad)
 	struct iova_rcache *rcache;
 	struct iova_cpu_rcache *cpu_rcache;
 	unsigned int cpu;
-	int i, j;
 
-	for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+	for (int i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
 		rcache = &iovad->rcaches[i];
 		if (!rcache->cpu_rcaches)
 			break;
@@ -907,8 +912,8 @@ static void free_iova_rcaches(struct iova_domain *iovad)
 			iova_magazine_free(cpu_rcache->prev);
 		}
 		free_percpu(rcache->cpu_rcaches);
-		for (j = 0; j < rcache->depot_size; ++j)
-			iova_magazine_free(rcache->depot[j]);
+		while (rcache->depot)
+			iova_magazine_free(iova_depot_pop(rcache));
 	}
 
 	kfree(iovad->rcaches);
@@ -942,16 +947,16 @@ static void free_global_cached_iovas(struct iova_domain *iovad)
 {
 	struct iova_rcache *rcache;
 	unsigned long flags;
-	int i, j;
 
-	for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+	for (int i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
 		rcache = &iovad->rcaches[i];
 		spin_lock_irqsave(&rcache->lock, flags);
-		for (j = 0; j < rcache->depot_size; ++j) {
-			iova_magazine_free_pfns(rcache->depot[j], iovad);
-			iova_magazine_free(rcache->depot[j]);
+		while (rcache->depot) {
+			struct iova_magazine *mag = iova_depot_pop(rcache);
+
+			iova_magazine_free_pfns(mag, iovad);
+			iova_magazine_free(mag);
 		}
-		rcache->depot_size = 0;
 		spin_unlock_irqrestore(&rcache->lock, flags);
 	}
 }
-- 
2.39.2.101.g768bb238c484.dirty


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v2 2/2] iommu/iova: Manage the depot list size
  2023-08-21 18:22 [PATCH v2 0/2] iommu/iova: Make the rcache depot properly flexible Robin Murphy
  2023-08-21 18:22 ` [PATCH v2 1/2] iommu/iova: Make the rcache depot scale better Robin Murphy
@ 2023-08-21 18:22 ` Robin Murphy
  2023-08-25 17:01   ` Jerry Snitselaar
  2023-09-07 17:54   ` Srivastava, Dheeraj Kumar
  2023-08-29  1:20 ` [PATCH v2 0/2] iommu/iova: Make the rcache depot properly flexible zhangzekun (A)
  2 siblings, 2 replies; 8+ messages in thread
From: Robin Murphy @ 2023-08-21 18:22 UTC (permalink / raw)
  To: joro
  Cc: will, iommu, linux-kernel, zhangzekun11, john.g.garry,
	dheerajkumar.srivastava, jsnitsel

Automatically scaling the depot up to suit the peak capacity of a
workload is all well and good, but it would be nice to have a way to
scale it back down again if the workload changes. To that end, add
backround reclaim that will gradually free surplus magazines if the
depot size remains above a reasonable threshold for long enough.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---

v2: fix reschedule delay typo

 drivers/iommu/iova.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index dd2309e9a6c5..436f42855c29 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -11,6 +11,7 @@
 #include <linux/smp.h>
 #include <linux/bitops.h>
 #include <linux/cpu.h>
+#include <linux/workqueue.h>
 
 /* The anchor node sits above the top of the usable address space */
 #define IOVA_ANCHOR	~0UL
@@ -627,6 +628,8 @@ EXPORT_SYMBOL_GPL(reserve_iova);
  */
 #define IOVA_MAG_SIZE 127
 
+#define IOVA_DEPOT_DELAY msecs_to_jiffies(100)
+
 struct iova_magazine {
 	union {
 		unsigned long size;
@@ -644,8 +647,11 @@ struct iova_cpu_rcache {
 
 struct iova_rcache {
 	spinlock_t lock;
+	unsigned int depot_size;
 	struct iova_magazine *depot;
 	struct iova_cpu_rcache __percpu *cpu_rcaches;
+	struct iova_domain *iovad;
+	struct delayed_work work;
 };
 
 static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
@@ -726,6 +732,7 @@ static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
 
 	rcache->depot = mag->next;
 	mag->size = IOVA_MAG_SIZE;
+	rcache->depot_size--;
 	return mag;
 }
 
@@ -733,6 +740,24 @@ static void iova_depot_push(struct iova_rcache *rcache, struct iova_magazine *ma
 {
 	mag->next = rcache->depot;
 	rcache->depot = mag;
+	rcache->depot_size++;
+}
+
+static void iova_depot_work_func(struct work_struct *work)
+{
+	struct iova_rcache *rcache = container_of(work, typeof(*rcache), work.work);
+	struct iova_magazine *mag = NULL;
+
+	spin_lock(&rcache->lock);
+	if (rcache->depot_size > num_online_cpus())
+		mag = iova_depot_pop(rcache);
+	spin_unlock(&rcache->lock);
+
+	if (mag) {
+		iova_magazine_free_pfns(mag, rcache->iovad);
+		iova_magazine_free(mag);
+		schedule_delayed_work(&rcache->work, IOVA_DEPOT_DELAY);
+	}
 }
 
 int iova_domain_init_rcaches(struct iova_domain *iovad)
@@ -752,6 +777,8 @@ int iova_domain_init_rcaches(struct iova_domain *iovad)
 
 		rcache = &iovad->rcaches[i];
 		spin_lock_init(&rcache->lock);
+		rcache->iovad = iovad;
+		INIT_DELAYED_WORK(&rcache->work, iova_depot_work_func);
 		rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache),
 						     cache_line_size());
 		if (!rcache->cpu_rcaches) {
@@ -812,6 +839,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
 			spin_lock(&rcache->lock);
 			iova_depot_push(rcache, cpu_rcache->loaded);
 			spin_unlock(&rcache->lock);
+			schedule_delayed_work(&rcache->work, IOVA_DEPOT_DELAY);
 
 			cpu_rcache->loaded = new_mag;
 			can_insert = true;
@@ -912,6 +940,7 @@ static void free_iova_rcaches(struct iova_domain *iovad)
 			iova_magazine_free(cpu_rcache->prev);
 		}
 		free_percpu(rcache->cpu_rcaches);
+		cancel_delayed_work_sync(&rcache->work);
 		while (rcache->depot)
 			iova_magazine_free(iova_depot_pop(rcache));
 	}
-- 
2.39.2.101.g768bb238c484.dirty


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 1/2] iommu/iova: Make the rcache depot scale better
  2023-08-21 18:22 ` [PATCH v2 1/2] iommu/iova: Make the rcache depot scale better Robin Murphy
@ 2023-08-25 16:55   ` Jerry Snitselaar
  0 siblings, 0 replies; 8+ messages in thread
From: Jerry Snitselaar @ 2023-08-25 16:55 UTC (permalink / raw)
  To: Robin Murphy
  Cc: joro, will, iommu, linux-kernel, zhangzekun11, john.g.garry,
	dheerajkumar.srivastava

On Mon, Aug 21, 2023 at 07:22:51PM +0100, Robin Murphy wrote:
> The algorithm in the original paper specifies the storage of full
> magazines in the depot as an unbounded list rather than a fixed-size
> array. It turns out to be pretty straightforward to do this in our
> implementation with no significant loss of efficiency. This allows
> the depot to scale up to the working set sizes of larger systems,
> while also potentially saving some memory on smaller ones too.
> 
> Since this involves touching struct iova_magazine with the requisite
> care, we may as well reinforce the comment with a proper assertion too.
> 
> Reviewed-by: John Garry <john.g.garry@oracle.com>
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---


Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 2/2] iommu/iova: Manage the depot list size
  2023-08-21 18:22 ` [PATCH v2 2/2] iommu/iova: Manage the depot list size Robin Murphy
@ 2023-08-25 17:01   ` Jerry Snitselaar
  2023-09-07 17:54   ` Srivastava, Dheeraj Kumar
  1 sibling, 0 replies; 8+ messages in thread
From: Jerry Snitselaar @ 2023-08-25 17:01 UTC (permalink / raw)
  To: Robin Murphy
  Cc: joro, will, iommu, linux-kernel, zhangzekun11, john.g.garry,
	dheerajkumar.srivastava

On Mon, Aug 21, 2023 at 07:22:52PM +0100, Robin Murphy wrote:
> Automatically scaling the depot up to suit the peak capacity of a
> workload is all well and good, but it would be nice to have a way to
> scale it back down again if the workload changes. To that end, add
> backround reclaim that will gradually free surplus magazines if the
> depot size remains above a reasonable threshold for long enough.
> 
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>

Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 0/2] iommu/iova: Make the rcache depot properly flexible
  2023-08-21 18:22 [PATCH v2 0/2] iommu/iova: Make the rcache depot properly flexible Robin Murphy
  2023-08-21 18:22 ` [PATCH v2 1/2] iommu/iova: Make the rcache depot scale better Robin Murphy
  2023-08-21 18:22 ` [PATCH v2 2/2] iommu/iova: Manage the depot list size Robin Murphy
@ 2023-08-29  1:20 ` zhangzekun (A)
  2 siblings, 0 replies; 8+ messages in thread
From: zhangzekun (A) @ 2023-08-29  1:20 UTC (permalink / raw)
  To: Robin Murphy
  Cc: will, iommu, linux-kernel, john.g.garry, dheerajkumar.srivastava,
	jsnitsel, joro



在 2023/8/22 2:22, Robin Murphy 写道:
> v1: https://lore.kernel.org/linux-iommu/cover.1692033783.git.robin.murphy@arm.com/
>
> Hi all,
>
> Here's a quick update, cleaning up the silly bugs from v1 and taking
> John's suggestion to improve the comments.
>
> Cheers,
> Robin.
>
>
> Robin Murphy (2):
>    iommu/iova: Make the rcache depot scale better
>    iommu/iova: Manage the depot list size
>
>   drivers/iommu/iova.c | 94 ++++++++++++++++++++++++++++++--------------
>   1 file changed, 64 insertions(+), 30 deletions(-)
>

Sorry for the delay, I got some trouble with our performance test 
environment. I had made some performance test for this patches, and the 
result looks good.

Thanks,
Zekun

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 2/2] iommu/iova: Manage the depot list size
  2023-08-21 18:22 ` [PATCH v2 2/2] iommu/iova: Manage the depot list size Robin Murphy
  2023-08-25 17:01   ` Jerry Snitselaar
@ 2023-09-07 17:54   ` Srivastava, Dheeraj Kumar
  2023-09-07 23:28     ` Robin Murphy
  1 sibling, 1 reply; 8+ messages in thread
From: Srivastava, Dheeraj Kumar @ 2023-09-07 17:54 UTC (permalink / raw)
  To: Robin Murphy, joro
  Cc: will, iommu, linux-kernel, zhangzekun11, john.g.garry, jsnitsel,
	"wyes.karny

Hi Robin,

On 8/21/2023 11:52 PM, Robin Murphy wrote:
> Automatically scaling the depot up to suit the peak capacity of a
> workload is all well and good, but it would be nice to have a way to
> scale it back down again if the workload changes. To that end, add
> backround reclaim that will gradually free surplus magazines if the
> depot size remains above a reasonable threshold for long enough.
> 
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---
> 
> v2: fix reschedule delay typo
> 
>   drivers/iommu/iova.c | 29 +++++++++++++++++++++++++++++
>   1 file changed, 29 insertions(+)
> 
> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
> index dd2309e9a6c5..436f42855c29 100644
> --- a/drivers/iommu/iova.c
> +++ b/drivers/iommu/iova.c
> @@ -11,6 +11,7 @@
>   #include <linux/smp.h>
>   #include <linux/bitops.h>
>   #include <linux/cpu.h>
> +#include <linux/workqueue.h>
>   
>   /* The anchor node sits above the top of the usable address space */
>   #define IOVA_ANCHOR	~0UL
> @@ -627,6 +628,8 @@ EXPORT_SYMBOL_GPL(reserve_iova);
>    */
>   #define IOVA_MAG_SIZE 127
>   
> +#define IOVA_DEPOT_DELAY msecs_to_jiffies(100)
> +
>   struct iova_magazine {
>   	union {
>   		unsigned long size;
> @@ -644,8 +647,11 @@ struct iova_cpu_rcache {
>   
>   struct iova_rcache {
>   	spinlock_t lock;
> +	unsigned int depot_size;
>   	struct iova_magazine *depot;
>   	struct iova_cpu_rcache __percpu *cpu_rcaches;
> +	struct iova_domain *iovad;
> +	struct delayed_work work;
>   };
>   
>   static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
> @@ -726,6 +732,7 @@ static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
>   
>   	rcache->depot = mag->next;
>   	mag->size = IOVA_MAG_SIZE;
> +	rcache->depot_size--;
>   	return mag;
>   }
>   
> @@ -733,6 +740,24 @@ static void iova_depot_push(struct iova_rcache *rcache, struct iova_magazine *ma
>   {
>   	mag->next = rcache->depot;
>   	rcache->depot = mag;
> +	rcache->depot_size++;
> +}
> +
> +static void iova_depot_work_func(struct work_struct *work)
> +{
> +	struct iova_rcache *rcache = container_of(work, typeof(*rcache), work.work);
> +	struct iova_magazine *mag = NULL;
> +
> +	spin_lock(&rcache->lock);
> +	if (rcache->depot_size > num_online_cpus())
> +		mag = iova_depot_pop(rcache);
> +	spin_unlock(&rcache->lock);
> +

Running fio test on nvme disk is causing hard LOCKUP issue with the 
below kernel logs

[ 7620.507689] watchdog: Watchdog detected hard LOCKUP on cpu 334
[ 7620.507694] Modules linked in: nvme_fabrics intel_rapl_msr 
intel_rapl_common amd64_edac edac_mce_amd kvm_amd kvm rapl wmi_bmof 
ipmi_ssif binfmt_misc nls_iso8859_1 input_leds joydev acpi_ipmi ipmi_si 
ccp ipmi_devintf k10temp ipmi_msghandler mac_hid sch_fq_codel 
dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua msr ramoops 
reed_solomon pstore_blk pstore_zone efi_pstore ip_tables x_tables 
autofs4 btrfs blake2b_generic raid10 raid456 async_raid6_recov 
async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 
raid0 multipath linear hid_generic usbhid ast dax_hmem i2c_algo_bit hid 
drm_shmem_helper drm_kms_helper cxl_acpi crct10dif_pclmul crc32_pclmul 
ghash_clmulni_intel sha512_ssse3 aesni_intel crypto_simd cryptd cxl_core 
nvme ahci drm tg3 nvme_core xhci_pci libahci i2c_piix4 xhci_pci_renesas wmi
[ 7620.507795] CPU: 334 PID: 3524 Comm: kworker/334:1 Not tainted 
6.5.0-woiovapatchfix #1
[ 7620.507800] Hardware name: AMD Corporation
[ 7620.507803] Workqueue: events iova_depot_work_func
[ 7620.507813] RIP: 0010:native_queued_spin_lock_slowpath+0x8b/0x300
[ 7620.507823] Code: 24 0f b6 d2 c1 e2 08 30 e4 09 d0 a9 00 01 ff ff 0f 
85 f6 01 00 00 85 c0 74 14 41 0f b6 04 24 84 c0 74 0b f3 90 41 0f b6 04 
24 <84> c0 75 f5 b8 01 00 00 00 66 41 89 04 24 5b 41 5c 41 5d 41 5e 41
[ 7620.507826] RSP: 0018:ffa000001e450c60 EFLAGS: 00000002
[ 7620.507830] RAX: 0000000000000001 RBX: 000000000000007f RCX: 
0000000000000001
[ 7620.507832] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 
ff11000167b7c000
[ 7620.507834] RBP: ffa000001e450c88 R08: 0000000000000086 R09: 
ff110055f3bf9800
[ 7620.507836] R10: 0000000000001000 R11: ffa000001e450ff8 R12: 
ff11000167b7c000
[ 7620.507838] R13: ff110060639dce08 R14: 000000000007fc54 R15: 
ff110055f3bf9800
[ 7620.507840] FS:  0000000000000000(0000) GS:ff11005ecf380000(0000) 
knlGS:0000000000000000
[ 7620.507843] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 7620.507846] CR2: 00007f3678bc4010 CR3: 000000016e00c003 CR4: 
0000000000771ee0
[ 7620.507848] PKRU: 55555554
[ 7620.507850] Call Trace:
[ 7620.507852]  <NMI>
[ 7620.507857]  ? show_regs+0x6e/0x80
[ 7620.507865]  ? watchdog_hardlockup_check+0x16e/0x330
[ 7620.507872]  ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507877]  ? watchdog_overflow_callback+0x70/0x80
[ 7620.507883]  ? __perf_event_overflow+0x13b/0x2b0
[ 7620.507888]  ? x86_perf_event_set_period+0xe7/0x1d0
[ 7620.507895]  ? perf_event_overflow+0x1d/0x30
[ 7620.507901]  ? amd_pmu_v2_handle_irq+0x158/0x2f0
[ 7620.507911]  ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507914]  ? flush_tlb_one_kernel+0x12/0x30
[ 7620.507920]  ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507923]  ? set_pte_vaddr_p4d+0x5c/0x70
[ 7620.507931]  ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507933]  ? set_pte_vaddr+0x80/0xb0
[ 7620.507938]  ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507942]  ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507945]  ? native_set_fixmap+0x6d/0x90
[ 7620.507949]  ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507952]  ? ghes_copy_tofrom_phys+0x79/0x120
[ 7620.507960]  ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507962]  ? __ghes_peek_estatus.isra.0+0x4e/0xc0
[ 7620.507968]  ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507973]  ? perf_event_nmi_handler+0x31/0x60
[ 7620.507978]  ? nmi_handle+0x68/0x180
[ 7620.507985]  ? default_do_nmi+0x45/0x120
[ 7620.507991]  ? exc_nmi+0x142/0x1c0
[ 7620.507996]  ? end_repeat_nmi+0x16/0x67
[ 7620.508006]  ? native_queued_spin_lock_slowpath+0x8b/0x300
[ 7620.508012]  ? native_queued_spin_lock_slowpath+0x8b/0x300
[ 7620.508019]  ? native_queued_spin_lock_slowpath+0x8b/0x300
[ 7620.508024]  </NMI>
[ 7620.508026]  <IRQ>
[ 7620.508029]  _raw_spin_lock+0x2d/0x40
[ 7620.508034]  free_iova_fast+0x12e/0x1b0
[ 7620.508041]  fq_ring_free+0x9b/0x150
[ 7620.508045]  ? amd_iommu_unmap_pages+0x49/0x130
[ 7620.508051]  iommu_dma_free_iova+0x10c/0x300
[ 7620.508058]  __iommu_dma_unmap+0xca/0x140
[ 7620.508067]  iommu_dma_unmap_page+0x4f/0xb0
[ 7620.508073]  dma_unmap_page_attrs+0x183/0x1c0
[ 7620.508081]  nvme_unmap_data+0x105/0x150 [nvme]
[ 7620.508095]  nvme_pci_complete_batch+0xaf/0xd0 [nvme]
[ 7620.508106]  nvme_irq+0x7b/0x90 [nvme]
[ 7620.508116]  ? __pfx_nvme_pci_complete_batch+0x10/0x10 [nvme]
[ 7620.508125]  __handle_irq_event_percpu+0x50/0x1b0
[ 7620.508133]  handle_irq_event+0x3d/0x80
[ 7620.508138]  handle_edge_irq+0x90/0x240
[ 7620.508143]  __common_interrupt+0x52/0x100
[ 7620.508149]  ? srso_alias_return_thunk+0x5/0x7f
[ 7620.508153]  common_interrupt+0x8d/0xa0
[ 7620.508157]  </IRQ>
[ 7620.508158]  <TASK>
[ 7620.508161]  asm_common_interrupt+0x2b/0x40
[ 7620.508165] RIP: 0010:iova_depot_work_func+0x28/0xb0
[ 7620.508170] Code: 90 90 f3 0f 1e fa 0f 1f 44 00 00 55 48 89 e5 41 56 
4c 8d 77 e0 41 55 41 54 49 89 fc 4c 89 f7 e8 2e 9d 5e 00 8b 05 58 d6 bb 
01 <41> 39 44 24 e4 77 14 4c 89 f7 e8 f9 9d 5e 00 41 5c 41 5d 41 5e 5d
[ 7620.508173] RSP: 0018:ffa0000025513e30 EFLAGS: 00000246
[ 7620.508176] RAX: 0000000000000200 RBX: ff110001988f4540 RCX: 
ff11005ecf3b3ca8
[ 7620.508178] RDX: 0000000000000001 RSI: ff110001000420b0 RDI: 
ff11000167b7c000
[ 7620.508180] RBP: ffa0000025513e48 R08: ff110001000420b0 R09: 
ff110001988f45c0
[ 7620.508181] R10: 0000000000000007 R11: 0000000000000007 R12: 
ff11000167b7c020
[ 7620.508183] R13: ff110001001f1600 R14: ff11000167b7c000 R15: 
ff110001001f1605
[ 7620.508193]  process_one_work+0x178/0x350
[ 7620.508202]  ? __pfx_worker_thread+0x10/0x10
[ 7620.508207]  worker_thread+0x2f7/0x420
[ 7620.508215]  ? __pfx_worker_thread+0x10/0x10
[ 7620.508220]  kthread+0xf8/0x130
[ 7620.508226]  ? __pfx_kthread+0x10/0x10
[ 7620.508231]  ret_from_fork+0x3d/0x60
[ 7620.508235]  ? __pfx_kthread+0x10/0x10
[ 7620.508240]  ret_from_fork_asm+0x1b/0x30
[ 7620.508252]  </TASK>

Looking into the trace and your patch figured out that in 
iova_depot_work_func workqueue function rcache->lock is taken via 
spin_lock. But the same lock will be taken from IRQ context also.
So, to prevent IRQ when the rcache->lock is taken we should disable IRQ.
Therefore use spin_lock_irqsave in place of normal spin_lock.

Below changes fixes the issue

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 436f42855c29..d30e453d0fb4 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -747,11 +747,12 @@ static void iova_depot_work_func(struct 
work_struct *work)
  {
	struct iova_rcache *rcache = container_of(work, typeof(*rcache), 
work.work);
	struct iova_magazine *mag = NULL;
+	unsigned long flags;

-	spin_lock(&rcache->lock);
+	spin_lock_irqsave(&rcache->lock, flags);
	if (rcache->depot_size > num_online_cpus())
		mag = iova_depot_pop(rcache);
-	spin_unlock(&rcache->lock);
+	spin_unlock_irqrestore(&rcache->lock, flags);

	if (mag) {
		iova_magazine_free_pfns(mag, rcache->iovad);


> +	if (mag) {
> +		iova_magazine_free_pfns(mag, rcache->iovad);
> +		iova_magazine_free(mag);
> +		schedule_delayed_work(&rcache->work, IOVA_DEPOT_DELAY);
> +	}
>   }
>   
>   int iova_domain_init_rcaches(struct iova_domain *iovad)
> @@ -752,6 +777,8 @@ int iova_domain_init_rcaches(struct iova_domain *iovad)
>   
>   		rcache = &iovad->rcaches[i];
>   		spin_lock_init(&rcache->lock);
> +		rcache->iovad = iovad;
> +		INIT_DELAYED_WORK(&rcache->work, iova_depot_work_func);
>   		rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache),
>   						     cache_line_size());
>   		if (!rcache->cpu_rcaches) {
> @@ -812,6 +839,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
>   			spin_lock(&rcache->lock);
>   			iova_depot_push(rcache, cpu_rcache->loaded);
>   			spin_unlock(&rcache->lock);
> +			schedule_delayed_work(&rcache->work, IOVA_DEPOT_DELAY);
>   
>   			cpu_rcache->loaded = new_mag;
>   			can_insert = true;
> @@ -912,6 +940,7 @@ static void free_iova_rcaches(struct iova_domain *iovad)
>   			iova_magazine_free(cpu_rcache->prev);
>   		}
>   		free_percpu(rcache->cpu_rcaches);
> +		cancel_delayed_work_sync(&rcache->work);
>   		while (rcache->depot)
>   			iova_magazine_free(iova_depot_pop(rcache));
>   	}

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 2/2] iommu/iova: Manage the depot list size
  2023-09-07 17:54   ` Srivastava, Dheeraj Kumar
@ 2023-09-07 23:28     ` Robin Murphy
  0 siblings, 0 replies; 8+ messages in thread
From: Robin Murphy @ 2023-09-07 23:28 UTC (permalink / raw)
  To: Srivastava, Dheeraj Kumar, joro
  Cc: will, iommu, linux-kernel, zhangzekun11, john.g.garry, jsnitsel,
	wyes.karny, vasant.hegde

On 2023-09-07 18:54, Srivastava, Dheeraj Kumar wrote:
> Hi Robin,
> 
> On 8/21/2023 11:52 PM, Robin Murphy wrote:
>> Automatically scaling the depot up to suit the peak capacity of a
>> workload is all well and good, but it would be nice to have a way to
>> scale it back down again if the workload changes. To that end, add
>> backround reclaim that will gradually free surplus magazines if the

[ bah, I'll have to fix that typo too - thanks for the squiggle, 
Thunderbird ]

[...]
> Looking into the trace and your patch figured out that in 
> iova_depot_work_func workqueue function rcache->lock is taken via 
> spin_lock. But the same lock will be taken from IRQ context also.
> So, to prevent IRQ when the rcache->lock is taken we should disable IRQ.
> Therefore use spin_lock_irqsave in place of normal spin_lock.

Oof, indeed it seems I totally failed to consider IRQs... this tweak 
looks right to me, thanks for the catch! I'll get a v3 ready for -rc1...

Cheers,
Robin.

> Below changes fixes the issue
> 
> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
> index 436f42855c29..d30e453d0fb4 100644
> --- a/drivers/iommu/iova.c
> +++ b/drivers/iommu/iova.c
> @@ -747,11 +747,12 @@ static void iova_depot_work_func(struct 
> work_struct *work)
>   {
>      struct iova_rcache *rcache = container_of(work, typeof(*rcache), 
> work.work);
>      struct iova_magazine *mag = NULL;
> +    unsigned long flags;
> 
> -    spin_lock(&rcache->lock);
> +    spin_lock_irqsave(&rcache->lock, flags);
>      if (rcache->depot_size > num_online_cpus())
>          mag = iova_depot_pop(rcache);
> -    spin_unlock(&rcache->lock);
> +    spin_unlock_irqrestore(&rcache->lock, flags);
> 
>      if (mag) {
>          iova_magazine_free_pfns(mag, rcache->iovad);
> 
> 
>> +    if (mag) {
>> +        iova_magazine_free_pfns(mag, rcache->iovad);
>> +        iova_magazine_free(mag);
>> +        schedule_delayed_work(&rcache->work, IOVA_DEPOT_DELAY);
>> +    }
>>   }
>>   int iova_domain_init_rcaches(struct iova_domain *iovad)
>> @@ -752,6 +777,8 @@ int iova_domain_init_rcaches(struct iova_domain 
>> *iovad)
>>           rcache = &iovad->rcaches[i];
>>           spin_lock_init(&rcache->lock);
>> +        rcache->iovad = iovad;
>> +        INIT_DELAYED_WORK(&rcache->work, iova_depot_work_func);
>>           rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache),
>>                                cache_line_size());
>>           if (!rcache->cpu_rcaches) {
>> @@ -812,6 +839,7 @@ static bool __iova_rcache_insert(struct 
>> iova_domain *iovad,
>>               spin_lock(&rcache->lock);
>>               iova_depot_push(rcache, cpu_rcache->loaded);
>>               spin_unlock(&rcache->lock);
>> +            schedule_delayed_work(&rcache->work, IOVA_DEPOT_DELAY);
>>               cpu_rcache->loaded = new_mag;
>>               can_insert = true;
>> @@ -912,6 +940,7 @@ static void free_iova_rcaches(struct iova_domain 
>> *iovad)
>>               iova_magazine_free(cpu_rcache->prev);
>>           }
>>           free_percpu(rcache->cpu_rcaches);
>> +        cancel_delayed_work_sync(&rcache->work);
>>           while (rcache->depot)
>>               iova_magazine_free(iova_depot_pop(rcache));
>>       }

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2023-09-07 23:28 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-08-21 18:22 [PATCH v2 0/2] iommu/iova: Make the rcache depot properly flexible Robin Murphy
2023-08-21 18:22 ` [PATCH v2 1/2] iommu/iova: Make the rcache depot scale better Robin Murphy
2023-08-25 16:55   ` Jerry Snitselaar
2023-08-21 18:22 ` [PATCH v2 2/2] iommu/iova: Manage the depot list size Robin Murphy
2023-08-25 17:01   ` Jerry Snitselaar
2023-09-07 17:54   ` Srivastava, Dheeraj Kumar
2023-09-07 23:28     ` Robin Murphy
2023-08-29  1:20 ` [PATCH v2 0/2] iommu/iova: Make the rcache depot properly flexible zhangzekun (A)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox