[PATCH v4] nvdimm/btt: Handle preemption in BTT lane acquisition

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v4] nvdimm/btt: Handle preemption in BTT lane acquisition
@ 2026-05-14 20:17 Alison Schofield
  2026-05-14 20:23 ` Alison Schofield
  0 siblings, 1 reply; 2+ messages in thread
From: Alison Schofield @ 2026-05-14 20:17 UTC (permalink / raw)
  To: Dan Williams; +Cc: Alison Schofield, nvdimm

BTT lanes serialize access to per-lane metadata and workspace state
during BTT I/O. The btt-check unit test reports data mismatches during
BTT writes due to a race in lane acquisition that can lead to silent
data corruption.

The existing lane model uses a spinlock together with a per-CPU
recursion count. That recursion model stopped being valid after BTT
lanes became preemptible: another task can run on the same CPU,
observe a non-zero recursion count, bypass locking, and use the same
lane concurrently.

BTT lanes are also held across metadata and data updates that can
reach nvdimm_flush(). Some provider flush callbacks can sleep, making
a spinlock the wrong primitive for the lane lifetime. That issue
predates this fix, but becomes more visible now that BTT lanes are
preemptible.

Replace the spinlock-based recursion model with a dynamically
allocated per-lane mutex array and take the lane lock unconditionally.

Add might_sleep() to catch any future atomic-context caller.

Found with the ndctl unit test btt-check.sh.

Fixes: 36c75ce3bd29 ("nd_btt: Make BTT lanes preemptible")
Assisted-by: Claude Sonnet 4.5
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
---


Changes in v4:
- Replace per-CPU lane storage w dynamically allocated mutex array (Sashiko)
- Remove the recursion fast path and take the lane lock unconditionally
- Update commit log

Changes in v3:
Replace spinlock with a per-lane mutex (Arboorva)

Changes in v2:
Use spin_(un)lock_bh() (Sashiko AI)
Update commit log per softirq re-enty and spinlock change

A new unit test to stress this is under review here:
https://lore.kernel.org/nvdimm/20260424233633.3762217-1-alison.schofield@intel.com/


 drivers/nvdimm/nd.h          |  5 ++-
 drivers/nvdimm/region_devs.c | 62 +++++++++++-------------------------
 2 files changed, 20 insertions(+), 47 deletions(-)

diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index b199eea3260e..69f329075527 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -366,8 +366,7 @@ unsigned sizeof_namespace_label(struct nvdimm_drvdata *ndd);
 			res; res = next, next = next ? next->sibling : NULL)
 
 struct nd_percpu_lane {
-	int count;
-	spinlock_t lock;
+	struct mutex lock; /* serialize lane access */
 };
 
 enum nd_label_flags {
@@ -420,7 +419,7 @@ struct nd_region {
 	struct kernfs_node *bb_state;
 	struct badblocks bb;
 	struct nd_interleave_set *nd_set;
-	struct nd_percpu_lane __percpu *lane;
+	struct nd_percpu_lane *lane;
 	int (*flush)(struct nd_region *nd_region, struct bio *bio);
 	struct nd_mapping mapping[] __counted_by(ndr_mappings);
 };
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index e35c2e18518f..bc5e402bbd9a 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -192,7 +192,7 @@ static void nd_region_release(struct device *dev)
 
 		put_device(&nvdimm->dev);
 	}
-	free_percpu(nd_region->lane);
+	kfree(nd_region->lane);
 	if (!test_bit(ND_REGION_CXL, &nd_region->flags))
 		memregion_free(nd_region->id);
 	kfree(nd_region);
@@ -904,52 +904,28 @@ void nd_region_advance_seeds(struct nd_region *nd_region, struct device *dev)
  * nd_region_acquire_lane - allocate and lock a lane
  * @nd_region: region id and number of lanes possible
  *
- * A lane correlates to a BLK-data-window and/or a log slot in the BTT.
- * We optimize for the common case where there are 256 lanes, one
- * per-cpu.  For larger systems we need to lock to share lanes.  For now
- * this implementation assumes the cost of maintaining an allocator for
- * free lanes is on the order of the lock hold time, so it implements a
- * static lane = cpu % num_lanes mapping.
+ * A lane correlates to a log slot in the BTT. Lanes are shared across
+ * CPUs using a static lane = cpu % num_lanes mapping, with a per-lane
+ * mutex to serialize access.
  *
- * In the case of a BTT instance on top of a BLK namespace a lane may be
- * acquired recursively.  We lock on the first instance.
- *
- * In the case of a BTT instance on top of PMEM, we only acquire a lane
- * for the BTT metadata updates.
+ * Callers must be in sleepable context. The only in-tree caller is
+ * BTT's ->submit_bio handler (btt_read_pg / btt_write_pg).
  */
 unsigned int nd_region_acquire_lane(struct nd_region *nd_region)
 {
-	unsigned int cpu, lane;
+	unsigned int lane;
 
-	migrate_disable();
-	cpu = smp_processor_id();
-	if (nd_region->num_lanes < nr_cpu_ids) {
-		struct nd_percpu_lane *ndl_lock, *ndl_count;
-
-		lane = cpu % nd_region->num_lanes;
-		ndl_count = per_cpu_ptr(nd_region->lane, cpu);
-		ndl_lock = per_cpu_ptr(nd_region->lane, lane);
-		if (ndl_count->count++ == 0)
-			spin_lock(&ndl_lock->lock);
-	} else
-		lane = cpu;
+	might_sleep();
 
+	lane = raw_smp_processor_id() % nd_region->num_lanes;
+	mutex_lock(&nd_region->lane[lane].lock);
 	return lane;
 }
 EXPORT_SYMBOL(nd_region_acquire_lane);
 
 void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane)
 {
-	if (nd_region->num_lanes < nr_cpu_ids) {
-		unsigned int cpu = smp_processor_id();
-		struct nd_percpu_lane *ndl_lock, *ndl_count;
-
-		ndl_count = per_cpu_ptr(nd_region->lane, cpu);
-		ndl_lock = per_cpu_ptr(nd_region->lane, lane);
-		if (--ndl_count->count == 0)
-			spin_unlock(&ndl_lock->lock);
-	}
-	migrate_enable();
+	mutex_unlock(&nd_region->lane[lane].lock);
 }
 EXPORT_SYMBOL(nd_region_release_lane);
 
@@ -1019,17 +995,16 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 			goto err_id;
 	}
 
-	nd_region->lane = alloc_percpu(struct nd_percpu_lane);
+	nd_region->num_lanes = ndr_desc->num_lanes;
+	if (!nd_region->num_lanes)
+		goto err_percpu;
+	nd_region->lane = kcalloc(nd_region->num_lanes,
+				  sizeof(*nd_region->lane), GFP_KERNEL);
 	if (!nd_region->lane)
 		goto err_percpu;
 
-        for (i = 0; i < nr_cpu_ids; i++) {
-		struct nd_percpu_lane *ndl;
-
-		ndl = per_cpu_ptr(nd_region->lane, i);
-		spin_lock_init(&ndl->lock);
-		ndl->count = 0;
-	}
+	for (i = 0; i < nd_region->num_lanes; i++)
+		mutex_init(&nd_region->lane[i].lock);
 
 	for (i = 0; i < ndr_desc->num_mappings; i++) {
 		struct nd_mapping_desc *mapping = &ndr_desc->mapping[i];
@@ -1046,7 +1021,6 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 	}
 	nd_region->provider_data = ndr_desc->provider_data;
 	nd_region->nd_set = ndr_desc->nd_set;
-	nd_region->num_lanes = ndr_desc->num_lanes;
 	nd_region->flags = ndr_desc->flags;
 	nd_region->ro = ro;
 	nd_region->numa_node = ndr_desc->numa_node;

base-commit: 254f49634ee16a731174d2ae34bc50bd5f45e731
-- 
2.37.3


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH v4] nvdimm/btt: Handle preemption in BTT lane acquisition
  2026-05-14 20:17 [PATCH v4] nvdimm/btt: Handle preemption in BTT lane acquisition Alison Schofield
@ 2026-05-14 20:23 ` Alison Schofield
  0 siblings, 0 replies; 2+ messages in thread
From: Alison Schofield @ 2026-05-14 20:23 UTC (permalink / raw)
  To: Dan Williams, Vishal Verma, Dave Jiang, Ira Weiny,
	Aboorva Devarajan; +Cc: nvdimm

On Thu, May 14, 2026 at 01:17:54PM -0700, Alison Schofield wrote:

Sorry to folks that got cut from the To: list.
Here you go.


> BTT lanes serialize access to per-lane metadata and workspace state
> during BTT I/O. The btt-check unit test reports data mismatches during
> BTT writes due to a race in lane acquisition that can lead to silent
> data corruption.
> 
> The existing lane model uses a spinlock together with a per-CPU
> recursion count. That recursion model stopped being valid after BTT
> lanes became preemptible: another task can run on the same CPU,
> observe a non-zero recursion count, bypass locking, and use the same
> lane concurrently.
> 
> BTT lanes are also held across metadata and data updates that can
> reach nvdimm_flush(). Some provider flush callbacks can sleep, making
> a spinlock the wrong primitive for the lane lifetime. That issue
> predates this fix, but becomes more visible now that BTT lanes are
> preemptible.
> 
> Replace the spinlock-based recursion model with a dynamically
> allocated per-lane mutex array and take the lane lock unconditionally.
> 
> Add might_sleep() to catch any future atomic-context caller.
> 
> Found with the ndctl unit test btt-check.sh.
> 
> Fixes: 36c75ce3bd29 ("nd_btt: Make BTT lanes preemptible")
> Assisted-by: Claude Sonnet 4.5
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> ---
> 
> 
> Changes in v4:
> - Replace per-CPU lane storage w dynamically allocated mutex array (Sashiko)
> - Remove the recursion fast path and take the lane lock unconditionally
> - Update commit log
> 
> Changes in v3:
> Replace spinlock with a per-lane mutex (Arboorva)
> 
> Changes in v2:
> Use spin_(un)lock_bh() (Sashiko AI)
> Update commit log per softirq re-enty and spinlock change
> 
> A new unit test to stress this is under review here:
> https://lore.kernel.org/nvdimm/20260424233633.3762217-1-alison.schofield@intel.com/
> 
> 
>  drivers/nvdimm/nd.h          |  5 ++-
>  drivers/nvdimm/region_devs.c | 62 +++++++++++-------------------------
>  2 files changed, 20 insertions(+), 47 deletions(-)
> 
> diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
> index b199eea3260e..69f329075527 100644
> --- a/drivers/nvdimm/nd.h
> +++ b/drivers/nvdimm/nd.h
> @@ -366,8 +366,7 @@ unsigned sizeof_namespace_label(struct nvdimm_drvdata *ndd);
>  			res; res = next, next = next ? next->sibling : NULL)
>  
>  struct nd_percpu_lane {
> -	int count;
> -	spinlock_t lock;
> +	struct mutex lock; /* serialize lane access */
>  };
>  
>  enum nd_label_flags {
> @@ -420,7 +419,7 @@ struct nd_region {
>  	struct kernfs_node *bb_state;
>  	struct badblocks bb;
>  	struct nd_interleave_set *nd_set;
> -	struct nd_percpu_lane __percpu *lane;
> +	struct nd_percpu_lane *lane;
>  	int (*flush)(struct nd_region *nd_region, struct bio *bio);
>  	struct nd_mapping mapping[] __counted_by(ndr_mappings);
>  };
> diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
> index e35c2e18518f..bc5e402bbd9a 100644
> --- a/drivers/nvdimm/region_devs.c
> +++ b/drivers/nvdimm/region_devs.c
> @@ -192,7 +192,7 @@ static void nd_region_release(struct device *dev)
>  
>  		put_device(&nvdimm->dev);
>  	}
> -	free_percpu(nd_region->lane);
> +	kfree(nd_region->lane);
>  	if (!test_bit(ND_REGION_CXL, &nd_region->flags))
>  		memregion_free(nd_region->id);
>  	kfree(nd_region);
> @@ -904,52 +904,28 @@ void nd_region_advance_seeds(struct nd_region *nd_region, struct device *dev)
>   * nd_region_acquire_lane - allocate and lock a lane
>   * @nd_region: region id and number of lanes possible
>   *
> - * A lane correlates to a BLK-data-window and/or a log slot in the BTT.
> - * We optimize for the common case where there are 256 lanes, one
> - * per-cpu.  For larger systems we need to lock to share lanes.  For now
> - * this implementation assumes the cost of maintaining an allocator for
> - * free lanes is on the order of the lock hold time, so it implements a
> - * static lane = cpu % num_lanes mapping.
> + * A lane correlates to a log slot in the BTT. Lanes are shared across
> + * CPUs using a static lane = cpu % num_lanes mapping, with a per-lane
> + * mutex to serialize access.
>   *
> - * In the case of a BTT instance on top of a BLK namespace a lane may be
> - * acquired recursively.  We lock on the first instance.
> - *
> - * In the case of a BTT instance on top of PMEM, we only acquire a lane
> - * for the BTT metadata updates.
> + * Callers must be in sleepable context. The only in-tree caller is
> + * BTT's ->submit_bio handler (btt_read_pg / btt_write_pg).
>   */
>  unsigned int nd_region_acquire_lane(struct nd_region *nd_region)
>  {
> -	unsigned int cpu, lane;
> +	unsigned int lane;
>  
> -	migrate_disable();
> -	cpu = smp_processor_id();
> -	if (nd_region->num_lanes < nr_cpu_ids) {
> -		struct nd_percpu_lane *ndl_lock, *ndl_count;
> -
> -		lane = cpu % nd_region->num_lanes;
> -		ndl_count = per_cpu_ptr(nd_region->lane, cpu);
> -		ndl_lock = per_cpu_ptr(nd_region->lane, lane);
> -		if (ndl_count->count++ == 0)
> -			spin_lock(&ndl_lock->lock);
> -	} else
> -		lane = cpu;
> +	might_sleep();
>  
> +	lane = raw_smp_processor_id() % nd_region->num_lanes;
> +	mutex_lock(&nd_region->lane[lane].lock);
>  	return lane;
>  }
>  EXPORT_SYMBOL(nd_region_acquire_lane);
>  
>  void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane)
>  {
> -	if (nd_region->num_lanes < nr_cpu_ids) {
> -		unsigned int cpu = smp_processor_id();
> -		struct nd_percpu_lane *ndl_lock, *ndl_count;
> -
> -		ndl_count = per_cpu_ptr(nd_region->lane, cpu);
> -		ndl_lock = per_cpu_ptr(nd_region->lane, lane);
> -		if (--ndl_count->count == 0)
> -			spin_unlock(&ndl_lock->lock);
> -	}
> -	migrate_enable();
> +	mutex_unlock(&nd_region->lane[lane].lock);
>  }
>  EXPORT_SYMBOL(nd_region_release_lane);
>  
> @@ -1019,17 +995,16 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
>  			goto err_id;
>  	}
>  
> -	nd_region->lane = alloc_percpu(struct nd_percpu_lane);
> +	nd_region->num_lanes = ndr_desc->num_lanes;
> +	if (!nd_region->num_lanes)
> +		goto err_percpu;
> +	nd_region->lane = kcalloc(nd_region->num_lanes,
> +				  sizeof(*nd_region->lane), GFP_KERNEL);
>  	if (!nd_region->lane)
>  		goto err_percpu;
>  
> -        for (i = 0; i < nr_cpu_ids; i++) {
> -		struct nd_percpu_lane *ndl;
> -
> -		ndl = per_cpu_ptr(nd_region->lane, i);
> -		spin_lock_init(&ndl->lock);
> -		ndl->count = 0;
> -	}
> +	for (i = 0; i < nd_region->num_lanes; i++)
> +		mutex_init(&nd_region->lane[i].lock);
>  
>  	for (i = 0; i < ndr_desc->num_mappings; i++) {
>  		struct nd_mapping_desc *mapping = &ndr_desc->mapping[i];
> @@ -1046,7 +1021,6 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
>  	}
>  	nd_region->provider_data = ndr_desc->provider_data;
>  	nd_region->nd_set = ndr_desc->nd_set;
> -	nd_region->num_lanes = ndr_desc->num_lanes;
>  	nd_region->flags = ndr_desc->flags;
>  	nd_region->ro = ro;
>  	nd_region->numa_node = ndr_desc->numa_node;
> 
> base-commit: 254f49634ee16a731174d2ae34bc50bd5f45e731
> -- 
> 2.37.3
> 

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2026-05-14 20:23 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-14 20:17 [PATCH v4] nvdimm/btt: Handle preemption in BTT lane acquisition Alison Schofield
2026-05-14 20:23 ` Alison Schofield

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.