[PATCH v2 0/4] kvfree_rcu() updates related to polled API

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 0/4] kvfree_rcu() updates related to polled API
@ 2022-11-29 15:58 Uladzislau Rezki (Sony)
  2022-11-29 15:58 ` [PATCH v2 1/4] rcu/kvfree: Switch to a generic linked list API Uladzislau Rezki (Sony)
                   ` (4 more replies)
  0 siblings, 5 replies; 13+ messages in thread
From: Uladzislau Rezki (Sony) @ 2022-11-29 15:58 UTC (permalink / raw)
  To: LKML, RCU, Paul E . McKenney
  Cc: Frederic Weisbecker, Neeraj Upadhyay, Joel Fernandes,
	Uladzislau Rezki, Oleksiy Avramchenko

v1 -> v2:
  - Rebase on latest dev.2022.11.23a branch.

Uladzislau Rezki (Sony) (4):
  rcu/kvfree: Switch to a generic linked list API
  rcu/kvfree: Move bulk/list reclaim to separate functions
  rcu/kvfree: Move need_offload_krc() out of krcp->lock
  rcu/kvfree: Use a polled API to speedup a reclaim process

 kernel/rcu/tree.c | 221 +++++++++++++++++++++++++++-------------------
 1 file changed, 131 insertions(+), 90 deletions(-)

-- 
2.30.2


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v2 1/4] rcu/kvfree: Switch to a generic linked list API
  2022-11-29 15:58 [PATCH v2 0/4] kvfree_rcu() updates related to polled API Uladzislau Rezki (Sony)
@ 2022-11-29 15:58 ` Uladzislau Rezki (Sony)
  2022-11-29 15:58 ` [PATCH v2 2/4] rcu/kvfree: Move bulk/list reclaim to separate functions Uladzislau Rezki (Sony)
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 13+ messages in thread
From: Uladzislau Rezki (Sony) @ 2022-11-29 15:58 UTC (permalink / raw)
  To: LKML, RCU, Paul E . McKenney
  Cc: Frederic Weisbecker, Neeraj Upadhyay, Joel Fernandes,
	Uladzislau Rezki, Oleksiy Avramchenko

To make a code more readable and less confusing switch
to a standard circular double linked list API. It allows
to simplify the code since basic list operations are well
defined and documented.

Please note, this patch does not introduce any functional
change it is only limited by refactoring of code.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 kernel/rcu/tree.c | 89 +++++++++++++++++++++++------------------------
 1 file changed, 43 insertions(+), 46 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 219b4b516f38..1c79b244aac9 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2740,13 +2740,13 @@ EXPORT_SYMBOL_GPL(call_rcu);
 
 /**
  * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
+ * @list: List node. All blocks are linked between each other
  * @nr_records: Number of active pointers in the array
- * @next: Next bulk object in the block chain
  * @records: Array of the kvfree_rcu() pointers
  */
 struct kvfree_rcu_bulk_data {
+	struct list_head list;
 	unsigned long nr_records;
-	struct kvfree_rcu_bulk_data *next;
 	void *records[];
 };
 
@@ -2762,21 +2762,21 @@ struct kvfree_rcu_bulk_data {
  * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
  * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
  * @head_free: List of kfree_rcu() objects waiting for a grace period
- * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
+ * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
  * @krcp: Pointer to @kfree_rcu_cpu structure
  */
 
 struct kfree_rcu_cpu_work {
 	struct rcu_work rcu_work;
 	struct rcu_head *head_free;
-	struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS];
+	struct list_head bulk_head_free[FREE_N_CHANNELS];
 	struct kfree_rcu_cpu *krcp;
 };
 
 /**
  * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
  * @head: List of kfree_rcu() objects not yet waiting for a grace period
- * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
+ * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
  * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
  * @lock: Synchronize access to this structure
  * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
@@ -2800,7 +2800,7 @@ struct kfree_rcu_cpu_work {
  */
 struct kfree_rcu_cpu {
 	struct rcu_head *head;
-	struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS];
+	struct list_head bulk_head[FREE_N_CHANNELS];
 	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
 	raw_spinlock_t lock;
 	struct delayed_work monitor_work;
@@ -2895,12 +2895,13 @@ drain_page_cache(struct kfree_rcu_cpu *krcp)
 
 /*
  * This function is invoked in workqueue context after a grace period.
- * It frees all the objects queued on ->bkvhead_free or ->head_free.
+ * It frees all the objects queued on ->bulk_head_free or ->head_free.
  */
 static void kfree_rcu_work(struct work_struct *work)
 {
 	unsigned long flags;
-	struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext;
+	struct kvfree_rcu_bulk_data *bnode, *n;
+	struct list_head bulk_head[FREE_N_CHANNELS];
 	struct rcu_head *head, *next;
 	struct kfree_rcu_cpu *krcp;
 	struct kfree_rcu_cpu_work *krwp;
@@ -2912,10 +2913,8 @@ static void kfree_rcu_work(struct work_struct *work)
 
 	raw_spin_lock_irqsave(&krcp->lock, flags);
 	// Channels 1 and 2.
-	for (i = 0; i < FREE_N_CHANNELS; i++) {
-		bkvhead[i] = krwp->bkvhead_free[i];
-		krwp->bkvhead_free[i] = NULL;
-	}
+	for (i = 0; i < FREE_N_CHANNELS; i++)
+		list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]);
 
 	// Channel 3.
 	head = krwp->head_free;
@@ -2924,36 +2923,33 @@ static void kfree_rcu_work(struct work_struct *work)
 
 	// Handle the first two channels.
 	for (i = 0; i < FREE_N_CHANNELS; i++) {
-		for (; bkvhead[i]; bkvhead[i] = bnext) {
-			bnext = bkvhead[i]->next;
-			debug_rcu_bhead_unqueue(bkvhead[i]);
+		list_for_each_entry_safe(bnode, n, &bulk_head[i], list) {
+			debug_rcu_bhead_unqueue(bnode);
 
 			rcu_lock_acquire(&rcu_callback_map);
 			if (i == 0) { // kmalloc() / kfree().
 				trace_rcu_invoke_kfree_bulk_callback(
-					rcu_state.name, bkvhead[i]->nr_records,
-					bkvhead[i]->records);
+					rcu_state.name, bnode->nr_records,
+					bnode->records);
 
-				kfree_bulk(bkvhead[i]->nr_records,
-					bkvhead[i]->records);
+				kfree_bulk(bnode->nr_records, bnode->records);
 			} else { // vmalloc() / vfree().
-				for (j = 0; j < bkvhead[i]->nr_records; j++) {
+				for (j = 0; j < bnode->nr_records; j++) {
 					trace_rcu_invoke_kvfree_callback(
-						rcu_state.name,
-						bkvhead[i]->records[j], 0);
+						rcu_state.name, bnode->records[j], 0);
 
-					vfree(bkvhead[i]->records[j]);
+					vfree(bnode->records[j]);
 				}
 			}
 			rcu_lock_release(&rcu_callback_map);
 
 			raw_spin_lock_irqsave(&krcp->lock, flags);
-			if (put_cached_bnode(krcp, bkvhead[i]))
-				bkvhead[i] = NULL;
+			if (put_cached_bnode(krcp, bnode))
+				bnode = NULL;
 			raw_spin_unlock_irqrestore(&krcp->lock, flags);
 
-			if (bkvhead[i])
-				free_page((unsigned long) bkvhead[i]);
+			if (bnode)
+				free_page((unsigned long) bnode);
 
 			cond_resched_tasks_rcu_qs();
 		}
@@ -2989,7 +2985,7 @@ need_offload_krc(struct kfree_rcu_cpu *krcp)
 	int i;
 
 	for (i = 0; i < FREE_N_CHANNELS; i++)
-		if (krcp->bkvhead[i])
+		if (!list_empty(&krcp->bulk_head[i]))
 			return true;
 
 	return !!krcp->head;
@@ -3026,21 +3022,20 @@ static void kfree_rcu_monitor(struct work_struct *work)
 	for (i = 0; i < KFREE_N_BATCHES; i++) {
 		struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
 
-		// Try to detach bkvhead or head and attach it over any
+		// Try to detach bulk_head or head and attach it over any
 		// available corresponding free channel. It can be that
 		// a previous RCU batch is in progress, it means that
 		// immediately to queue another one is not possible so
 		// in that case the monitor work is rearmed.
-		if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
-			(krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
+		if ((!list_empty(&krcp->bulk_head[0]) && list_empty(&krwp->bulk_head_free[0])) ||
+			(!list_empty(&krcp->bulk_head[1]) && list_empty(&krwp->bulk_head_free[1])) ||
 				(krcp->head && !krwp->head_free)) {
+
 			// Channel 1 corresponds to the SLAB-pointer bulk path.
 			// Channel 2 corresponds to vmalloc-pointer bulk path.
 			for (j = 0; j < FREE_N_CHANNELS; j++) {
-				if (!krwp->bkvhead_free[j]) {
-					krwp->bkvhead_free[j] = krcp->bkvhead[j];
-					krcp->bkvhead[j] = NULL;
-				}
+				if (list_empty(&krwp->bulk_head_free[j]))
+					list_replace_init(&krcp->bulk_head[j], &krwp->bulk_head_free[j]);
 			}
 
 			// Channel 3 corresponds to both SLAB and vmalloc
@@ -3152,10 +3147,11 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
 		return false;
 
 	idx = !!is_vmalloc_addr(ptr);
+	bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
+		struct kvfree_rcu_bulk_data, list);
 
 	/* Check if a new block is required. */
-	if (!(*krcp)->bkvhead[idx] ||
-			(*krcp)->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
+	if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
 		bnode = get_cached_bnode(*krcp);
 		if (!bnode && can_alloc) {
 			krc_this_cpu_unlock(*krcp, *flags);
@@ -3179,18 +3175,13 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
 		if (!bnode)
 			return false;
 
-		/* Initialize the new block. */
+		// Initialize the new block and attach it.
 		bnode->nr_records = 0;
-		bnode->next = (*krcp)->bkvhead[idx];
-
-		/* Attach it to the head. */
-		(*krcp)->bkvhead[idx] = bnode;
+		list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
 	}
 
 	/* Finally insert. */
-	(*krcp)->bkvhead[idx]->records
-		[(*krcp)->bkvhead[idx]->nr_records++] = ptr;
-
+	bnode->records[bnode->nr_records++] = ptr;
 	return true;
 }
 
@@ -4779,7 +4770,7 @@ struct workqueue_struct *rcu_gp_wq;
 static void __init kfree_rcu_batch_init(void)
 {
 	int cpu;
-	int i;
+	int i, j;
 
 	/* Clamp it to [0:100] seconds interval. */
 	if (rcu_delay_page_cache_fill_msec < 0 ||
@@ -4799,8 +4790,14 @@ static void __init kfree_rcu_batch_init(void)
 		for (i = 0; i < KFREE_N_BATCHES; i++) {
 			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
 			krcp->krw_arr[i].krcp = krcp;
+
+			for (j = 0; j < FREE_N_CHANNELS; j++)
+				INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]);
 		}
 
+		for (i = 0; i < FREE_N_CHANNELS; i++)
+			INIT_LIST_HEAD(&krcp->bulk_head[i]);
+
 		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
 		INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
 		krcp->initialized = true;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 2/4] rcu/kvfree: Move bulk/list reclaim to separate functions
  2022-11-29 15:58 [PATCH v2 0/4] kvfree_rcu() updates related to polled API Uladzislau Rezki (Sony)
  2022-11-29 15:58 ` [PATCH v2 1/4] rcu/kvfree: Switch to a generic linked list API Uladzislau Rezki (Sony)
@ 2022-11-29 15:58 ` Uladzislau Rezki (Sony)
  2022-11-29 15:58 ` [PATCH v2 3/4] rcu/kvfree: Move need_offload_krc() out of krcp->lock Uladzislau Rezki (Sony)
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 13+ messages in thread
From: Uladzislau Rezki (Sony) @ 2022-11-29 15:58 UTC (permalink / raw)
  To: LKML, RCU, Paul E . McKenney
  Cc: Frederic Weisbecker, Neeraj Upadhyay, Joel Fernandes,
	Uladzislau Rezki, Oleksiy Avramchenko

There are two different paths how a memory is reclaimed.
Currently it is open-coded what makes it a bit messy and
less easy to read.

Introduce two separate functions kvfree_rcu_list() and
kvfree_rcu_bulk() to cover two independent cases.

Please note, this patch does not introduce any functional
change it is only limited by refactoring of code.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 kernel/rcu/tree.c | 114 ++++++++++++++++++++++++++--------------------
 1 file changed, 65 insertions(+), 49 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1c79b244aac9..445f8c11a9a3 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2893,6 +2893,65 @@ drain_page_cache(struct kfree_rcu_cpu *krcp)
 	return freed;
 }
 
+static void
+kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
+	struct kvfree_rcu_bulk_data *bnode, int idx)
+{
+	unsigned long flags;
+	int i;
+
+	debug_rcu_bhead_unqueue(bnode);
+
+	rcu_lock_acquire(&rcu_callback_map);
+	if (idx == 0) { // kmalloc() / kfree().
+		trace_rcu_invoke_kfree_bulk_callback(
+			rcu_state.name, bnode->nr_records,
+			bnode->records);
+
+		kfree_bulk(bnode->nr_records, bnode->records);
+	} else { // vmalloc() / vfree().
+		for (i = 0; i < bnode->nr_records; i++) {
+			trace_rcu_invoke_kvfree_callback(
+				rcu_state.name, bnode->records[i], 0);
+
+			vfree(bnode->records[i]);
+		}
+	}
+	rcu_lock_release(&rcu_callback_map);
+
+	raw_spin_lock_irqsave(&krcp->lock, flags);
+	if (put_cached_bnode(krcp, bnode))
+		bnode = NULL;
+	raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+	if (bnode)
+		free_page((unsigned long) bnode);
+
+	cond_resched_tasks_rcu_qs();
+}
+
+static void
+kvfree_rcu_list(struct rcu_head *head)
+{
+	struct rcu_head *next;
+
+	for (; head; head = next) {
+		void *ptr = (void *) head->func;
+		unsigned long offset = (void *) head - ptr;
+
+		next = head->next;
+		debug_rcu_head_unqueue((struct rcu_head *)ptr);
+		rcu_lock_acquire(&rcu_callback_map);
+		trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset);
+
+		if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
+			kvfree(ptr);
+
+		rcu_lock_release(&rcu_callback_map);
+		cond_resched_tasks_rcu_qs();
+	}
+}
+
 /*
  * This function is invoked in workqueue context after a grace period.
  * It frees all the objects queued on ->bulk_head_free or ->head_free.
@@ -2902,10 +2961,10 @@ static void kfree_rcu_work(struct work_struct *work)
 	unsigned long flags;
 	struct kvfree_rcu_bulk_data *bnode, *n;
 	struct list_head bulk_head[FREE_N_CHANNELS];
-	struct rcu_head *head, *next;
+	struct rcu_head *head;
 	struct kfree_rcu_cpu *krcp;
 	struct kfree_rcu_cpu_work *krwp;
-	int i, j;
+	int i;
 
 	krwp = container_of(to_rcu_work(work),
 			    struct kfree_rcu_cpu_work, rcu_work);
@@ -2922,38 +2981,9 @@ static void kfree_rcu_work(struct work_struct *work)
 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
 
 	// Handle the first two channels.
-	for (i = 0; i < FREE_N_CHANNELS; i++) {
-		list_for_each_entry_safe(bnode, n, &bulk_head[i], list) {
-			debug_rcu_bhead_unqueue(bnode);
-
-			rcu_lock_acquire(&rcu_callback_map);
-			if (i == 0) { // kmalloc() / kfree().
-				trace_rcu_invoke_kfree_bulk_callback(
-					rcu_state.name, bnode->nr_records,
-					bnode->records);
-
-				kfree_bulk(bnode->nr_records, bnode->records);
-			} else { // vmalloc() / vfree().
-				for (j = 0; j < bnode->nr_records; j++) {
-					trace_rcu_invoke_kvfree_callback(
-						rcu_state.name, bnode->records[j], 0);
-
-					vfree(bnode->records[j]);
-				}
-			}
-			rcu_lock_release(&rcu_callback_map);
-
-			raw_spin_lock_irqsave(&krcp->lock, flags);
-			if (put_cached_bnode(krcp, bnode))
-				bnode = NULL;
-			raw_spin_unlock_irqrestore(&krcp->lock, flags);
-
-			if (bnode)
-				free_page((unsigned long) bnode);
-
-			cond_resched_tasks_rcu_qs();
-		}
-	}
+	for (i = 0; i < FREE_N_CHANNELS; i++)
+		list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
+			kvfree_rcu_bulk(krcp, bnode, i);
 
 	/*
 	 * This is used when the "bulk" path can not be used for the
@@ -2962,21 +2992,7 @@ static void kfree_rcu_work(struct work_struct *work)
 	 * queued on a linked list through their rcu_head structures.
 	 * This list is named "Channel 3".
 	 */
-	for (; head; head = next) {
-		void *ptr = (void *) head->func;
-		unsigned long offset = (void *) head - ptr;
-
-		next = head->next;
-		debug_rcu_head_unqueue((struct rcu_head *)ptr);
-		rcu_lock_acquire(&rcu_callback_map);
-		trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset);
-
-		if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
-			kvfree(ptr);
-
-		rcu_lock_release(&rcu_callback_map);
-		cond_resched_tasks_rcu_qs();
-	}
+	kvfree_rcu_list(head);
 }
 
 static bool
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 3/4] rcu/kvfree: Move need_offload_krc() out of krcp->lock
  2022-11-29 15:58 [PATCH v2 0/4] kvfree_rcu() updates related to polled API Uladzislau Rezki (Sony)
  2022-11-29 15:58 ` [PATCH v2 1/4] rcu/kvfree: Switch to a generic linked list API Uladzislau Rezki (Sony)
  2022-11-29 15:58 ` [PATCH v2 2/4] rcu/kvfree: Move bulk/list reclaim to separate functions Uladzislau Rezki (Sony)
@ 2022-11-29 15:58 ` Uladzislau Rezki (Sony)
  2022-11-29 23:38   ` Paul E. McKenney
  2022-11-29 15:58 ` [PATCH v2 4/4] rcu/kvfree: Use a polled API to speedup a reclaim process Uladzislau Rezki (Sony)
  2022-11-29 16:37 ` [PATCH v2 0/4] kvfree_rcu() updates related to polled API Paul E. McKenney
  4 siblings, 1 reply; 13+ messages in thread
From: Uladzislau Rezki (Sony) @ 2022-11-29 15:58 UTC (permalink / raw)
  To: LKML, RCU, Paul E . McKenney
  Cc: Frederic Weisbecker, Neeraj Upadhyay, Joel Fernandes,
	Uladzislau Rezki, Oleksiy Avramchenko

Currently a need_offload_krc() function requires the krcp->lock
to be held because krcp->head can not be checked concurrently.

Fix it by updating the krcp->head using WRITE_ONCE() macro so
it becomes lock-free and safe for readers to see a valid data
without any locking.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 kernel/rcu/tree.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 445f8c11a9a3..c94c17194299 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3058,7 +3058,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
 			// objects queued on the linked list.
 			if (!krwp->head_free) {
 				krwp->head_free = krcp->head;
-				krcp->head = NULL;
+				WRITE_ONCE(krcp->head, NULL);
 			}
 
 			WRITE_ONCE(krcp->count, 0);
@@ -3072,6 +3072,8 @@ static void kfree_rcu_monitor(struct work_struct *work)
 		}
 	}
 
+	raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
 	// If there is nothing to detach, it means that our job is
 	// successfully done here. In case of having at least one
 	// of the channels that is still busy we should rearm the
@@ -3079,8 +3081,6 @@ static void kfree_rcu_monitor(struct work_struct *work)
 	// still in progress.
 	if (need_offload_krc(krcp))
 		schedule_delayed_monitor_work(krcp);
-
-	raw_spin_unlock_irqrestore(&krcp->lock, flags);
 }
 
 static enum hrtimer_restart
@@ -3250,7 +3250,7 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
 
 		head->func = ptr;
 		head->next = krcp->head;
-		krcp->head = head;
+		WRITE_ONCE(krcp->head, head);
 		success = true;
 	}
 
@@ -3327,15 +3327,12 @@ static struct shrinker kfree_rcu_shrinker = {
 void __init kfree_rcu_scheduler_running(void)
 {
 	int cpu;
-	unsigned long flags;
 
 	for_each_possible_cpu(cpu) {
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
 
-		raw_spin_lock_irqsave(&krcp->lock, flags);
 		if (need_offload_krc(krcp))
 			schedule_delayed_monitor_work(krcp);
-		raw_spin_unlock_irqrestore(&krcp->lock, flags);
 	}
 }
 
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 4/4] rcu/kvfree: Use a polled API to speedup a reclaim process
  2022-11-29 15:58 [PATCH v2 0/4] kvfree_rcu() updates related to polled API Uladzislau Rezki (Sony)
                   ` (2 preceding siblings ...)
  2022-11-29 15:58 ` [PATCH v2 3/4] rcu/kvfree: Move need_offload_krc() out of krcp->lock Uladzislau Rezki (Sony)
@ 2022-11-29 15:58 ` Uladzislau Rezki (Sony)
  2022-12-01 23:45   ` Paul E. McKenney
  2022-11-29 16:37 ` [PATCH v2 0/4] kvfree_rcu() updates related to polled API Paul E. McKenney
  4 siblings, 1 reply; 13+ messages in thread
From: Uladzislau Rezki (Sony) @ 2022-11-29 15:58 UTC (permalink / raw)
  To: LKML, RCU, Paul E . McKenney
  Cc: Frederic Weisbecker, Neeraj Upadhyay, Joel Fernandes,
	Uladzislau Rezki, Oleksiy Avramchenko

Currently all objects placed into a batch require a full GP
after passing which objects in a batch are eligible to be
freed.

The problem is that many pointers may already passed several
GP sequences so there is no need for them in extra delay and
such objects can be reclaimed right away without waiting.

In order to reduce a memory footprint this patch introduces
a per-page-grace-period-controlling mechanism. It allows us
to distinguish pointers for which a grace period is passed
and for which not.

A reclaim thread in its turn frees a memory in a reverse
order starting from a tail because a GP is likely passed
for objects in a page. If a page with a GP sequence in a
list hits a condition when a GP is not ready we bail out
requesting one more grace period in order to complete a
drain process for left pages.

Test example:

kvm.sh --memory 10G --torture rcuscale --allcpus --duration 1 \
  --kconfig CONFIG_NR_CPUS=64 \
  --kconfig CONFIG_RCU_NOCB_CPU=y \
  --kconfig CONFIG_RCU_NOCB_CPU_DEFAULT_ALL=y \
  --kconfig CONFIG_RCU_LAZY=n \
  --bootargs "rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 \
  rcuscale.holdoff=20 rcuscale.kfree_loops=10000 \
  torture.disable_onoff_at_boot" --trust-make

Total time taken by all kfree'ers: 8535693700 ns, loops: 10000, batches: 1188, memory footprint: 2248MB
Total time taken by all kfree'ers: 8466933582 ns, loops: 10000, batches: 1157, memory footprint: 2820MB
Total time taken by all kfree'ers: 5375602446 ns, loops: 10000, batches: 1130, memory footprint: 6502MB
Total time taken by all kfree'ers: 7523283832 ns, loops: 10000, batches: 1006, memory footprint: 3343MB
Total time taken by all kfree'ers: 6459171956 ns, loops: 10000, batches: 1150, memory footprint: 6549MB

Total time taken by all kfree'ers: 8560060176 ns, loops: 10000, batches: 1787, memory footprint: 61MB
Total time taken by all kfree'ers: 8573885501 ns, loops: 10000, batches: 1777, memory footprint: 93MB
Total time taken by all kfree'ers: 8320000202 ns, loops: 10000, batches: 1727, memory footprint: 66MB
Total time taken by all kfree'ers: 8552718794 ns, loops: 10000, batches: 1790, memory footprint: 75MB
Total time taken by all kfree'ers: 8601368792 ns, loops: 10000, batches: 1724, memory footprint: 62MB

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 kernel/rcu/tree.c | 47 +++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 39 insertions(+), 8 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c94c17194299..44279ca488ef 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2741,11 +2741,13 @@ EXPORT_SYMBOL_GPL(call_rcu);
 /**
  * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
  * @list: List node. All blocks are linked between each other
+ * @gp_snap: Snapshot of RCU state for objects placed to this bulk
  * @nr_records: Number of active pointers in the array
  * @records: Array of the kvfree_rcu() pointers
  */
 struct kvfree_rcu_bulk_data {
 	struct list_head list;
+	unsigned long gp_snap;
 	unsigned long nr_records;
 	void *records[];
 };
@@ -2762,13 +2764,15 @@ struct kvfree_rcu_bulk_data {
  * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
  * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
  * @head_free: List of kfree_rcu() objects waiting for a grace period
+ * @head_free_gp_snap: Snapshot of RCU state for objects placed to "@head_free"
  * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
  * @krcp: Pointer to @kfree_rcu_cpu structure
  */
 
 struct kfree_rcu_cpu_work {
-	struct rcu_work rcu_work;
+	struct work_struct rcu_work;
 	struct rcu_head *head_free;
+	unsigned long head_free_gp_snap;
 	struct list_head bulk_head_free[FREE_N_CHANNELS];
 	struct kfree_rcu_cpu *krcp;
 };
@@ -2964,10 +2968,11 @@ static void kfree_rcu_work(struct work_struct *work)
 	struct rcu_head *head;
 	struct kfree_rcu_cpu *krcp;
 	struct kfree_rcu_cpu_work *krwp;
+	unsigned long head_free_gp_snap;
 	int i;
 
-	krwp = container_of(to_rcu_work(work),
-			    struct kfree_rcu_cpu_work, rcu_work);
+	krwp = container_of(work,
+		struct kfree_rcu_cpu_work, rcu_work);
 	krcp = krwp->krcp;
 
 	raw_spin_lock_irqsave(&krcp->lock, flags);
@@ -2978,12 +2983,29 @@ static void kfree_rcu_work(struct work_struct *work)
 	// Channel 3.
 	head = krwp->head_free;
 	krwp->head_free = NULL;
+	head_free_gp_snap = krwp->head_free_gp_snap;
 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
 
 	// Handle the first two channels.
-	for (i = 0; i < FREE_N_CHANNELS; i++)
+	for (i = 0; i < FREE_N_CHANNELS; i++) {
+		// Start from the tail page, so a GP is likely passed for it.
+		list_for_each_entry_safe_reverse(bnode, n, &bulk_head[i], list) {
+			// Not yet ready? Bail out since we need one more GP.
+			if (!poll_state_synchronize_rcu(bnode->gp_snap))
+				break;
+
+			list_del_init(&bnode->list);
+			kvfree_rcu_bulk(krcp, bnode, i);
+		}
+
+		// Please note a request for one more extra GP can
+		// occur only once for all objects in this batch.
+		if (!list_empty(&bulk_head[i]))
+			synchronize_rcu();
+
 		list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
 			kvfree_rcu_bulk(krcp, bnode, i);
+	}
 
 	/*
 	 * This is used when the "bulk" path can not be used for the
@@ -2992,7 +3014,10 @@ static void kfree_rcu_work(struct work_struct *work)
 	 * queued on a linked list through their rcu_head structures.
 	 * This list is named "Channel 3".
 	 */
-	kvfree_rcu_list(head);
+	if (head) {
+		cond_synchronize_rcu(head_free_gp_snap);
+		kvfree_rcu_list(head);
+	}
 }
 
 static bool
@@ -3059,6 +3084,11 @@ static void kfree_rcu_monitor(struct work_struct *work)
 			if (!krwp->head_free) {
 				krwp->head_free = krcp->head;
 				WRITE_ONCE(krcp->head, NULL);
+
+				// Take a snapshot for this krwp. Please note no more
+				// any objects can be added to attached head_free channel
+				// therefore fixate a GP for it here.
+				krwp->head_free_gp_snap = get_state_synchronize_rcu();
 			}
 
 			WRITE_ONCE(krcp->count, 0);
@@ -3068,7 +3098,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
 			// be that the work is in the pending state when
 			// channels have been detached following by each
 			// other.
-			queue_rcu_work(system_wq, &krwp->rcu_work);
+			queue_work(system_wq, &krwp->rcu_work);
 		}
 	}
 
@@ -3196,8 +3226,9 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
 		list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
 	}
 
-	/* Finally insert. */
+	// Finally insert and update the GP for this page.
 	bnode->records[bnode->nr_records++] = ptr;
+	bnode->gp_snap = get_state_synchronize_rcu();
 	return true;
 }
 
@@ -4801,7 +4832,7 @@ static void __init kfree_rcu_batch_init(void)
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
 
 		for (i = 0; i < KFREE_N_BATCHES; i++) {
-			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
+			INIT_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
 			krcp->krw_arr[i].krcp = krcp;
 
 			for (j = 0; j < FREE_N_CHANNELS; j++)
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 0/4] kvfree_rcu() updates related to polled API
  2022-11-29 15:58 [PATCH v2 0/4] kvfree_rcu() updates related to polled API Uladzislau Rezki (Sony)
                   ` (3 preceding siblings ...)
  2022-11-29 15:58 ` [PATCH v2 4/4] rcu/kvfree: Use a polled API to speedup a reclaim process Uladzislau Rezki (Sony)
@ 2022-11-29 16:37 ` Paul E. McKenney
  4 siblings, 0 replies; 13+ messages in thread
From: Paul E. McKenney @ 2022-11-29 16:37 UTC (permalink / raw)
  To: Uladzislau Rezki (Sony)
  Cc: LKML, RCU, Frederic Weisbecker, Neeraj Upadhyay, Joel Fernandes,
	Oleksiy Avramchenko

On Tue, Nov 29, 2022 at 04:58:18PM +0100, Uladzislau Rezki (Sony) wrote:
> v1 -> v2:
>   - Rebase on latest dev.2022.11.23a branch.

Much better, thank you!

I have pulled these in for review and testing.

							Thanx, Paul

> Uladzislau Rezki (Sony) (4):
>   rcu/kvfree: Switch to a generic linked list API
>   rcu/kvfree: Move bulk/list reclaim to separate functions
>   rcu/kvfree: Move need_offload_krc() out of krcp->lock
>   rcu/kvfree: Use a polled API to speedup a reclaim process
> 
>  kernel/rcu/tree.c | 221 +++++++++++++++++++++++++++-------------------
>  1 file changed, 131 insertions(+), 90 deletions(-)
> 
> -- 
> 2.30.2
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 3/4] rcu/kvfree: Move need_offload_krc() out of krcp->lock
  2022-11-29 15:58 ` [PATCH v2 3/4] rcu/kvfree: Move need_offload_krc() out of krcp->lock Uladzislau Rezki (Sony)
@ 2022-11-29 23:38   ` Paul E. McKenney
  2022-11-30 12:56     ` Uladzislau Rezki
  0 siblings, 1 reply; 13+ messages in thread
From: Paul E. McKenney @ 2022-11-29 23:38 UTC (permalink / raw)
  To: Uladzislau Rezki (Sony)
  Cc: LKML, RCU, Frederic Weisbecker, Neeraj Upadhyay, Joel Fernandes,
	Oleksiy Avramchenko

On Tue, Nov 29, 2022 at 04:58:21PM +0100, Uladzislau Rezki (Sony) wrote:
> Currently a need_offload_krc() function requires the krcp->lock
> to be held because krcp->head can not be checked concurrently.
> 
> Fix it by updating the krcp->head using WRITE_ONCE() macro so
> it becomes lock-free and safe for readers to see a valid data
> without any locking.

Don't we also need to use READ_ONCE() for the code loading this krcp->head
pointer?  Or do the remaining plain C-language accesses somehow avoid
running concurrently with those new WRITE_ONCE() invocations?

						Thanx, Paul

> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> ---
>  kernel/rcu/tree.c | 11 ++++-------
>  1 file changed, 4 insertions(+), 7 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 445f8c11a9a3..c94c17194299 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3058,7 +3058,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
>  			// objects queued on the linked list.
>  			if (!krwp->head_free) {
>  				krwp->head_free = krcp->head;
> -				krcp->head = NULL;
> +				WRITE_ONCE(krcp->head, NULL);
>  			}
>  
>  			WRITE_ONCE(krcp->count, 0);
> @@ -3072,6 +3072,8 @@ static void kfree_rcu_monitor(struct work_struct *work)
>  		}
>  	}
>  
> +	raw_spin_unlock_irqrestore(&krcp->lock, flags);
> +
>  	// If there is nothing to detach, it means that our job is
>  	// successfully done here. In case of having at least one
>  	// of the channels that is still busy we should rearm the
> @@ -3079,8 +3081,6 @@ static void kfree_rcu_monitor(struct work_struct *work)
>  	// still in progress.
>  	if (need_offload_krc(krcp))
>  		schedule_delayed_monitor_work(krcp);
> -
> -	raw_spin_unlock_irqrestore(&krcp->lock, flags);
>  }
>  
>  static enum hrtimer_restart
> @@ -3250,7 +3250,7 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
>  
>  		head->func = ptr;
>  		head->next = krcp->head;
> -		krcp->head = head;
> +		WRITE_ONCE(krcp->head, head);
>  		success = true;
>  	}
>  
> @@ -3327,15 +3327,12 @@ static struct shrinker kfree_rcu_shrinker = {
>  void __init kfree_rcu_scheduler_running(void)
>  {
>  	int cpu;
> -	unsigned long flags;
>  
>  	for_each_possible_cpu(cpu) {
>  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
>  
> -		raw_spin_lock_irqsave(&krcp->lock, flags);
>  		if (need_offload_krc(krcp))
>  			schedule_delayed_monitor_work(krcp);
> -		raw_spin_unlock_irqrestore(&krcp->lock, flags);
>  	}
>  }
>  
> -- 
> 2.30.2
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 3/4] rcu/kvfree: Move need_offload_krc() out of krcp->lock
  2022-11-29 23:38   ` Paul E. McKenney
@ 2022-11-30 12:56     ` Uladzislau Rezki
  2022-11-30 18:44       ` Paul E. McKenney
  0 siblings, 1 reply; 13+ messages in thread
From: Uladzislau Rezki @ 2022-11-30 12:56 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Uladzislau Rezki (Sony), LKML, RCU, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Oleksiy Avramchenko

On Tue, Nov 29, 2022 at 03:38:33PM -0800, Paul E. McKenney wrote:
> On Tue, Nov 29, 2022 at 04:58:21PM +0100, Uladzislau Rezki (Sony) wrote:
> > Currently a need_offload_krc() function requires the krcp->lock
> > to be held because krcp->head can not be checked concurrently.
> > 
> > Fix it by updating the krcp->head using WRITE_ONCE() macro so
> > it becomes lock-free and safe for readers to see a valid data
> > without any locking.
> 
> Don't we also need to use READ_ONCE() for the code loading this krcp->head
> pointer?  Or do the remaining plain C-language accesses somehow avoid
> running concurrently with those new WRITE_ONCE() invocations?
>
It can be concurrent. I was thinking about it. For some reason i decided
to keep readers as a "regular" ones for loading the krcp->head.

In this case it might take time for readers to see an updated value
as a worst case scenario.

So i need to update it or upload one more patch on top of v2. Should
i upload a new patch?

Thanks!

--
Uladzislau Rezki

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 3/4] rcu/kvfree: Move need_offload_krc() out of krcp->lock
  2022-11-30 12:56     ` Uladzislau Rezki
@ 2022-11-30 18:44       ` Paul E. McKenney
  2022-12-02 13:19         ` Uladzislau Rezki
  0 siblings, 1 reply; 13+ messages in thread
From: Paul E. McKenney @ 2022-11-30 18:44 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: LKML, RCU, Frederic Weisbecker, Neeraj Upadhyay, Joel Fernandes,
	Oleksiy Avramchenko

On Wed, Nov 30, 2022 at 01:56:17PM +0100, Uladzislau Rezki wrote:
> On Tue, Nov 29, 2022 at 03:38:33PM -0800, Paul E. McKenney wrote:
> > On Tue, Nov 29, 2022 at 04:58:21PM +0100, Uladzislau Rezki (Sony) wrote:
> > > Currently a need_offload_krc() function requires the krcp->lock
> > > to be held because krcp->head can not be checked concurrently.
> > > 
> > > Fix it by updating the krcp->head using WRITE_ONCE() macro so
> > > it becomes lock-free and safe for readers to see a valid data
> > > without any locking.
> > 
> > Don't we also need to use READ_ONCE() for the code loading this krcp->head
> > pointer?  Or do the remaining plain C-language accesses somehow avoid
> > running concurrently with those new WRITE_ONCE() invocations?
> >
> It can be concurrent. I was thinking about it. For some reason i decided
> to keep readers as a "regular" ones for loading the krcp->head.
> 
> In this case it might take time for readers to see an updated value
> as a worst case scenario.
> 
> So i need to update it or upload one more patch on top of v2. Should
> i upload a new patch?

Sending an additional patch should be fine.  Unless you would rather it
be folded into one of the existing patches, in which case please start
with the set that I have queued.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 4/4] rcu/kvfree: Use a polled API to speedup a reclaim process
  2022-11-29 15:58 ` [PATCH v2 4/4] rcu/kvfree: Use a polled API to speedup a reclaim process Uladzislau Rezki (Sony)
@ 2022-12-01 23:45   ` Paul E. McKenney
  2022-12-02 12:54     ` Uladzislau Rezki
  0 siblings, 1 reply; 13+ messages in thread
From: Paul E. McKenney @ 2022-12-01 23:45 UTC (permalink / raw)
  To: Uladzislau Rezki (Sony)
  Cc: LKML, RCU, Frederic Weisbecker, Neeraj Upadhyay, Joel Fernandes,
	Oleksiy Avramchenko

On Tue, Nov 29, 2022 at 04:58:22PM +0100, Uladzislau Rezki (Sony) wrote:
> Currently all objects placed into a batch require a full GP
> after passing which objects in a batch are eligible to be
> freed.
> 
> The problem is that many pointers may already passed several
> GP sequences so there is no need for them in extra delay and
> such objects can be reclaimed right away without waiting.
> 
> In order to reduce a memory footprint this patch introduces
> a per-page-grace-period-controlling mechanism. It allows us
> to distinguish pointers for which a grace period is passed
> and for which not.
> 
> A reclaim thread in its turn frees a memory in a reverse
> order starting from a tail because a GP is likely passed
> for objects in a page. If a page with a GP sequence in a
> list hits a condition when a GP is not ready we bail out
> requesting one more grace period in order to complete a
> drain process for left pages.
> 
> Test example:
> 
> kvm.sh --memory 10G --torture rcuscale --allcpus --duration 1 \
>   --kconfig CONFIG_NR_CPUS=64 \
>   --kconfig CONFIG_RCU_NOCB_CPU=y \
>   --kconfig CONFIG_RCU_NOCB_CPU_DEFAULT_ALL=y \
>   --kconfig CONFIG_RCU_LAZY=n \
>   --bootargs "rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 \
>   rcuscale.holdoff=20 rcuscale.kfree_loops=10000 \
>   torture.disable_onoff_at_boot" --trust-make
> 
> Total time taken by all kfree'ers: 8535693700 ns, loops: 10000, batches: 1188, memory footprint: 2248MB
> Total time taken by all kfree'ers: 8466933582 ns, loops: 10000, batches: 1157, memory footprint: 2820MB
> Total time taken by all kfree'ers: 5375602446 ns, loops: 10000, batches: 1130, memory footprint: 6502MB
> Total time taken by all kfree'ers: 7523283832 ns, loops: 10000, batches: 1006, memory footprint: 3343MB
> Total time taken by all kfree'ers: 6459171956 ns, loops: 10000, batches: 1150, memory footprint: 6549MB
> 
> Total time taken by all kfree'ers: 8560060176 ns, loops: 10000, batches: 1787, memory footprint: 61MB
> Total time taken by all kfree'ers: 8573885501 ns, loops: 10000, batches: 1777, memory footprint: 93MB
> Total time taken by all kfree'ers: 8320000202 ns, loops: 10000, batches: 1727, memory footprint: 66MB
> Total time taken by all kfree'ers: 8552718794 ns, loops: 10000, batches: 1790, memory footprint: 75MB
> Total time taken by all kfree'ers: 8601368792 ns, loops: 10000, batches: 1724, memory footprint: 62MB
> 
> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>

A couple more questions interspersed below upon further reflection.

Thoughts?

						Thanx, Paul

> ---
>  kernel/rcu/tree.c | 47 +++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 39 insertions(+), 8 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index c94c17194299..44279ca488ef 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2741,11 +2741,13 @@ EXPORT_SYMBOL_GPL(call_rcu);
>  /**
>   * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
>   * @list: List node. All blocks are linked between each other
> + * @gp_snap: Snapshot of RCU state for objects placed to this bulk
>   * @nr_records: Number of active pointers in the array
>   * @records: Array of the kvfree_rcu() pointers
>   */
>  struct kvfree_rcu_bulk_data {
>  	struct list_head list;
> +	unsigned long gp_snap;
>  	unsigned long nr_records;
>  	void *records[];
>  };
> @@ -2762,13 +2764,15 @@ struct kvfree_rcu_bulk_data {
>   * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
>   * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
>   * @head_free: List of kfree_rcu() objects waiting for a grace period
> + * @head_free_gp_snap: Snapshot of RCU state for objects placed to "@head_free"
>   * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
>   * @krcp: Pointer to @kfree_rcu_cpu structure
>   */
>  
>  struct kfree_rcu_cpu_work {
> -	struct rcu_work rcu_work;
> +	struct work_struct rcu_work;
>  	struct rcu_head *head_free;
> +	unsigned long head_free_gp_snap;
>  	struct list_head bulk_head_free[FREE_N_CHANNELS];
>  	struct kfree_rcu_cpu *krcp;
>  };
> @@ -2964,10 +2968,11 @@ static void kfree_rcu_work(struct work_struct *work)
>  	struct rcu_head *head;
>  	struct kfree_rcu_cpu *krcp;
>  	struct kfree_rcu_cpu_work *krwp;
> +	unsigned long head_free_gp_snap;
>  	int i;
>  
> -	krwp = container_of(to_rcu_work(work),
> -			    struct kfree_rcu_cpu_work, rcu_work);
> +	krwp = container_of(work,
> +		struct kfree_rcu_cpu_work, rcu_work);
>  	krcp = krwp->krcp;
>  
>  	raw_spin_lock_irqsave(&krcp->lock, flags);
> @@ -2978,12 +2983,29 @@ static void kfree_rcu_work(struct work_struct *work)
>  	// Channel 3.
>  	head = krwp->head_free;
>  	krwp->head_free = NULL;
> +	head_free_gp_snap = krwp->head_free_gp_snap;
>  	raw_spin_unlock_irqrestore(&krcp->lock, flags);
>  
>  	// Handle the first two channels.
> -	for (i = 0; i < FREE_N_CHANNELS; i++)
> +	for (i = 0; i < FREE_N_CHANNELS; i++) {
> +		// Start from the tail page, so a GP is likely passed for it.
> +		list_for_each_entry_safe_reverse(bnode, n, &bulk_head[i], list) {
> +			// Not yet ready? Bail out since we need one more GP.
> +			if (!poll_state_synchronize_rcu(bnode->gp_snap))
> +				break;
> +
> +			list_del_init(&bnode->list);
> +			kvfree_rcu_bulk(krcp, bnode, i);
> +		}
> +
> +		// Please note a request for one more extra GP can
> +		// occur only once for all objects in this batch.
> +		if (!list_empty(&bulk_head[i]))
> +			synchronize_rcu();

Does directly invoking synchronize_rcu() instead of using queue_rcu_work()
provide benefits, for example, reduced memory footprint?  If not,
it would be good to instead use queue_rcu_work() in order to avoid an
unnecessary context switch in this workqueue handler.

My concern is that an RCU CPU stall might otherwise end up tying up more
workqueue kthreads as well as more memory.

>  		list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
>  			kvfree_rcu_bulk(krcp, bnode, i);
> +	}
>  
>  	/*
>  	 * This is used when the "bulk" path can not be used for the
> @@ -2992,7 +3014,10 @@ static void kfree_rcu_work(struct work_struct *work)
>  	 * queued on a linked list through their rcu_head structures.
>  	 * This list is named "Channel 3".
>  	 */
> -	kvfree_rcu_list(head);
> +	if (head) {
> +		cond_synchronize_rcu(head_free_gp_snap);

And similarly here.

> +		kvfree_rcu_list(head);
> +	}
>  }
>  
>  static bool
> @@ -3059,6 +3084,11 @@ static void kfree_rcu_monitor(struct work_struct *work)
>  			if (!krwp->head_free) {
>  				krwp->head_free = krcp->head;
>  				WRITE_ONCE(krcp->head, NULL);
> +
> +				// Take a snapshot for this krwp. Please note no more
> +				// any objects can be added to attached head_free channel
> +				// therefore fixate a GP for it here.
> +				krwp->head_free_gp_snap = get_state_synchronize_rcu();
>  			}
>  
>  			WRITE_ONCE(krcp->count, 0);
> @@ -3068,7 +3098,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
>  			// be that the work is in the pending state when
>  			// channels have been detached following by each
>  			// other.
> -			queue_rcu_work(system_wq, &krwp->rcu_work);
> +			queue_work(system_wq, &krwp->rcu_work);
>  		}
>  	}
>  
> @@ -3196,8 +3226,9 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
>  		list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
>  	}
>  
> -	/* Finally insert. */
> +	// Finally insert and update the GP for this page.
>  	bnode->records[bnode->nr_records++] = ptr;
> +	bnode->gp_snap = get_state_synchronize_rcu();
>  	return true;
>  }
>  
> @@ -4801,7 +4832,7 @@ static void __init kfree_rcu_batch_init(void)
>  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
>  
>  		for (i = 0; i < KFREE_N_BATCHES; i++) {
> -			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
> +			INIT_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
>  			krcp->krw_arr[i].krcp = krcp;
>  
>  			for (j = 0; j < FREE_N_CHANNELS; j++)
> -- 
> 2.30.2
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 4/4] rcu/kvfree: Use a polled API to speedup a reclaim process
  2022-12-01 23:45   ` Paul E. McKenney
@ 2022-12-02 12:54     ` Uladzislau Rezki
  2022-12-02 19:14       ` Paul E. McKenney
  0 siblings, 1 reply; 13+ messages in thread
From: Uladzislau Rezki @ 2022-12-02 12:54 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Uladzislau Rezki (Sony), LKML, RCU, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Oleksiy Avramchenko

> 
> A couple more questions interspersed below upon further reflection.
> 
> Thoughts?
> 
See below my thoughts:

> 						Thanx, Paul
> 
> > ---
> >  kernel/rcu/tree.c | 47 +++++++++++++++++++++++++++++++++++++++--------
> >  1 file changed, 39 insertions(+), 8 deletions(-)
> > 
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index c94c17194299..44279ca488ef 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -2741,11 +2741,13 @@ EXPORT_SYMBOL_GPL(call_rcu);
> >  /**
> >   * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
> >   * @list: List node. All blocks are linked between each other
> > + * @gp_snap: Snapshot of RCU state for objects placed to this bulk
> >   * @nr_records: Number of active pointers in the array
> >   * @records: Array of the kvfree_rcu() pointers
> >   */
> >  struct kvfree_rcu_bulk_data {
> >  	struct list_head list;
> > +	unsigned long gp_snap;
> >  	unsigned long nr_records;
> >  	void *records[];
> >  };
> > @@ -2762,13 +2764,15 @@ struct kvfree_rcu_bulk_data {
> >   * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
> >   * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
> >   * @head_free: List of kfree_rcu() objects waiting for a grace period
> > + * @head_free_gp_snap: Snapshot of RCU state for objects placed to "@head_free"
> >   * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
> >   * @krcp: Pointer to @kfree_rcu_cpu structure
> >   */
> >  
> >  struct kfree_rcu_cpu_work {
> > -	struct rcu_work rcu_work;
> > +	struct work_struct rcu_work;
> >  	struct rcu_head *head_free;
> > +	unsigned long head_free_gp_snap;
> >  	struct list_head bulk_head_free[FREE_N_CHANNELS];
> >  	struct kfree_rcu_cpu *krcp;
> >  };
> > @@ -2964,10 +2968,11 @@ static void kfree_rcu_work(struct work_struct *work)
> >  	struct rcu_head *head;
> >  	struct kfree_rcu_cpu *krcp;
> >  	struct kfree_rcu_cpu_work *krwp;
> > +	unsigned long head_free_gp_snap;
> >  	int i;
> >  
> > -	krwp = container_of(to_rcu_work(work),
> > -			    struct kfree_rcu_cpu_work, rcu_work);
> > +	krwp = container_of(work,
> > +		struct kfree_rcu_cpu_work, rcu_work);
> >  	krcp = krwp->krcp;
> >  
> >  	raw_spin_lock_irqsave(&krcp->lock, flags);
> > @@ -2978,12 +2983,29 @@ static void kfree_rcu_work(struct work_struct *work)
> >  	// Channel 3.
> >  	head = krwp->head_free;
> >  	krwp->head_free = NULL;
> > +	head_free_gp_snap = krwp->head_free_gp_snap;
> >  	raw_spin_unlock_irqrestore(&krcp->lock, flags);
> >  
> >  	// Handle the first two channels.
> > -	for (i = 0; i < FREE_N_CHANNELS; i++)
> > +	for (i = 0; i < FREE_N_CHANNELS; i++) {
> > +		// Start from the tail page, so a GP is likely passed for it.
> > +		list_for_each_entry_safe_reverse(bnode, n, &bulk_head[i], list) {
> > +			// Not yet ready? Bail out since we need one more GP.
> > +			if (!poll_state_synchronize_rcu(bnode->gp_snap))
> > +				break;
> > +
> > +			list_del_init(&bnode->list);
> > +			kvfree_rcu_bulk(krcp, bnode, i);
> > +		}
> > +
> > +		// Please note a request for one more extra GP can
> > +		// occur only once for all objects in this batch.
> > +		if (!list_empty(&bulk_head[i]))
> > +			synchronize_rcu();
> 
> Does directly invoking synchronize_rcu() instead of using queue_rcu_work()
> provide benefits, for example, reduced memory footprint?
>
queue_rcu_work() will delay freeing of all objects in a batch. We can
make use of it but it should be only for the ones which still require
a grace period. A memory footprint and a time depends on when our
callback is invoked by the RCU-core to queue the reclaim work.

Such time can be long, because it depends on many factors:

- scheduling delays in waking gp;
- scheduling delays in kicking nocb;
- delays in waiting in a "cblist":
    - dequeuing and invoking f(rhp);
- delay in waking our final reclaim work and giving it a CPU time.

This patch combines a possibility to reclaim asap for objects which
passed a grace period and requesting one more GP for the ones which
have not passed it yet.

>
> If not, it would be good to instead use queue_rcu_work() in order
> to avoid an unnecessary context switch in this workqueue handler.
>
I went by the most easiest way from code perspective since i do not
see problems with a current approach from testing and personal point
of views.

If we are about to do that i need to add extra logic to split ready
and not ready pointers for direct reclaim and the rest over the
queu_rcu_work().

I can check how it goes.

> 
> My concern is that an RCU CPU stall might otherwise end up tying up more
> workqueue kthreads as well as more memory.
> 
There is a limit. We have two batches, one work for each. Suppose the
reclaim kthread is stuck in synchronize_rcu() so it does not do any
progress. In this case same work can be only in pending state and
nothing more no matter how many times the queue_work() is invoked:

2 * num_possible_cpus();

If we end up in RCU stall we will not be able to reclaim anyway.

--
Uladzislau Rezki

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 3/4] rcu/kvfree: Move need_offload_krc() out of krcp->lock
  2022-11-30 18:44       ` Paul E. McKenney
@ 2022-12-02 13:19         ` Uladzislau Rezki
  0 siblings, 0 replies; 13+ messages in thread
From: Uladzislau Rezki @ 2022-12-02 13:19 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Uladzislau Rezki, LKML, RCU, Frederic Weisbecker, Neeraj Upadhyay,
	Joel Fernandes, Oleksiy Avramchenko

On Wed, Nov 30, 2022 at 10:44:55AM -0800, Paul E. McKenney wrote:
> On Wed, Nov 30, 2022 at 01:56:17PM +0100, Uladzislau Rezki wrote:
> > On Tue, Nov 29, 2022 at 03:38:33PM -0800, Paul E. McKenney wrote:
> > > On Tue, Nov 29, 2022 at 04:58:21PM +0100, Uladzislau Rezki (Sony) wrote:
> > > > Currently a need_offload_krc() function requires the krcp->lock
> > > > to be held because krcp->head can not be checked concurrently.
> > > > 
> > > > Fix it by updating the krcp->head using WRITE_ONCE() macro so
> > > > it becomes lock-free and safe for readers to see a valid data
> > > > without any locking.
> > > 
> > > Don't we also need to use READ_ONCE() for the code loading this krcp->head
> > > pointer?  Or do the remaining plain C-language accesses somehow avoid
> > > running concurrently with those new WRITE_ONCE() invocations?
> > >
> > It can be concurrent. I was thinking about it. For some reason i decided
> > to keep readers as a "regular" ones for loading the krcp->head.
> > 
> > In this case it might take time for readers to see an updated value
> > as a worst case scenario.
> > 
> > So i need to update it or upload one more patch on top of v2. Should
> > i upload a new patch?
> 
> Sending an additional patch should be fine.  Unless you would rather it
> be folded into one of the existing patches, in which case please start
> with the set that I have queued.
> 
Done.

--
Uladzislau Rezki

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 4/4] rcu/kvfree: Use a polled API to speedup a reclaim process
  2022-12-02 12:54     ` Uladzislau Rezki
@ 2022-12-02 19:14       ` Paul E. McKenney
  0 siblings, 0 replies; 13+ messages in thread
From: Paul E. McKenney @ 2022-12-02 19:14 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: LKML, RCU, Frederic Weisbecker, Neeraj Upadhyay, Joel Fernandes,
	Oleksiy Avramchenko

On Fri, Dec 02, 2022 at 01:54:17PM +0100, Uladzislau Rezki wrote:
> > 
> > A couple more questions interspersed below upon further reflection.
> > 
> > Thoughts?
> > 
> See below my thoughts:
> 
> > 						Thanx, Paul
> > 
> > > ---
> > >  kernel/rcu/tree.c | 47 +++++++++++++++++++++++++++++++++++++++--------
> > >  1 file changed, 39 insertions(+), 8 deletions(-)
> > > 
> > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > index c94c17194299..44279ca488ef 100644
> > > --- a/kernel/rcu/tree.c
> > > +++ b/kernel/rcu/tree.c
> > > @@ -2741,11 +2741,13 @@ EXPORT_SYMBOL_GPL(call_rcu);
> > >  /**
> > >   * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
> > >   * @list: List node. All blocks are linked between each other
> > > + * @gp_snap: Snapshot of RCU state for objects placed to this bulk
> > >   * @nr_records: Number of active pointers in the array
> > >   * @records: Array of the kvfree_rcu() pointers
> > >   */
> > >  struct kvfree_rcu_bulk_data {
> > >  	struct list_head list;
> > > +	unsigned long gp_snap;
> > >  	unsigned long nr_records;
> > >  	void *records[];
> > >  };
> > > @@ -2762,13 +2764,15 @@ struct kvfree_rcu_bulk_data {
> > >   * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
> > >   * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
> > >   * @head_free: List of kfree_rcu() objects waiting for a grace period
> > > + * @head_free_gp_snap: Snapshot of RCU state for objects placed to "@head_free"
> > >   * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
> > >   * @krcp: Pointer to @kfree_rcu_cpu structure
> > >   */
> > >  
> > >  struct kfree_rcu_cpu_work {
> > > -	struct rcu_work rcu_work;
> > > +	struct work_struct rcu_work;
> > >  	struct rcu_head *head_free;
> > > +	unsigned long head_free_gp_snap;
> > >  	struct list_head bulk_head_free[FREE_N_CHANNELS];
> > >  	struct kfree_rcu_cpu *krcp;
> > >  };
> > > @@ -2964,10 +2968,11 @@ static void kfree_rcu_work(struct work_struct *work)
> > >  	struct rcu_head *head;
> > >  	struct kfree_rcu_cpu *krcp;
> > >  	struct kfree_rcu_cpu_work *krwp;
> > > +	unsigned long head_free_gp_snap;
> > >  	int i;
> > >  
> > > -	krwp = container_of(to_rcu_work(work),
> > > -			    struct kfree_rcu_cpu_work, rcu_work);
> > > +	krwp = container_of(work,
> > > +		struct kfree_rcu_cpu_work, rcu_work);
> > >  	krcp = krwp->krcp;
> > >  
> > >  	raw_spin_lock_irqsave(&krcp->lock, flags);
> > > @@ -2978,12 +2983,29 @@ static void kfree_rcu_work(struct work_struct *work)
> > >  	// Channel 3.
> > >  	head = krwp->head_free;
> > >  	krwp->head_free = NULL;
> > > +	head_free_gp_snap = krwp->head_free_gp_snap;
> > >  	raw_spin_unlock_irqrestore(&krcp->lock, flags);
> > >  
> > >  	// Handle the first two channels.
> > > -	for (i = 0; i < FREE_N_CHANNELS; i++)
> > > +	for (i = 0; i < FREE_N_CHANNELS; i++) {
> > > +		// Start from the tail page, so a GP is likely passed for it.
> > > +		list_for_each_entry_safe_reverse(bnode, n, &bulk_head[i], list) {
> > > +			// Not yet ready? Bail out since we need one more GP.
> > > +			if (!poll_state_synchronize_rcu(bnode->gp_snap))
> > > +				break;
> > > +
> > > +			list_del_init(&bnode->list);
> > > +			kvfree_rcu_bulk(krcp, bnode, i);
> > > +		}
> > > +
> > > +		// Please note a request for one more extra GP can
> > > +		// occur only once for all objects in this batch.
> > > +		if (!list_empty(&bulk_head[i]))
> > > +			synchronize_rcu();
> > 
> > Does directly invoking synchronize_rcu() instead of using queue_rcu_work()
> > provide benefits, for example, reduced memory footprint?
> >
> queue_rcu_work() will delay freeing of all objects in a batch. We can
> make use of it but it should be only for the ones which still require
> a grace period. A memory footprint and a time depends on when our
> callback is invoked by the RCU-core to queue the reclaim work.
> 
> Such time can be long, because it depends on many factors:
> 
> - scheduling delays in waking gp;
> - scheduling delays in kicking nocb;
> - delays in waiting in a "cblist":
>     - dequeuing and invoking f(rhp);
> - delay in waking our final reclaim work and giving it a CPU time.
> 
> This patch combines a possibility to reclaim asap for objects which
> passed a grace period and requesting one more GP for the ones which
> have not passed it yet.

Understood.  It would be necessary to split the list in order to
immediately reclaim those whose grace periods have completed.
Then the remaining objects (only those whose grace periods have
not completed) would be passed to queue_rcu_work().

> > If not, it would be good to instead use queue_rcu_work() in order
> > to avoid an unnecessary context switch in this workqueue handler.
> >
> I went by the most easiest way from code perspective since i do not
> see problems with a current approach from testing and personal point
> of views.

I am worried about corner cases where memory is low and RCU grace periods
are being delayed and workqueues is running short of ktheads.

> If we are about to do that i need to add extra logic to split ready
> and not ready pointers for direct reclaim and the rest over the
> queu_rcu_work().

Agreed.

> I can check how it goes.

Please!

> > My concern is that an RCU CPU stall might otherwise end up tying up more
> > workqueue kthreads as well as more memory.
> > 
> There is a limit. We have two batches, one work for each. Suppose the
> reclaim kthread is stuck in synchronize_rcu() so it does not do any
> progress. In this case same work can be only in pending state and
> nothing more no matter how many times the queue_work() is invoked:
> 
> 2 * num_possible_cpus();
> 
> If we end up in RCU stall we will not be able to reclaim anyway.

Understood.

The goal is not to make progress because as you say, we cannot make any
progress until the RCU grace period completes.  The goal is instead to
avoid tying up workqueue kthreads while in that sad state.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2022-12-02 19:14 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-11-29 15:58 [PATCH v2 0/4] kvfree_rcu() updates related to polled API Uladzislau Rezki (Sony)
2022-11-29 15:58 ` [PATCH v2 1/4] rcu/kvfree: Switch to a generic linked list API Uladzislau Rezki (Sony)
2022-11-29 15:58 ` [PATCH v2 2/4] rcu/kvfree: Move bulk/list reclaim to separate functions Uladzislau Rezki (Sony)
2022-11-29 15:58 ` [PATCH v2 3/4] rcu/kvfree: Move need_offload_krc() out of krcp->lock Uladzislau Rezki (Sony)
2022-11-29 23:38   ` Paul E. McKenney
2022-11-30 12:56     ` Uladzislau Rezki
2022-11-30 18:44       ` Paul E. McKenney
2022-12-02 13:19         ` Uladzislau Rezki
2022-11-29 15:58 ` [PATCH v2 4/4] rcu/kvfree: Use a polled API to speedup a reclaim process Uladzislau Rezki (Sony)
2022-12-01 23:45   ` Paul E. McKenney
2022-12-02 12:54     ` Uladzislau Rezki
2022-12-02 19:14       ` Paul E. McKenney
2022-11-29 16:37 ` [PATCH v2 0/4] kvfree_rcu() updates related to polled API Paul E. McKenney

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox